vc=68: audit-fix sprint round 1 (11 HIGH + MED batch)
Block B — enrichment lifecycle drift:
* SubscriptionFeedViewModel tracks enrichJob, cancelled in refresh
+ clearInMemoryCache so spam-refresh and cache-toggle no longer
leave a globalScope coroutine writing to a destroyed _ui
* Enrich now runs on viewModelScope, channels snapshotted at job
start so the terminal merge doesn't read a stale subs list
* mergeFromCache moved off Main on both the refresh path AND the
init-hydration path — 750-item flatMap+sort+regex no longer
blocks the UI thread
* VideoDetailViewModel dual loadedUrl bookkeeping collapsed to
the UiState field only; the rejected-URL path also stamps
loadedUrl so the gate reads coherently
Block A — auto-update authenticity:
* AppUpdateClient pins the fdroid.sulkta.com leaf SPKI + the
Let's Encrypt E7 intermediate via OkHttp CertificatePinner
* file.name accepted only when matching ^/[A-Za-z0-9._-]+\.apk$
* versionCode clamped to (0, 10_000_000] before we trust the
'update available' notification — a hostile index can no longer
pin us to MAX_VALUE
Block C — captureResumePosition perf:
* ResumePositionsStore.record short-circuits when the existing
entry matches position+duration so the 5s poll's
before !== next guard actually skips the SP write
* JSON encode + SP write off Main via globalScope IO
Block D — Rust feed.rs hardening:
* Shared reqwest Client via OnceLock — 50 channels no longer
pay 50 TLS handshakes
* Response body capped at 2 MiB via bytes_stream — adversarial
feeds can't OOM the JVM
* parse_rss returns partial results on quick-xml errors instead
of nuking everything already parsed
* extract_channel_id widened (m./www./http(s)?/trailing path)
and validates exact 24-char UC<22 base64-ish>
* Skip entries with empty title/published
* iso_to_relative future dates → 'just now' (clock skew
no longer pins items to top)
* civil_to_days year clamp 1970..=2200 before the i64 arithmetic
* Redirect chain capped at 3
* Dropped the broken lexicographic sort on upload_date_relative
* Cap parsed entries at 50 per channel
MED batch:
* ThumbnailProgressOverlay uses derivedStateOf so only rows
whose specific entry changed recompose on the 5s positions tick
* EnrichmentStore.put short-circuits on identical view+duration
so re-enrich within TTL doesn't write SP
* EnrichmentStore.load prunes TTL-expired entries on hydration
* FeedRefreshWorker distinguishes transient (Result.retry) from
parse (Result.success) failures
* WorkManager interval coerceAtLeast(15L) on both schedulers
This commit is contained in:
parent
796244e065
commit
c960a1f424
11 changed files with 385 additions and 83 deletions
|
|
@ -13,6 +13,7 @@
|
|||
// the full stream_info path to fetch the rich metadata when actually
|
||||
// needed.
|
||||
|
||||
use std::sync::OnceLock;
|
||||
use std::time::Duration;
|
||||
|
||||
use futures::stream::{self, StreamExt};
|
||||
|
|
@ -24,6 +25,23 @@ use crate::search::SearchItem;
|
|||
const RSS_BASE: &str = "https://www.youtube.com/feeds/videos.xml?channel_id=";
|
||||
const MAX_CONCURRENT: usize = 50;
|
||||
const PER_CHANNEL_TIMEOUT_S: u64 = 8;
|
||||
/// Cap on the body bytes we'll read for a single RSS fetch. Real YT
|
||||
/// Atom feeds are ~5-30 KB; 2 MiB leaves comfortable headroom while
|
||||
/// blocking a hostile or compromised host from streaming GB-scale
|
||||
/// bodies into JVM memory inside the 8s timeout. Round-67 audit
|
||||
/// rust-HIGH-5.
|
||||
const RSS_MAX_BYTES: usize = 2 * 1024 * 1024;
|
||||
/// Cap on parsed entries per channel — RSS normally returns 15.
|
||||
/// 50 leaves headroom for one-off legitimate variance; anything
|
||||
/// past that is a sign the feed isn't what we expect.
|
||||
/// Round-67 audit rust-MED-6.
|
||||
const RSS_MAX_ENTRIES: usize = 50;
|
||||
/// Year range we trust civil-to-days math for. Strawcore RSS only
|
||||
/// emits real-world recent uploads; clamping here turns adversarial
|
||||
/// year fields into a parse failure rather than i64 overflow.
|
||||
/// Round-67 audit rust-CRIT-1.
|
||||
const YEAR_MIN: i32 = 1970;
|
||||
const YEAR_MAX: i32 = 2200;
|
||||
|
||||
/// Hybrid-backfill metadata: just the two fields RSS doesn't return
|
||||
/// (view count + duration). Kotlin calls this lazily for visible feed
|
||||
|
|
@ -55,6 +73,28 @@ pub async fn enrich_feed_item(
|
|||
})
|
||||
}
|
||||
|
||||
/// Shared reqwest Client — DNS resolver + TLS keepalive + connection
|
||||
/// pool live here so a 50-channel fan-out reuses one pool instead of
|
||||
/// paying 50 handshakes. Round-67 audit rust-HIGH-4.
|
||||
static RSS_CLIENT: OnceLock<Client> = OnceLock::new();
|
||||
|
||||
fn rss_client() -> Result<&'static Client, StrawcoreError> {
|
||||
if let Some(c) = RSS_CLIENT.get() {
|
||||
return Ok(c);
|
||||
}
|
||||
let client = Client::builder()
|
||||
.timeout(Duration::from_secs(PER_CHANNEL_TIMEOUT_S))
|
||||
.user_agent(concat!("Mozilla/5.0 (Android; Mobile; Straw/", env!("CARGO_PKG_VERSION"), ")"))
|
||||
// Cap redirect chains so a misconfigured/hostile feed can't
|
||||
// spin a server out of our 8s budget. Round-67 audit rust-LOW-8.
|
||||
.redirect(reqwest::redirect::Policy::limited(3))
|
||||
.build()
|
||||
.map_err(|e| StrawcoreError::Extractor {
|
||||
msg: format!("http client build: {e}"),
|
||||
})?;
|
||||
Ok(RSS_CLIENT.get_or_init(|| client))
|
||||
}
|
||||
|
||||
/// Single-channel RSS — Kotlin keeps its per-channel cache + fan-out
|
||||
/// (parallelism cranked to 50 in the wrapper). Each call is ~50-150ms
|
||||
/// instead of the ~500ms channelInfo page-scrape, so a 50-sub refresh
|
||||
|
|
@ -65,14 +105,8 @@ pub async fn channel_feed_rss(
|
|||
) -> Result<Vec<SearchItem>, StrawcoreError> {
|
||||
crate::runtime::ensure_initialized();
|
||||
log::info!("strawcore::channel_feed_rss url_len={}", channel_url.len());
|
||||
let client = Client::builder()
|
||||
.timeout(Duration::from_secs(PER_CHANNEL_TIMEOUT_S))
|
||||
.user_agent("Mozilla/5.0 (Android; Mobile; Straw/0.1)")
|
||||
.build()
|
||||
.map_err(|e| StrawcoreError::Extractor {
|
||||
msg: format!("http client build: {e}"),
|
||||
})?;
|
||||
Ok(fetch_channel_rss(&client, &channel_url).await.unwrap_or_default())
|
||||
let client = rss_client()?;
|
||||
Ok(fetch_channel_rss(client, &channel_url).await.unwrap_or_default())
|
||||
}
|
||||
|
||||
/// Bulk subscription feed fan-out — for callers that want one round-trip
|
||||
|
|
@ -88,68 +122,109 @@ pub async fn subscription_feed(
|
|||
if channel_urls.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
let client = Client::builder()
|
||||
.timeout(Duration::from_secs(PER_CHANNEL_TIMEOUT_S))
|
||||
.user_agent("Mozilla/5.0 (Android; Mobile; Straw/0.1)")
|
||||
.build()
|
||||
.map_err(|e| StrawcoreError::Extractor {
|
||||
msg: format!("http client build: {e}"),
|
||||
})?;
|
||||
let client = rss_client()?;
|
||||
|
||||
let results: Vec<Vec<SearchItem>> = stream::iter(channel_urls.into_iter())
|
||||
.map(|url| {
|
||||
let client = client.clone();
|
||||
async move { fetch_channel_rss(&client, &url).await.unwrap_or_default() }
|
||||
})
|
||||
.map(|url| async move { fetch_channel_rss(client, &url).await.unwrap_or_default() })
|
||||
.buffer_unordered(MAX_CONCURRENT)
|
||||
.collect()
|
||||
.await;
|
||||
|
||||
let mut flat: Vec<SearchItem> = results.into_iter().flatten().collect();
|
||||
// Newest first by published timestamp baked into the upload_date_relative
|
||||
// field at parse time — RSS already returns entries newest-first per
|
||||
// channel so we mostly just need cross-channel interleave.
|
||||
flat.sort_by(|a, b| b.upload_date_relative.cmp(&a.upload_date_relative));
|
||||
Ok(flat)
|
||||
// Per-channel ordering is RSS-served-newest-first. Cross-channel
|
||||
// interleave is the caller's responsibility — Kotlin's mergeFromCache
|
||||
// sorts by parsed recency, which is the source of truth. Returning
|
||||
// the flat list as-is. (vc=66 prior code sorted lexicographically
|
||||
// on the relative-date STRING, which is wrong because "10 hours
|
||||
// ago" < "2 hours ago" in cmp order — round-67 audit rust-HIGH-6.)
|
||||
Ok(results.into_iter().flatten().collect())
|
||||
}
|
||||
|
||||
async fn fetch_channel_rss(client: &Client, channel_url: &str) -> Option<Vec<SearchItem>> {
|
||||
let channel_id = extract_channel_id(channel_url)?;
|
||||
let url = format!("{RSS_BASE}{channel_id}");
|
||||
let body = client
|
||||
let resp = client
|
||||
.get(&url)
|
||||
.send()
|
||||
.await
|
||||
.ok()?
|
||||
.error_for_status()
|
||||
.ok()?
|
||||
.text()
|
||||
.await
|
||||
.ok()?;
|
||||
// Streaming body read with a hard byte cap — `.text()` reads
|
||||
// unbounded into a String. Round-67 audit rust-HIGH-5.
|
||||
let body = read_capped_body(resp).await?;
|
||||
parse_rss(&body, channel_id)
|
||||
}
|
||||
|
||||
/// Extract the `UCxxx` channel ID from a channel URL. Handles the
|
||||
/// common shapes:
|
||||
/// Drain a reqwest Response into a String, bailing out (return None) if
|
||||
/// the body exceeds RSS_MAX_BYTES. Round-67 audit rust-HIGH-5.
|
||||
async fn read_capped_body(resp: reqwest::Response) -> Option<String> {
|
||||
use futures::StreamExt;
|
||||
let mut total = 0usize;
|
||||
let mut buf: Vec<u8> = Vec::with_capacity(32 * 1024);
|
||||
let mut stream = resp.bytes_stream();
|
||||
while let Some(chunk_result) = stream.next().await {
|
||||
let chunk = chunk_result.ok()?;
|
||||
total = total.saturating_add(chunk.len());
|
||||
if total > RSS_MAX_BYTES {
|
||||
log::warn!("strawcore::rss body exceeded {RSS_MAX_BYTES} bytes; aborting");
|
||||
return None;
|
||||
}
|
||||
buf.extend_from_slice(&chunk);
|
||||
}
|
||||
String::from_utf8(buf).ok()
|
||||
}
|
||||
|
||||
/// Extract the `UCxxx` channel ID from a channel URL. Accepts the
|
||||
/// shapes the Android app actually has in Subscriptions plus the ones
|
||||
/// users paste from share intents:
|
||||
/// * `https://www.youtube.com/channel/UCxxx...`
|
||||
/// * `https://www.youtube.com/UCxxx...` (canonical clone)
|
||||
/// * `https://youtube.com/channel/UCxxx...`
|
||||
/// * `http(s)://m.youtube.com/channel/UCxxx...`
|
||||
/// * trailing `/videos`, `?si=...`, etc — anything after the ID is dropped
|
||||
/// * raw `UCxxx...` (already an ID)
|
||||
///
|
||||
/// Real YT channel IDs are EXACTLY 24 chars (`UC` + 22 base64-ish).
|
||||
/// Round-67 audit rust-HIGH-1.
|
||||
///
|
||||
/// `@handle` URLs are NOT supported here — RSS requires the channel ID.
|
||||
/// Callers that only have an @handle should resolve via channel_info()
|
||||
/// once, cache the ID into Subscriptions, and pass the ID forever after.
|
||||
/// Callers with @handles should resolve via channel_info() once and
|
||||
/// cache the ID into Subscriptions.
|
||||
fn extract_channel_id(input: &str) -> Option<String> {
|
||||
let trimmed = input.trim();
|
||||
if let Some(stripped) = trimmed.strip_prefix("https://www.youtube.com/channel/") {
|
||||
return Some(stripped.split('/').next()?.to_string());
|
||||
let trimmed_lower = trimmed.to_lowercase();
|
||||
// Match the "<scheme>://<host>/channel/" prefix in a single sweep
|
||||
// so we accept http/https + www./m. variants without four-way
|
||||
// string-strip ladders.
|
||||
const PREFIXES: &[&str] = &[
|
||||
"https://www.youtube.com/channel/",
|
||||
"https://youtube.com/channel/",
|
||||
"https://m.youtube.com/channel/",
|
||||
"http://www.youtube.com/channel/",
|
||||
"http://youtube.com/channel/",
|
||||
"http://m.youtube.com/channel/",
|
||||
];
|
||||
for p in PREFIXES {
|
||||
if let Some(idx) = trimmed_lower.find(p) {
|
||||
let rest = &trimmed[idx + p.len()..];
|
||||
let id = rest.split(|c: char| c == '/' || c == '?' || c == '#').next()?;
|
||||
return validate_channel_id(id);
|
||||
}
|
||||
}
|
||||
if let Some(stripped) = trimmed.strip_prefix("https://youtube.com/channel/") {
|
||||
return Some(stripped.split('/').next()?.to_string());
|
||||
validate_channel_id(trimmed)
|
||||
}
|
||||
|
||||
/// A real YouTube channel ID is `UC` followed by exactly 22 chars from
|
||||
/// `[A-Za-z0-9_-]`. Round-67 audit rust-HIGH-1.
|
||||
fn validate_channel_id(id: &str) -> Option<String> {
|
||||
if id.len() != 24 || !id.starts_with("UC") {
|
||||
return None;
|
||||
}
|
||||
if trimmed.starts_with("UC") && trimmed.len() >= 22 && trimmed.len() <= 26 {
|
||||
return Some(trimmed.to_string());
|
||||
if !id.bytes().skip(2).all(|b| {
|
||||
matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' | b'-')
|
||||
}) {
|
||||
return None;
|
||||
}
|
||||
None
|
||||
Some(id.to_string())
|
||||
}
|
||||
|
||||
fn parse_rss(body: &str, channel_id: String) -> Option<Vec<SearchItem>> {
|
||||
|
|
@ -242,7 +317,11 @@ fn parse_rss(body: &str, channel_id: String) -> Option<Vec<SearchItem>> {
|
|||
let name = e.name();
|
||||
let local = local_name(name.as_ref());
|
||||
if local == "entry" {
|
||||
if !video_id.is_empty() {
|
||||
// Skip entries missing the load-bearing fields —
|
||||
// an empty title renders as a blank card the user
|
||||
// can't tap, and an empty published collapses the
|
||||
// recency sort. Round-67 audit rust-HIGH-2.
|
||||
if !video_id.is_empty() && !title.is_empty() && !published.is_empty() {
|
||||
items.push(SearchItem {
|
||||
url: format!("https://www.youtube.com/watch?v={video_id}"),
|
||||
title: title.clone(),
|
||||
|
|
@ -266,6 +345,12 @@ fn parse_rss(body: &str, channel_id: String) -> Option<Vec<SearchItem>> {
|
|||
// first in the fan-out. Caught 2026-05-26.
|
||||
upload_date_relative: iso_to_relative(&published),
|
||||
});
|
||||
if items.len() >= RSS_MAX_ENTRIES {
|
||||
// Defense-in-depth against a feed that
|
||||
// ships thousands of <entry> blocks.
|
||||
// Round-67 audit rust-MED-6.
|
||||
return Some(items);
|
||||
}
|
||||
}
|
||||
in_entry = false;
|
||||
depth = 0;
|
||||
|
|
@ -275,7 +360,15 @@ fn parse_rss(body: &str, channel_id: String) -> Option<Vec<SearchItem>> {
|
|||
text_target = None;
|
||||
}
|
||||
Ok(Event::Eof) => break,
|
||||
Err(_) => return None,
|
||||
// Partial-parse on error: return whatever we've already
|
||||
// collected rather than throwing the whole batch away.
|
||||
// A truncated body (EOF mid-stream on a flaky network)
|
||||
// would otherwise silently disappear the channel.
|
||||
// Round-67 audit rust-CRIT-3.
|
||||
Err(e) => {
|
||||
log::warn!("strawcore::rss parse error after {} items: {e}", items.len());
|
||||
return Some(items);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
buf.clear();
|
||||
|
|
@ -307,7 +400,16 @@ fn iso_to_relative(iso: &str) -> String {
|
|||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.map(|d| d.as_secs() as i64)
|
||||
.unwrap_or(0);
|
||||
format_relative(now_secs.saturating_sub(secs))
|
||||
// A device with a skewed clock can see RSS timestamps as future-
|
||||
// dated. saturating_sub returns 0 → "0 seconds ago" → sorts to
|
||||
// top, which is the LTT/WTYP-recurrence vector. Treat future
|
||||
// dates as "just now" so the relative-string sort behaves and
|
||||
// a single skewed item doesn't pin itself at the top of the
|
||||
// feed. Round-67 audit rust-HIGH-7.
|
||||
if secs > now_secs {
|
||||
return "just now".to_string();
|
||||
}
|
||||
format_relative(now_secs - secs)
|
||||
}
|
||||
|
||||
fn parse_rfc3339_secs(s: &str) -> Option<i64> {
|
||||
|
|
@ -327,6 +429,13 @@ fn parse_rfc3339_secs(s: &str) -> Option<i64> {
|
|||
let hh: u32 = time_parts.next()?.parse().ok()?;
|
||||
let mm: u32 = time_parts.next()?.parse().ok()?;
|
||||
let ss: u32 = time_parts.next()?.parse().ok()?;
|
||||
// Year clamp BEFORE civil_to_days — out-of-range years overflow
|
||||
// the era arithmetic in debug, wrap in release. A hostile feed
|
||||
// serving year=2147483647 must not produce junk timestamps.
|
||||
// Round-67 audit rust-CRIT-1.
|
||||
if !(YEAR_MIN..=YEAR_MAX).contains(&y) {
|
||||
return None;
|
||||
}
|
||||
if !(1..=12).contains(&m) || !(1..=31).contains(&d) || hh > 23 || mm > 59 || ss > 60 {
|
||||
return None;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue