vc=68: audit-fix sprint round 1 (11 HIGH + MED batch)

Block B — enrichment lifecycle drift:
  * SubscriptionFeedViewModel tracks enrichJob, cancelled in refresh
    + clearInMemoryCache so spam-refresh and cache-toggle no longer
    leave a globalScope coroutine writing to a destroyed _ui
  * Enrich now runs on viewModelScope, channels snapshotted at job
    start so the terminal merge doesn't read a stale subs list
  * mergeFromCache moved off Main on both the refresh path AND the
    init-hydration path — 750-item flatMap+sort+regex no longer
    blocks the UI thread
  * VideoDetailViewModel dual loadedUrl bookkeeping collapsed to
    the UiState field only; the rejected-URL path also stamps
    loadedUrl so the gate reads coherently

Block A — auto-update authenticity:
  * AppUpdateClient pins the fdroid.sulkta.com leaf SPKI + the
    Let's Encrypt E7 intermediate via OkHttp CertificatePinner
  * file.name accepted only when matching ^/[A-Za-z0-9._-]+\.apk$
  * versionCode clamped to (0, 10_000_000] before we trust the
    'update available' notification — a hostile index can no longer
    pin us to MAX_VALUE

Block C — captureResumePosition perf:
  * ResumePositionsStore.record short-circuits when the existing
    entry matches position+duration so the 5s poll's
    before !== next guard actually skips the SP write
  * JSON encode + SP write off Main via globalScope IO

Block D — Rust feed.rs hardening:
  * Shared reqwest Client via OnceLock — 50 channels no longer
    pay 50 TLS handshakes
  * Response body capped at 2 MiB via bytes_stream — adversarial
    feeds can't OOM the JVM
  * parse_rss returns partial results on quick-xml errors instead
    of nuking everything already parsed
  * extract_channel_id widened (m./www./http(s)?/trailing path)
    and validates exact 24-char UC<22 base64-ish>
  * Skip entries with empty title/published
  * iso_to_relative future dates → 'just now' (clock skew
    no longer pins items to top)
  * civil_to_days year clamp 1970..=2200 before the i64 arithmetic
  * Redirect chain capped at 3
  * Dropped the broken lexicographic sort on upload_date_relative
  * Cap parsed entries at 50 per channel

MED batch:
  * ThumbnailProgressOverlay uses derivedStateOf so only rows
    whose specific entry changed recompose on the 5s positions tick
  * EnrichmentStore.put short-circuits on identical view+duration
    so re-enrich within TTL doesn't write SP
  * EnrichmentStore.load prunes TTL-expired entries on hydration
  * FeedRefreshWorker distinguishes transient (Result.retry) from
    parse (Result.success) failures
  * WorkManager interval coerceAtLeast(15L) on both schedulers
This commit is contained in:
Kayos 2026-05-26 20:53:25 -07:00
parent 796244e065
commit c960a1f424
11 changed files with 385 additions and 83 deletions

View file

@ -13,6 +13,7 @@
// the full stream_info path to fetch the rich metadata when actually
// needed.
use std::sync::OnceLock;
use std::time::Duration;
use futures::stream::{self, StreamExt};
@ -24,6 +25,23 @@ use crate::search::SearchItem;
const RSS_BASE: &str = "https://www.youtube.com/feeds/videos.xml?channel_id=";
const MAX_CONCURRENT: usize = 50;
const PER_CHANNEL_TIMEOUT_S: u64 = 8;
/// Cap on the body bytes we'll read for a single RSS fetch. Real YT
/// Atom feeds are ~5-30 KB; 2 MiB leaves comfortable headroom while
/// blocking a hostile or compromised host from streaming GB-scale
/// bodies into JVM memory inside the 8s timeout. Round-67 audit
/// rust-HIGH-5.
const RSS_MAX_BYTES: usize = 2 * 1024 * 1024;
/// Cap on parsed entries per channel — RSS normally returns 15.
/// 50 leaves headroom for one-off legitimate variance; anything
/// past that is a sign the feed isn't what we expect.
/// Round-67 audit rust-MED-6.
const RSS_MAX_ENTRIES: usize = 50;
/// Year range we trust civil-to-days math for. Strawcore RSS only
/// emits real-world recent uploads; clamping here turns adversarial
/// year fields into a parse failure rather than i64 overflow.
/// Round-67 audit rust-CRIT-1.
const YEAR_MIN: i32 = 1970;
const YEAR_MAX: i32 = 2200;
/// Hybrid-backfill metadata: just the two fields RSS doesn't return
/// (view count + duration). Kotlin calls this lazily for visible feed
@ -55,6 +73,28 @@ pub async fn enrich_feed_item(
})
}
/// Shared reqwest Client — DNS resolver + TLS keepalive + connection
/// pool live here so a 50-channel fan-out reuses one pool instead of
/// paying 50 handshakes. Round-67 audit rust-HIGH-4.
static RSS_CLIENT: OnceLock<Client> = OnceLock::new();
fn rss_client() -> Result<&'static Client, StrawcoreError> {
if let Some(c) = RSS_CLIENT.get() {
return Ok(c);
}
let client = Client::builder()
.timeout(Duration::from_secs(PER_CHANNEL_TIMEOUT_S))
.user_agent(concat!("Mozilla/5.0 (Android; Mobile; Straw/", env!("CARGO_PKG_VERSION"), ")"))
// Cap redirect chains so a misconfigured/hostile feed can't
// spin a server out of our 8s budget. Round-67 audit rust-LOW-8.
.redirect(reqwest::redirect::Policy::limited(3))
.build()
.map_err(|e| StrawcoreError::Extractor {
msg: format!("http client build: {e}"),
})?;
Ok(RSS_CLIENT.get_or_init(|| client))
}
/// Single-channel RSS — Kotlin keeps its per-channel cache + fan-out
/// (parallelism cranked to 50 in the wrapper). Each call is ~50-150ms
/// instead of the ~500ms channelInfo page-scrape, so a 50-sub refresh
@ -65,14 +105,8 @@ pub async fn channel_feed_rss(
) -> Result<Vec<SearchItem>, StrawcoreError> {
crate::runtime::ensure_initialized();
log::info!("strawcore::channel_feed_rss url_len={}", channel_url.len());
let client = Client::builder()
.timeout(Duration::from_secs(PER_CHANNEL_TIMEOUT_S))
.user_agent("Mozilla/5.0 (Android; Mobile; Straw/0.1)")
.build()
.map_err(|e| StrawcoreError::Extractor {
msg: format!("http client build: {e}"),
})?;
Ok(fetch_channel_rss(&client, &channel_url).await.unwrap_or_default())
let client = rss_client()?;
Ok(fetch_channel_rss(client, &channel_url).await.unwrap_or_default())
}
/// Bulk subscription feed fan-out — for callers that want one round-trip
@ -88,68 +122,109 @@ pub async fn subscription_feed(
if channel_urls.is_empty() {
return Ok(Vec::new());
}
let client = Client::builder()
.timeout(Duration::from_secs(PER_CHANNEL_TIMEOUT_S))
.user_agent("Mozilla/5.0 (Android; Mobile; Straw/0.1)")
.build()
.map_err(|e| StrawcoreError::Extractor {
msg: format!("http client build: {e}"),
})?;
let client = rss_client()?;
let results: Vec<Vec<SearchItem>> = stream::iter(channel_urls.into_iter())
.map(|url| {
let client = client.clone();
async move { fetch_channel_rss(&client, &url).await.unwrap_or_default() }
})
.map(|url| async move { fetch_channel_rss(client, &url).await.unwrap_or_default() })
.buffer_unordered(MAX_CONCURRENT)
.collect()
.await;
let mut flat: Vec<SearchItem> = results.into_iter().flatten().collect();
// Newest first by published timestamp baked into the upload_date_relative
// field at parse time — RSS already returns entries newest-first per
// channel so we mostly just need cross-channel interleave.
flat.sort_by(|a, b| b.upload_date_relative.cmp(&a.upload_date_relative));
Ok(flat)
// Per-channel ordering is RSS-served-newest-first. Cross-channel
// interleave is the caller's responsibility — Kotlin's mergeFromCache
// sorts by parsed recency, which is the source of truth. Returning
// the flat list as-is. (vc=66 prior code sorted lexicographically
// on the relative-date STRING, which is wrong because "10 hours
// ago" < "2 hours ago" in cmp order — round-67 audit rust-HIGH-6.)
Ok(results.into_iter().flatten().collect())
}
async fn fetch_channel_rss(client: &Client, channel_url: &str) -> Option<Vec<SearchItem>> {
let channel_id = extract_channel_id(channel_url)?;
let url = format!("{RSS_BASE}{channel_id}");
let body = client
let resp = client
.get(&url)
.send()
.await
.ok()?
.error_for_status()
.ok()?
.text()
.await
.ok()?;
// Streaming body read with a hard byte cap — `.text()` reads
// unbounded into a String. Round-67 audit rust-HIGH-5.
let body = read_capped_body(resp).await?;
parse_rss(&body, channel_id)
}
/// Extract the `UCxxx` channel ID from a channel URL. Handles the
/// common shapes:
/// Drain a reqwest Response into a String, bailing out (return None) if
/// the body exceeds RSS_MAX_BYTES. Round-67 audit rust-HIGH-5.
async fn read_capped_body(resp: reqwest::Response) -> Option<String> {
use futures::StreamExt;
let mut total = 0usize;
let mut buf: Vec<u8> = Vec::with_capacity(32 * 1024);
let mut stream = resp.bytes_stream();
while let Some(chunk_result) = stream.next().await {
let chunk = chunk_result.ok()?;
total = total.saturating_add(chunk.len());
if total > RSS_MAX_BYTES {
log::warn!("strawcore::rss body exceeded {RSS_MAX_BYTES} bytes; aborting");
return None;
}
buf.extend_from_slice(&chunk);
}
String::from_utf8(buf).ok()
}
/// Extract the `UCxxx` channel ID from a channel URL. Accepts the
/// shapes the Android app actually has in Subscriptions plus the ones
/// users paste from share intents:
/// * `https://www.youtube.com/channel/UCxxx...`
/// * `https://www.youtube.com/UCxxx...` (canonical clone)
/// * `https://youtube.com/channel/UCxxx...`
/// * `http(s)://m.youtube.com/channel/UCxxx...`
/// * trailing `/videos`, `?si=...`, etc — anything after the ID is dropped
/// * raw `UCxxx...` (already an ID)
///
/// Real YT channel IDs are EXACTLY 24 chars (`UC` + 22 base64-ish).
/// Round-67 audit rust-HIGH-1.
///
/// `@handle` URLs are NOT supported here — RSS requires the channel ID.
/// Callers that only have an @handle should resolve via channel_info()
/// once, cache the ID into Subscriptions, and pass the ID forever after.
/// Callers with @handles should resolve via channel_info() once and
/// cache the ID into Subscriptions.
fn extract_channel_id(input: &str) -> Option<String> {
let trimmed = input.trim();
if let Some(stripped) = trimmed.strip_prefix("https://www.youtube.com/channel/") {
return Some(stripped.split('/').next()?.to_string());
let trimmed_lower = trimmed.to_lowercase();
// Match the "<scheme>://<host>/channel/" prefix in a single sweep
// so we accept http/https + www./m. variants without four-way
// string-strip ladders.
const PREFIXES: &[&str] = &[
"https://www.youtube.com/channel/",
"https://youtube.com/channel/",
"https://m.youtube.com/channel/",
"http://www.youtube.com/channel/",
"http://youtube.com/channel/",
"http://m.youtube.com/channel/",
];
for p in PREFIXES {
if let Some(idx) = trimmed_lower.find(p) {
let rest = &trimmed[idx + p.len()..];
let id = rest.split(|c: char| c == '/' || c == '?' || c == '#').next()?;
return validate_channel_id(id);
}
}
if let Some(stripped) = trimmed.strip_prefix("https://youtube.com/channel/") {
return Some(stripped.split('/').next()?.to_string());
validate_channel_id(trimmed)
}
/// A real YouTube channel ID is `UC` followed by exactly 22 chars from
/// `[A-Za-z0-9_-]`. Round-67 audit rust-HIGH-1.
fn validate_channel_id(id: &str) -> Option<String> {
if id.len() != 24 || !id.starts_with("UC") {
return None;
}
if trimmed.starts_with("UC") && trimmed.len() >= 22 && trimmed.len() <= 26 {
return Some(trimmed.to_string());
if !id.bytes().skip(2).all(|b| {
matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' | b'-')
}) {
return None;
}
None
Some(id.to_string())
}
fn parse_rss(body: &str, channel_id: String) -> Option<Vec<SearchItem>> {
@ -242,7 +317,11 @@ fn parse_rss(body: &str, channel_id: String) -> Option<Vec<SearchItem>> {
let name = e.name();
let local = local_name(name.as_ref());
if local == "entry" {
if !video_id.is_empty() {
// Skip entries missing the load-bearing fields —
// an empty title renders as a blank card the user
// can't tap, and an empty published collapses the
// recency sort. Round-67 audit rust-HIGH-2.
if !video_id.is_empty() && !title.is_empty() && !published.is_empty() {
items.push(SearchItem {
url: format!("https://www.youtube.com/watch?v={video_id}"),
title: title.clone(),
@ -266,6 +345,12 @@ fn parse_rss(body: &str, channel_id: String) -> Option<Vec<SearchItem>> {
// first in the fan-out. Caught 2026-05-26.
upload_date_relative: iso_to_relative(&published),
});
if items.len() >= RSS_MAX_ENTRIES {
// Defense-in-depth against a feed that
// ships thousands of <entry> blocks.
// Round-67 audit rust-MED-6.
return Some(items);
}
}
in_entry = false;
depth = 0;
@ -275,7 +360,15 @@ fn parse_rss(body: &str, channel_id: String) -> Option<Vec<SearchItem>> {
text_target = None;
}
Ok(Event::Eof) => break,
Err(_) => return None,
// Partial-parse on error: return whatever we've already
// collected rather than throwing the whole batch away.
// A truncated body (EOF mid-stream on a flaky network)
// would otherwise silently disappear the channel.
// Round-67 audit rust-CRIT-3.
Err(e) => {
log::warn!("strawcore::rss parse error after {} items: {e}", items.len());
return Some(items);
}
_ => {}
}
buf.clear();
@ -307,7 +400,16 @@ fn iso_to_relative(iso: &str) -> String {
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs() as i64)
.unwrap_or(0);
format_relative(now_secs.saturating_sub(secs))
// A device with a skewed clock can see RSS timestamps as future-
// dated. saturating_sub returns 0 → "0 seconds ago" → sorts to
// top, which is the LTT/WTYP-recurrence vector. Treat future
// dates as "just now" so the relative-string sort behaves and
// a single skewed item doesn't pin itself at the top of the
// feed. Round-67 audit rust-HIGH-7.
if secs > now_secs {
return "just now".to_string();
}
format_relative(now_secs - secs)
}
fn parse_rfc3339_secs(s: &str) -> Option<i64> {
@ -327,6 +429,13 @@ fn parse_rfc3339_secs(s: &str) -> Option<i64> {
let hh: u32 = time_parts.next()?.parse().ok()?;
let mm: u32 = time_parts.next()?.parse().ok()?;
let ss: u32 = time_parts.next()?.parse().ok()?;
// Year clamp BEFORE civil_to_days — out-of-range years overflow
// the era arithmetic in debug, wrap in release. A hostile feed
// serving year=2147483647 must not produce junk timestamps.
// Round-67 audit rust-CRIT-1.
if !(YEAR_MIN..=YEAR_MAX).contains(&y) {
return None;
}
if !(1..=12).contains(&m) || !(1..=31).contains(&d) || hh > 23 || mm > 59 || ss > 60 {
return None;
}