straw/rust/strawcore/src/feed.rs

// vc=56 — fast subscription feed via YouTube's per-channel RSS endpoint.
//
// YouTube serves `https://www.youtube.com/feeds/videos.xml?channel_id=UCxxx`
// — small Atom XML, no auth, no JS, no InnerTube round-trip. Replaces the
// per-channel `channel_info()` page-scrape that was costing ~500ms each
// (the bottleneck behind NewPipe's "pull to refresh takes 30 seconds for
// 50 subs" UX). Fan-out 50× concurrent via `futures::stream::buffer_unordered`
// turns a 50-sub refresh from ~5-8s parallel-12 to ~1s parallel-50.
//
// RSS is intentionally lossy — it returns title/url/published/thumbnail
// only. No duration, no view count, no shorts/age/paid flags. That's the
// right trade for a feed-refresh use case: tap-through still goes through
// the full stream_info path to fetch the rich metadata when actually
// needed.

use std::sync::OnceLock;
use std::time::Duration;

use futures::stream::{self, StreamExt};
use reqwest::Client;

use crate::error::StrawcoreError;
use crate::search::SearchItem;

const RSS_BASE: &str = "https://www.youtube.com/feeds/videos.xml?channel_id=";
const MAX_CONCURRENT: usize = 50;
const PER_CHANNEL_TIMEOUT_S: u64 = 8;
/// Cap on the body bytes we'll read for a single RSS fetch. Real YT
/// Atom feeds are ~5-30 KB; 2 MiB leaves comfortable headroom while
/// blocking a hostile or compromised host from streaming GB-scale
/// bodies into JVM memory inside the 8s timeout. Round-67 audit
/// rust-HIGH-5.
const RSS_MAX_BYTES: usize = 2 * 1024 * 1024;
/// Cap on parsed entries per channel — RSS normally returns 15.
/// 50 leaves headroom for one-off legitimate variance; anything
/// past that is a sign the feed isn't what we expect.
/// Round-67 audit rust-MED-6.
const RSS_MAX_ENTRIES: usize = 50;
/// Year range we trust civil-to-days math for. Strawcore RSS only
/// emits real-world recent uploads; clamping here turns adversarial
/// year fields into a parse failure rather than i64 overflow.
/// Round-67 audit rust-CRIT-1.
const YEAR_MIN: i32 = 1970;
const YEAR_MAX: i32 = 2200;

/// Hybrid-backfill metadata: just the two fields RSS doesn't return
/// (view count + duration). Kotlin calls this lazily for visible feed
/// items after the RSS-fed paint to fill in the gaps that
/// channel_feed_rss leaves empty.
///
/// vc=66 — built specifically so the subs feed can show 'N views ·
/// X duration' the way YT does, without paying the full channel_info
/// page-scrape cost on initial paint. The underlying stream_info IS
/// heavier than we'd like (~500ms each, runs JS deobf for play URLs
/// we'll discard) — future opt would be to parse the watch-page HTML
/// JSON state directly for just these two fields. ~100ms savings per
/// call but ~150 lines of HTML/JSON pluck logic. Punted until needed.
#[derive(Debug, Clone, uniffi::Record)]
pub struct EnrichedFeedMetadata {
    pub view_count: i64,
    pub duration_seconds: i64,
}

#[uniffi::export(async_runtime = "tokio")]
pub async fn enrich_feed_item(
    video_url: String,
) -> Result<EnrichedFeedMetadata, StrawcoreError> {
    crate::runtime::ensure_initialized();
    let info = crate::stream::stream_info(video_url).await?;
    Ok(EnrichedFeedMetadata {
        view_count: info.view_count,
        duration_seconds: info.duration_seconds,
    })
}

/// Shared reqwest Client — DNS resolver + TLS keepalive + connection
/// pool live here so a 50-channel fan-out reuses one pool instead of
/// paying 50 handshakes. Round-67 audit rust-HIGH-4.
static RSS_CLIENT: OnceLock<Client> = OnceLock::new();

fn rss_client() -> Result<&'static Client, StrawcoreError> {
    if let Some(c) = RSS_CLIENT.get() {
        return Ok(c);
    }
    let client = Client::builder()
        .timeout(Duration::from_secs(PER_CHANNEL_TIMEOUT_S))
        .user_agent(concat!("Mozilla/5.0 (Android; Mobile; Straw/", env!("CARGO_PKG_VERSION"), ")"))
        // Cap redirect chains so a misconfigured/hostile feed can't
        // spin a server out of our 8s budget. Round-67 audit rust-LOW-8.
        .redirect(reqwest::redirect::Policy::limited(3))
        .build()
        .map_err(|e| StrawcoreError::Extractor {
            msg: format!("http client build: {e}"),
        })?;
    Ok(RSS_CLIENT.get_or_init(|| client))
}

/// Single-channel RSS — Kotlin keeps its per-channel cache + fan-out
/// (parallelism cranked to 50 in the wrapper). Each call is ~50-150ms
/// instead of the ~500ms channelInfo page-scrape, so a 50-sub refresh
/// drops from ~5-8s to ~1s.
#[uniffi::export(async_runtime = "tokio")]
pub async fn channel_feed_rss(
    channel_url: String,
) -> Result<Vec<SearchItem>, StrawcoreError> {
    crate::runtime::ensure_initialized();
    log::info!("strawcore::channel_feed_rss url_len={}", channel_url.len());
    let client = rss_client()?;
    Ok(fetch_channel_rss(client, &channel_url).await.unwrap_or_default())
}

/// Bulk subscription feed fan-out — for callers that want one round-trip
/// to Rust. Currently unused by the Android app (it sticks with the
/// per-channel cache), but exposed for future desktop / web variants
/// or for a "warm everything" background prefetch.
#[uniffi::export(async_runtime = "tokio")]
pub async fn subscription_feed(
    channel_urls: Vec<String>,
) -> Result<Vec<SearchItem>, StrawcoreError> {
    crate::runtime::ensure_initialized();
    log::info!("strawcore::subscription_feed channels={}", channel_urls.len());
    if channel_urls.is_empty() {
        return Ok(Vec::new());
    }
    let client = rss_client()?;

    let results: Vec<Vec<SearchItem>> = stream::iter(channel_urls.into_iter())
        .map(|url| async move { fetch_channel_rss(client, &url).await.unwrap_or_default() })
        .buffer_unordered(MAX_CONCURRENT)
        .collect()
        .await;

    // Per-channel ordering is RSS-served-newest-first. Cross-channel
    // interleave is the caller's responsibility — Kotlin's mergeFromCache
    // sorts by parsed recency, which is the source of truth. Returning
    // the flat list as-is.  (vc=66 prior code sorted lexicographically
    // on the relative-date STRING, which is wrong because "10 hours
    // ago" < "2 hours ago" in cmp order — round-67 audit rust-HIGH-6.)
    Ok(results.into_iter().flatten().collect())
}

async fn fetch_channel_rss(client: &Client, channel_url: &str) -> Option<Vec<SearchItem>> {
    let channel_id = extract_channel_id(channel_url)?;
    let url = format!("{RSS_BASE}{channel_id}");
    let resp = client
        .get(&url)
        .send()
        .await
        .ok()?
        .error_for_status()
        .ok()?;
    // Streaming body read with a hard byte cap — `.text()` reads
    // unbounded into a String. Round-67 audit rust-HIGH-5.
    let body = read_capped_body(resp).await?;
    parse_rss(&body, channel_id)
}

/// Drain a reqwest Response into a String, bailing out (return None) if
/// the body exceeds RSS_MAX_BYTES. Round-67 audit rust-HIGH-5.
async fn read_capped_body(resp: reqwest::Response) -> Option<String> {
    use futures::StreamExt;
    let mut total = 0usize;
    let mut buf: Vec<u8> = Vec::with_capacity(32 * 1024);
    let mut stream = resp.bytes_stream();
    while let Some(chunk_result) = stream.next().await {
        let chunk = chunk_result.ok()?;
        // Defense-in-depth: a single hostile chunk can be arbitrarily
        // large (HTTP allows multi-GiB chunks). Reject any one chunk
        // bigger than the whole body cap before we even add it to the
        // running total — protects against hyper having already
        // allocated the chunk on our behalf. Round-68 audit
        // rust-HIGH-1.
        if chunk.len() > RSS_MAX_BYTES {
            log::warn!("strawcore::rss single chunk {} exceeds cap; aborting", chunk.len());
            return None;
        }
        total = total.saturating_add(chunk.len());
        if total > RSS_MAX_BYTES {
            log::warn!("strawcore::rss body exceeded {RSS_MAX_BYTES} bytes; aborting");
            return None;
        }
        buf.extend_from_slice(&chunk);
    }
    // Lossy decode — round-68 audit rust-HIGH-2. A strict from_utf8
    // returns None on any invalid byte, so a single mojibake title
    // would silently drop the entire channel from the feed. quick-xml
    // tolerates U+FFFD replacement chars and the per-entry skip-on-
    // empty handles broken entries downstream.
    Some(String::from_utf8_lossy(&buf).into_owned())
}

/// Extract the `UCxxx` channel ID from a channel URL. Accepts the
/// shapes the Android app actually has in Subscriptions plus the ones
/// users paste from share intents:
///   * `https://www.youtube.com/channel/UCxxx...`
///   * `https://youtube.com/channel/UCxxx...`
///   * `http(s)://m.youtube.com/channel/UCxxx...`
///   * trailing `/videos`, `?si=...`, etc — anything after the ID is dropped
///   * raw `UCxxx...` (already an ID)
///
/// Real YT channel IDs are EXACTLY 24 chars (`UC` + 22 base64-ish).
/// Round-67 audit rust-HIGH-1.
///
/// `@handle` URLs are NOT supported here — RSS requires the channel ID.
/// Callers with @handles should resolve via channel_info() once and
/// cache the ID into Subscriptions.
fn extract_channel_id(input: &str) -> Option<String> {
    let trimmed = input.trim();
    let trimmed_lower = trimmed.to_lowercase();
    // Match the "<scheme>://<host>/channel/" prefix in a single sweep
    // so we accept http/https + www./m. variants without four-way
    // string-strip ladders. ANCHORED at the start of the string —
    // round-68 audit rust-HIGH-3: prior `find()` accepted any input
    // containing the prefix as a substring, so a pasted
    // `evil.com/?redir=https://www.youtube.com/channel/UCxxx` would
    // silently rewrite to the wrong channel.
    const PREFIXES: &[&str] = &[
        "https://www.youtube.com/channel/",
        "https://youtube.com/channel/",
        "https://m.youtube.com/channel/",
        "http://www.youtube.com/channel/",
        "http://youtube.com/channel/",
        "http://m.youtube.com/channel/",
    ];
    for p in PREFIXES {
        if let Some(rest) = trimmed_lower.strip_prefix(p) {
            // Bytes match 1:1 with `trimmed` since the prefix is ASCII
            // and case-folding ASCII doesn't change byte length.
            let rest_in_original = &trimmed[p.len()..p.len() + rest.len()];
            let id = rest_in_original
                .split(|c: char| c == '/' || c == '?' || c == '#')
                .next()?;
            return validate_channel_id(id);
        }
    }
    validate_channel_id(trimmed)
}

/// A real YouTube channel ID is `UC` followed by exactly 22 chars from
/// `[A-Za-z0-9_-]`. Round-67 audit rust-HIGH-1.
fn validate_channel_id(id: &str) -> Option<String> {
    if id.len() != 24 || !id.starts_with("UC") {
        return None;
    }
    if !id.bytes().skip(2).all(|b| {
        matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' | b'-')
    }) {
        return None;
    }
    Some(id.to_string())
}

fn parse_rss(body: &str, channel_id: String) -> Option<Vec<SearchItem>> {
    use quick_xml::events::Event;
    use quick_xml::Reader;

    let mut reader = Reader::from_str(body);
    reader.config_mut().trim_text(true);

    let mut buf = Vec::new();
    let mut items: Vec<SearchItem> = Vec::new();

    // Per-entry scratch.
    let mut in_entry = false;
    let mut depth = 0u8;
    let mut video_id = String::new();
    let mut title = String::new();
    let mut uploader = String::new();
    let mut uploader_url = String::new();
    let mut thumbnail: Option<String> = None;
    let mut published = String::new();

    // What text-collecting state we're in. Replaced per element open.
    let mut text_target: Option<TextTarget> = None;

    loop {
        match reader.read_event_into(&mut buf) {
            Ok(Event::Start(e)) => {
                let name = e.name();
                let local = local_name(name.as_ref());
                if local == "entry" {
                    in_entry = true;
                    depth = 0;
                    video_id.clear();
                    title.clear();
                    uploader.clear();
                    uploader_url.clear();
                    thumbnail = None;
                    published.clear();
                }
                if !in_entry {
                    continue;
                }
                depth = depth.saturating_add(1);
                text_target = match local {
                    "videoId" => Some(TextTarget::VideoId),
                    "title" if depth <= 2 => Some(TextTarget::Title),
                    "name" => Some(TextTarget::UploaderName),
                    "uri" => Some(TextTarget::UploaderUrl),
                    "published" => Some(TextTarget::Published),
                    _ => None,
                };
            }
            Ok(Event::Empty(e)) => {
                if !in_entry {
                    continue;
                }
                let name = e.name();
                let local = local_name(name.as_ref());
                // <media:thumbnail url="..."/> is self-closing.
                if local == "thumbnail" {
                    for attr in e.attributes().flatten() {
                        if attr.key.as_ref() == b"url" {
                            if let Ok(v) = attr.unescape_value() {
                                thumbnail = Some(v.into_owned());
                            }
                        }
                    }
                }
            }
            Ok(Event::Text(t)) => {
                if !in_entry {
                    continue;
                }
                let Ok(s) = t.unescape() else { continue };
                let s = s.as_ref();
                match text_target {
                    Some(TextTarget::VideoId) => video_id.push_str(s),
                    Some(TextTarget::Title) => title.push_str(s),
                    Some(TextTarget::UploaderName) => uploader.push_str(s),
                    Some(TextTarget::UploaderUrl) => uploader_url.push_str(s),
                    Some(TextTarget::Published) => published.push_str(s),
                    None => {}
                }
            }
            Ok(Event::End(e)) => {
                if !in_entry {
                    continue;
                }
                let name = e.name();
                let local = local_name(name.as_ref());
                if local == "entry" {
                    // Skip entries missing the load-bearing fields —
                    // an empty title renders as a blank card the user
                    // can't tap, and an empty published collapses the
                    // recency sort. Round-67 audit rust-HIGH-2.
                    if !video_id.is_empty() && !title.is_empty() && !published.is_empty() {
                        items.push(SearchItem {
                            url: format!("https://www.youtube.com/watch?v={video_id}"),
                            title: title.clone(),
                            uploader: uploader.clone(),
                            uploader_url: if uploader_url.is_empty() {
                                Some(format!("https://www.youtube.com/channel/{channel_id}"))
                            } else {
                                Some(uploader_url.clone())
                            },
                            thumbnail: thumbnail.clone(),
                            duration_seconds: 0,
                            view_count: 0,
                            // RSS gives RFC3339 timestamps. Convert to
                            // the human-relative format Kotlin's
                            // recencyScore parser expects ("N units
                            // ago"). vc=56 was passing the raw ISO
                            // through, which broke the sort comparator
                            // — every item tied at MIN_VALUE so the
                            // feed order was effectively random; LTT +
                            // WTYP landed at top because they resolved
                            // first in the fan-out. Caught 2026-05-26.
                            upload_date_relative: iso_to_relative(&published),
                        });
                        if items.len() >= RSS_MAX_ENTRIES {
                            // Defense-in-depth against a feed that
                            // ships thousands of <entry> blocks.
                            // Round-67 audit rust-MED-6.
                            return Some(items);
                        }
                    }
                    in_entry = false;
                    depth = 0;
                } else {
                    depth = depth.saturating_sub(1);
                }
                text_target = None;
            }
            Ok(Event::Eof) => break,
            // Partial-parse on error: return whatever we've already
            // collected rather than throwing the whole batch away.
            // A truncated body (EOF mid-stream on a flaky network)
            // would otherwise silently disappear the channel.
            // Round-67 audit rust-CRIT-3.
            Err(e) => {
                log::warn!("strawcore::rss parse error after {} items: {e}", items.len());
                return Some(items);
            }
            _ => {}
        }
        buf.clear();
    }
    Some(items)
}

enum TextTarget {
    VideoId,
    Title,
    UploaderName,
    UploaderUrl,
    Published,
}

/// Parse an RFC3339 timestamp (`2026-05-25T15:00:00+00:00`) into "N
/// units ago". Drops the timezone offset — YT RSS always serves UTC
/// and the granularity is days at most, so a ±14h skew doesn't matter
/// for the relative display.
///
/// Falls back to the raw string if parsing fails. That keeps the UI
/// readable even on a malformed feed (rare).
fn iso_to_relative(iso: &str) -> String {
    let secs = match parse_rfc3339_secs(iso) {
        Some(s) => s,
        None => return iso.to_string(),
    };
    let now_secs = std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .map(|d| d.as_secs() as i64)
        .unwrap_or(0);
    // A device with a skewed clock can see RSS timestamps as future-
    // dated. saturating_sub returns 0 → "0 seconds ago" → sorts to
    // top, which is the LTT/WTYP-recurrence vector. Treat future
    // dates as "just now" so the relative-string sort behaves and
    // a single skewed item doesn't pin itself at the top of the
    // feed. Round-67 audit rust-HIGH-7.
    if secs > now_secs {
        return "just now".to_string();
    }
    format_relative(now_secs - secs)
}

fn parse_rfc3339_secs(s: &str) -> Option<i64> {
    if s.len() < 19 {
        return None;
    }
    let date = s.get(..10)?;
    let time = s.get(11..19)?;
    if !s.is_char_boundary(10) || s.as_bytes().get(10) != Some(&b'T') {
        return None;
    }
    let mut date_parts = date.split('-');
    let y: i32 = date_parts.next()?.parse().ok()?;
    let m: u32 = date_parts.next()?.parse().ok()?;
    let d: u32 = date_parts.next()?.parse().ok()?;
    let mut time_parts = time.split(':');
    let hh: u32 = time_parts.next()?.parse().ok()?;
    let mm: u32 = time_parts.next()?.parse().ok()?;
    let ss: u32 = time_parts.next()?.parse().ok()?;
    // Year clamp BEFORE civil_to_days — out-of-range years overflow
    // the era arithmetic in debug, wrap in release. A hostile feed
    // serving year=2147483647 must not produce junk timestamps.
    // Round-67 audit rust-CRIT-1.
    if !(YEAR_MIN..=YEAR_MAX).contains(&y) {
        return None;
    }
    if !(1..=12).contains(&m) || !(1..=31).contains(&d) || hh > 23 || mm > 59 || ss > 60 {
        return None;
    }
    let days = civil_to_days(y, m, d);
    Some(days * 86_400 + hh as i64 * 3_600 + mm as i64 * 60 + ss as i64)
}

/// Howard Hinnant's days-since-1970-01-01 algorithm. Standard,
/// branch-free, handles negative years correctly. Source: chrono
/// proposal for C++20.
fn civil_to_days(y: i32, m: u32, d: u32) -> i64 {
    let y = if m <= 2 { y - 1 } else { y };
    let era = if y >= 0 { y / 400 } else { (y - 399) / 400 };
    let yoe = (y - era * 400) as u32;
    let doy = (153 * (if m > 2 { m - 3 } else { m + 9 }) + 2) / 5 + d - 1;
    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
    era as i64 * 146_097 + doe as i64 - 719_468
}

fn format_relative(age_secs: i64) -> String {
    let s = age_secs.max(0);
    fn unit(n: i64, name: &str) -> String {
        format!("{} {}{} ago", n, name, if n == 1 { "" } else { "s" })
    }
    if s < 60 {
        unit(s, "second")
    } else if s < 3_600 {
        unit(s / 60, "minute")
    } else if s < 86_400 {
        unit(s / 3_600, "hour")
    } else if s < 604_800 {
        unit(s / 86_400, "day")
    } else if s < 2_592_000 {
        unit(s / 604_800, "week")
    } else if s < 31_536_000 {
        unit(s / 2_592_000, "month")
    } else {
        unit(s / 31_536_000, "year")
    }
}

/// Strip the namespace prefix off an XML element name. YouTube's feed
/// is heavily namespaced (`yt:videoId`, `media:thumbnail`) but we only
/// care about the local part — namespace-vs-local distinguishing
/// would just bloat the matcher.
fn local_name(qualified: &[u8]) -> &str {
    let s = std::str::from_utf8(qualified).unwrap_or("");
    match s.rfind(':') {
        Some(idx) => &s[idx + 1..],
        None => s,
    }
}