diff --git a/src/downloader/default_impl.rs b/src/downloader/default_impl.rs index 00a552c..69428dc 100644 --- a/src/downloader/default_impl.rs +++ b/src/downloader/default_impl.rs @@ -8,6 +8,7 @@ // * HTTP 429 → NetworkError::Recaptcha; all other status codes surface // as Ok(Response) +use std::io::Read; use std::time::Duration; use reqwest::blocking::Client; @@ -22,6 +23,12 @@ const DEFAULT_TIMEOUT: Duration = Duration::from_secs(30); const MAX_REDIRECTS: usize = 10; const USER_AGENT: &str = "Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Mobile Safari/537.36"; +/// Hard cap on response body size. Real YT player.js is ~1.5 MB, +/// InnerTube watch-page HTML is ~3 MB, channel/playlist responses +/// are well under 1 MB. 32 MB leaves a generous margin while +/// blocking a hostile / compromised host from blasting GB-scale +/// bodies into the JVM and OOM-killing the process. +const MAX_BODY_BYTES: u64 = 32 * 1024 * 1024; pub struct ReqwestDownloader { client: Client, @@ -90,7 +97,30 @@ impl Downloader for ReqwestDownloader { headers.entry(key).or_default().push(val); } - let body = resp.text()?; + // Fail fast when a known Content-Length already exceeds the cap. + if let Some(len) = resp.content_length() { + if len > MAX_BODY_BYTES { + return Err(NetworkError::Transport(format!( + "response body {len} bytes exceeds cap {MAX_BODY_BYTES}" + ))); + } + } + + // Streaming read with hard cap. take(N+1) so we can detect + // "exceeded the cap" rather than "exactly at the cap, maybe + // more truncated" — if read_to_end fills MAX+1 bytes we know + // the upstream had more. + let mut buf: Vec = Vec::new(); + let mut limited = resp.take(MAX_BODY_BYTES + 1); + limited + .read_to_end(&mut buf) + .map_err(|e| NetworkError::Transport(format!("body read: {e}")))?; + if buf.len() as u64 > MAX_BODY_BYTES { + return Err(NetworkError::Transport(format!( + "response body exceeded cap {MAX_BODY_BYTES}" + ))); + } + let body = String::from_utf8_lossy(&buf).into_owned(); Ok(Response::new(code, message, headers, body, url_after_redirects)) } diff --git a/src/youtube/linkhandler/stream.rs b/src/youtube/linkhandler/stream.rs index f140a24..9592138 100644 --- a/src/youtube/linkhandler/stream.rs +++ b/src/youtube/linkhandler/stream.rs @@ -30,6 +30,18 @@ pub fn is_valid_video_id(id: &str) -> bool { /// the URL doesn't look like a YT video URL (so search results / channel /// pages return None rather than Err — caller decides). pub fn extract_video_id(input_url: &str) -> Result { + extract_video_id_inner(input_url, 0) +} + +/// Internal recursion helper. `depth` is bumped on every +/// /attribution_link?u=... re-entry so a maliciously-nested +/// attribution chain can't blow the stack. +fn extract_video_id_inner(input_url: &str, depth: u8) -> Result { + // Cap attribution-link recursion. Real YT never nests these; a + // pasted-URL DoS that recurses ~10k levels would stack-overflow + // the JVM via UniFFI. One level is enough for the legitimate + // share-from-attribution-app case. + const MAX_ATTRIBUTION_DEPTH: u8 = 1; let url = Url::parse(input_url) .map_err(|e| LinkError::InvalidUrl(format!("{input_url}: {e}")))?; let host = url @@ -68,10 +80,15 @@ pub fn extract_video_id(input_url: &str) -> Result { // /attribution_link?u= if candidate.is_none() && path.starts_with("/attribution_link") { + if depth >= MAX_ATTRIBUTION_DEPTH { + return Err(LinkError::InvalidUrl( + "attribution_link recursion exceeded cap".into(), + )); + } if let Some((_, u_param)) = url.query_pairs().find(|(k, _)| k == "u") { - // Recurse on the decoded URL — but only one level deep. + // Recurse on the decoded URL with depth bump. let inner = format!("https://www.youtube.com{u_param}"); - return extract_video_id(&inner); + return extract_video_id_inner(&inner, depth + 1); } }