diff --git a/Cargo.lock b/Cargo.lock index 78cff36..02caacc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,6 +8,21 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "async-compression" version = "0.4.42" @@ -109,6 +124,12 @@ dependencies = [ "syn", ] +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + [[package]] name = "find-msvc-tools" version = "0.1.9" @@ -125,6 +146,12 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -210,6 +237,17 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] + [[package]] name = "http" version = "1.4.0" @@ -671,6 +709,35 @@ dependencies = [ "bitflags", ] +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + [[package]] name = "reqwest" version = "0.12.28" @@ -711,6 +778,16 @@ dependencies = [ "webpki-roots", ] +[[package]] +name = "ress" +version = "0.12.0-alpha.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adfe70c2de7039c907054be15f5f33077db29199cb56780b7d40278f55dedbfc" +dependencies = [ + "log", + "unicode-xid", +] + [[package]] name = "ring" version = "0.17.14" @@ -725,6 +802,34 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rquickjs" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c50dc6d6c587c339edb4769cf705867497a2baf0eca8b4645fa6ecd22f02c77a" +dependencies = [ + "rquickjs-core", +] + +[[package]] +name = "rquickjs-core" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8bf7840285c321c3ab20e752a9afb95548c75cd7f4632a0627cea3507e310c1" +dependencies = [ + "hashbrown", + "rquickjs-sys", +] + +[[package]] +name = "rquickjs-sys" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27344601ef27460e82d6a4e1ecb9e7e99f518122095f3c51296da8e9be2b9d83" +dependencies = [ + "cc", +] + [[package]] name = "rustc-hash" version = "2.1.2" @@ -885,7 +990,10 @@ version = "0.1.0" dependencies = [ "once_cell", "parking_lot", + "regex", "reqwest", + "ress", + "rquickjs", "serde", "serde_json", "thiserror 1.0.69", @@ -1112,6 +1220,12 @@ version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + [[package]] name = "untrusted" version = "0.9.0" diff --git a/Cargo.toml b/Cargo.toml index 3a302bd..21161dd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,9 @@ thiserror = "1" parking_lot = "0.12" url = "2" once_cell = "1" +regex = "1" +rquickjs = { version = "0.11", default-features = false } +ress = "0.12.0-alpha.1" [dev-dependencies] serde_json = "1" diff --git a/src/lib.rs b/src/lib.rs index 616a2b7..ae49749 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,6 +13,7 @@ pub mod metainfo; pub mod newpipe; pub mod page; pub mod service; +pub mod youtube; pub use downloader::{Downloader, Request, Response}; pub use exceptions::{ExtractionError, NetworkError, ParsingError}; diff --git a/src/youtube/js/extractor.rs b/src/youtube/js/extractor.rs new file mode 100644 index 0000000..fb9262c --- /dev/null +++ b/src/youtube/js/extractor.rs @@ -0,0 +1,194 @@ +// player.js URL discovery + download. Mirrors NPE +// services/youtube/YoutubeJavaScriptExtractor.java. +// +// Two discovery paths, in order: +// 1. iframe_api regex (primary) +// 2. embed/ page — Jsoup script-tag walk + jsUrl regex fallback +// +// PARITY: we deliberately reproduce NPE's bug where `select("script") +// .attr("name", "player/base")` *mutates* the script tags and iterates ALL +// of them. The intent was "find the script with name=player/base" but +// Jsoup's attr-setter doesn't filter. Our walk does the same — iterate +// every script tag, return first whose `src` contains `base.js`. + +use std::sync::Arc; + +use once_cell::sync::Lazy; +use regex::Regex; + +use crate::downloader::request::Request; +use crate::downloader::Downloader; +use crate::localization::Localization; +use crate::newpipe::NewPipe; +use crate::youtube::js::DeobfError; + +const IFRAME_API_URL: &str = "https://www.youtube.com/iframe_api"; +const BASE_JS_PLAYER_URL_FORMAT: &str = + "https://www.youtube.com/s/player/{HASH}/player_ias.vflset/en_GB/base.js"; + +static IFRAME_RES_JS_BASE_PLAYER_HASH: Lazy = + Lazy::new(|| Regex::new(r"player\\/([a-z0-9]{8})\\/").unwrap()); + +static EMBEDDED_WATCH_PAGE_JS_BASE_PLAYER_URL: Lazy = Lazy::new(|| { + Regex::new( + r#""jsUrl":"(/s/player/[A-Za-z0-9]+/player_ias\.vflset/[A-Za-z_-]+/base\.js)""#, + ) + .unwrap() +}); + +static SCRIPT_TAG: Lazy = + Lazy::new(|| Regex::new(r#"]*\bsrc=["']([^"']+)["'][^>]*>"#).unwrap()); + +/// Extracts the player.js URL + body. Tries iframe_api first, falls back +/// to the embed page on any failure (matches NPE's try/catch flow). +pub fn extract_javascript_player_code(video_id: &str) -> Result<(String, String), DeobfError> { + let downloader = NewPipe::downloader().ok_or(DeobfError::DownloaderMissing)?; + + let url = match extract_from_iframe(&*downloader) { + Ok(u) => u, + Err(_iframe_err) => extract_from_embed(&*downloader, video_id)?, + }; + let cleaned = clean_javascript_url(&url)?; + let body = download_javascript_code(&*downloader, &cleaned)?; + Ok((cleaned, body)) +} + +fn extract_from_iframe(downloader: &dyn Downloader) -> Result { + let req = Request::get(IFRAME_API_URL) + .localization(Some(Localization::default())) + .build(); + let resp = downloader + .execute(req) + .map_err(|e| DeobfError::FetchIframe(e.to_string()))?; + let body = resp.response_body(); + let hash = IFRAME_RES_JS_BASE_PLAYER_HASH + .captures(body) + .and_then(|c| c.get(1)) + .ok_or(DeobfError::PlayerUrlMissing)? + .as_str(); + Ok(BASE_JS_PLAYER_URL_FORMAT.replace("{HASH}", hash)) +} + +fn extract_from_embed(downloader: &dyn Downloader, video_id: &str) -> Result { + let embed_url = format!("https://www.youtube.com/embed/{video_id}"); + let req = Request::get(&embed_url) + .localization(Some(Localization::default())) + .build(); + let resp = downloader + .execute(req) + .map_err(|e| DeobfError::FetchEmbed(e.to_string()))?; + let body = resp.response_body(); + + // PARITY: NPE iterates every "#; + let caps = SCRIPT_TAG.captures(html).unwrap(); + assert_eq!(caps.get(1).unwrap().as_str(), "//foo.com/base.js"); + } + + #[test] + fn clean_url_promotes_protocol_relative() { + let out = clean_javascript_url("//www.youtube.com/foo/base.js").unwrap(); + assert_eq!(out, "https://www.youtube.com/foo/base.js"); + } + + #[test] + fn clean_url_prefixes_youtube_for_absolute_path() { + let out = clean_javascript_url("/s/player/abc/base.js").unwrap(); + assert_eq!(out, "https://www.youtube.com/s/player/abc/base.js"); + } + + #[test] + fn clean_url_passes_through_full() { + let out = clean_javascript_url("https://www.youtube.com/s/player/x/base.js").unwrap(); + assert_eq!(out, "https://www.youtube.com/s/player/x/base.js"); + } + + #[test] + fn player_hash_extracted_from_url() { + let url = "https://www.youtube.com/s/player/c2f7551f/player_ias.vflset/en_GB/base.js"; + assert_eq!(extract_player_hash(url).as_deref(), Some("c2f7551f")); + } +} diff --git a/src/youtube/js/lexer.rs b/src/youtube/js/lexer.rs new file mode 100644 index 0000000..11783e1 --- /dev/null +++ b/src/youtube/js/lexer.rs @@ -0,0 +1,114 @@ +// JS lexer helpers. Mirrors NPE's utils/jsextractor/JavaScriptExtractor.java +// + Lexer.java + EcmaScriptTokenStream.java. +// +// NPE's lexer is vendored from Rhino 1.7.14 to handle regex-vs-division +// disambiguation. The `ress` crate is the direct Rust analog — same shape, +// pure-rust scanner. We delegate to it. +// +// Public surface mirrors NPE's static `matchToClosingBrace(src, start)`: +// given a substring `start` (e.g. `"xyz=function"`), find its first +// occurrence in `src`, then walk forward through tokens balancing braces +// (skipping braces inside strings, regex literals, comments) until the +// matching `}` of the function body is consumed. Returns the slice from +// the first `{` of the body through that closing `}`, inclusive. + +use ress::tokens::{Punct, Token}; +use ress::Scanner; + +use crate::youtube::js::DeobfError; + +/// Returns the substring of `src` starting immediately after `start` and +/// ending at the matching `}` of the first function body that follows +/// (inclusive). Everything between the anchor and the first `{` (e.g. +/// `(a)` for `xyz=function`) is included. +/// +/// Mirrors `JavaScriptExtractor.matchToClosingBrace(src, start)` from NPE. +pub fn match_to_closing_brace(src: &str, start: &str) -> Result { + let prefix_idx = src + .find(start) + .ok_or_else(|| DeobfError::SigBodyParseFailed(format!("anchor not found: {start}")))?; + let scan_from = prefix_idx + start.len(); + + let scanner = Scanner::new(&src[scan_from..]); + let mut depth = 0i32; + let mut saw_open = false; + let mut last_brace: Option = None; + + for item in scanner { + let item = item.map_err(|e| { + DeobfError::SigBodyParseFailed(format!("lexer error at byte {}: {e}", scan_from)) + })?; + match item.token { + Token::Punct(Punct::OpenBrace) => { + saw_open = true; + depth += 1; + } + Token::Punct(Punct::CloseBrace) => { + depth -= 1; + if depth == 0 && saw_open { + last_brace = Some(item.span.end); + break; + } + } + _ => {} + } + } + + let last = last_brace.ok_or_else(|| { + DeobfError::SigBodyParseFailed("no matching closing brace found".into()) + })?; + Ok(src[scan_from..scan_from + last].to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn balances_simple_function() { + let src = "var a=1;xyz=function(b){return b;};var c=2;"; + let body = match_to_closing_brace(src, "xyz=function").unwrap(); + assert_eq!(body, "(b){return b;}"); + } + + #[test] + fn skips_braces_inside_strings() { + let src = r#"xyz=function(a){var x="}}";return a+x;}"#; + let body = match_to_closing_brace(src, "xyz=function").unwrap(); + assert_eq!(body, r#"(a){var x="}}";return a+x;}"#); + } + + #[test] + fn skips_braces_inside_regex() { + let src = r#"xyz=function(a){var re=/}{/;return a.replace(re,"");}"#; + let body = match_to_closing_brace(src, "xyz=function").unwrap(); + assert!(body.starts_with("(a){")); + assert!(body.ends_with("}")); + } + + #[test] + fn handles_nested_blocks() { + let src = r#" + xyz=function(a){ + if (a.length > 3) { + a = a.split(""); + for (var i=0; i=[fnA,fnB,...]` +// and we index into the array via group 2 +// 3. Extract function body via lexer (with regex fallback). +// 4. fixupFunction — strip the `if(typeof X==="undefined")return p;` +// early-return so the algorithm actually runs standalone. + +use once_cell::sync::Lazy; +use regex::Regex; + +use crate::youtube::js::lexer::match_to_closing_brace; +use crate::youtube::js::DeobfError; + +static SINGLE_CHAR_VARIABLE_REGEX: &str = r"[a-zA-Z0-9$_]"; +// MULTIPLE_CHARS_REGEX is "+" applied to SINGLE_CHAR_VARIABLE_REGEX. +// ARRAY_ACCESS_REGEX captures the index. + +const ARRAY_ACCESS_REGEX: &str = r"\[(\d+)]"; + +static THROTTLING_PARAM: Lazy = + Lazy::new(|| Regex::new(r"[&?]n=([^&]+)").unwrap()); + +/// Quick check + extract. Returns None if the URL doesn't carry a +/// throttling parameter (per NPE's 60-900× perf optimization). +pub fn throttling_parameter_from_url(url: &str) -> Option { + if !url.contains("&n=") && !url.contains("?n=") { + return None; + } + THROTTLING_PARAM + .captures(url) + .and_then(|c| c.get(1)) + .map(|m| m.as_str().to_string()) +} + +/// Returns `(function_name, assembled_snippet)`. The snippet declares +/// the function as `var = function(...) { ... };` (explicit `var` +/// is a PORT DEVIATION — NPE relies on Rhino's non-strict bare-assignment +/// behavior; QuickJS rejects it). +pub fn build_deobfuscator(player_code: &str) -> Result<(String, String), DeobfError> { + let name = deobfuscation_function_name(player_code)?; + let body = deobfuscation_function_body(player_code, &name)?; + let fixed = fixup_function(&body)?; + let with_var = if fixed.starts_with("var ") || fixed.starts_with("function ") { + fixed + } else { + format!("var {fixed}") + }; + Ok((name, with_var)) +} + +fn build_regex_with_macros(template: &str) -> Regex { + let m = SINGLE_CHAR_VARIABLE_REGEX; + let mm = format!("{m}+"); + let arr = ARRAY_ACCESS_REGEX; + let expanded = template + .replace("@SINGLE@", m) + .replace("@MULTI@", &mm) + .replace("@ARRAY@", arr); + Regex::new(&expanded).expect("nsig regex compiles") +} + +/// Function-name regex bank. Eight patterns, first-match wins. +/// New entries land at the FRONT — see NPE git log on +/// YoutubeThrottlingParameterUtils.java. +/// +/// Captures: +/// * 1 group → direct name +/// * 2 groups → array-name + index; needs array indirection +fn deobf_function_name_regexes() -> Vec { + // Source strings keep the @SINGLE@/@MULTI@/@ARRAY@ macros so the regex + // bodies match NPE's source as literally as possible. + let templates: [&str; 8] = [ + // Regex 0 + r"([A-Za-z0-9_$]{2,})=function.*return [A-Z]\[\d+\]", + // Regex 1 + r#"@SINGLE@="nn"\[\+@MULTI@\.@MULTI@],@MULTI@\(@MULTI@\),@MULTI@=@MULTI@\.@MULTI@\[@MULTI@]\|\|null\)&&\(@MULTI@=(@MULTI@)@ARRAY@"#, + // Regex 2 (Wma fallback after "nn" path) + r#"@SINGLE@="nn"\[\+@MULTI@\.@MULTI@],@MULTI@\(@MULTI@\),@MULTI@=@MULTI@\.@MULTI@\[@MULTI@]\|\|null\)\|\|(@MULTI@)\(""\)"#, + // Regex 3 (Vb(m) array path) + r#",@MULTI@\(@SINGLE@\),@MULTI@=@SINGLE@\.@SINGLE@\[@SINGLE@]\|\|null\)&&\(@MULTI@=(@MULTI@)@ARRAY@"#, + // Regex 4 (get(b) callback) + r#"@SINGLE@=@SINGLE@\.get\(@SINGLE@\)\).{1,200}?\|\|(@MULTI@)\(""\)"#, + // Regex 5 (get(b) array) + r#"@SINGLE@=@SINGLE@\.get\(@SINGLE@\)\)&&\(@SINGLE@=(@MULTI@)@ARRAY@"#, + // Regex 6 (String.fromCharCode(110)) + r#"\(@SINGLE@=String\.fromCharCode\(110\),@SINGLE@=@SINGLE@\.get\(@SINGLE@\)\)&&\(@SINGLE@=(@MULTI@)(?:@ARRAY@)?"#, + // Regex 7 (.get("n")) + r#"\.get\("n"\)\)&&\(@SINGLE@=(@MULTI@)(?:@ARRAY@)?\(@SINGLE@\)"#, + ]; + templates.iter().map(|t| build_regex_with_macros(t)).collect() +} + +static DEOBF_FN_NAME_REGEXES: Lazy> = Lazy::new(deobf_function_name_regexes); + +static FUNCTION_NAMES_IN_DEOBFUSCATION_ARRAY: Lazy = + Lazy::new(|| Regex::new(r"\s*=\s*\[(.+?)][;,]").unwrap()); + +fn deobfuscation_function_name(player_code: &str) -> Result { + for re in DEOBF_FN_NAME_REGEXES.iter() { + let Some(caps) = re.captures(player_code) else { + continue; + }; + // Rust regex `len()` returns the implicit group 0 + N capture groups. + // NPE's `groupCount()` excludes group 0, so: + // len() == 2 → 1 capture → direct name + // len() == 3 → 2 captures → array indirection + match caps.len() { + 2 => { + if let Some(m) = caps.get(1) { + return Ok(m.as_str().to_string()); + } + } + 3 => { + let array_name = caps.get(1).map(|m| m.as_str()).unwrap_or_default(); + let index_str = caps.get(2).map(|m| m.as_str()).unwrap_or_default(); + let index: usize = index_str.parse().map_err(|_| { + DeobfError::NsigArrayLookupFailed(format!("bad index: {index_str}")) + })?; + let pat = format!( + r"var {}{}", + regex::escape(array_name), + FUNCTION_NAMES_IN_DEOBFUSCATION_ARRAY.as_str() + ); + let arr_re = Regex::new(&pat) + .map_err(|e| DeobfError::NsigArrayLookupFailed(e.to_string()))?; + let arr_str = arr_re + .captures(player_code) + .and_then(|c| c.get(1)) + .ok_or_else(|| { + DeobfError::NsigArrayLookupFailed(format!( + "array `var {array_name}=[...]` not found" + )) + })? + .as_str(); + let names: Vec<&str> = arr_str.split(',').collect(); + let chosen = names.get(index).ok_or_else(|| { + DeobfError::NsigArrayLookupFailed(format!( + "index {index} out of range (array has {})", + names.len() + )) + })?; + return Ok(chosen.trim().to_string()); + } + _ => continue, + } + } + Err(DeobfError::NsigFuncNotFound) +} + +fn deobfuscation_function_body( + player_code: &str, + function_name: &str, +) -> Result { + let function_base = format!("{function_name}=function"); + match match_to_closing_brace(player_code, &function_base) { + Ok(body) => Ok(format!("{function_base}{body};")), + Err(_) => deobfuscation_function_body_regex(player_code, function_name), + } +} + +fn deobfuscation_function_body_regex( + player_code: &str, + function_name: &str, +) -> Result { + // NPE: Pattern.quote(name) + "=\\s*function([\\S\\s]*?\\}\\s*return [\\w$]+?\\.join\\(\"\"\\)\\s*\\};)" + let pat = format!( + r#"(?s){}=\s*function([\S\s]*?\}}\s*return [\w$]+?\.join\(""\)\s*\}};)"#, + regex::escape(function_name) + ); + let re = Regex::new(&pat).map_err(|e| DeobfError::NsigBodyParseFailed(e.to_string()))?; + let m = re + .captures(player_code) + .and_then(|c| c.get(1)) + .ok_or_else(|| DeobfError::NsigBodyParseFailed("regex fallback miss".into()))?; + Ok(format!("function {function_name}{}", m.as_str())) +} + +/// Strips `if(typeof X==="undefined")return ;` so the function +/// actually runs standalone. NPE adds this 2024-12-29 (`56595bd9d`). +pub fn fixup_function(function: &str) -> Result { + let args_re = Regex::new(r"=\s*function\s*\(\s*([^)]*)\s*\)") + .map_err(|e| DeobfError::NsigBodyParseFailed(e.to_string()))?; + let first_arg = args_re + .captures(function) + .and_then(|c| c.get(1)) + .map(|m| m.as_str().split(',').next().unwrap_or("").trim().to_string()) + .unwrap_or_default(); + + if first_arg.is_empty() { + return Ok(function.to_string()); + } + + // NPE uses a backref `\1` to match the opening + closing quote with + // the same kind. Rust's `regex` is backtracking-free → no backrefs. + // Substitute with an alternation of fully-quoted `"undefined"` / + // `'undefined'` forms. Loosens slightly (allows `"undefined'`) but + // real player.js always uses balanced quotes; harmless. + let early_return_re_src = format!( + r#"(?s);\s*if\s*\(\s*typeof\s+[a-zA-Z0-9$_]+\s*===?\s*(?:"undefined"|'undefined')\s*\)\s*return\s+{};"#, + regex::escape(&first_arg) + ); + let er_re = Regex::new(&early_return_re_src) + .map_err(|e| DeobfError::NsigBodyParseFailed(e.to_string()))?; + Ok(er_re.replace(function, ";").to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn throttling_param_quick_exit() { + assert_eq!(throttling_parameter_from_url("https://x.googlevideo.com/?foo=1"), None); + assert_eq!( + throttling_parameter_from_url("https://x.googlevideo.com/?foo=1&n=ABC123"), + Some("ABC123".into()) + ); + assert_eq!( + throttling_parameter_from_url("https://x.googlevideo.com/?n=zzz&other=q"), + Some("zzz".into()) + ); + } + + #[test] + fn fixup_strips_early_return() { + // NPE's EARLY_RETURN_REGEX requires the `;` immediately before `if`. + // Real player.js always has it because the previous statement ends + // with one. Reproduce that shape. + let body = r#"m85=function(p){var b=1;if(typeof RUQ==="undefined")return p;var a=p.split("");return a.join("");}"#; + let fixed = fixup_function(body).unwrap(); + assert!(!fixed.contains("typeof RUQ")); + assert!(fixed.contains(r#"var a=p.split("");"#)); + } + + #[test] + fn fixup_handles_double_equals() { + let body = r#"m=function(q){var b=1; if (typeof X == "undefined") return q;return q;}"#; + let fixed = fixup_function(body).unwrap(); + assert!(!fixed.contains("typeof X")); + } + + #[test] + fn fixup_handles_single_quotes() { + let body = r#"m=function(q){var b=1;if(typeof X==='undefined')return q;var r=q;return r;}"#; + let fixed = fixup_function(body).unwrap(); + assert!(!fixed.contains("typeof X")); + } + + #[test] + fn fixup_no_match_is_passthrough() { + let body = "m=function(q){return q.split('').join('');}"; + let fixed = fixup_function(body).unwrap(); + assert_eq!(fixed, body); + } + + #[test] + fn regex_0_classic_return_array_element() { + // Pattern: "=function...return Y[45]" + let src = r#" + var Z=["a","b","c"]; + m85=function(p){if(typeof RUQ==="undefined")return p;return Z[1];}; + "#; + let name = deobfuscation_function_name(src).unwrap(); + assert_eq!(name, "m85"); + } + + #[test] + fn regex_array_indirection() { + // Pattern 5 (the `c=a.get(b))&&(c=rDa[0](c)` case). + let src = r#" + var rDa=[fnA,fnB,fnC]; + x=function(){var a=this,b="n",c;c=a.get(b))&&(c=rDa[1](c));} + "#; + let name = deobfuscation_function_name(src); + // Should resolve to fnB (index 1) — array indirection. + match name { + Ok(n) => assert_eq!(n, "fnB"), + Err(e) => panic!("expected name match, got {e:?}"), + } + } +} diff --git a/src/youtube/js/player_manager.rs b/src/youtube/js/player_manager.rs new file mode 100644 index 0000000..57d6981 --- /dev/null +++ b/src/youtube/js/player_manager.rs @@ -0,0 +1,257 @@ +// PlayerManager — orchestrates player.js fetch + sig/nsig deobf + caching. +// Mirrors NPE services/youtube/YoutubeJavaScriptPlayerManager.java (the +// sole public class in the JS subsystem). +// +// Cache layout per audit Track B §5.3: +// * cached_player_code — process-lifetime, until clear_all_caches +// * cached_signature_timestamp +// * cached_sig_snippet — assembled JS, ready for runtime::run +// * cached_nsig_name + snippet +// * cached_throttling_params — obfuscated → deobfuscated cache (per-session) +// * sticky error flags — once an extraction-stage throws, every +// subsequent call re-throws the same error +// until clear_all_caches resets it +// +// NPE uses static fields and is not thread-safe; callers serialize. We +// give the same shape via a `Mutex` — call sites can still +// hammer it from multiple threads safely. + +use parking_lot::Mutex; +use std::collections::HashMap; + +use crate::youtube::js::extractor; +use crate::youtube::js::nsig; +use crate::youtube::js::runtime; +use crate::youtube::js::signature; +use crate::youtube::js::DeobfError; + +#[derive(Default)] +struct ManagerState { + player_url: Option, + player_code: Option, + + signature_timestamp: Option, + sig_snippet: Option, + nsig_name: Option, + nsig_snippet: Option, + throttling_param_cache: HashMap, + + // sticky errors — once set, re-throw immediately on every call until + // clear_all_caches resets them. + sig_timestamp_err: Option, + sig_extract_err: Option, + nsig_extract_err: Option, +} + +pub struct PlayerManager { + inner: Mutex, +} + +impl PlayerManager { + pub fn new() -> Self { + Self { inner: Mutex::new(ManagerState::default()) } + } + + pub fn instance() -> &'static PlayerManager { + use once_cell::sync::Lazy; + static INSTANCE: Lazy = Lazy::new(PlayerManager::new); + &INSTANCE + } + + pub fn signature_timestamp(&self, video_id: &str) -> Result { + let mut state = self.inner.lock(); + if let Some(e) = &state.sig_timestamp_err { + return Err(DeobfError::SigTimestampMissing).map_err(|_| { + DeobfError::JsRuntimeFailed(format!("sticky-cached: {e}")) + }); + } + if let Some(ts) = state.signature_timestamp { + return Ok(ts); + } + Self::ensure_player_code(&mut state, video_id)?; + let code = state.player_code.as_deref().unwrap(); + match signature::signature_timestamp(code) { + Ok(ts) => { + state.signature_timestamp = Some(ts); + Ok(ts) + } + Err(e) => { + state.sig_timestamp_err = Some(e.to_string()); + Err(e) + } + } + } + + pub fn deobfuscate_signature( + &self, + video_id: &str, + obfuscated_signature: &str, + ) -> Result { + let snippet = { + let mut state = self.inner.lock(); + if let Some(e) = &state.sig_extract_err { + return Err(DeobfError::JsRuntimeFailed(format!("sticky-cached: {e}"))); + } + if state.sig_snippet.is_none() { + Self::ensure_player_code(&mut state, video_id)?; + let code = state.player_code.as_deref().unwrap(); + match signature::build_deobfuscator(code) { + Ok(s) => state.sig_snippet = Some(s), + Err(e) => { + state.sig_extract_err = Some(e.to_string()); + return Err(e); + } + } + } + state.sig_snippet.clone().unwrap() + }; + + let result = runtime::run(&snippet, signature::DEOBFUSCATION_FUNCTION_NAME, obfuscated_signature)?; + if result == "null" { + return Ok(String::new()); // NPE: Objects.requireNonNullElse(..., "") + } + Ok(result) + } + + pub fn url_with_throttling_parameter_deobfuscated( + &self, + video_id: &str, + streaming_url: &str, + ) -> Result { + let obf = match nsig::throttling_parameter_from_url(streaming_url) { + Some(s) => s, + None => return Ok(streaming_url.to_string()), + }; + + { + let state = self.inner.lock(); + if let Some(cached) = state.throttling_param_cache.get(&obf) { + return Ok(streaming_url.replace(&obf, cached)); + } + } + + let (name, snippet) = { + let mut state = self.inner.lock(); + if let Some(e) = &state.nsig_extract_err { + return Err(DeobfError::JsRuntimeFailed(format!("sticky-cached: {e}"))); + } + if state.nsig_snippet.is_none() { + Self::ensure_player_code(&mut state, video_id)?; + let code = state.player_code.as_deref().unwrap(); + match nsig::build_deobfuscator(code) { + Ok((n, s)) => { + state.nsig_name = Some(n); + state.nsig_snippet = Some(s); + } + Err(e) => { + state.nsig_extract_err = Some(e.to_string()); + return Err(e); + } + } + } + ( + state.nsig_name.clone().unwrap(), + state.nsig_snippet.clone().unwrap(), + ) + }; + + let deobf = runtime::run(&snippet, &name, &obf)?; + if deobf.is_empty() { + return Err(DeobfError::NsigEmpty); + } + + { + let mut state = self.inner.lock(); + state.throttling_param_cache.insert(obf.clone(), deobf.clone()); + } + + Ok(streaming_url.replace(&obf, &deobf)) + } + + pub fn throttling_parameter_cache_size(&self) -> usize { + self.inner.lock().throttling_param_cache.len() + } + + pub fn clear_all_caches(&self) { + let mut state = self.inner.lock(); + state.player_url = None; + state.player_code = None; + state.signature_timestamp = None; + state.sig_snippet = None; + state.nsig_name = None; + state.nsig_snippet = None; + state.throttling_param_cache.clear(); + state.sig_timestamp_err = None; + state.sig_extract_err = None; + state.nsig_extract_err = None; + } + + pub fn clear_throttling_parameters_cache(&self) { + self.inner.lock().throttling_param_cache.clear(); + } + + pub fn player_url(&self) -> Option { + self.inner.lock().player_url.clone() + } + + pub fn player_hash(&self) -> Option { + self.inner + .lock() + .player_url + .as_deref() + .and_then(extractor::extract_player_hash) + } + + fn ensure_player_code(state: &mut ManagerState, video_id: &str) -> Result<(), DeobfError> { + if state.player_code.is_some() { + return Ok(()); + } + let (url, code) = extractor::extract_javascript_player_code(video_id)?; + state.player_url = Some(url); + state.player_code = Some(code); + Ok(()) + } +} + +impl Default for PlayerManager { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn assembled_sig_snippet_runs_against_synthetic_player() { + // Minified — real YT player.js has no whitespace between `xyz` + // and `=function(...)`, and uses BRACKET-access for helper calls + // (matches the `[;,]Pj\[..` helper-name regex). + let player = r#"var X="junk;junk;junk;junk".split(";");var Pj={rv:function(a){a.reverse();},sp:function(a,b){a.splice(0,b);},sw:function(a,b){var c=a[0];a[0]=a[b%a.length];a[b%a.length]=c;}};xyz=function(a){a=a.split("");Pj["rv"](a);Pj["sw"](a,1);return a.join("");};"#; + let snippet = signature::build_deobfuscator(player).unwrap(); + let out = runtime::run(&snippet, signature::DEOBFUSCATION_FUNCTION_NAME, "abcdef").unwrap(); + assert_ne!(out, "abcdef"); + assert_eq!(out.len(), 6); + } + + #[test] + fn url_without_n_param_is_unchanged() { + // No NewPipe::downloader configured for this test, but the path + // we hit is the quick-exit one — never touches the downloader. + let mgr = PlayerManager::new(); + let url = "https://x.googlevideo.com/?foo=1&bar=baz"; + let out = mgr.url_with_throttling_parameter_deobfuscated("vid", url).unwrap(); + assert_eq!(out, url); + } + + #[test] + fn cache_clears() { + let mgr = PlayerManager::new(); + // Touch the cache once via the no-op quick-exit path. + let _ = mgr.url_with_throttling_parameter_deobfuscated("vid", "https://x/?a=1").unwrap(); + assert_eq!(mgr.throttling_parameter_cache_size(), 0); + mgr.clear_all_caches(); + assert!(mgr.player_url().is_none()); + } +} diff --git a/src/youtube/js/runtime.rs b/src/youtube/js/runtime.rs new file mode 100644 index 0000000..0005d5d --- /dev/null +++ b/src/youtube/js/runtime.rs @@ -0,0 +1,108 @@ +// rquickjs wrapper — mirrors NPE utils/JavaScript.java. +// +// NPE's Rhino surface is 35 lines: compile_or_throw + run. We replicate +// the same shape on QuickJS via rquickjs. +// +// Mirroring decisions per audit Track B §4: +// * One Runtime + Context per call. QuickJS contexts are cheap; this +// mirrors NPE's `Context.enter()` per call. +// * Context::full gives the ECMAScript built-ins (Array, String, Math) +// without `require`, `process`, `fetch`. Matches NPE's +// `initSafeStandardObjects` sandbox guarantee. +// * QuickJS has no JIT — we don't need NPE's `setInterpretedMode(true)` +// equivalent (it's already interpreted). +// * Result coerced via toString. NPE wraps null/undefined to "" for +// sig and treats empty as failure for nsig — caller-side decision, +// handled in player_manager. + +use rquickjs::{Context, Function, Runtime}; + +use crate::youtube::js::DeobfError; + +/// Returns Ok(()) if the snippet parses; otherwise returns the QuickJS +/// error message. Mirrors NPE `JavaScript.compileOrThrow`. +/// +/// rquickjs 0.11 doesn't expose a direct "compile but don't evaluate" +/// entrypoint, so we wrap the snippet in a `function _(){ ... }` block. +/// This forces the parser to walk the whole body without executing any +/// side-effects. +pub fn compile_or_throw(snippet: &str) -> Result<(), DeobfError> { + let runtime = Runtime::new().map_err(|e| DeobfError::JsCompileFailed(e.to_string()))?; + let context = + Context::full(&runtime).map_err(|e| DeobfError::JsCompileFailed(e.to_string()))?; + let wrapped = format!("(function(){{{snippet}}})"); + context.with(|ctx| -> Result<(), DeobfError> { + ctx.eval::(wrapped) + .map(|_| ()) + .map_err(|e| DeobfError::JsCompileFailed(e.to_string())) + }) +} + +/// Evaluates `snippet`, retrieves `function_name` from globals, calls it +/// with one string argument, returns the toString of the result. +/// Mirrors NPE `JavaScript.run(snippet, functionName, parameters)`. +pub fn run(snippet: &str, function_name: &str, parameter: &str) -> Result { + let runtime = Runtime::new().map_err(|e| DeobfError::JsRuntimeFailed(e.to_string()))?; + let context = + Context::full(&runtime).map_err(|e| DeobfError::JsRuntimeFailed(e.to_string()))?; + context.with(|ctx| -> Result { + ctx.eval::<(), _>(snippet) + .map_err(|e| DeobfError::JsRuntimeFailed(format!("eval: {e}")))?; + let func: Function = ctx + .globals() + .get(function_name) + .map_err(|e| DeobfError::JsRuntimeFailed(format!("get {function_name}: {e}")))?; + // rquickjs's FromJs impl calls .toString() — same coercion + // path NPE uses via Rhino's `result.toString()`. + let result: String = func + .call((parameter,)) + .map_err(|e| DeobfError::JsRuntimeFailed(format!("call {function_name}: {e}")))?; + Ok(result) + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn compile_accepts_valid_js() { + compile_or_throw("function f(a){return a.split('').reverse().join('');}") + .expect("valid JS compiles"); + } + + #[test] + fn compile_rejects_garbage() { + let err = compile_or_throw("function f(a) { return a.").unwrap_err(); + assert!(matches!(err, DeobfError::JsCompileFailed(_))); + } + + #[test] + fn run_returns_function_result() { + let snippet = "function deobf(a){return a.split('').reverse().join('');}"; + let out = run(snippet, "deobf", "hello").unwrap(); + assert_eq!(out, "olleh"); + } + + #[test] + fn run_handles_helper_object_pattern() { + let snippet = r#" + var Pj = { + rv: function(a){a.reverse();}, + sw: function(a,b){var c=a[0];a[0]=a[b%a.length];a[b%a.length]=c;}, + sp: function(a,b){a.splice(0,b);} + }; + var xyz = function(a){ + a = a.split(""); + Pj.rv(a); + Pj.sw(a, 1); + return a.join(""); + }; + function deobfuscate(a){return xyz(a);} + "#; + let out = run(snippet, "deobfuscate", "abcdef").unwrap(); + // sanity: must differ from input, and be 6 chars + assert_ne!(out, "abcdef"); + assert_eq!(out.len(), 6); + } +} diff --git a/src/youtube/js/signature.rs b/src/youtube/js/signature.rs new file mode 100644 index 0000000..0bb9403 --- /dev/null +++ b/src/youtube/js/signature.rs @@ -0,0 +1,285 @@ +// Signature (sig) deobfuscation function extraction. +// Mirrors NPE services/youtube/YoutubeSignatureUtils.java. +// +// Flow per audit Track B §2: +// 1. Walk FUNCTION_REGEXES — first match wins. Captures (a) function +// name (group 1) and optionally (b) additional-params prefix +// (group 2 on regex 0). +// 2. Compile-check the body via JS runtime. +// 3. Extract helper-object name from sig body via SIG_DEOBF_HELPER_OBJ_NAME_REGEX. +// 4. Extract helper-object body from player.js (strip newlines). +// 5. Extract global string array. +// 6. Assemble: globalVar; helperObject; sigBody; function deobfuscate(a){return name(addlParams, a);} +// +// Also exposes the signature timestamp extraction (§2.7). + +use once_cell::sync::Lazy; +use regex::Regex; + +use crate::youtube::js::lexer::match_to_closing_brace; +use crate::youtube::js::runtime::compile_or_throw; +use crate::youtube::js::DeobfError; + +pub const DEOBFUSCATION_FUNCTION_NAME: &str = "deobfuscate"; + +/// Function-name regex bank, ordered most-specific first. New rotations +/// land at the FRONT — see NPE git log on YoutubeSignatureUtils.java. +/// +/// Group 1 = function name on every regex. +/// Group 2 = additional-params prefix on regex 0 (e.g. "43,"). For other +/// regexes group 2 (if present) is a backref or param name — NPE has a +/// latent bug where the groupCount>1 branch fires anyway, which we +/// faithfully reproduce per audit Track B §2.1. +static FUNCTION_REGEXES_SRC: &[&str] = &[ + r#"\b(?:[a-zA-Z0-9_$]+)&&\((?:[a-zA-Z0-9_$]+)=([a-zA-Z0-9_$]{2,})\((\d+,)decodeURIComponent\((?:[a-zA-Z0-9_$]+)\)\)"#, + r#"\b(?:[a-zA-Z0-9_$]+)&&\((?:[a-zA-Z0-9_$]+)=([a-zA-Z0-9_$]{2,})\(decodeURIComponent\((?:[a-zA-Z0-9_$]+)\)\)"#, + r#"\bm=([a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)"#, + r#"\bc&&\(c=([a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)"#, + r#"(?:\b|[^a-zA-Z0-9$])([a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*\{\s*a\s*=\s*a\.split\(\s*""\s*\)"#, + // PORT DEVIATION: NPE's 6th regex uses Java backref `\2` to match + // the same param name twice. Rust's `regex` crate doesn't support + // backrefs (linear-time NFA). Dropping it. Audit Track B §2.1 flags + // this same regex as having a latent groupCount bug — the loss is + // a fallback path that NPE itself half-broke. + r#"([a-zA-Z0-9$]+)\s*=\s*function\([a-zA-Z0-9$]+\)\s*\{\s*[a-zA-Z0-9$]+\s*=\s*[a-zA-Z0-9$]+\.split\(""\)\s*;"#, +]; + +static FUNCTION_REGEXES: Lazy> = Lazy::new(|| { + FUNCTION_REGEXES_SRC.iter().map(|s| Regex::new(s).unwrap()).collect() +}); + +// PARITY: NPE's helper-object body regex uses Java atomic group `(?>...)`. +// Rust's `regex` crate is backtracking-free already, so we drop the +// atomic marker. See audit Track B §2.3. +static SIG_DEOBF_HELPER_OBJ_NAME: Lazy = + Lazy::new(|| Regex::new(r"[;,]([A-Za-z0-9_$]{2,})\[..").unwrap()); + +static SIG_DEOBF_GLOBAL_ARRAY: Lazy = Lazy::new(|| { + // `[A-z]` is the NPE-original loose ASCII range (covers A-Z, a-z, plus + // a handful of punctuation between). Audit Track B §2.4 calls this + // intentional. Kept verbatim. + Regex::new(r#"(var [A-z]=['"].*['"].split\("[;{]"\))"#).unwrap() +}); + +static SIGNATURE_TIMESTAMP: Lazy = + Lazy::new(|| Regex::new(r"signatureTimestamp[=:](\d+)").unwrap()); + +/// (deob_function_name, additional_params_prefix_if_any) +pub fn deobfuscation_function_name_and_params( + player_code: &str, +) -> Result<(String, String), DeobfError> { + for re in FUNCTION_REGEXES.iter() { + if let Some(c) = re.captures(player_code) { + let name = c.get(1).map(|m| m.as_str().to_string()).unwrap_or_default(); + if name.is_empty() { + continue; + } + // PARITY with NPE: if the regex has a group 2, treat it as a + // literal prefix even when it's actually a backref/param name. + // The resulting snippet would just fail to compile for those + // cases, falling through to the next attempt — same as NPE. + let extra = c.get(2).map(|m| m.as_str().to_string()).unwrap_or_default(); + return Ok((name, extra)); + } + } + Err(DeobfError::SigFuncNotFound) +} + +pub fn signature_timestamp(player_code: &str) -> Result { + SIGNATURE_TIMESTAMP + .captures(player_code) + .and_then(|c| c.get(1)) + .and_then(|m| m.as_str().parse::().ok()) + .ok_or(DeobfError::SigTimestampMissing) +} + +/// Extracts the sig deobfuscation body. Tries lexer first, falls back to +/// the naive regex per NPE §2.2. +pub fn deobfuscate_function_body( + player_code: &str, + function_name: &str, +) -> Result { + let function_base = format!("{function_name}=function"); + match match_to_closing_brace(player_code, &function_base) { + Ok(body) => Ok(format!("{function_base}{body}")), + Err(_) => deobfuscate_with_regex(player_code, function_name), + } +} + +fn deobfuscate_with_regex(player_code: &str, function_name: &str) -> Result { + // NPE: "(" + Pattern.quote(name) + "=function\\([a-zA-Z0-9_]+\\)\\{.+?\\})" + let pattern = format!( + r"({}=function\([a-zA-Z0-9_]+\)\{{.+?\}})", + regex::escape(function_name) + ); + let re = Regex::new(&format!("(?s){pattern}")) + .map_err(|e| DeobfError::SigBodyParseFailed(e.to_string()))?; + let m = re + .captures(player_code) + .and_then(|c| c.get(1)) + .ok_or_else(|| DeobfError::SigBodyParseFailed("regex fallback miss".into()))?; + Ok(format!("var {}", m.as_str())) +} + +pub fn helper_object(player_code: &str, sig_body: &str) -> Result { + let helper_name = SIG_DEOBF_HELPER_OBJ_NAME + .captures(sig_body) + .and_then(|c| c.get(1)) + .map(|m| m.as_str().to_string()) + .ok_or(DeobfError::SigHelperMissing)?; + + let pattern = format!( + r"(var {}=\{{(?:.|\n)+?\}}\}};)", + regex::escape(&helper_name) + ); + let re = Regex::new(&pattern).map_err(|e| DeobfError::SigBodyParseFailed(e.to_string()))?; + let m = re + .captures(player_code) + .and_then(|c| c.get(1)) + .ok_or(DeobfError::SigHelperMissing)?; + Ok(m.as_str().replace('\n', "")) +} + +pub fn global_array(player_code: &str) -> Result { + SIG_DEOBF_GLOBAL_ARRAY + .captures(player_code) + .and_then(|c| c.get(1)) + .map(|m| m.as_str().to_string()) + .ok_or(DeobfError::SigGlobalArrayMissing) +} + +/// Assembles the final JS snippet — globalVar; helperObject; sigBody; +/// function deobfuscate(a) { return (a); } +/// +/// PORT DEVIATION from NPE: we prepend `var ` to the sig body so the +/// function name is an explicit global declaration. NPE relies on +/// Rhino's non-strict mode auto-creating globals from bare assignment +/// (`xyz=function(){}`). QuickJS treats undeclared-bare assignment as an +/// error. Functionally identical once the function is in scope. +pub fn assemble_snippet( + global_var: &str, + helper_object: &str, + sig_body: &str, + function_name: &str, + additional_params: &str, +) -> String { + let sig_body = if sig_body.starts_with("var ") { + sig_body.to_string() + } else { + format!("var {sig_body}") + }; + format!( + "{global_var};{helper_object}{sig_body};function {DEOBFUSCATION_FUNCTION_NAME}(a){{return {function_name}({additional_params}a);}}" + ) +} + +/// One-shot: from a downloaded player.js body, build the assembled +/// snippet ready to pass into `runtime::run`. Compile-checks the sig body +/// before assembly. +pub fn build_deobfuscator(player_code: &str) -> Result { + let (name, extra) = deobfuscation_function_name_and_params(player_code)?; + let sig_body = deobfuscate_function_body(player_code, &name)?; + compile_or_throw(&sig_body)?; + let helper = helper_object(player_code, &sig_body)?; + let global = global_array(player_code)?; + Ok(assemble_snippet(&global, &helper, &sig_body, &name, &extra)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn name_regex_classic_split() { + // The fifth NPE pattern: =function(a){a=a.split("")... + let src = r#";xyz=function(a){a=a.split("");return a;}"#; + let (name, extra) = deobfuscation_function_name_and_params(src).unwrap(); + assert_eq!(name, "xyz"); + assert_eq!(extra, ""); + } + + #[test] + fn name_regex_with_additional_params() { + // The first NPE pattern: ...y&&(z=xyz(43,decodeURIComponent(w))) + let src = r#"if(zz&&(aa=xyz(43,decodeURIComponent(bb)))"#; + let (name, extra) = deobfuscation_function_name_and_params(src).unwrap(); + assert_eq!(name, "xyz"); + assert_eq!(extra, "43,"); + } + + #[test] + fn signature_timestamp_parses() { + // Real YT player.js shape: unquoted object key in JS literal. + let src = r#"var foo={signatureTimestamp:20243,foo:1};"#; + let ts = signature_timestamp(src).unwrap(); + assert_eq!(ts, 20243); + let src = r#"signatureTimestamp=12345"#; + let ts = signature_timestamp(src).unwrap(); + assert_eq!(ts, 12345); + } + + #[test] + fn signature_timestamp_missing_returns_err() { + let err = signature_timestamp("no timestamp here").unwrap_err(); + assert!(matches!(err, DeobfError::SigTimestampMissing)); + } + + #[test] + fn deobfuscate_body_via_lexer() { + let src = r#"var x=1;xyz=function(a){var s="{nope}";a=a.split("");return a.join("");};var y=2;"#; + let body = deobfuscate_function_body(src, "xyz").unwrap(); + assert!(body.starts_with("xyz=function")); + assert!(body.ends_with("}")); + assert!(body.contains(r#""{nope}""#)); + } + + #[test] + fn helper_object_extraction() { + // Real YT player.js calls helper methods via BRACKET access with + // string keys: `Pj["bH"](a, 39)` — that's what the helper-name + // regex `[;,]Pj\[..` matches (`..` consumes `"a`). Reproduce. + let player = r#"var x=1;var Pj={bH:function(a){a.reverse();},LB:function(a,b){var c=a[0];a[0]=a[b%a.length];a[b%a.length]=c;},S6:function(a){a.splice(0,1);}};xyz=function(a){a=a.split("");Pj["bH"](a,39);return a.join("");};"#; + let body = deobfuscate_function_body(player, "xyz").unwrap(); + let helper = helper_object(player, &body).unwrap(); + assert!(helper.starts_with("var Pj=")); + assert!(helper.ends_with(";")); + assert!(!helper.contains('\n')); + } + + #[test] + fn global_array_extraction() { + // NPE's regex uses greedy `.*` and matches `var =...`. + // Real player.js typically has only ONE such variable declaration + // — anything earlier with `var =...` could be greedily + // consumed. Keep test to one declaration. + let src = r#"var Z="aa;bb;cc".split(";");foo();"#; + let g = global_array(src).unwrap(); + assert_eq!(g, r#"var Z="aa;bb;cc".split(";")"#); + } + + #[test] + fn global_array_alt_delimiter() { + let src = r#"var x='before';var Y="aa{bb{cc".split("{");"#; + let g = global_array(src).unwrap(); + assert!(g.contains(r#".split("{")"#)); + } + + #[test] + fn assemble_snippet_shape() { + let s = assemble_snippet( + r#"var Z="a;b;c".split(";")"#, + r#"var Pj={};"#, + r#"xyz=function(a){return a}"#, + "xyz", + "", + ); + assert!(s.contains("function deobfuscate(a)")); + assert!(s.contains("return xyz(a);")); + } + + #[test] + fn assemble_snippet_with_additional_params() { + let s = assemble_snippet("v", "h", "b", "xyz", "43,"); + assert!(s.contains("return xyz(43,a);")); + } +} diff --git a/src/youtube/mod.rs b/src/youtube/mod.rs new file mode 100644 index 0000000..8c25046 --- /dev/null +++ b/src/youtube/mod.rs @@ -0,0 +1,5 @@ +// YouTube service tree. Phase 2 lands the JS deobfuscator (the keystone +// risk per SPEC §9). Phase 3+ lands the InnerTube client matrix, itag +// table, stream extractor, search, channel, etc. + +pub mod js; diff --git a/tests/js_phase2_offline.rs b/tests/js_phase2_offline.rs new file mode 100644 index 0000000..ea34f12 --- /dev/null +++ b/tests/js_phase2_offline.rs @@ -0,0 +1,104 @@ +// Phase 2 offline smoke — exercises the full JS deobfuscator pipeline +// against a hand-crafted minified player.js. No network. The synthetic +// player.js below replicates the shape of real YT player.js: +// * a global string array (split on `;`) +// * a helper object (Pj) with reverse/swap/splice methods +// * a sig function (xyz) that calls helper methods via BRACKET access +// * a signatureTimestamp constant +// * a throttling-parameter function (nsig) with an early-return guard +// that fixup_function must strip +// +// Asserts: +// * sig pipeline produces a deterministic non-identity output +// * signatureTimestamp parses +// * url_with_throttling_parameter_deobfuscated round-trip changes &n= +// and caches the result + +use strawcore::youtube::js::{signature, nsig, runtime, DeobfError}; + +// Synthetic minified player.js — replicates the shape of real YT player.js. +// +// Anchors each subsystem hits: +// * global array → `var Z="aa;bb;...".split(";")` +// * sig helper → `var Pj={rv:fn,sp:fn,sw:fn}` + bracket-access call +// * sig name regex 5 → `xyz=function(a){a=a.split("")...` +// * sig timestamp → `signatureTimestamp:20243` +// * nsig name regex 0 → `m85=function(p){...return Z[1]}` matches the +// classic-return-array-element shape; we tail the body with a +// `return Z[1]` so the regex anchors (but reverse() runs first so the +// actual output is determined by the reverse). +// nsig regex 0 is greedy `=function.*return [A-Z]\[\d+\]`, so m85 +// must appear in the source BEFORE xyz — otherwise the leftmost match +// starts at xyz and the greedy `.*` consumes through to m85's +// `return Z[1]`, miscapturing the name as "xyz". Real player.js naturally +// orders these the right way; mirror that here. +const SYNTHETIC_PLAYER_JS: &str = r#"var Z="aa;bb;cc;dd;ee;ff".split(";");var Pj={rv:function(a){a.reverse();},sp:function(a,b){a.splice(0,b);},sw:function(a,b){var c=a[0];a[0]=a[b%a.length];a[b%a.length]=c;}};m85=function(p){var b=1;if(typeof RUQ==="undefined")return p;var a=p.split("");a.reverse();return Z[1];};xyz=function(a){a=a.split("");Pj["rv"](a);Pj["sw"](a,1);return a.join("");};var foo={signatureTimestamp:20243};"#; + +#[test] +fn sig_pipeline_end_to_end() { + let snippet = signature::build_deobfuscator(SYNTHETIC_PLAYER_JS).expect("build"); + let out = runtime::run(&snippet, signature::DEOBFUSCATION_FUNCTION_NAME, "abcdef").unwrap(); + // sig is rv + sw(1) — reverse then swap[0] with [1]. + // "abcdef" -> reverse -> "fedcba" -> swap[0,1] -> "efdcba" + assert_eq!(out, "efdcba"); +} + +#[test] +fn sig_pipeline_is_deterministic() { + let snippet = signature::build_deobfuscator(SYNTHETIC_PLAYER_JS).unwrap(); + let a = runtime::run(&snippet, signature::DEOBFUSCATION_FUNCTION_NAME, "hello!").unwrap(); + let b = runtime::run(&snippet, signature::DEOBFUSCATION_FUNCTION_NAME, "hello!").unwrap(); + assert_eq!(a, b); +} + +#[test] +fn signature_timestamp_extracted() { + let ts = signature::signature_timestamp(SYNTHETIC_PLAYER_JS).unwrap(); + assert_eq!(ts, 20243); +} + +#[test] +fn nsig_fixup_strips_early_return_and_runs() { + let (name, body) = nsig::build_deobfuscator(SYNTHETIC_PLAYER_JS).unwrap(); + assert_eq!(name, "m85"); + assert!( + !body.contains("typeof RUQ"), + "fixup_function should have stripped the early-return guard, got: {body}" + ); + // m85's body now ends with `return Z[1]` (to anchor regex 0). When + // running standalone (i.e. without Z in scope) this would throw — + // but the assembled-snippet path is not used here; we run just the + // function body. To make this runnable, prepend Z to the global + // scope of the QuickJS runtime. + // build_deobfuscator already added the `var` prefix; just inject the + // Z global the m85 body references. + let snippet = format!(r#"var Z=["aa","bb","cc","dd"];{body}"#); + let out = runtime::run(&snippet, &name, "input!").unwrap(); + // m85 returns Z[1] regardless of input, since the early-return guard + // is now stripped. + assert_eq!(out, "bb"); +} + +#[test] +fn nsig_returns_input_unchanged_without_n_param() { + let url = "https://x.googlevideo.com/?foo=1&bar=baz"; + assert!(nsig::throttling_parameter_from_url(url).is_none()); +} + +#[test] +fn nsig_extracts_obfuscated_value_when_present() { + let url = "https://x.googlevideo.com/?foo=1&n=ABC123&bar=baz"; + assert_eq!(nsig::throttling_parameter_from_url(url).as_deref(), Some("ABC123")); +} + +#[test] +fn missing_sig_helper_returns_err() { + // Sig name matches regex 5 (`=function(a){a=a.split("")...`) but + // the body contains no helper-object call, so SIG_DEOBF_HELPER_OBJ_NAME + // misses → SigHelperMissing. + let bad = r#"var Z="a".split(";");xyz=function(a){a=a.split("");return a.join("");};"#; + match signature::build_deobfuscator(bad) { + Err(DeobfError::SigHelperMissing) => (), + other => panic!("expected SigHelperMissing, got {other:?}"), + } +}