strawcore/src/youtube/js/extractor.rs
Kayos d4000a9f9a Cleanup: drop playlist + suggestion + dead client constants + suppress_unused stubs
Round-2 cruft audit punch list — mechanical deletes, no behavior change.

Whole modules deleted (no wrapper consumer):
  * youtube/playlist_extractor.rs (297 LOC) — full playlist extraction
  * youtube/linkhandler/playlist.rs (81 LOC) — playlist URL parser
  * youtube/suggestion_extractor.rs (91 LOC) — search-as-you-type
  * tests/stream_phase4_offline.rs (186 LOC) — tautological test

Dead pub fns + enum variants + constants:
  * WEB_REMIX_* constants (3) + WEB_MUSIC_ANALYTICS_* constants (3)
  * InnertubeClientRequestInfo::of_web_music_analytics_charts_client
    factory + its charts_client_omits_platform_and_screen test
  * SearchFilter::Music{Songs,Videos,Albums,Playlists,Artists} variants
    (5 of 9 cases) + uses_music_endpoint helper + the search_extractor
    'music search not implemented' reject branch
  * Two #[allow(dead_code)] _suppress_unused stub fns and the imports
    they were keeping alive (std::sync::Arc in js/extractor.rs,
    NetworkError in stream_extractor.rs)

Renamed:
  * search_extractor::test_helpers -> renderer_helpers. Mis-named:
    it's production code called from channel.rs, not a test fixture.

potoken/ kept and documented as the designed Phase-5 extension point
for YouTube bot-detection — wrapper's Android side hasn't registered
a real provider yet, but the trait + global slot stay so when YT
forces po_token universally the integration is one Kotlin patch away,
not a Rust-side rewrite.

~580 LOC removed from production. Wrapper does not need to change.
2026-05-26 22:16:11 -07:00

189 lines
6.8 KiB
Rust

// player.js URL discovery + download. Mirrors NPE
// services/youtube/YoutubeJavaScriptExtractor.java.
//
// Two discovery paths, in order:
// 1. iframe_api regex (primary)
// 2. embed/<videoId> page — Jsoup script-tag walk + jsUrl regex fallback
//
// PARITY: we deliberately reproduce NPE's bug where `select("script")
// .attr("name", "player/base")` *mutates* the script tags and iterates ALL
// of them. The intent was "find the script with name=player/base" but
// Jsoup's attr-setter doesn't filter. Our walk does the same — iterate
// every script tag, return first whose `src` contains `base.js`.
use once_cell::sync::Lazy;
use regex::Regex;
use crate::downloader::request::Request;
use crate::downloader::Downloader;
use crate::localization::Localization;
use crate::newpipe::NewPipe;
use crate::youtube::js::DeobfError;
const IFRAME_API_URL: &str = "https://www.youtube.com/iframe_api";
const BASE_JS_PLAYER_URL_FORMAT: &str =
"https://www.youtube.com/s/player/{HASH}/player_ias.vflset/en_GB/base.js";
static IFRAME_RES_JS_BASE_PLAYER_HASH: Lazy<Regex> =
Lazy::new(|| Regex::new(r"player\\/([a-z0-9]{8})\\/").unwrap());
static EMBEDDED_WATCH_PAGE_JS_BASE_PLAYER_URL: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r#""jsUrl":"(/s/player/[A-Za-z0-9]+/player_ias\.vflset/[A-Za-z_-]+/base\.js)""#,
)
.unwrap()
});
static SCRIPT_TAG: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"<script[^>]*\bsrc=["']([^"']+)["'][^>]*>"#).unwrap());
/// Extracts the player.js URL + body. Tries iframe_api first, falls back
/// to the embed page on any failure (matches NPE's try/catch flow).
pub fn extract_javascript_player_code(video_id: &str) -> Result<(String, String), DeobfError> {
let downloader = NewPipe::downloader().ok_or(DeobfError::DownloaderMissing)?;
let url = match extract_from_iframe(&*downloader) {
Ok(u) => u,
Err(_iframe_err) => extract_from_embed(&*downloader, video_id)?,
};
let cleaned = clean_javascript_url(&url)?;
let body = download_javascript_code(&*downloader, &cleaned)?;
Ok((cleaned, body))
}
fn extract_from_iframe(downloader: &dyn Downloader) -> Result<String, DeobfError> {
let req = Request::get(IFRAME_API_URL)
.localization(Some(Localization::default()))
.build();
let resp = downloader
.execute(req)
.map_err(|e| DeobfError::FetchIframe(e.to_string()))?;
let body = resp.response_body();
let hash = IFRAME_RES_JS_BASE_PLAYER_HASH
.captures(body)
.and_then(|c| c.get(1))
.ok_or(DeobfError::PlayerUrlMissing)?
.as_str();
Ok(BASE_JS_PLAYER_URL_FORMAT.replace("{HASH}", hash))
}
fn extract_from_embed(downloader: &dyn Downloader, video_id: &str) -> Result<String, DeobfError> {
let embed_url = format!("https://www.youtube.com/embed/{video_id}");
let req = Request::get(&embed_url)
.localization(Some(Localization::default()))
.build();
let resp = downloader
.execute(req)
.map_err(|e| DeobfError::FetchEmbed(e.to_string()))?;
let body = resp.response_body();
// PARITY: NPE iterates every <script> tag (the `.attr("name","player/base")`
// call sets an attribute rather than filtering). We do the same.
for caps in SCRIPT_TAG.captures_iter(body) {
if let Some(src) = caps.get(1) {
let src = src.as_str();
if src.contains("base.js") {
return Ok(src.to_string());
}
}
}
// Regex fallback.
if let Some(c) = EMBEDDED_WATCH_PAGE_JS_BASE_PLAYER_URL.captures(body) {
if let Some(m) = c.get(1) {
return Ok(m.as_str().to_string());
}
}
Err(DeobfError::PlayerUrlMissing)
}
fn clean_javascript_url(url: &str) -> Result<String, DeobfError> {
let normalized = if let Some(rest) = url.strip_prefix("//") {
format!("https://{rest}")
} else if url.starts_with('/') {
format!("https://www.youtube.com{url}")
} else {
url.to_string()
};
url::Url::parse(&normalized).map_err(|e| DeobfError::InvalidPlayerUrl(e.to_string()))?;
Ok(normalized)
}
fn download_javascript_code(downloader: &dyn Downloader, url: &str) -> Result<String, DeobfError> {
let req = Request::get(url)
.localization(Some(Localization::default()))
.build();
let resp = downloader
.execute(req)
.map_err(|e| DeobfError::FetchPlayerCode(e.to_string()))?;
if resp.response_code() != 200 {
return Err(DeobfError::FetchPlayerCode(format!(
"HTTP {}",
resp.response_code()
)));
}
Ok(resp.response_body().to_string())
}
/// Extracts the 8-char player hash from a URL like
/// `https://www.youtube.com/s/player/<hash>/player_ias.vflset/.../base.js`.
/// Used for rotation detection.
pub fn extract_player_hash(url: &str) -> Option<String> {
static RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"/s/player/([A-Za-z0-9]{8})/").unwrap());
RE.captures(url).and_then(|c| c.get(1)).map(|m| m.as_str().to_string())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn iframe_hash_regex_matches_escaped_form() {
let sample = r#"src:"https://www.youtube.com/s/player\/c2f7551f\/player_ias.vflset/en_US/www-embed.js""#;
let caps = IFRAME_RES_JS_BASE_PLAYER_HASH.captures(sample).unwrap();
assert_eq!(caps.get(1).unwrap().as_str(), "c2f7551f");
}
#[test]
fn embedded_js_url_regex_matches() {
let sample = r#"...,"jsUrl":"/s/player/abcdef12/player_ias.vflset/en_GB/base.js",..."#;
let caps = EMBEDDED_WATCH_PAGE_JS_BASE_PLAYER_URL.captures(sample).unwrap();
assert_eq!(
caps.get(1).unwrap().as_str(),
"/s/player/abcdef12/player_ias.vflset/en_GB/base.js"
);
}
#[test]
fn script_tag_regex_finds_src() {
let html = r#"<html><body><script src="//foo.com/base.js" name="x"></script></body></html>"#;
let caps = SCRIPT_TAG.captures(html).unwrap();
assert_eq!(caps.get(1).unwrap().as_str(), "//foo.com/base.js");
}
#[test]
fn clean_url_promotes_protocol_relative() {
let out = clean_javascript_url("//www.youtube.com/foo/base.js").unwrap();
assert_eq!(out, "https://www.youtube.com/foo/base.js");
}
#[test]
fn clean_url_prefixes_youtube_for_absolute_path() {
let out = clean_javascript_url("/s/player/abc/base.js").unwrap();
assert_eq!(out, "https://www.youtube.com/s/player/abc/base.js");
}
#[test]
fn clean_url_passes_through_full() {
let out = clean_javascript_url("https://www.youtube.com/s/player/x/base.js").unwrap();
assert_eq!(out, "https://www.youtube.com/s/player/x/base.js");
}
#[test]
fn player_hash_extracted_from_url() {
let url = "https://www.youtube.com/s/player/c2f7551f/player_ias.vflset/en_GB/base.js";
assert_eq!(extract_player_hash(url).as_deref(), Some("c2f7551f"));
}
}