strawcore/src/youtube/parsing.rs
Kayos cd98673684 Phase 4 (partial) — stream value types + InnerTube /player helpers
Lands the data shapes + the HTTP layer for stream extraction. The
extractor orchestrator + DASH manifest creator are deferred to the
next session — the parsing logic is dense enough to want a focused
pass.

src/stream/
  * mod.rs       — StreamInfo + StreamInfoItem (full + 'card' shapes)
                   mirroring NPE StreamInfo.java + StreamInfoItem.java
  * delivery.rs  — DeliveryMethod (Progressive/Dash/Hls/Torrent)
  * audio.rs     — AudioStream (itag, format, url, bitrate, codec,
                   audio_track_id, content_length, etc.)
  * video.rs     — VideoStream (itag, format, url, resolution, fps,
                   bandwidth, codec, video_only flag)
  * subtitles.rs — SubtitlesStream (url, lang, auto_generated, mime)

src/youtube/stream_helper.rs
  * generate_content_playback_nonce() — 16-char LCG-shuffled cpn
  * get_web_metadata_player_response       (microformat + thumbnails only)
  * get_web_embedded_player_response       (embed-url + signatureTimestamp)
  * get_android_player_response            (full Android /player + poToken)
  * get_android_reel_player_response       (no-poToken fallback)
  * get_ios_player_response                (iOS — flagged with 917 KiB cap
                                            warning in the doc comment)

Per-helper headers + URL shapes match audit Track C §2.7 verbatim:
Android/iOS hit gapis endpoint with mobile UA; WEB family hits
www.youtube.com with the WEB headers.

Tests: 64 lib unit pass (up from 62 in Phase 3).

Next session: full stream_extractor.rs orchestrator + dash_manifest/
creator + Phase 4 done-when smoke (extract NCS Spektrem).
2026-05-24 17:01:03 -07:00

231 lines
8 KiB
Rust

// YoutubeParsingHelper-shaped helpers — mirrors NPE
// services/youtube/YoutubeParsingHelper.java.
//
// Currently implements:
// * consent toggle + cookie generator (set_consent_accepted, consent_cookie)
// * client-version cache + sw.js fetch fallback (get_web_client_version)
// * visitor-data bootstrap via /youtubei/v1/visitor_id
// * client/origin/referer header builder
//
// PoToken integration lands in Phase 5. po_token / DroidGuard / BotGuard
// machinery is host-provided (PoTokenProvider trait).
use once_cell::sync::Lazy;
use parking_lot::RwLock;
use regex::Regex;
use serde_json::Value;
use crate::downloader::request::Request;
use crate::exceptions::ParsingError;
use crate::localization::{ContentCountry, Localization};
use crate::newpipe::NewPipe;
use crate::youtube::client_request::{
build_envelope, InnertubeClientRequestInfo,
};
use crate::youtube::constants::*;
static CONSENT_ACCEPTED: Lazy<RwLock<bool>> = Lazy::new(|| RwLock::new(false));
static CACHED_WEB_CLIENT_VERSION: Lazy<RwLock<Option<String>>> = Lazy::new(|| RwLock::new(None));
pub fn set_consent_accepted(accepted: bool) {
*CONSENT_ACCEPTED.write() = accepted;
}
pub fn is_consent_accepted() -> bool {
*CONSENT_ACCEPTED.read()
}
/// Returns the `SOCS=` consent cookie value. EU users need
/// `CAISAiAD` (accepted) to extract mix-playlist continuations.
pub fn consent_cookie() -> &'static str {
if is_consent_accepted() {
"SOCS=CAISAiAD"
} else {
"SOCS=CAE="
}
}
/// Returns the cached WEB client version. Falls back to the hardcoded
/// constant if no live extraction has run.
pub fn web_client_version() -> String {
if let Some(v) = CACHED_WEB_CLIENT_VERSION.read().as_ref() {
return v.clone();
}
WEB_HARDCODED_CLIENT_VERSION.to_string()
}
pub fn reset_web_client_version_cache() {
*CACHED_WEB_CLIENT_VERSION.write() = None;
}
static SW_JS_VERSION_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"INNERTUBE_CONTEXT_CLIENT_VERSION":\s*"([^"]+)""#).unwrap()
});
/// Fetches sw.js + extracts the live WEB client version. Caches the
/// result. Returns the cached value if already known.
pub fn discover_web_client_version() -> Result<String, ParsingError> {
if let Some(v) = CACHED_WEB_CLIENT_VERSION.read().as_ref() {
return Ok(v.clone());
}
let downloader = NewPipe::downloader()
.ok_or_else(|| ParsingError::Invalid("downloader not initialized".into()))?;
let req = Request::get("https://www.youtube.com/sw.js")
.add_header("Origin", "https://www.youtube.com")
.add_header("Referer", "https://www.youtube.com")
.build();
let resp = downloader
.execute(req)
.map_err(|e| ParsingError::Invalid(format!("sw.js fetch: {e}")))?;
if resp.response_code() != 200 {
return Err(ParsingError::Invalid(format!(
"sw.js HTTP {}",
resp.response_code()
)));
}
let version = SW_JS_VERSION_RE
.captures(resp.response_body())
.and_then(|c| c.get(1))
.map(|m| m.as_str().to_string())
.ok_or_else(|| ParsingError::RegexMiss("INNERTUBE_CONTEXT_CLIENT_VERSION".into()))?;
*CACHED_WEB_CLIENT_VERSION.write() = Some(version.clone());
Ok(version)
}
/// Headers for a WEB-flavor POST (JSON content-type, client headers,
/// origin/referer, consent cookie).
pub fn youtube_post_headers() -> Vec<(String, String)> {
vec![
("Content-Type".into(), "application/json".into()),
("X-YouTube-Client-Name".into(), WEB_CLIENT_ID.into()),
("X-YouTube-Client-Version".into(), web_client_version()),
("Origin".into(), "https://www.youtube.com".into()),
("Referer".into(), "https://www.youtube.com".into()),
("Cookie".into(), consent_cookie().into()),
]
}
/// Mobile (Android/iOS) POST headers — UA + format-version only. No
/// X-YouTube-Client-Name, no Origin/Referer, no Cookie (audit Track A §6.2).
pub fn mobile_post_headers(user_agent: &str) -> Vec<(String, String)> {
vec![
("Content-Type".into(), "application/json".into()),
("User-Agent".into(), user_agent.into()),
("X-Goog-Api-Format-Version".into(), "2".into()),
]
}
pub fn android_user_agent(country: &ContentCountry) -> String {
format!(
"com.google.android.youtube/{ANDROID_CLIENT_VERSION} (Linux; U; Android 15; {}) gzip",
country.country_code()
)
}
pub fn ios_user_agent(country: &ContentCountry) -> String {
format!(
"com.google.ios.youtube/{IOS_CLIENT_VERSION}({IOS_DEVICE_MODEL}; U; CPU iOS {IOS_USER_AGENT_VERSION} like Mac OS X; {})",
country.country_code()
)
}
/// Bootstraps a visitor_data token via `/youtubei/v1/visitor_id`. Returns
/// the value of `responseContext.visitorData` from the response.
pub fn bootstrap_visitor_data(
info: &InnertubeClientRequestInfo,
localization: &Localization,
content_country: &ContentCountry,
use_gapis_endpoint: bool,
) -> Result<String, ParsingError> {
let downloader = NewPipe::downloader()
.ok_or_else(|| ParsingError::Invalid("downloader not initialized".into()))?;
let envelope = build_envelope(info, localization, content_country, None);
let body = serde_json::to_vec(&envelope)?;
let base = if use_gapis_endpoint {
YOUTUBEI_V1_GAPIS_URL
} else {
YOUTUBEI_V1_URL
};
let url = format!("{base}visitor_id{DISABLE_PRETTY_PRINT_PARAM}");
let mut req_builder = Request::post(&url, body);
for (k, v) in youtube_post_headers() {
req_builder = req_builder.add_header(&k, &v);
}
let resp = downloader
.execute(req_builder.build())
.map_err(|e| ParsingError::Invalid(format!("visitor_id POST: {e}")))?;
if resp.response_code() != 200 {
return Err(ParsingError::Invalid(format!(
"visitor_id HTTP {}",
resp.response_code()
)));
}
let parsed: Value = serde_json::from_str(resp.response_body())?;
parsed
.get("responseContext")
.and_then(|rc| rc.get("visitorData"))
.and_then(|v| v.as_str())
.map(|s| s.to_string())
.ok_or_else(|| ParsingError::MissingField("responseContext.visitorData".into()))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn consent_toggle_flips_cookie() {
set_consent_accepted(false);
assert_eq!(consent_cookie(), "SOCS=CAE=");
set_consent_accepted(true);
assert_eq!(consent_cookie(), "SOCS=CAISAiAD");
set_consent_accepted(false); // reset for other tests
}
#[test]
fn web_client_version_falls_back_to_hardcoded() {
reset_web_client_version_cache();
assert_eq!(web_client_version(), WEB_HARDCODED_CLIENT_VERSION);
}
#[test]
fn mobile_headers_omit_client_name_and_referer() {
let h = mobile_post_headers("ua/1.0");
let keys: Vec<&str> = h.iter().map(|(k, _)| k.as_str()).collect();
assert!(keys.contains(&"User-Agent"));
assert!(keys.contains(&"X-Goog-Api-Format-Version"));
assert!(!keys.contains(&"X-YouTube-Client-Name"));
assert!(!keys.contains(&"Origin"));
assert!(!keys.contains(&"Referer"));
assert!(!keys.contains(&"Cookie"));
}
#[test]
fn web_headers_include_consent_and_client_id() {
let h = youtube_post_headers();
let keys: Vec<&str> = h.iter().map(|(k, _)| k.as_str()).collect();
assert!(keys.contains(&"X-YouTube-Client-Name"));
assert!(keys.contains(&"Origin"));
assert!(keys.contains(&"Cookie"));
}
#[test]
fn android_ua_template() {
let ua = android_user_agent(&ContentCountry::new("DE"));
assert!(ua.contains("com.google.android.youtube/21.03.36"));
assert!(ua.contains("Android 15"));
assert!(ua.contains("; DE)"));
assert!(ua.ends_with(" gzip"));
}
#[test]
fn ios_ua_template() {
let ua = ios_user_agent(&ContentCountry::new("US"));
assert!(ua.contains("com.google.ios.youtube/21.03.2"));
assert!(ua.contains("iPhone16,2"));
assert!(ua.contains("CPU iOS 18_7_2"));
assert!(ua.contains("; US)"));
}
}