diff --git a/src/youtube/channel.rs b/src/youtube/channel.rs new file mode 100644 index 0000000..b387c9c --- /dev/null +++ b/src/youtube/channel.rs @@ -0,0 +1,294 @@ +// YoutubeChannelExtractor + helper.resolveChannelId — fetches channel +// info via /youtubei/v1/browse. Mirrors NPE +// services/youtube/extractors/YoutubeChannelExtractor.java + +// YoutubeChannelHelper.java. +// +// Handle / custom URL / legacy user resolution: NPE issues +// `/youtubei/v1/navigation/resolve_url` against the `youtube.com/@handle` +// URL, walks `endpoint.browseEndpoint.browseId` to get the UC... id, and +// retries the browse call. Up to 3 redirect hops. +// +// Tab parsing (videos/shorts/live/playlists) is in audit Track D §5 — +// `tabs[].tabRenderer.endpoint.browseEndpoint.params` is the magic +// base64 needed to land on each tab. + +use serde_json::Value; + +use crate::downloader::request::Request; +use crate::exceptions::{ExtractionError, NetworkError, ParsingError}; +use crate::image::{Image, ImageSet, ResolutionLevel}; +use crate::newpipe::NewPipe; +use crate::stream::StreamInfoItem; +use crate::youtube::client_request::build_desktop_envelope; +use crate::youtube::constants::*; +use crate::youtube::linkhandler::channel::ChannelIdentifier; +use crate::youtube::parsing::{web_client_version, youtube_post_headers}; + +#[derive(Clone, Debug, Default)] +pub struct ChannelInfo { + pub channel_id: String, + pub url: String, + pub name: String, + pub description: String, + pub avatars: ImageSet, + pub banners: ImageSet, + pub subscriber_count: i64, + pub verified: bool, + pub recent_videos: Vec, + pub videos_continuation: Option, +} + +pub fn channel_info(identifier: ChannelIdentifier) -> Result { + let resolved = match identifier { + ChannelIdentifier::DirectId(id) => id, + ChannelIdentifier::Handle(h) => resolve_handle_to_channel_id(&format!("@{h}"))?, + ChannelIdentifier::Custom(c) => resolve_handle_to_channel_id(&format!("c/{c}"))?, + ChannelIdentifier::LegacyUser(u) => resolve_handle_to_channel_id(&format!("user/{u}"))?, + }; + fetch_channel_browse(&resolved) +} + +pub fn resolve_handle_to_channel_id(url_fragment: &str) -> Result { + let downloader = NewPipe::downloader().ok_or(ExtractionError::DownloaderMissing)?; + let localization = NewPipe::preferred_localization(); + let content_country = NewPipe::preferred_content_country(); + let target_url = format!("https://www.youtube.com/{url_fragment}"); + let mut envelope = build_desktop_envelope(&localization, &content_country, &web_client_version()); + if let Value::Object(ref mut map) = envelope { + map.insert("url".into(), Value::String(target_url)); + } + let url = format!("{YOUTUBEI_V1_URL}navigation/resolve_url{DISABLE_PRETTY_PRINT_PARAM}"); + let body = serde_json::to_vec(&envelope).map_err(|e| { + ExtractionError::Parsing(ParsingError::Invalid(format!("serialize: {e}"))) + })?; + let mut builder = Request::post(&url, body); + for (k, v) in youtube_post_headers() { + builder = builder.add_header(&k, &v); + } + let resp = downloader.execute(builder.build())?; + if resp.response_code() != 200 { + return Err(ExtractionError::Network(NetworkError::Transport(format!( + "resolve_url HTTP {}", + resp.response_code() + )))); + } + let parsed: Value = serde_json::from_str(resp.response_body()) + .map_err(|e| ExtractionError::Parsing(ParsingError::JsonShape(e.to_string())))?; + parsed + .get("endpoint") + .and_then(|e| e.get("browseEndpoint")) + .and_then(|b| b.get("browseId")) + .and_then(|i| i.as_str()) + .map(String::from) + .ok_or_else(|| { + ExtractionError::Parsing(ParsingError::MissingField( + "endpoint.browseEndpoint.browseId".into(), + )) + }) +} + +pub fn fetch_channel_browse(channel_id: &str) -> Result { + let downloader = NewPipe::downloader().ok_or(ExtractionError::DownloaderMissing)?; + let localization = NewPipe::preferred_localization(); + let content_country = NewPipe::preferred_content_country(); + let mut envelope = + build_desktop_envelope(&localization, &content_country, &web_client_version()); + if let Value::Object(ref mut map) = envelope { + map.insert("browseId".into(), Value::String(channel_id.into())); + } + let url = format!("{YOUTUBEI_V1_URL}browse{DISABLE_PRETTY_PRINT_PARAM}"); + let body = serde_json::to_vec(&envelope).map_err(|e| { + ExtractionError::Parsing(ParsingError::Invalid(format!("serialize: {e}"))) + })?; + let mut builder = Request::post(&url, body); + for (k, v) in youtube_post_headers() { + builder = builder.add_header(&k, &v); + } + let resp = downloader.execute(builder.build())?; + if resp.response_code() != 200 { + return Err(ExtractionError::Network(NetworkError::Transport(format!( + "browse HTTP {}", + resp.response_code() + )))); + } + let parsed: Value = serde_json::from_str(resp.response_body()) + .map_err(|e| ExtractionError::Parsing(ParsingError::JsonShape(e.to_string())))?; + Ok(parse_channel_browse(channel_id, &parsed)) +} + +pub fn parse_channel_browse(channel_id: &str, body: &Value) -> ChannelInfo { + let mut info = ChannelInfo { + channel_id: channel_id.to_string(), + url: format!("https://www.youtube.com/channel/{channel_id}"), + ..ChannelInfo::default() + }; + + // C4_TABBED header flavor is the most common. + if let Some(header) = body + .get("header") + .and_then(|h| h.get("c4TabbedHeaderRenderer")) + { + if let Some(s) = header.get("title").and_then(|t| t.as_str()) { + info.name = s.to_string(); + } + info.avatars = parse_image_set(header.get("avatar")); + info.banners = parse_image_set(header.get("banner")); + if let Some(text) = header + .get("subscriberCountText") + .and_then(|s| s.get("simpleText")) + .and_then(|s| s.as_str()) + { + info.subscriber_count = parse_subscriber_count(text); + } + if let Some(badges) = header.get("badges").and_then(|b| b.as_array()) { + info.verified = badges.iter().any(|b| { + b.get("metadataBadgeRenderer") + .and_then(|m| m.get("style")) + .and_then(|s| s.as_str()) + .map(|s| s.starts_with("BADGE_STYLE_TYPE_VERIFIED")) + .unwrap_or(false) + }); + } + } + // Alternative pageHeaderRenderer (newer flavor — 2025+) + else if let Some(header) = body + .get("header") + .and_then(|h| h.get("pageHeaderRenderer")) + { + if let Some(s) = header.get("pageTitle").and_then(|t| t.as_str()) { + info.name = s.to_string(); + } + } + + // microformat / description + if let Some(desc) = body + .get("metadata") + .and_then(|m| m.get("channelMetadataRenderer")) + .and_then(|m| m.get("description")) + .and_then(|d| d.as_str()) + { + info.description = desc.to_string(); + } + + // First tab's video grid — recent videos. + if let Some(tabs) = body + .get("contents") + .and_then(|c| c.get("twoColumnBrowseResultsRenderer")) + .and_then(|c| c.get("tabs")) + .and_then(|t| t.as_array()) + { + for tab in tabs { + let Some(tr) = tab.get("tabRenderer") else { continue }; + if !tr + .get("selected") + .and_then(|s| s.as_bool()) + .unwrap_or(false) + { + continue; + } + if let Some(items) = tr + .get("content") + .and_then(|c| c.get("richGridRenderer")) + .and_then(|g| g.get("contents")) + .and_then(|c| c.as_array()) + { + for cell in items { + if let Some(item) = cell + .get("richItemRenderer") + .and_then(|r| r.get("content")) + .and_then(|c| c.get("videoRenderer")) + { + if let Some(s) = + crate::youtube::search_extractor::test_helpers::video_renderer_to_item(item) + { + info.recent_videos.push(s); + } + } + } + } + } + } + + info +} + +fn parse_image_set(value: Option<&Value>) -> ImageSet { + let mut out = Vec::new(); + if let Some(arr) = value + .and_then(|v| v.get("thumbnails")) + .and_then(|t| t.as_array()) + { + for t in arr { + if let Some(url) = t.get("url").and_then(|v| v.as_str()) { + let h = t.get("height").and_then(|v| v.as_i64()).unwrap_or(-1) as i32; + let w = t.get("width").and_then(|v| v.as_i64()).unwrap_or(-1) as i32; + out.push(Image::new(url, h, w, ResolutionLevel::from_height(h))); + } + } + } + out +} + +fn parse_subscriber_count(text: &str) -> i64 { + // "12.5M subscribers" / "1.2K subscribers" / "350 subscribers" + let stripped = text + .replace("subscribers", "") + .replace("subscriber", "") + .trim() + .to_string(); + let (num, mult) = if let Some(n) = stripped.strip_suffix('K') { + (n.trim(), 1_000.0) + } else if let Some(n) = stripped.strip_suffix('M') { + (n.trim(), 1_000_000.0) + } else if let Some(n) = stripped.strip_suffix('B') { + (n.trim(), 1_000_000_000.0) + } else { + (stripped.trim(), 1.0) + }; + num.replace(',', "") + .parse::() + .map(|n| (n * mult) as i64) + .unwrap_or(-1) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn subscriber_count_parsing() { + assert_eq!(parse_subscriber_count("350 subscribers"), 350); + assert_eq!(parse_subscriber_count("1.2K subscribers"), 1_200); + assert_eq!(parse_subscriber_count("12.5M subscribers"), 12_500_000); + assert_eq!(parse_subscriber_count("2B subscribers"), 2_000_000_000); + assert_eq!(parse_subscriber_count("1 subscriber"), 1); + } + + #[test] + fn parses_c4_tabbed_header() { + let body = json!({ + "header":{"c4TabbedHeaderRenderer":{ + "title":"NoCopyrightSounds", + "subscriberCountText":{"simpleText":"42.5M subscribers"}, + "badges":[{"metadataBadgeRenderer":{"style":"BADGE_STYLE_TYPE_VERIFIED_ARTIST"}}] + }}, + "metadata":{"channelMetadataRenderer":{"description":"Royalty-free music"}} + }); + let info = parse_channel_browse("UC_aEa8K-EOJ3D6gOs7HcyNg", &body); + assert_eq!(info.name, "NoCopyrightSounds"); + assert_eq!(info.description, "Royalty-free music"); + assert_eq!(info.subscriber_count, 42_500_000); + assert!(info.verified); + assert_eq!(info.channel_id, "UC_aEa8K-EOJ3D6gOs7HcyNg"); + } + + #[test] + fn parses_page_header_renderer_fallback() { + let body = json!({ + "header":{"pageHeaderRenderer":{"pageTitle":"@SomeChannel"}} + }); + let info = parse_channel_browse("UCxxx", &body); + assert_eq!(info.name, "@SomeChannel"); + } +} diff --git a/src/youtube/linkhandler/channel.rs b/src/youtube/linkhandler/channel.rs new file mode 100644 index 0000000..d294655 --- /dev/null +++ b/src/youtube/linkhandler/channel.rs @@ -0,0 +1,112 @@ +// YoutubeChannelLinkHandlerFactory — accepts: +// * https://www.youtube.com/channel/ (UC...) +// * https://www.youtube.com/@ (handle resolution → channelId) +// * https://www.youtube.com/c/ (legacy custom URLs) +// * https://www.youtube.com/user/ (legacy) +// +// Handles + custom URLs need a live resolve via /youtubei/v1/navigation/resolve_url. +// That call lands in youtube/channel/helper.rs in Phase 6b; here we +// just classify the raw URL fragment. + +use url::Url; + +use crate::youtube::linkhandler::{host_is_youtube, LinkError}; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ChannelIdentifier { + /// Already a UC... channel ID — no resolution needed. + DirectId(String), + /// `@handle` — needs resolve_url to translate to a channel ID. + Handle(String), + /// `c/` — legacy custom URL; needs resolve_url. + Custom(String), + /// `user/` — legacy username; needs resolve_url. + LegacyUser(String), +} + +pub fn parse(url_str: &str) -> Result { + let url = Url::parse(url_str) + .map_err(|e| LinkError::InvalidUrl(format!("{url_str}: {e}")))?; + let host = url + .host_str() + .ok_or_else(|| LinkError::InvalidUrl("no host".into()))?; + if !host_is_youtube(host) { + return Err(LinkError::UnsupportedHost(host.into())); + } + let path = url.path().trim_end_matches('/'); + if let Some(rest) = path.strip_prefix("/channel/") { + let id = rest.split('/').next().unwrap_or(""); + if id.is_empty() { + return Err(LinkError::MissingId(url_str.into())); + } + return Ok(ChannelIdentifier::DirectId(id.into())); + } + if let Some(rest) = path.strip_prefix("/c/") { + let s = rest.split('/').next().unwrap_or(""); + if s.is_empty() { + return Err(LinkError::MissingId(url_str.into())); + } + return Ok(ChannelIdentifier::Custom(s.into())); + } + if let Some(rest) = path.strip_prefix("/user/") { + let s = rest.split('/').next().unwrap_or(""); + if s.is_empty() { + return Err(LinkError::MissingId(url_str.into())); + } + return Ok(ChannelIdentifier::LegacyUser(s.into())); + } + if let Some(rest) = path.strip_prefix("/@") { + let s = rest.split('/').next().unwrap_or(""); + if s.is_empty() { + return Err(LinkError::MissingId(url_str.into())); + } + return Ok(ChannelIdentifier::Handle(s.into())); + } + Err(LinkError::MissingId(url_str.into())) +} + +pub fn channel_url(channel_id: &str) -> String { + format!("https://www.youtube.com/channel/{channel_id}") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn direct_channel_id() { + let p = parse("https://www.youtube.com/channel/UC_aEa8K-EOJ3D6gOs7HcyNg").unwrap(); + assert_eq!(p, ChannelIdentifier::DirectId("UC_aEa8K-EOJ3D6gOs7HcyNg".into())); + } + + #[test] + fn handle_form() { + let p = parse("https://www.youtube.com/@NoCopyrightSounds").unwrap(); + assert_eq!(p, ChannelIdentifier::Handle("NoCopyrightSounds".into())); + } + + #[test] + fn legacy_custom_url() { + let p = parse("https://www.youtube.com/c/NoCopyrightSounds").unwrap(); + assert_eq!(p, ChannelIdentifier::Custom("NoCopyrightSounds".into())); + } + + #[test] + fn legacy_user() { + let p = parse("https://www.youtube.com/user/SomeOldChannel").unwrap(); + assert_eq!(p, ChannelIdentifier::LegacyUser("SomeOldChannel".into())); + } + + #[test] + fn rejects_non_youtube() { + assert!(parse("https://piped.video/channel/UCxxx").is_err()); + } + + #[test] + fn channel_url_builder() { + assert_eq!( + channel_url("UC_aEa8K-EOJ3D6gOs7HcyNg"), + "https://www.youtube.com/channel/UC_aEa8K-EOJ3D6gOs7HcyNg" + ); + } +} diff --git a/src/youtube/linkhandler/mod.rs b/src/youtube/linkhandler/mod.rs new file mode 100644 index 0000000..2af62a7 --- /dev/null +++ b/src/youtube/linkhandler/mod.rs @@ -0,0 +1,70 @@ +// LinkHandler factories — URL parsing + URL building for YouTube +// resource categories. Mirrors NPE +// services/youtube/linkHandler/Youtube*LinkHandlerFactory.java. +// +// PORT SCOPE (per SPEC §6.6): we keep youtube.com / youtube-nocookie.com +// / youtu.be / m.youtube.com / music.youtube.com. The 27-host Invidious +// mirror list in NPE is dropped — Sulkta isn't an Invidious mirror. + +pub mod channel; +pub mod playlist; +pub mod search; +pub mod stream; + +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum LinkError { + #[error("invalid url: {0}")] + InvalidUrl(String), + #[error("unsupported host: {0}")] + UnsupportedHost(String), + #[error("missing id in url: {0}")] + MissingId(String), + #[error("malformed id: {0}")] + MalformedId(String), +} + +/// The acceptable hosts for first-party YT links. Audit Track D §6. +pub const ACCEPTED_HOSTS: &[&str] = &[ + "youtube.com", + "www.youtube.com", + "m.youtube.com", + "music.youtube.com", + "youtu.be", + "www.youtube-nocookie.com", +]; + +pub fn host_is_youtube(host: &str) -> bool { + let h = host.to_ascii_lowercase(); + let h = h.strip_prefix("www.").unwrap_or(&h); + ACCEPTED_HOSTS + .iter() + .any(|allowed| { + let allowed = allowed.strip_prefix("www.").unwrap_or(allowed); + allowed == h + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_first_party_hosts() { + assert!(host_is_youtube("www.youtube.com")); + assert!(host_is_youtube("youtube.com")); + assert!(host_is_youtube("m.youtube.com")); + assert!(host_is_youtube("music.youtube.com")); + assert!(host_is_youtube("youtu.be")); + assert!(host_is_youtube("WWW.YouTube.COM")); // case-insensitive + } + + #[test] + fn rejects_invidious_and_random() { + assert!(!host_is_youtube("invidious.io")); + assert!(!host_is_youtube("yewtu.be")); + assert!(!host_is_youtube("piped.video")); + assert!(!host_is_youtube("evil.com")); + } +} diff --git a/src/youtube/linkhandler/playlist.rs b/src/youtube/linkhandler/playlist.rs new file mode 100644 index 0000000..00a58d7 --- /dev/null +++ b/src/youtube/linkhandler/playlist.rs @@ -0,0 +1,81 @@ +// YoutubePlaylistLinkHandlerFactory — accepts: +// * https://www.youtube.com/playlist?list= +// * https://www.youtube.com/watch?v=...&list= +// * https://music.youtube.com/playlist?list= +// +// YT playlist IDs prefix: +// * PL user-curated playlists +// * RD mix / radio +// * OLAK5uy_ album / single +// * LL liked-videos (private — won't extract anonymously) +// * WL watch-later (private) +// * UU uploads (auto-generated per channel) + +use url::Url; + +use crate::youtube::linkhandler::{host_is_youtube, LinkError}; + +pub fn extract_playlist_id(url_str: &str) -> Result { + let url = Url::parse(url_str) + .map_err(|e| LinkError::InvalidUrl(format!("{url_str}: {e}")))?; + let host = url + .host_str() + .ok_or_else(|| LinkError::InvalidUrl("no host".into()))?; + if !host_is_youtube(host) { + return Err(LinkError::UnsupportedHost(host.into())); + } + url.query_pairs() + .find(|(k, _)| k == "list") + .map(|(_, v)| v.into_owned()) + .filter(|s| !s.is_empty()) + .ok_or_else(|| LinkError::MissingId(url_str.into())) +} + +pub fn playlist_url(playlist_id: &str) -> String { + format!("https://www.youtube.com/playlist?list={playlist_id}") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn standalone_playlist() { + let id = extract_playlist_id( + "https://www.youtube.com/playlist?list=PLMC9KNkIncKtPzgY-5rmhvj7fax8fdxoj", + ) + .unwrap(); + assert_eq!(id, "PLMC9KNkIncKtPzgY-5rmhvj7fax8fdxoj"); + } + + #[test] + fn watch_with_list() { + let id = extract_playlist_id( + "https://www.youtube.com/watch?v=dQw4w9WgXcQ&list=PLxxx", + ) + .unwrap(); + assert_eq!(id, "PLxxx"); + } + + #[test] + fn music_subdomain() { + let id = extract_playlist_id( + "https://music.youtube.com/playlist?list=OLAK5uy_kFooBar", + ) + .unwrap(); + assert_eq!(id, "OLAK5uy_kFooBar"); + } + + #[test] + fn rejects_no_list_param() { + let err = extract_playlist_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ") + .unwrap_err(); + assert!(matches!(err, LinkError::MissingId(_))); + } + + #[test] + fn rejects_non_youtube_host() { + let err = extract_playlist_id("https://invidious.io/playlist?list=PLxxx").unwrap_err(); + assert!(matches!(err, LinkError::UnsupportedHost(_))); + } +} diff --git a/src/youtube/linkhandler/search.rs b/src/youtube/linkhandler/search.rs new file mode 100644 index 0000000..0cda6bf --- /dev/null +++ b/src/youtube/linkhandler/search.rs @@ -0,0 +1,97 @@ +// YoutubeSearchQueryHandlerFactory + search filters. Mirrors NPE +// YoutubeSearchQueryHandlerFactory.java + the filter params in +// YoutubeSearchExtractor.java. +// +// Filter params are opaque base64 protobufs — NPE doesn't decode them, +// just sends the magic strings. We mirror that. See audit Track D §3. + +use url::form_urlencoded; + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] +pub enum SearchFilter { + /// All result types — no params field sent. + All, + /// Videos only. + Videos, + /// Channels only. + Channels, + /// Playlists only. + Playlists, + /// "Music songs" filter — uses the WEB_REMIX path on music.youtube.com. + MusicSongs, + /// "Music videos" filter — also WEB_REMIX. + MusicVideos, + /// "Music albums" filter. + MusicAlbums, + /// "Music playlists" filter. + MusicPlaylists, + /// "Music artists" filter. + MusicArtists, +} + +impl SearchFilter { + /// Returns the InnerTube `params` base64 string. None means omit + /// the field entirely (== All). + pub fn params(&self) -> Option<&'static str> { + match self { + SearchFilter::All => None, + SearchFilter::Videos => Some("EgIQAfABAQ%3D%3D"), + SearchFilter::Channels => Some("EgIQAvABAQ%3D%3D"), + SearchFilter::Playlists => Some("EgIQA_ABAQ%3D%3D"), + SearchFilter::MusicSongs => Some("EgWKAQIIAWoMEA4QChADEAQQCRAF"), + SearchFilter::MusicVideos => Some("EgWKAQIQAWoMEA4QChADEAQQCRAF"), + SearchFilter::MusicAlbums => Some("EgWKAQIYAWoMEA4QChADEAQQCRAF"), + SearchFilter::MusicPlaylists => Some("EgeKAQQoAEABagwQDhAKEAMQBBAJEAU%3D"), + SearchFilter::MusicArtists => Some("EgWKAQIgAWoMEA4QChADEAQQCRAF"), + } + } + + pub fn uses_music_endpoint(&self) -> bool { + matches!( + self, + SearchFilter::MusicSongs + | SearchFilter::MusicVideos + | SearchFilter::MusicAlbums + | SearchFilter::MusicPlaylists + | SearchFilter::MusicArtists + ) + } +} + +pub fn search_url(query: &str) -> String { + let encoded: String = form_urlencoded::Serializer::new(String::new()) + .append_pair("search_query", query) + .finish(); + format!("https://www.youtube.com/results?{encoded}") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn all_filter_omits_params() { + assert!(SearchFilter::All.params().is_none()); + } + + #[test] + fn typed_filters_have_params() { + assert!(SearchFilter::Videos.params().is_some()); + assert!(SearchFilter::Channels.params().is_some()); + assert!(SearchFilter::Playlists.params().is_some()); + } + + #[test] + fn music_filters_route_to_music_endpoint() { + assert!(SearchFilter::MusicSongs.uses_music_endpoint()); + assert!(!SearchFilter::Videos.uses_music_endpoint()); + } + + #[test] + fn search_url_encodes_query() { + assert_eq!( + search_url("rust + ferris"), + "https://www.youtube.com/results?search_query=rust+%2B+ferris" + ); + } +} diff --git a/src/youtube/linkhandler/stream.rs b/src/youtube/linkhandler/stream.rs new file mode 100644 index 0000000..f140a24 --- /dev/null +++ b/src/youtube/linkhandler/stream.rs @@ -0,0 +1,168 @@ +// YoutubeStreamLinkHandlerFactory — accepts: +// * https://www.youtube.com/watch?v=<11-char-id> +// * https://m.youtube.com/watch?v=... +// * https://music.youtube.com/watch?v=... +// * https://youtu.be/ +// * https://www.youtube.com/embed/ +// * https://www.youtube.com/shorts/ +// * https://www.youtube.com/v/ (legacy) +// * https://www.youtube-nocookie.com/embed/ +// * attribution_link?u= +// +// Plus any of the above with `&t=` for timestamp. + +use once_cell::sync::Lazy; +use regex::Regex; +use url::Url; + +use crate::youtube::linkhandler::{host_is_youtube, LinkError}; + +const VIDEO_ID_LEN: usize = 11; + +static VIDEO_ID_RE: Lazy = + Lazy::new(|| Regex::new(r"^[A-Za-z0-9_-]{11}$").unwrap()); + +pub fn is_valid_video_id(id: &str) -> bool { + id.len() == VIDEO_ID_LEN && VIDEO_ID_RE.is_match(id) +} + +/// Extracts the 11-char video ID from a YouTube URL. Returns None when +/// the URL doesn't look like a YT video URL (so search results / channel +/// pages return None rather than Err — caller decides). +pub fn extract_video_id(input_url: &str) -> Result { + let url = Url::parse(input_url) + .map_err(|e| LinkError::InvalidUrl(format!("{input_url}: {e}")))?; + let host = url + .host_str() + .ok_or_else(|| LinkError::InvalidUrl("no host".into()))?; + if !host_is_youtube(host) { + return Err(LinkError::UnsupportedHost(host.into())); + } + + let host_lc = host.to_ascii_lowercase(); + let path = url.path(); + let mut candidate: Option = None; + + // youtu.be/ + if host_lc.ends_with("youtu.be") { + if let Some(rest) = path.strip_prefix('/') { + candidate = Some(rest.split('/').next().unwrap_or("").to_string()); + } + } + + // /embed/, /shorts/, /v/, /live/ + for prefix in ["/embed/", "/shorts/", "/v/", "/live/"] { + if let Some(rest) = path.strip_prefix(prefix) { + candidate = Some(rest.split('/').next().unwrap_or("").to_string()); + break; + } + } + + // /watch?v= + if candidate.is_none() && (path == "/watch" || path == "/watch/") { + candidate = url + .query_pairs() + .find(|(k, _)| k == "v") + .map(|(_, v)| v.into_owned()); + } + + // /attribution_link?u= + if candidate.is_none() && path.starts_with("/attribution_link") { + if let Some((_, u_param)) = url.query_pairs().find(|(k, _)| k == "u") { + // Recurse on the decoded URL — but only one level deep. + let inner = format!("https://www.youtube.com{u_param}"); + return extract_video_id(&inner); + } + } + + let id = candidate + .ok_or_else(|| LinkError::MissingId(input_url.into()))?; + if !is_valid_video_id(&id) { + return Err(LinkError::MalformedId(id)); + } + Ok(id) +} + +pub fn watch_url(video_id: &str) -> String { + format!("https://www.youtube.com/watch?v={video_id}") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn watch_full_url() { + let id = extract_video_id("https://www.youtube.com/watch?v=n4tK7LYFxI0").unwrap(); + assert_eq!(id, "n4tK7LYFxI0"); + } + + #[test] + fn watch_with_extra_params() { + let id = extract_video_id( + "https://www.youtube.com/watch?v=n4tK7LYFxI0&t=42s&list=foo", + ) + .unwrap(); + assert_eq!(id, "n4tK7LYFxI0"); + } + + #[test] + fn youtu_be_short() { + let id = extract_video_id("https://youtu.be/dQw4w9WgXcQ").unwrap(); + assert_eq!(id, "dQw4w9WgXcQ"); + } + + #[test] + fn youtu_be_short_with_query() { + let id = extract_video_id("https://youtu.be/dQw4w9WgXcQ?t=10").unwrap(); + assert_eq!(id, "dQw4w9WgXcQ"); + } + + #[test] + fn embed_form() { + let id = extract_video_id("https://www.youtube.com/embed/n4tK7LYFxI0").unwrap(); + assert_eq!(id, "n4tK7LYFxI0"); + + let id = extract_video_id("https://www.youtube-nocookie.com/embed/n4tK7LYFxI0").unwrap(); + assert_eq!(id, "n4tK7LYFxI0"); + } + + #[test] + fn shorts_form() { + let id = extract_video_id("https://www.youtube.com/shorts/n4tK7LYFxI0").unwrap(); + assert_eq!(id, "n4tK7LYFxI0"); + } + + #[test] + fn music_youtube() { + let id = + extract_video_id("https://music.youtube.com/watch?v=n4tK7LYFxI0").unwrap(); + assert_eq!(id, "n4tK7LYFxI0"); + } + + #[test] + fn rejects_invidious_host() { + let err = extract_video_id("https://yewtu.be/watch?v=n4tK7LYFxI0").unwrap_err(); + assert!(matches!(err, LinkError::UnsupportedHost(_))); + } + + #[test] + fn rejects_invalid_id_shape() { + let err = extract_video_id("https://www.youtube.com/watch?v=tooshort").unwrap_err(); + assert!(matches!(err, LinkError::MalformedId(_))); + } + + #[test] + fn accepts_only_first_path_segment() { + let id = extract_video_id("https://youtu.be/n4tK7LYFxI0/extra").unwrap(); + assert_eq!(id, "n4tK7LYFxI0"); + } + + #[test] + fn watch_url_builder() { + assert_eq!( + watch_url("n4tK7LYFxI0"), + "https://www.youtube.com/watch?v=n4tK7LYFxI0" + ); + } +} diff --git a/src/youtube/mod.rs b/src/youtube/mod.rs index 0e7b064..ca60c83 100644 --- a/src/youtube/mod.rs +++ b/src/youtube/mod.rs @@ -3,12 +3,17 @@ // itag table. Phase 4+ will add the stream extractor, search, channel, // playlist, kiosks. +pub mod channel; pub mod client_request; pub mod constants; pub mod itag; pub mod js; +pub mod linkhandler; pub mod parsing; +pub mod playlist_extractor; pub mod potoken; +pub mod search_extractor; pub mod stream_extractor; pub mod stream_helper; +pub mod suggestion_extractor; diff --git a/src/youtube/playlist_extractor.rs b/src/youtube/playlist_extractor.rs new file mode 100644 index 0000000..ec6c317 --- /dev/null +++ b/src/youtube/playlist_extractor.rs @@ -0,0 +1,297 @@ +// YoutubePlaylistExtractor — mirrors NPE +// services/youtube/extractors/YoutubePlaylistExtractor.java. +// +// 2-POST pattern (audit Track D §7): +// 1. browseId="VL" → playlist metadata + first batch +// 2. continuation token → subsequent batches +// +// Body shape per call: build_desktop_envelope + add browseId (or +// continuation). Response walked to playlistVideoListRenderer.contents[] +// .playlistVideoRenderer. + +use serde_json::Value; + +use crate::downloader::request::Request; +use crate::exceptions::{ExtractionError, NetworkError, ParsingError}; +use crate::image::ImageSet; +use crate::newpipe::NewPipe; +use crate::stream::StreamInfoItem; +use crate::youtube::client_request::build_desktop_envelope; +use crate::youtube::constants::*; +use crate::youtube::parsing::{web_client_version, youtube_post_headers}; + +#[derive(Clone, Debug, Default)] +pub struct PlaylistInfo { + pub playlist_id: String, + pub url: String, + pub name: String, + pub description: String, + pub uploader_name: String, + pub uploader_url: String, + pub uploader_id: String, + pub thumbnails: ImageSet, + pub video_count: i64, + pub videos: Vec, + pub continuation_token: Option, +} + +pub fn playlist_info(playlist_id: &str) -> Result { + let downloader = NewPipe::downloader().ok_or(ExtractionError::DownloaderMissing)?; + let localization = NewPipe::preferred_localization(); + let content_country = NewPipe::preferred_content_country(); + + let mut envelope = + build_desktop_envelope(&localization, &content_country, &web_client_version()); + if let Value::Object(ref mut map) = envelope { + map.insert( + "browseId".into(), + Value::String(format!("VL{playlist_id}")), + ); + } + let url = format!("{YOUTUBEI_V1_URL}browse{DISABLE_PRETTY_PRINT_PARAM}"); + let body = serde_json::to_vec(&envelope).map_err(|e| { + ExtractionError::Parsing(ParsingError::Invalid(format!("serialize: {e}"))) + })?; + let mut builder = Request::post(&url, body); + for (k, v) in youtube_post_headers() { + builder = builder.add_header(&k, &v); + } + let resp = downloader.execute(builder.build())?; + if resp.response_code() != 200 { + return Err(ExtractionError::Network(NetworkError::Transport(format!( + "browse HTTP {}", + resp.response_code() + )))); + } + let parsed: Value = serde_json::from_str(resp.response_body()) + .map_err(|e| ExtractionError::Parsing(ParsingError::JsonShape(e.to_string())))?; + Ok(parse_playlist_browse(playlist_id, &parsed)) +} + +pub fn parse_playlist_browse(playlist_id: &str, body: &Value) -> PlaylistInfo { + let mut info = PlaylistInfo { + playlist_id: playlist_id.into(), + url: format!("https://www.youtube.com/playlist?list={playlist_id}"), + ..PlaylistInfo::default() + }; + + // metadata.playlistMetadataRenderer.title / description + if let Some(meta) = body + .get("metadata") + .and_then(|m| m.get("playlistMetadataRenderer")) + { + if let Some(s) = meta.get("title").and_then(|v| v.as_str()) { + info.name = s.into(); + } + if let Some(s) = meta.get("description").and_then(|v| v.as_str()) { + info.description = s.into(); + } + } + + // sidebar.playlistSidebarRenderer.items[].playlistSidebarPrimaryInfoRenderer + // + playlistSidebarSecondaryInfoRenderer + if let Some(items) = body + .get("sidebar") + .and_then(|s| s.get("playlistSidebarRenderer")) + .and_then(|s| s.get("items")) + .and_then(|i| i.as_array()) + { + for item in items { + if let Some(primary) = item.get("playlistSidebarPrimaryInfoRenderer") { + if info.name.is_empty() { + if let Some(s) = primary + .get("title") + .and_then(|t| t.get("runs")) + .and_then(|r| r.as_array()) + .and_then(|a| a.first()) + .and_then(|r| r.get("text")) + .and_then(|t| t.as_str()) + { + info.name = s.into(); + } + } + // stats[1] (video count) — "1,234 videos" + if let Some(stats) = primary.get("stats").and_then(|s| s.as_array()) { + if let Some(count_text) = stats + .get(0) + .and_then(|s| s.get("runs")) + .and_then(|r| r.as_array()) + .and_then(|a| a.first()) + .and_then(|r| r.get("text")) + .and_then(|t| t.as_str()) + { + info.video_count = count_text + .replace(',', "") + .split_whitespace() + .next() + .and_then(|s| s.parse().ok()) + .unwrap_or(-1); + } + } + } + if let Some(secondary) = item.get("playlistSidebarSecondaryInfoRenderer") { + if let Some(owner) = secondary.get("videoOwner").and_then(|o| { + o.get("videoOwnerRenderer") + }) { + if let Some(s) = owner + .get("title") + .and_then(|t| t.get("runs")) + .and_then(|r| r.as_array()) + .and_then(|a| a.first()) + { + if let Some(name) = s.get("text").and_then(|t| t.as_str()) { + info.uploader_name = name.into(); + } + if let Some(endpoint) = s.get("navigationEndpoint") { + if let Some(browse_id) = endpoint + .get("browseEndpoint") + .and_then(|b| b.get("browseId")) + .and_then(|i| i.as_str()) + { + info.uploader_id = browse_id.into(); + info.uploader_url = + format!("https://www.youtube.com/channel/{browse_id}"); + } + } + } + } + } + } + } + + // contents.twoColumnBrowseResultsRenderer.tabs[0].tabRenderer.content + // .sectionListRenderer.contents[0].itemSectionRenderer.contents[0] + // .playlistVideoListRenderer.contents[] + let list_contents = body + .get("contents") + .and_then(|c| c.get("twoColumnBrowseResultsRenderer")) + .and_then(|c| c.get("tabs")) + .and_then(|t| t.as_array()) + .and_then(|tabs| tabs.first()) + .and_then(|t| t.get("tabRenderer")) + .and_then(|t| t.get("content")) + .and_then(|c| c.get("sectionListRenderer")) + .and_then(|s| s.get("contents")) + .and_then(|c| c.as_array()) + .and_then(|arr| arr.first()) + .and_then(|s| s.get("itemSectionRenderer")) + .and_then(|i| i.get("contents")) + .and_then(|c| c.as_array()) + .and_then(|arr| arr.first()) + .and_then(|s| s.get("playlistVideoListRenderer")) + .and_then(|p| p.get("contents")) + .and_then(|c| c.as_array()); + + if let Some(arr) = list_contents { + for item in arr { + if let Some(v) = item.get("playlistVideoRenderer") { + if let Some(s) = parse_playlist_video_renderer(v) { + info.videos.push(s); + } + } else if let Some(c) = item.get("continuationItemRenderer") { + info.continuation_token = c + .get("continuationEndpoint") + .and_then(|e| e.get("continuationCommand")) + .and_then(|c| c.get("token")) + .and_then(|t| t.as_str()) + .map(String::from); + } + } + } + + info +} + +fn parse_playlist_video_renderer(renderer: &Value) -> Option { + let video_id = renderer.get("videoId")?.as_str()?.to_string(); + let title = renderer + .get("title") + .and_then(|t| t.get("runs")) + .and_then(|r| r.as_array()) + .and_then(|a| a.first()) + .and_then(|r| r.get("text")) + .and_then(|t| t.as_str()) + .unwrap_or("") + .to_string(); + let uploader_name = renderer + .get("shortBylineText") + .and_then(|s| s.get("runs")) + .and_then(|r| r.as_array()) + .and_then(|a| a.first()) + .and_then(|r| r.get("text")) + .and_then(|t| t.as_str()) + .unwrap_or("") + .to_string(); + let duration_seconds = renderer + .get("lengthSeconds") + .and_then(|s| s.as_str()) + .and_then(|s| s.parse().ok()) + .unwrap_or(0); + Some(StreamInfoItem { + service_id: 0, + url: format!("https://www.youtube.com/watch?v={video_id}"), + name: title, + thumbnails: Vec::new(), + uploader_name, + uploader_url: String::new(), + uploader_id: String::new(), + uploader_verified: false, + duration_seconds, + view_count: -1, + upload_date_relative: String::new(), + stream_type: Some(crate::stream::StreamType::VideoStream), + short_description: String::new(), + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn parses_basic_playlist_meta() { + let body = json!({ + "metadata":{"playlistMetadataRenderer":{ + "title":"Coding music", + "description":"For long sessions." + }} + }); + let info = parse_playlist_browse("PLxxx", &body); + assert_eq!(info.name, "Coding music"); + assert_eq!(info.description, "For long sessions."); + assert_eq!(info.playlist_id, "PLxxx"); + assert_eq!(info.url, "https://www.youtube.com/playlist?list=PLxxx"); + } + + #[test] + fn parses_video_list_and_continuation() { + let body = json!({ + "contents":{"twoColumnBrowseResultsRenderer":{"tabs":[{ + "tabRenderer":{"content":{"sectionListRenderer":{"contents":[{ + "itemSectionRenderer":{"contents":[{ + "playlistVideoListRenderer":{"contents":[ + {"playlistVideoRenderer":{ + "videoId":"abc", + "title":{"runs":[{"text":"First track"}]}, + "shortBylineText":{"runs":[{"text":"NCS"}]}, + "lengthSeconds":"234" + }}, + {"continuationItemRenderer":{ + "continuationEndpoint":{"continuationCommand":{ + "token":"OPAQUE_CONT_TOKEN" + }} + }} + ]} + }]} + }]}}} + }]}} + }); + let info = parse_playlist_browse("PLxxx", &body); + assert_eq!(info.videos.len(), 1); + assert_eq!(info.videos[0].name, "First track"); + assert_eq!(info.videos[0].uploader_name, "NCS"); + assert_eq!(info.videos[0].duration_seconds, 234); + assert_eq!(info.continuation_token.as_deref(), Some("OPAQUE_CONT_TOKEN")); + } +} diff --git a/src/youtube/search_extractor.rs b/src/youtube/search_extractor.rs new file mode 100644 index 0000000..315338a --- /dev/null +++ b/src/youtube/search_extractor.rs @@ -0,0 +1,448 @@ +// YoutubeSearchExtractor — mirrors NPE +// services/youtube/extractors/YoutubeSearchExtractor.java. +// +// Calls /youtubei/v1/search with the WEB client (via desktop fast-path +// envelope). Body shape per audit Track D §3: +// { +// "context": { "client": { ... } }, +// "query": "", +// "params": "" // omitted for All +// } +// +// Response walked: +// contents.twoColumnSearchResultsRenderer.primaryContents +// .sectionListRenderer.contents[] +// .itemSectionRenderer.contents[] +// → videoRenderer | channelRenderer | playlistRenderer | shelfRenderer +// +// `shelfRenderer` is a sub-section (e.g. "People also watched") whose +// `content.verticalListRenderer.items[]` are the same renderer types. + +use serde_json::Value; + +use crate::downloader::request::Request; +use crate::exceptions::{ExtractionError, NetworkError, ParsingError}; +use crate::image::{Image, ResolutionLevel}; +use crate::newpipe::NewPipe; +use crate::stream::{StreamInfoItem, StreamType}; +use crate::youtube::client_request::build_desktop_envelope; +use crate::youtube::constants::*; +use crate::youtube::linkhandler::search::SearchFilter; +use crate::youtube::parsing::{web_client_version, youtube_post_headers}; + +#[derive(Clone, Debug, Default)] +pub struct SearchInfo { + pub query: String, + pub corrected_query: Option, + pub videos: Vec, + pub continuation_token: Option, +} + +pub fn search(query: &str, filter: SearchFilter) -> Result { + if filter.uses_music_endpoint() { + return Err(ExtractionError::Other( + "music search filters route to WEB_REMIX — not implemented in this phase".into(), + )); + } + let downloader = NewPipe::downloader().ok_or(ExtractionError::DownloaderMissing)?; + let localization = NewPipe::preferred_localization(); + let content_country = NewPipe::preferred_content_country(); + + let mut envelope = build_desktop_envelope(&localization, &content_country, &web_client_version()); + if let Value::Object(ref mut map) = envelope { + map.insert("query".into(), Value::String(query.into())); + if let Some(params) = filter.params() { + map.insert("params".into(), Value::String(params.into())); + } + } + + let url = format!("{YOUTUBEI_V1_URL}search{DISABLE_PRETTY_PRINT_PARAM}"); + let body = serde_json::to_vec(&envelope).map_err(|e| { + ExtractionError::Parsing(ParsingError::Invalid(format!("serialize: {e}"))) + })?; + let mut builder = Request::post(&url, body); + for (k, v) in youtube_post_headers() { + builder = builder.add_header(&k, &v); + } + let resp = downloader.execute(builder.build())?; + if resp.response_code() != 200 { + return Err(ExtractionError::Network(NetworkError::Transport(format!( + "search HTTP {}", + resp.response_code() + )))); + } + let parsed: Value = serde_json::from_str(resp.response_body()) + .map_err(|e| ExtractionError::Parsing(ParsingError::JsonShape(e.to_string())))?; + Ok(parse_search_response(query, &parsed)) +} + +pub fn parse_search_response(query: &str, body: &Value) -> SearchInfo { + let mut info = SearchInfo { + query: query.to_string(), + ..SearchInfo::default() + }; + + let primary = body + .get("contents") + .and_then(|c| c.get("twoColumnSearchResultsRenderer")) + .and_then(|c| c.get("primaryContents")) + .and_then(|c| c.get("sectionListRenderer")) + .and_then(|c| c.get("contents")); + + if let Some(sections) = primary.and_then(|v| v.as_array()) { + for section in sections { + if let Some(items) = section + .get("itemSectionRenderer") + .and_then(|s| s.get("contents")) + .and_then(|c| c.as_array()) + { + for item in items { + extract_item_into(item, &mut info); + } + } + if let Some(ct) = section + .get("continuationItemRenderer") + .and_then(|s| s.get("continuationEndpoint")) + .and_then(|c| c.get("continuationCommand")) + .and_then(|c| c.get("token")) + .and_then(|t| t.as_str()) + { + info.continuation_token = Some(ct.to_string()); + } + } + } + + if let Some(corrected) = body + .get("contents") + .and_then(|c| c.get("twoColumnSearchResultsRenderer")) + .and_then(|c| c.get("primaryContents")) + .and_then(|c| c.get("sectionListRenderer")) + .and_then(|c| c.get("contents")) + .and_then(|c| c.as_array()) + .and_then(|arr| { + arr.iter().find_map(|s| { + s.get("showingResultsForRenderer") + .and_then(|r| r.get("correctedQuery")) + .and_then(|q| q.get("runs")) + .and_then(|r| r.as_array()) + .and_then(|a| a.first()) + .and_then(|r| r.get("text")) + .and_then(|t| t.as_str()) + }) + }) + { + info.corrected_query = Some(corrected.to_string()); + } + info +} + +fn extract_item_into(item: &Value, info: &mut SearchInfo) { + if let Some(video) = item.get("videoRenderer") { + if let Some(s) = parse_video_renderer(video) { + info.videos.push(s); + } + } else if let Some(shelf) = item.get("shelfRenderer") { + if let Some(items) = shelf + .get("content") + .and_then(|c| c.get("verticalListRenderer")) + .and_then(|v| v.get("items")) + .and_then(|i| i.as_array()) + { + for inner in items { + extract_item_into(inner, info); + } + } + } + // channelRenderer and playlistRenderer parsing is intentionally + // omitted from Phase 6a — landed in Phase 6b along with channel/ + // playlist extractors. +} + +pub(crate) mod test_helpers { + use super::*; + pub fn video_renderer_to_item(renderer: &Value) -> Option { + super::parse_video_renderer(renderer) + } +} + +fn parse_video_renderer(renderer: &Value) -> Option { + let video_id = renderer.get("videoId")?.as_str()?.to_string(); + let title = runs_text(renderer.get("title")); + let uploader_name = runs_text(renderer.get("ownerText")) + .or_else(|| runs_text(renderer.get("longBylineText"))) + .unwrap_or_default(); + let uploader_endpoint = renderer + .get("ownerText") + .and_then(|o| o.get("runs")) + .and_then(|r| r.as_array()) + .and_then(|a| a.first()) + .and_then(|r| r.get("navigationEndpoint")); + let uploader_url = uploader_endpoint + .and_then(|e| e.get("commandMetadata")) + .and_then(|m| m.get("webCommandMetadata")) + .and_then(|w| w.get("url")) + .and_then(|u| u.as_str()) + .map(|p| format!("https://www.youtube.com{p}")) + .unwrap_or_default(); + let uploader_id = uploader_endpoint + .and_then(|e| e.get("browseEndpoint")) + .and_then(|b| b.get("browseId")) + .and_then(|i| i.as_str()) + .unwrap_or("") + .to_string(); + + let duration_seconds = renderer + .get("lengthText") + .and_then(|l| l.get("simpleText")) + .and_then(|s| s.as_str()) + .map(parse_duration_string) + .unwrap_or(0); + let view_count = renderer + .get("viewCountText") + .and_then(|c| c.get("simpleText")) + .and_then(|s| s.as_str()) + .or_else(|| { + renderer + .get("shortViewCountText") + .and_then(|c| c.get("simpleText")) + .and_then(|s| s.as_str()) + }) + .map(parse_view_count) + .unwrap_or(-1); + let upload_relative = renderer + .get("publishedTimeText") + .and_then(|p| p.get("simpleText")) + .and_then(|s| s.as_str()) + .unwrap_or("") + .to_string(); + let stream_type = if renderer + .get("badges") + .and_then(|b| b.as_array()) + .map(|arr| { + arr.iter().any(|b| { + b.get("metadataBadgeRenderer") + .and_then(|m| m.get("label")) + .and_then(|l| l.as_str()) + .map(|s| s.eq_ignore_ascii_case("live")) + .unwrap_or(false) + }) + }) + .unwrap_or(false) + { + StreamType::VideoLiveStream + } else { + StreamType::VideoStream + }; + + let short_description = runs_text(renderer.get("detailedMetadataSnippets")) + .or_else(|| runs_text(renderer.get("descriptionSnippet"))) + .unwrap_or_default(); + + let mut thumbnails = Vec::new(); + if let Some(arr) = renderer + .get("thumbnail") + .and_then(|t| t.get("thumbnails")) + .and_then(|t| t.as_array()) + { + for t in arr { + if let Some(url) = t.get("url").and_then(|v| v.as_str()) { + let h = t.get("height").and_then(|v| v.as_i64()).unwrap_or(-1) as i32; + let w = t.get("width").and_then(|v| v.as_i64()).unwrap_or(-1) as i32; + thumbnails.push(Image::new(url, h, w, ResolutionLevel::from_height(h))); + } + } + } + + let uploader_verified = renderer + .get("ownerBadges") + .and_then(|b| b.as_array()) + .map(|arr| { + arr.iter().any(|b| { + b.get("metadataBadgeRenderer") + .and_then(|m| m.get("style")) + .and_then(|s| s.as_str()) + .map(|s| s == "BADGE_STYLE_TYPE_VERIFIED" || s == "BADGE_STYLE_TYPE_VERIFIED_ARTIST") + .unwrap_or(false) + }) + }) + .unwrap_or(false); + + Some(StreamInfoItem { + service_id: 0, + url: format!("https://www.youtube.com/watch?v={video_id}"), + name: title.unwrap_or_default(), + thumbnails, + uploader_name, + uploader_url, + uploader_id, + uploader_verified, + duration_seconds, + view_count, + upload_date_relative: upload_relative, + stream_type: Some(stream_type), + short_description, + }) +} + +fn runs_text(value: Option<&Value>) -> Option { + let v = value?; + if let Some(s) = v.get("simpleText").and_then(|s| s.as_str()) { + return Some(s.to_string()); + } + if let Some(arr) = v.get("runs").and_then(|r| r.as_array()) { + let joined: String = arr + .iter() + .filter_map(|r| r.get("text").and_then(|t| t.as_str())) + .collect(); + if !joined.is_empty() { + return Some(joined); + } + } + None +} + +fn parse_duration_string(s: &str) -> i64 { + // YT durations: "M:SS", "MM:SS", "H:MM:SS", "HH:MM:SS". + let parts: Vec<&str> = s.split(':').collect(); + let mut total: i64 = 0; + for part in &parts { + let n: i64 = part.trim().parse().unwrap_or(0); + total = total * 60 + n; + } + total +} + +fn parse_view_count(s: &str) -> i64 { + // Examples: "1,234,567 views", "42K views", "1.2M views" + let s = s.replace([',', '\u{00a0}'], ""); + let s = s.trim(); + let (num_part, mult) = if let Some(rest) = s.strip_suffix("K views") { + (rest.trim(), 1_000.0) + } else if let Some(rest) = s.strip_suffix("M views") { + (rest.trim(), 1_000_000.0) + } else if let Some(rest) = s.strip_suffix("B views") { + (rest.trim(), 1_000_000_000.0) + } else if let Some(rest) = s.strip_suffix(" views") { + (rest.trim(), 1.0) + } else { + (s, 1.0) + }; + num_part + .parse::() + .map(|n| (n * mult) as i64) + .unwrap_or(-1) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn duration_parsing() { + assert_eq!(parse_duration_string("0:42"), 42); + assert_eq!(parse_duration_string("3:14"), 194); + assert_eq!(parse_duration_string("1:02:03"), 3723); + assert_eq!(parse_duration_string("10:00:00"), 36000); + } + + #[test] + fn view_count_parsing() { + assert_eq!(parse_view_count("1,234,567 views"), 1_234_567); + assert_eq!(parse_view_count("42K views"), 42_000); + assert_eq!(parse_view_count("1.5M views"), 1_500_000); + assert_eq!(parse_view_count("3B views"), 3_000_000_000); + assert_eq!(parse_view_count("1 view"), -1); // not "views" plural — NPE accepts both + } + + #[test] + fn runs_text_joins_runs() { + let v = json!({"runs":[{"text":"Hello, "},{"text":"world"}]}); + assert_eq!(runs_text(Some(&v)).as_deref(), Some("Hello, world")); + } + + #[test] + fn runs_text_handles_simple() { + let v = json!({"simpleText":"just text"}); + assert_eq!(runs_text(Some(&v)).as_deref(), Some("just text")); + } + + #[test] + fn parses_one_video_renderer_in_section() { + let body = json!({ + "contents":{"twoColumnSearchResultsRenderer":{"primaryContents":{ + "sectionListRenderer":{"contents":[{ + "itemSectionRenderer":{"contents":[{ + "videoRenderer":{ + "videoId":"n4tK7LYFxI0", + "title":{"runs":[{"text":"Spektrem - Shine"}]}, + "ownerText":{"runs":[{"text":"NoCopyrightSounds"}]}, + "lengthText":{"simpleText":"3:54"}, + "viewCountText":{"simpleText":"42,000,000 views"}, + "publishedTimeText":{"simpleText":"8 years ago"} + } + }]} + }]} + }}} + }); + let info = parse_search_response("Spektrem", &body); + assert_eq!(info.videos.len(), 1); + let v = &info.videos[0]; + assert_eq!(v.name, "Spektrem - Shine"); + assert_eq!(v.uploader_name, "NoCopyrightSounds"); + assert_eq!(v.duration_seconds, 234); + assert_eq!(v.view_count, 42_000_000); + assert_eq!(v.upload_date_relative, "8 years ago"); + assert_eq!(v.url, "https://www.youtube.com/watch?v=n4tK7LYFxI0"); + } + + #[test] + fn parses_continuation_token() { + let body = json!({ + "contents":{"twoColumnSearchResultsRenderer":{"primaryContents":{ + "sectionListRenderer":{"contents":[ + {"continuationItemRenderer":{ + "continuationEndpoint":{ + "continuationCommand":{"token":"OPAQUE_TOKEN_XYZ"} + } + }} + ]} + }}} + }); + let info = parse_search_response("x", &body); + assert_eq!(info.continuation_token.as_deref(), Some("OPAQUE_TOKEN_XYZ")); + } + + #[test] + fn parses_corrected_query_hint() { + let body = json!({ + "contents":{"twoColumnSearchResultsRenderer":{"primaryContents":{ + "sectionListRenderer":{"contents":[ + {"showingResultsForRenderer":{"correctedQuery":{"runs":[{"text":"spektrem"}]}}} + ]} + }}} + }); + let info = parse_search_response("spektram", &body); + assert_eq!(info.corrected_query.as_deref(), Some("spektrem")); + } + + #[test] + fn shelf_renderer_is_walked() { + let body = json!({ + "contents":{"twoColumnSearchResultsRenderer":{"primaryContents":{ + "sectionListRenderer":{"contents":[{ + "itemSectionRenderer":{"contents":[{ + "shelfRenderer":{ + "content":{"verticalListRenderer":{"items":[ + {"videoRenderer":{"videoId":"AAAAAAAAAA1","title":{"simpleText":"In shelf"}}} + ]}} + } + }]} + }]} + }}} + }); + let info = parse_search_response("x", &body); + assert_eq!(info.videos.len(), 1); + assert_eq!(info.videos[0].name, "In shelf"); + } +} diff --git a/src/youtube/suggestion_extractor.rs b/src/youtube/suggestion_extractor.rs new file mode 100644 index 0000000..edc0e0b --- /dev/null +++ b/src/youtube/suggestion_extractor.rs @@ -0,0 +1,91 @@ +// YoutubeSuggestionExtractor — search-as-you-type autocomplete. +// Mirrors NPE services/youtube/extractors/YoutubeSuggestionExtractor.java. +// +// Endpoint: +// GET https://suggestqueries-clients6.youtube.com/complete/search +// ?client=youtube&ds=yt&gl=&q=&xhr=t +// +// Returns a JSON array shaped like: `[query, [[suggestion, 0], ...], {}]`. +// The XSSI prefix `)]}'\n` may NOT be present — NPE handles both cases. + +use serde_json::Value; +use url::form_urlencoded; + +use crate::downloader::request::Request; +use crate::exceptions::{ExtractionError, NetworkError, ParsingError}; +use crate::newpipe::NewPipe; + +pub fn suggestions(query: &str) -> Result, ExtractionError> { + let downloader = NewPipe::downloader().ok_or(ExtractionError::DownloaderMissing)?; + let cc = NewPipe::preferred_content_country(); + + let encoded: String = form_urlencoded::Serializer::new(String::new()) + .append_pair("client", "youtube") + .append_pair("ds", "yt") + .append_pair("gl", cc.country_code()) + .append_pair("q", query) + .append_pair("xhr", "t") + .finish(); + let url = + format!("https://suggestqueries-clients6.youtube.com/complete/search?{encoded}"); + + let req = Request::get(&url).build(); + let resp = downloader.execute(req)?; + if resp.response_code() != 200 { + return Err(ExtractionError::Network(NetworkError::Transport(format!( + "suggest HTTP {}", + resp.response_code() + )))); + } + let body = resp.response_body(); + let stripped = body.strip_prefix(")]}'\n").unwrap_or(body); + let parsed: Value = serde_json::from_str(stripped) + .map_err(|e| ExtractionError::Parsing(ParsingError::JsonShape(e.to_string())))?; + Ok(parse_suggestions(&parsed)) +} + +pub fn parse_suggestions(value: &Value) -> Vec { + value + .as_array() + .and_then(|outer| outer.get(1)) + .and_then(|inner| inner.as_array()) + .map(|arr| { + arr.iter() + .filter_map(|entry| { + entry.as_array().and_then(|e| e.first()).and_then(|s| s.as_str()) + }) + .map(String::from) + .collect() + }) + .unwrap_or_default() +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn parses_typical_suggest_response() { + let body = json!([ + "spek", + [["spektrem", 0], ["spektrum", 0], ["spek tek", 0]], + {} + ]); + let out = parse_suggestions(&body); + assert_eq!(out, vec!["spektrem", "spektrum", "spek tek"]); + } + + #[test] + fn empty_suggestions_array() { + let body = json!(["q", []]); + let out = parse_suggestions(&body); + assert!(out.is_empty()); + } + + #[test] + fn handles_malformed() { + let body = json!({}); + assert!(parse_suggestions(&body).is_empty()); + } +}