feat: add visitor data cache, remove random visitor data

Apparently YouTube can detect randomly generated visitor data and
prevents both the iOS and TV player from being fetched
(Error: Sign in to confirm you’re not a bot). Therefore the visitor
data generation code was removed and replaced with a cache that randomly
chooses from a selection of real visitor data.
This commit is contained in:
ThetaDev 2025-02-02 02:06:56 +01:00
parent 50ab1f7a5d
commit b12f4c5d82
No known key found for this signature in database
GPG key ID: E319D3C5148D65B6
3 changed files with 197 additions and 82 deletions

View file

@ -38,6 +38,7 @@ use time::OffsetDateTime;
use tokio::sync::RwLock as AsyncRwLock;
use crate::error::AuthError;
use crate::util::VisitorDataCache;
use crate::{
cache::{CacheStorage, FileStorage, DEFAULT_CACHE_FILE},
deobfuscate::DeobfData,
@ -312,9 +313,9 @@ impl AuthCookie {
}
}
const DEFAULT_UA: &str = "Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0";
const MOBILE_UA: &str = "Mozilla/5.0 (Android 14; Mobile; rv:129.0) Gecko/129.0 Firefox/129.0";
const TV_UA: &str = "Mozilla/5.0 (SMART-TV; Linux; Tizen 5.0) AppleWebKit/538.1 (KHTML, like Gecko) Version/5.0 NativeTVAds Safari/538.1";
pub(crate) const DEFAULT_UA: &str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
pub(crate) const MOBILE_UA: &str = "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.6778.135 Mobile Safari/537.36";
pub(crate) const TV_UA: &str = "Mozilla/5.0 (SMART-TV; Linux; Tizen 5.0) AppleWebKit/538.1 (KHTML, like Gecko) Version/5.0 NativeTVAds Safari/538.1";
const CONSENT_COOKIE: &str = "SOCS=CAISAiAD";
@ -323,7 +324,7 @@ const YOUTUBEI_V1_GAPIS_URL: &str = "https://youtubei.googleapis.com/youtubei/v1
const YOUTUBE_MUSIC_V1_URL: &str = "https://music.youtube.com/youtubei/v1/";
const YOUTUBEI_MOBILE_V1_URL: &str = "https://m.youtube.com/youtubei/v1/";
const YOUTUBE_HOME_URL: &str = "https://www.youtube.com";
const YOUTUBE_MUSIC_HOME_URL: &str = "https://music.youtube.com";
pub(crate) const YOUTUBE_MUSIC_HOME_URL: &str = "https://music.youtube.com";
const YOUTUBE_MOBILE_HOME_URL: &str = "https://m.youtube.com";
const YOUTUBE_TV_URL: &str = "https://www.youtube.com/tv";
@ -350,8 +351,6 @@ const OAUTH_SCOPES: &str = "http://gdata.youtube.com https://www.googleapis.com/
static CLIENT_VERSION_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r#""INNERTUBE_CONTEXT_CLIENT_VERSION":"([\w\d\._-]+?)""#).unwrap());
static VISITOR_DATA_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r#""visitorData":"([\w\d_\-%]+?)""#).unwrap());
/// Default order of client types when fetching player data
///
@ -378,6 +377,7 @@ struct RustyPipeRef {
cache: CacheHolder,
default_opts: RustyPipeOpts,
user_agent: Cow<'static, str>,
visitor_data_cache: VisitorDataCache,
}
#[derive(Clone)]
@ -688,6 +688,8 @@ impl RustyPipeBuilder {
})
.collect::<HashMap<_, _>>();
let visitor_data_cache = VisitorDataCache::new(http.clone());
Ok(RustyPipe {
inner: Arc::new(RustyPipeRef {
http,
@ -706,6 +708,7 @@ impl RustyPipeBuilder {
},
default_opts: self.default_opts,
user_agent,
visitor_data_cache,
}),
})
}
@ -1196,51 +1199,7 @@ impl RustyPipe {
/// Sometimes YouTube does not set the `__Secure-YEC` cookie. In this case, the
/// visitor data is extracted from the html page.
async fn get_visitor_data(&self) -> Result<String, Error> {
tracing::debug!("getting YT visitor data");
let resp = self
.inner
.http
.get(YOUTUBE_MUSIC_HOME_URL)
.header(header::ORIGIN, YOUTUBE_MUSIC_HOME_URL)
.header(header::REFERER, YOUTUBE_MUSIC_HOME_URL)
.send()
.await?;
let vdata = resp
.headers()
.get_all(header::SET_COOKIE)
.iter()
.find_map(|c| {
if let Ok(cookie) = c.to_str() {
if let Some(after) = cookie.strip_prefix("__Secure-YEC=") {
return after
.split_once(';')
.map(|s| s.0.to_owned())
.filter(|s| !s.is_empty());
}
}
None
});
match vdata {
Some(vdata) => Ok(vdata),
None => {
if resp.status().is_success() {
// Extract visitor data from html
let html = resp.text().await?;
util::get_cg_from_regex(&VISITOR_DATA_REGEX, &html, 1).ok_or(Error::Extraction(
ExtractionError::InvalidData(
"Could not find visitor data on html page".into(),
),
))
} else {
Err(Error::Extraction(ExtractionError::InvalidData(
format!("Could not get visitor data, status: {}", resp.status()).into(),
)))
}
}
}
self.inner.visitor_data_cache.new_visitor_data().await
}
/// Get a new device code for logging into YouTube
@ -2147,11 +2106,14 @@ impl RustyPipeQuery {
) -> Result<M, Error> {
tracing::debug!("getting {}({})", operation, id);
let visitor_data = ctx_src
let visitor_data = match ctx_src
.visitor_data
.or(self.opts.visitor_data.as_deref())
.map(Cow::Borrowed)
.unwrap_or_else(|| util::random_visitor_data(self.opts.country).into());
{
Some(vd) => vd,
None => self.client.inner.visitor_data_cache.get().await?.into(),
};
let context = self
.get_context(ctype, !ctx_src.unlocalized, &visitor_data)
@ -2289,12 +2251,10 @@ impl RustyPipeQuery {
endpoint: &str,
body: &B,
) -> Result<String, Error> {
let visitor_data = self
.opts
.visitor_data
.as_deref()
.map(Cow::Borrowed)
.unwrap_or_else(|| util::random_visitor_data(self.opts.country).into());
let visitor_data = match self.opts.visitor_data.as_deref().map(Cow::Borrowed) {
Some(vd) => vd,
None => self.client.inner.visitor_data_cache.get().await?.into(),
};
let context = self.get_context(ctype, true, &visitor_data).await;
let req_body = QBody { context, body };

View file

@ -1,11 +1,13 @@
mod date;
mod protobuf;
mod visitor_data;
pub mod dictionary;
pub mod timeago;
pub use date::{now_sec, shift_months, shift_weeks_mo, shift_years};
pub use protobuf::{string_from_pb, ProtoBuilder};
pub use visitor_data::VisitorDataCache;
use std::{
collections::BTreeMap,
@ -99,29 +101,6 @@ pub fn random_uuid() -> String {
)
}
/// Generate a random visitor data cookie
pub fn random_visitor_data(country: Country) -> String {
let mut rng = rand::thread_rng();
let mut pb_e2 = ProtoBuilder::new();
pb_e2.string(2, "");
pb_e2.varint(4, rng.gen_range(1..256));
let mut pb_e = ProtoBuilder::new();
pb_e.string(1, &country.to_string());
pb_e.embedded(2, pb_e2);
let mut pb = ProtoBuilder::new();
pb.string(1, &random_string(CONTENT_PLAYBACK_NONCE_ALPHABET, 11));
pb.varint(
5,
(time::OffsetDateTime::now_utc().unix_timestamp() as u64)
.saturating_sub(rng.gen_range(0..600_000)),
);
pb.embedded(6, pb_e);
pb.to_base64()
}
/// Split an URL into its base string and parameter map
///
/// Example:

176
src/util/visitor_data.rs Normal file
View file

@ -0,0 +1,176 @@
use std::sync::{atomic::AtomicU32, Arc, RwLock};
use once_cell::sync::Lazy;
use rand::Rng;
use regex::Regex;
use reqwest::{header, Client};
use crate::{
client::YOUTUBE_MUSIC_HOME_URL,
error::{Error, ExtractionError},
util,
};
/// To increase privacy and possibly circumvent rate limits, RustyPipe uses multiple
/// visitor data IDs. These are held in this cache object.
///
/// On instantiation, the cache is empty, so for the first requests new visitor data IDs
/// have to be requested. For subsequent requests a random ID from the cache is picked.
/// After req_limit requests, a new token is requested asynchronously and added to the cache
/// to prevent the IDs from being overused.
///
/// The cache holds a maximum of 100 visitor data IDs. If more are added, the oldest ones
/// are evicted.
#[derive(Clone)]
pub struct VisitorDataCache {
inner: Arc<VisitorDataCacheRef>,
}
struct VisitorDataCacheRef {
req_counter: AtomicU32,
visitor_data: RwLock<Vec<String>>,
http: Client,
}
static VISITOR_DATA_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r#""visitorData":"([\w\d_\-%]+?)""#).unwrap());
/// Number of requests after which a new token is requested
const REQ_LIMIT: u32 = 10;
/// Maximum size of the cache (-1)
const MAX_SIZE: usize = 99;
impl VisitorDataCache {
pub fn new(http: Client) -> Self {
Self {
inner: VisitorDataCacheRef {
req_counter: Default::default(),
visitor_data: Default::default(),
http,
}
.into(),
}
}
async fn get_visitor_data(&self) -> Result<String, Error> {
tracing::debug!("getting YT visitor data");
let resp = self
.inner
.http
.get(YOUTUBE_MUSIC_HOME_URL)
.header(header::ORIGIN, YOUTUBE_MUSIC_HOME_URL)
.header(header::REFERER, YOUTUBE_MUSIC_HOME_URL)
.send()
.await?;
let vdata = resp
.headers()
.get_all(header::SET_COOKIE)
.iter()
.find_map(|c| {
if let Ok(cookie) = c.to_str() {
if let Some(after) = cookie.strip_prefix("__Secure-YEC=") {
return after
.split_once(';')
.map(|s| s.0.to_owned())
.filter(|s| !s.is_empty());
}
}
None
});
match vdata {
Some(vdata) => Ok(vdata),
None => {
if resp.status().is_success() {
// Extract visitor data from html
let html = resp.text().await?;
util::get_cg_from_regex(&VISITOR_DATA_REGEX, &html, 1).ok_or(Error::Extraction(
ExtractionError::InvalidData(
"Could not find visitor data on html page".into(),
),
))
} else {
Err(Error::Extraction(ExtractionError::InvalidData(
format!("Could not get visitor data, status: {}", resp.status()).into(),
)))
}
}
}
}
pub async fn new_visitor_data(&self) -> Result<String, Error> {
self.inner
.req_counter
.store(0, std::sync::atomic::Ordering::SeqCst);
let vd = self.get_visitor_data().await.unwrap();
let mut vds = self.inner.visitor_data.write().unwrap();
for _ in 0..(vds.len().saturating_sub(MAX_SIZE)) {
let rem = vds.remove(0);
tracing::debug!("visitor data {rem} removed from cache");
}
vds.push(vd.to_owned());
tracing::debug!("visitor data {} added to cache ({} ids)", vd, vds.len());
Ok(vd)
}
pub async fn get(&self) -> Result<String, Error> {
// Request new visitor data in the background every 10 requests
if self
.inner
.req_counter
.fetch_add(1, std::sync::atomic::Ordering::SeqCst)
>= REQ_LIMIT
{
let nc = self.clone();
tokio::spawn(async move { nc.new_visitor_data().await });
}
{
let vds = self.inner.visitor_data.read().unwrap();
if !vds.is_empty() {
let mut rng = rand::thread_rng();
let vd = vds[rng.gen_range(0..vds.len())].to_owned();
tracing::debug!("visitor data {vd} picked from cache");
return Ok(vd);
}
}
// Fetch new visitor data if the cache is empty
self.new_visitor_data().await
}
}
#[cfg(test)]
mod tests {
use std::time::Duration;
use crate::client::DEFAULT_UA;
use super::*;
use tracing_test::traced_test;
#[tokio::test]
#[traced_test]
async fn get_visitor_data() {
let cache =
VisitorDataCache::new(Client::builder().user_agent(DEFAULT_UA).build().unwrap());
// Get initial visitor data
let v1 = cache.get().await.unwrap();
// Run as many request as necessary to fetch second visitor data
for _ in 0..=REQ_LIMIT {
let got = cache.get().await.unwrap();
assert_eq!(got, v1);
}
// Second visitor data does not arrive instantly, request immediately after returns the first data
let vds_len = cache.inner.visitor_data.read().unwrap().len();
assert_eq!(vds_len, 1);
// Wait for the second visitor data to arrive
tokio::time::sleep(Duration::from_millis(1000)).await;
let vds_len = cache.inner.visitor_data.read().unwrap().len();
assert_eq!(vds_len, 2);
}
}