feat: add visitor data cache, remove random visitor data
Apparently YouTube can detect randomly generated visitor data and prevents both the iOS and TV player from being fetched (Error: Sign in to confirm you’re not a bot). Therefore the visitor data generation code was removed and replaced with a cache that randomly chooses from a selection of real visitor data.
This commit is contained in:
parent
50ab1f7a5d
commit
b12f4c5d82
3 changed files with 197 additions and 82 deletions
|
|
@ -38,6 +38,7 @@ use time::OffsetDateTime;
|
|||
use tokio::sync::RwLock as AsyncRwLock;
|
||||
|
||||
use crate::error::AuthError;
|
||||
use crate::util::VisitorDataCache;
|
||||
use crate::{
|
||||
cache::{CacheStorage, FileStorage, DEFAULT_CACHE_FILE},
|
||||
deobfuscate::DeobfData,
|
||||
|
|
@ -312,9 +313,9 @@ impl AuthCookie {
|
|||
}
|
||||
}
|
||||
|
||||
const DEFAULT_UA: &str = "Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0";
|
||||
const MOBILE_UA: &str = "Mozilla/5.0 (Android 14; Mobile; rv:129.0) Gecko/129.0 Firefox/129.0";
|
||||
const TV_UA: &str = "Mozilla/5.0 (SMART-TV; Linux; Tizen 5.0) AppleWebKit/538.1 (KHTML, like Gecko) Version/5.0 NativeTVAds Safari/538.1";
|
||||
pub(crate) const DEFAULT_UA: &str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
|
||||
pub(crate) const MOBILE_UA: &str = "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.6778.135 Mobile Safari/537.36";
|
||||
pub(crate) const TV_UA: &str = "Mozilla/5.0 (SMART-TV; Linux; Tizen 5.0) AppleWebKit/538.1 (KHTML, like Gecko) Version/5.0 NativeTVAds Safari/538.1";
|
||||
|
||||
const CONSENT_COOKIE: &str = "SOCS=CAISAiAD";
|
||||
|
||||
|
|
@ -323,7 +324,7 @@ const YOUTUBEI_V1_GAPIS_URL: &str = "https://youtubei.googleapis.com/youtubei/v1
|
|||
const YOUTUBE_MUSIC_V1_URL: &str = "https://music.youtube.com/youtubei/v1/";
|
||||
const YOUTUBEI_MOBILE_V1_URL: &str = "https://m.youtube.com/youtubei/v1/";
|
||||
const YOUTUBE_HOME_URL: &str = "https://www.youtube.com";
|
||||
const YOUTUBE_MUSIC_HOME_URL: &str = "https://music.youtube.com";
|
||||
pub(crate) const YOUTUBE_MUSIC_HOME_URL: &str = "https://music.youtube.com";
|
||||
const YOUTUBE_MOBILE_HOME_URL: &str = "https://m.youtube.com";
|
||||
const YOUTUBE_TV_URL: &str = "https://www.youtube.com/tv";
|
||||
|
||||
|
|
@ -350,8 +351,6 @@ const OAUTH_SCOPES: &str = "http://gdata.youtube.com https://www.googleapis.com/
|
|||
|
||||
static CLIENT_VERSION_REGEX: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#""INNERTUBE_CONTEXT_CLIENT_VERSION":"([\w\d\._-]+?)""#).unwrap());
|
||||
static VISITOR_DATA_REGEX: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#""visitorData":"([\w\d_\-%]+?)""#).unwrap());
|
||||
|
||||
/// Default order of client types when fetching player data
|
||||
///
|
||||
|
|
@ -378,6 +377,7 @@ struct RustyPipeRef {
|
|||
cache: CacheHolder,
|
||||
default_opts: RustyPipeOpts,
|
||||
user_agent: Cow<'static, str>,
|
||||
visitor_data_cache: VisitorDataCache,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
|
|
@ -688,6 +688,8 @@ impl RustyPipeBuilder {
|
|||
})
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let visitor_data_cache = VisitorDataCache::new(http.clone());
|
||||
|
||||
Ok(RustyPipe {
|
||||
inner: Arc::new(RustyPipeRef {
|
||||
http,
|
||||
|
|
@ -706,6 +708,7 @@ impl RustyPipeBuilder {
|
|||
},
|
||||
default_opts: self.default_opts,
|
||||
user_agent,
|
||||
visitor_data_cache,
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
|
@ -1196,51 +1199,7 @@ impl RustyPipe {
|
|||
/// Sometimes YouTube does not set the `__Secure-YEC` cookie. In this case, the
|
||||
/// visitor data is extracted from the html page.
|
||||
async fn get_visitor_data(&self) -> Result<String, Error> {
|
||||
tracing::debug!("getting YT visitor data");
|
||||
let resp = self
|
||||
.inner
|
||||
.http
|
||||
.get(YOUTUBE_MUSIC_HOME_URL)
|
||||
.header(header::ORIGIN, YOUTUBE_MUSIC_HOME_URL)
|
||||
.header(header::REFERER, YOUTUBE_MUSIC_HOME_URL)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
let vdata = resp
|
||||
.headers()
|
||||
.get_all(header::SET_COOKIE)
|
||||
.iter()
|
||||
.find_map(|c| {
|
||||
if let Ok(cookie) = c.to_str() {
|
||||
if let Some(after) = cookie.strip_prefix("__Secure-YEC=") {
|
||||
return after
|
||||
.split_once(';')
|
||||
.map(|s| s.0.to_owned())
|
||||
.filter(|s| !s.is_empty());
|
||||
}
|
||||
}
|
||||
None
|
||||
});
|
||||
|
||||
match vdata {
|
||||
Some(vdata) => Ok(vdata),
|
||||
None => {
|
||||
if resp.status().is_success() {
|
||||
// Extract visitor data from html
|
||||
let html = resp.text().await?;
|
||||
|
||||
util::get_cg_from_regex(&VISITOR_DATA_REGEX, &html, 1).ok_or(Error::Extraction(
|
||||
ExtractionError::InvalidData(
|
||||
"Could not find visitor data on html page".into(),
|
||||
),
|
||||
))
|
||||
} else {
|
||||
Err(Error::Extraction(ExtractionError::InvalidData(
|
||||
format!("Could not get visitor data, status: {}", resp.status()).into(),
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
self.inner.visitor_data_cache.new_visitor_data().await
|
||||
}
|
||||
|
||||
/// Get a new device code for logging into YouTube
|
||||
|
|
@ -2147,11 +2106,14 @@ impl RustyPipeQuery {
|
|||
) -> Result<M, Error> {
|
||||
tracing::debug!("getting {}({})", operation, id);
|
||||
|
||||
let visitor_data = ctx_src
|
||||
let visitor_data = match ctx_src
|
||||
.visitor_data
|
||||
.or(self.opts.visitor_data.as_deref())
|
||||
.map(Cow::Borrowed)
|
||||
.unwrap_or_else(|| util::random_visitor_data(self.opts.country).into());
|
||||
{
|
||||
Some(vd) => vd,
|
||||
None => self.client.inner.visitor_data_cache.get().await?.into(),
|
||||
};
|
||||
|
||||
let context = self
|
||||
.get_context(ctype, !ctx_src.unlocalized, &visitor_data)
|
||||
|
|
@ -2289,12 +2251,10 @@ impl RustyPipeQuery {
|
|||
endpoint: &str,
|
||||
body: &B,
|
||||
) -> Result<String, Error> {
|
||||
let visitor_data = self
|
||||
.opts
|
||||
.visitor_data
|
||||
.as_deref()
|
||||
.map(Cow::Borrowed)
|
||||
.unwrap_or_else(|| util::random_visitor_data(self.opts.country).into());
|
||||
let visitor_data = match self.opts.visitor_data.as_deref().map(Cow::Borrowed) {
|
||||
Some(vd) => vd,
|
||||
None => self.client.inner.visitor_data_cache.get().await?.into(),
|
||||
};
|
||||
|
||||
let context = self.get_context(ctype, true, &visitor_data).await;
|
||||
let req_body = QBody { context, body };
|
||||
|
|
|
|||
|
|
@ -1,11 +1,13 @@
|
|||
mod date;
|
||||
mod protobuf;
|
||||
mod visitor_data;
|
||||
|
||||
pub mod dictionary;
|
||||
pub mod timeago;
|
||||
|
||||
pub use date::{now_sec, shift_months, shift_weeks_mo, shift_years};
|
||||
pub use protobuf::{string_from_pb, ProtoBuilder};
|
||||
pub use visitor_data::VisitorDataCache;
|
||||
|
||||
use std::{
|
||||
collections::BTreeMap,
|
||||
|
|
@ -99,29 +101,6 @@ pub fn random_uuid() -> String {
|
|||
)
|
||||
}
|
||||
|
||||
/// Generate a random visitor data cookie
|
||||
pub fn random_visitor_data(country: Country) -> String {
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
let mut pb_e2 = ProtoBuilder::new();
|
||||
pb_e2.string(2, "");
|
||||
pb_e2.varint(4, rng.gen_range(1..256));
|
||||
|
||||
let mut pb_e = ProtoBuilder::new();
|
||||
pb_e.string(1, &country.to_string());
|
||||
pb_e.embedded(2, pb_e2);
|
||||
|
||||
let mut pb = ProtoBuilder::new();
|
||||
pb.string(1, &random_string(CONTENT_PLAYBACK_NONCE_ALPHABET, 11));
|
||||
pb.varint(
|
||||
5,
|
||||
(time::OffsetDateTime::now_utc().unix_timestamp() as u64)
|
||||
.saturating_sub(rng.gen_range(0..600_000)),
|
||||
);
|
||||
pb.embedded(6, pb_e);
|
||||
pb.to_base64()
|
||||
}
|
||||
|
||||
/// Split an URL into its base string and parameter map
|
||||
///
|
||||
/// Example:
|
||||
|
|
|
|||
176
src/util/visitor_data.rs
Normal file
176
src/util/visitor_data.rs
Normal file
|
|
@ -0,0 +1,176 @@
|
|||
use std::sync::{atomic::AtomicU32, Arc, RwLock};
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use rand::Rng;
|
||||
use regex::Regex;
|
||||
use reqwest::{header, Client};
|
||||
|
||||
use crate::{
|
||||
client::YOUTUBE_MUSIC_HOME_URL,
|
||||
error::{Error, ExtractionError},
|
||||
util,
|
||||
};
|
||||
|
||||
/// To increase privacy and possibly circumvent rate limits, RustyPipe uses multiple
|
||||
/// visitor data IDs. These are held in this cache object.
|
||||
///
|
||||
/// On instantiation, the cache is empty, so for the first requests new visitor data IDs
|
||||
/// have to be requested. For subsequent requests a random ID from the cache is picked.
|
||||
/// After req_limit requests, a new token is requested asynchronously and added to the cache
|
||||
/// to prevent the IDs from being overused.
|
||||
///
|
||||
/// The cache holds a maximum of 100 visitor data IDs. If more are added, the oldest ones
|
||||
/// are evicted.
|
||||
#[derive(Clone)]
|
||||
pub struct VisitorDataCache {
|
||||
inner: Arc<VisitorDataCacheRef>,
|
||||
}
|
||||
|
||||
struct VisitorDataCacheRef {
|
||||
req_counter: AtomicU32,
|
||||
visitor_data: RwLock<Vec<String>>,
|
||||
http: Client,
|
||||
}
|
||||
|
||||
static VISITOR_DATA_REGEX: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#""visitorData":"([\w\d_\-%]+?)""#).unwrap());
|
||||
/// Number of requests after which a new token is requested
|
||||
const REQ_LIMIT: u32 = 10;
|
||||
/// Maximum size of the cache (-1)
|
||||
const MAX_SIZE: usize = 99;
|
||||
|
||||
impl VisitorDataCache {
|
||||
pub fn new(http: Client) -> Self {
|
||||
Self {
|
||||
inner: VisitorDataCacheRef {
|
||||
req_counter: Default::default(),
|
||||
visitor_data: Default::default(),
|
||||
http,
|
||||
}
|
||||
.into(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_visitor_data(&self) -> Result<String, Error> {
|
||||
tracing::debug!("getting YT visitor data");
|
||||
let resp = self
|
||||
.inner
|
||||
.http
|
||||
.get(YOUTUBE_MUSIC_HOME_URL)
|
||||
.header(header::ORIGIN, YOUTUBE_MUSIC_HOME_URL)
|
||||
.header(header::REFERER, YOUTUBE_MUSIC_HOME_URL)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
let vdata = resp
|
||||
.headers()
|
||||
.get_all(header::SET_COOKIE)
|
||||
.iter()
|
||||
.find_map(|c| {
|
||||
if let Ok(cookie) = c.to_str() {
|
||||
if let Some(after) = cookie.strip_prefix("__Secure-YEC=") {
|
||||
return after
|
||||
.split_once(';')
|
||||
.map(|s| s.0.to_owned())
|
||||
.filter(|s| !s.is_empty());
|
||||
}
|
||||
}
|
||||
None
|
||||
});
|
||||
|
||||
match vdata {
|
||||
Some(vdata) => Ok(vdata),
|
||||
None => {
|
||||
if resp.status().is_success() {
|
||||
// Extract visitor data from html
|
||||
let html = resp.text().await?;
|
||||
|
||||
util::get_cg_from_regex(&VISITOR_DATA_REGEX, &html, 1).ok_or(Error::Extraction(
|
||||
ExtractionError::InvalidData(
|
||||
"Could not find visitor data on html page".into(),
|
||||
),
|
||||
))
|
||||
} else {
|
||||
Err(Error::Extraction(ExtractionError::InvalidData(
|
||||
format!("Could not get visitor data, status: {}", resp.status()).into(),
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn new_visitor_data(&self) -> Result<String, Error> {
|
||||
self.inner
|
||||
.req_counter
|
||||
.store(0, std::sync::atomic::Ordering::SeqCst);
|
||||
let vd = self.get_visitor_data().await.unwrap();
|
||||
let mut vds = self.inner.visitor_data.write().unwrap();
|
||||
for _ in 0..(vds.len().saturating_sub(MAX_SIZE)) {
|
||||
let rem = vds.remove(0);
|
||||
tracing::debug!("visitor data {rem} removed from cache");
|
||||
}
|
||||
vds.push(vd.to_owned());
|
||||
tracing::debug!("visitor data {} added to cache ({} ids)", vd, vds.len());
|
||||
Ok(vd)
|
||||
}
|
||||
|
||||
pub async fn get(&self) -> Result<String, Error> {
|
||||
// Request new visitor data in the background every 10 requests
|
||||
if self
|
||||
.inner
|
||||
.req_counter
|
||||
.fetch_add(1, std::sync::atomic::Ordering::SeqCst)
|
||||
>= REQ_LIMIT
|
||||
{
|
||||
let nc = self.clone();
|
||||
tokio::spawn(async move { nc.new_visitor_data().await });
|
||||
}
|
||||
|
||||
{
|
||||
let vds = self.inner.visitor_data.read().unwrap();
|
||||
if !vds.is_empty() {
|
||||
let mut rng = rand::thread_rng();
|
||||
let vd = vds[rng.gen_range(0..vds.len())].to_owned();
|
||||
tracing::debug!("visitor data {vd} picked from cache");
|
||||
return Ok(vd);
|
||||
}
|
||||
}
|
||||
// Fetch new visitor data if the cache is empty
|
||||
self.new_visitor_data().await
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::client::DEFAULT_UA;
|
||||
|
||||
use super::*;
|
||||
|
||||
use tracing_test::traced_test;
|
||||
|
||||
#[tokio::test]
|
||||
#[traced_test]
|
||||
async fn get_visitor_data() {
|
||||
let cache =
|
||||
VisitorDataCache::new(Client::builder().user_agent(DEFAULT_UA).build().unwrap());
|
||||
// Get initial visitor data
|
||||
let v1 = cache.get().await.unwrap();
|
||||
|
||||
// Run as many request as necessary to fetch second visitor data
|
||||
for _ in 0..=REQ_LIMIT {
|
||||
let got = cache.get().await.unwrap();
|
||||
assert_eq!(got, v1);
|
||||
}
|
||||
|
||||
// Second visitor data does not arrive instantly, request immediately after returns the first data
|
||||
let vds_len = cache.inner.visitor_data.read().unwrap().len();
|
||||
assert_eq!(vds_len, 1);
|
||||
|
||||
// Wait for the second visitor data to arrive
|
||||
tokio::time::sleep(Duration::from_millis(1000)).await;
|
||||
let vds_len = cache.inner.visitor_data.read().unwrap().len();
|
||||
assert_eq!(vds_len, 2);
|
||||
}
|
||||
}
|
||||
Reference in a new issue