fix: extract visitor data from html page

This commit is contained in:
ThetaDev 2023-08-03 19:31:34 +02:00
parent ed84f72ace
commit e5c51fe995
2 changed files with 44 additions and 24 deletions

View file

@ -209,8 +209,10 @@ const ANDROID_API_KEY: &str = "AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w";
const IOS_API_KEY: &str = "AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc";
const IOS_DEVICE_MODEL: &str = "iPhone14,5";
static CLIENT_VERSION_REGEXES: Lazy<[Regex; 1]> =
Lazy::new(|| [Regex::new(r#"INNERTUBE_CONTEXT_CLIENT_VERSION":"([\w\d\._-]+?)""#).unwrap()]);
static CLIENT_VERSION_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r#""INNERTUBE_CONTEXT_CLIENT_VERSION":"([\w\d\._-]+?)""#).unwrap());
static VISITOR_DATA_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r#""visitorData":"([\w\d_\-%]+?)""#).unwrap());
/// The RustyPipe client used to access YouTube's API
///
@ -814,11 +816,11 @@ impl RustyPipe {
)
.await?;
util::get_cg_from_regexes(CLIENT_VERSION_REGEXES.iter(), &swjs, 1).ok_or(
Error::Extraction(ExtractionError::InvalidData(Cow::Borrowed(
util::get_cg_from_regex(&CLIENT_VERSION_REGEX, &swjs, 1).ok_or(Error::Extraction(
ExtractionError::InvalidData(Cow::Borrowed(
"Could not find client version in sw.js",
))),
)
)),
))
});
let from_html = async {
@ -829,11 +831,11 @@ impl RustyPipe {
let html = self.http_request_txt(&builder.build().unwrap()).await?;
util::get_cg_from_regexes(CLIENT_VERSION_REGEXES.iter(), &html, 1).ok_or(
Error::Extraction(ExtractionError::InvalidData(Cow::Borrowed(
util::get_cg_from_regex(&CLIENT_VERSION_REGEX, &html, 1).ok_or(Error::Extraction(
ExtractionError::InvalidData(Cow::Borrowed(
"Could not find client version on html page",
))),
)
)),
))
};
if let Some(from_swjs) = from_swjs {
@ -965,11 +967,15 @@ impl RustyPipe {
///
/// Since the cookie is shared between YT and YTM and the YTM page loads faster,
/// we request that.
///
/// Sometimes YouTube does not set the `__Secure-YEC` cookie. In this case, the
/// visitor data is extracted from the html page.
async fn get_visitor_data(&self) -> Result<String, Error> {
log::debug!("getting YT visitor data");
let resp = self.inner.http.get(YOUTUBE_MUSIC_HOME_URL).send().await?;
resp.headers()
let vdata = resp
.headers()
.get_all(header::SET_COOKIE)
.iter()
.find_map(|c| {
@ -979,10 +985,27 @@ impl RustyPipe {
}
}
None
})
.ok_or(Error::Extraction(ExtractionError::InvalidData(
Cow::Borrowed("could not get YTM cookies"),
)))
});
match vdata {
Some(vdata) => Ok(vdata),
None => {
if resp.status().is_success() {
// Extract visitor data from html
let html = resp.text().await?;
util::get_cg_from_regex(&VISITOR_DATA_REGEX, &html, 1).ok_or(Error::Extraction(
ExtractionError::InvalidData(Cow::Borrowed(
"Could not find visitor data on html page",
)),
))
} else {
Err(Error::Extraction(ExtractionError::InvalidData(
format!("Could not get visitor data, status: {}", resp.status()).into(),
)))
}
}
}
}
}

View file

@ -41,14 +41,11 @@ pub const ARTIST_DISCOGRAPHY_PREFIX: &str = "MPAD";
const CONTENT_PLAYBACK_NONCE_ALPHABET: &[u8; 64] =
b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
/// Return the given capture group that matches first in a list of regexes
pub fn get_cg_from_regexes<'a, I>(mut regexes: I, text: &str, cg: usize) -> Option<String>
where
I: Iterator<Item = &'a Regex>,
{
regexes
.find_map(|pattern| pattern.captures(text))
.map(|c| c.get(cg).unwrap().as_str().to_owned())
/// Return the given capture group that matches the regex
pub fn get_cg_from_regex(regex: &Regex, text: &str, cg: usize) -> Option<String> {
regex
.captures(text)
.and_then(|c| c.get(cg).map(|c| c.as_str().to_owned()))
}
/// Return the given capture group that matches first in a list of fancy regexes
@ -58,7 +55,7 @@ where
{
regexes
.find_map(|pattern| pattern.captures(text).ok().flatten())
.map(|c| c.get(cg).unwrap().as_str().to_owned())
.and_then(|c| c.get(cg).map(|c| c.as_str().to_owned()))
}
/// Generate a random string with given length and byte charset.