This repository has been archived on 2026-05-27. You can view files and clone it, but you cannot make any changes to it's state, such as pushing and creating new issues, pull requests or comments.
rustypipe/src/util/mod.rs
2023-05-11 17:18:58 +02:00

633 lines
20 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

mod date;
mod protobuf;
pub mod dictionary;
pub mod timeago;
pub use date::{now_sec, shift_months, shift_years};
pub use protobuf::{string_from_pb, ProtoBuilder};
use std::{
borrow::{Borrow, Cow},
collections::BTreeMap,
str::{FromStr, SplitWhitespace},
};
use base64::Engine;
use fancy_regex::Regex as FancyRegex;
use once_cell::sync::Lazy;
use rand::Rng;
use regex::Regex;
use url::Url;
use crate::{error::Error, param::Language, serializer::text::TextComponent};
pub static VIDEO_ID_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[A-Za-z0-9_-]{11}$").unwrap());
pub static CHANNEL_ID_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^UC[A-Za-z0-9_-]{22}$").unwrap());
pub static PLAYLIST_ID_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^(?:PL|RDCLAK|OLAK)[A-Za-z0-9_-]{16,50}$").unwrap());
pub static ALBUM_ID_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^MPREb_[A-Za-z0-9_-]{11}$").unwrap());
pub static VANITY_PATH_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"^/?(?:(?:c/|user/)?[A-z0-9]{1,100})|(?:@[A-z0-9-_.]{1,100})$").unwrap()
});
/// Separator string for YouTube Music subtitles
pub const DOT_SEPARATOR: &str = "";
pub const VARIOUS_ARTISTS: &str = "Various Artists";
pub const PLAYLIST_ID_ALBUM_PREFIX: &str = "OLAK";
const CONTENT_PLAYBACK_NONCE_ALPHABET: &[u8; 64] =
b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
/// Internal error
#[derive(thiserror::Error, Debug)]
#[error("mapping error: {0}")]
pub struct MappingError(pub(crate) Cow<'static, str>);
/// Return the given capture group that matches first in a list of regexes
pub fn get_cg_from_regexes<'a, I>(mut regexes: I, text: &str, cg: usize) -> Option<String>
where
I: Iterator<Item = &'a Regex>,
{
regexes
.find_map(|pattern| pattern.captures(text))
.map(|c| c.get(cg).unwrap().as_str().to_owned())
}
/// Return the given capture group that matches first in a list of fancy regexes
pub fn get_cg_from_fancy_regexes<'a, I>(mut regexes: I, text: &str, cg: usize) -> Option<String>
where
I: Iterator<Item = &'a FancyRegex>,
{
regexes
.find_map(|pattern| pattern.captures(text).ok().flatten())
.map(|c| c.get(cg).unwrap().as_str().to_owned())
}
/// Generate a random string with given length and byte charset.
fn random_string(charset: &[u8], length: usize) -> String {
let mut result = String::with_capacity(length);
let mut rng = rand::thread_rng();
for _ in 0..length {
result.push(char::from(charset[rng.gen_range(0..charset.len())]));
}
result
}
/// Generate a 16 characters long random string used as a CPN (Content Playback Nonce)
pub fn generate_content_playback_nonce() -> String {
random_string(CONTENT_PLAYBACK_NONCE_ALPHABET, 16)
}
pub fn random_uuid() -> String {
let mut rng = rand::thread_rng();
format!(
"{:08x}-{:04x}-{:04x}-{:04x}-{:012x}",
rng.gen::<u32>(),
rng.gen::<u16>(),
rng.gen::<u16>(),
rng.gen::<u16>(),
rng.gen::<u64>() & 0xffffffffffff,
)
}
/// Split an URL into its base string and parameter map
///
/// Example:
///
/// `example.com/api?k1=v1&k2=v2 => example.com/api; {k1: v1, k2: v2}`
pub fn url_to_params(url: &str) -> Result<(Url, BTreeMap<String, String>), Error> {
let mut parsed_url = Url::parse(url)
.map_err(|e| Error::Other(format!("could not parse url `{url}` err: {e}").into()))?;
let url_params: BTreeMap<String, String> = parsed_url
.query_pairs()
.map(|(k, v)| (k.to_string(), v.to_string()))
.collect();
parsed_url.set_query(None);
Ok((parsed_url, url_params))
}
/// Parse a string after removing all non-numeric characters
pub fn parse_numeric<F>(string: &str) -> Result<F, F::Err>
where
F: FromStr,
{
let mut buf = String::new();
for c in string.chars() {
if c.is_ascii_digit() {
buf.push(c);
}
}
buf.parse()
}
/// Parse all numbers occurring in a string and reurn them as a vec
pub fn parse_numeric_vec<F>(string: &str) -> Vec<F>
where
F: FromStr,
{
let mut numbers = vec![];
let mut buf = String::new();
for c in string.chars() {
if c.is_ascii_digit() {
buf.push(c);
} else if !buf.is_empty() {
buf.parse::<F>().map_or((), |n| numbers.push(n));
buf.clear();
}
}
if !buf.is_empty() {
buf.parse::<F>().map_or((), |n| numbers.push(n));
}
numbers
}
/// Parse textual video length (e.g. `0:49`, `2:02` or `1:48:18`)
/// and return the duration in seconds.
pub fn parse_video_length(text: &str) -> Option<u32> {
static VIDEO_LENGTH_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"(?:(\d+)[:.])?(\d{1,2})[:.](\d{2})"#).unwrap());
VIDEO_LENGTH_REGEX.captures(text).map(|cap| {
let hrs = cap
.get(1)
.and_then(|x| x.as_str().parse::<u32>().ok())
.unwrap_or_default();
let min = cap
.get(2)
.and_then(|x| x.as_str().parse::<u32>().ok())
.unwrap_or_default();
let sec = cap
.get(3)
.and_then(|x| x.as_str().parse::<u32>().ok())
.unwrap_or_default();
hrs * 3600 + min * 60 + sec
})
}
pub fn parse_numeric_or_warn<F>(string: &str, warnings: &mut Vec<String>) -> Option<F>
where
F: FromStr,
{
let res = parse_numeric::<F>(string);
if res.is_err() {
warnings.push(format!("could not parse number `{string}`"));
}
res.ok()
}
pub fn retry_delay(
n_past_retries: u32,
min_retry_interval: u32,
max_retry_interval: u32,
backoff_base: u32,
) -> u32 {
let unjittered_delay = backoff_base.checked_pow(n_past_retries).unwrap_or(u32::MAX);
let jitter_factor = rand::thread_rng().gen_range(800..1500);
let jittered_delay = unjittered_delay
.checked_mul(jitter_factor)
.unwrap_or(u32::MAX);
min_retry_interval.max(jittered_delay.min(max_retry_interval))
}
/// Convert YouTube redirect URLs (`https://www.youtube.com/redirect?`) into regular URLs.
///
/// Also strips google analytics tracking parameters
/// (`utm_source`, `utm_medium`, `utm_campaign`, `utm_content`) because google analytics is bad.
pub fn sanitize_yt_url(url: &str) -> String {
fn sanitize_yt_url_inner(url: &str) -> Option<String> {
let mut parsed_url = Url::parse(url).ok()?;
// Convert redirect url
if parsed_url.host_str().unwrap_or_default() == "www.youtube.com"
&& parsed_url.path() == "/redirect"
{
if let Some((_, url)) = parsed_url.query_pairs().find(|(k, _)| k == "q") {
parsed_url = Url::parse(url.as_ref()).ok()?;
}
}
// Remove GA tracking params
if parsed_url.query().is_some() {
let params = parsed_url
.query_pairs()
.filter_map(|(k, v)| match k.borrow() {
"utm_source" | "utm_medium" | "utm_campaign" | "utm_content" => None,
_ => Some((k.to_string(), v.to_string())),
})
.collect::<Vec<_>>();
// Set empty query string if there are no parameters to prevent urls from ending with /?
if params.is_empty() {
parsed_url.set_query(None);
} else {
parsed_url
.query_pairs_mut()
.clear()
.extend_pairs(params)
.finish();
}
}
Some(parsed_url.to_string())
}
sanitize_yt_url_inner(url).unwrap_or_else(|| url.to_string())
}
pub trait TryRemove<T> {
/// Removes and returns the element at position `index` within the vector,
/// shifting all elements after it to the left.
///
/// Returns None if the index is out of bounds.
///
/// Note: Because this shifts over the remaining elements, it has a
/// worst-case performance of *O*(*n*). If you don't need the order of elements
/// to be preserved, use [`vec_try_swap_remove`] instead.
fn try_remove(&mut self, index: usize) -> Option<T>;
/// Removes an element from the vector and returns it.
///
/// The removed element is replaced by the last element of the vector.
///
/// Returns None if the index is out of bounds.
///
/// This does not preserve ordering, but is *O*(1).
/// If you need to preserve the element order, use [`vec_try_remove`] instead.
fn try_swap_remove(&mut self, index: usize) -> Option<T>;
}
impl<T> TryRemove<T> for Vec<T> {
fn try_remove(&mut self, index: usize) -> Option<T> {
if index < self.len() {
Some(self.remove(index))
} else {
None
}
}
fn try_swap_remove(&mut self, index: usize) -> Option<T> {
if index < self.len() {
Some(self.swap_remove(index))
} else {
None
}
}
}
/// Check if a channel name equals "YouTube Music"
/// (the author of original YouTube music playlists)
pub(crate) fn is_ytm(text: &TextComponent) -> bool {
if let TextComponent::Text { text } = text {
text.starts_with("YouTube")
} else {
false
}
}
/// Check if a language should be parsed by character
pub fn lang_by_char(lang: Language) -> bool {
matches!(
lang,
Language::Ja | Language::ZhCn | Language::ZhHk | Language::ZhTw
)
}
/// Parse a large, textual number (e.g. `1.4M subscribers`, `22K views`)
pub fn parse_large_numstr<F>(string: &str, lang: Language) -> Option<F>
where
F: TryFrom<u64>,
{
// Special case for Gujarati: the "no views" text does not contain
// any parseable tokens: the 2 words occur in any view count text.
// This may be a translation error.
if lang == Language::Gu && string == "જોવાયાની સંખ્યા" {
return 0.try_into().ok();
}
let dict_entry = dictionary::entry(lang);
let by_char = lang_by_char(lang) || lang == Language::Ko;
let decimal_point = match dict_entry.comma_decimal {
true => ',',
false => '.',
};
let mut digits = String::new();
let mut filtered = String::new();
let mut exp = 0;
let mut after_point = false;
for c in string.chars() {
if c.is_ascii_digit() {
digits.push(c);
if after_point {
exp -= 1;
}
} else if c == decimal_point {
after_point = true;
} else if !matches!(
c,
'\u{200b}' | '\u{202b}' | '\u{202c}' | '\u{202e}' | '\u{200e}' | '\u{200f}' | '.' | ','
) {
c.to_lowercase().for_each(|c| filtered.push(c));
}
}
if digits.is_empty() {
SplitTokens::new(&filtered, by_char)
.find_map(|token| dict_entry.number_nd_tokens.get(token))
.and_then(|n| (*n as u64).try_into().ok())
} else {
let num = digits.parse::<u64>().ok()?;
exp += SplitTokens::new(&filtered, by_char)
.filter_map(|token| match token {
"k" => Some(3),
_ => dict_entry.number_tokens.get(token).map(|t| *t as i32),
})
.sum::<i32>();
F::try_from(num.checked_mul((10_u64).checked_pow(exp.try_into().ok()?)?)?).ok()
}
}
pub fn parse_large_numstr_or_warn<F>(
string: &str,
lang: Language,
warnings: &mut Vec<String>,
) -> Option<F>
where
F: TryFrom<u64>,
{
let res = parse_large_numstr::<F>(string, lang);
if res.is_none() {
warnings.push(format!("could not parse numstr `{string}`"));
}
res
}
/// Replace all html control characters to make a string safe for inserting into HTML.
pub fn escape_html(input: &str) -> String {
let mut buf = String::with_capacity(input.len());
for c in input.chars() {
match c {
'<' => buf.push_str("&lt;"),
'>' => buf.push_str("&gt;"),
'&' => buf.push_str("&amp;"),
'"' => buf.push_str("&quot;"),
'\'' => buf.push_str("&#x27;"),
'\n' => buf.push_str("<br>"),
_ => buf.push(c),
};
}
buf
}
pub fn video_id_from_thumbnail_url(url: &str) -> Option<String> {
static URL_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^https://i.ytimg.com/vi/([A-Za-z0-9_-]{11})/").unwrap());
URL_REGEX
.captures(url)
.and_then(|cap| cap.get(1).map(|x| x.as_str().to_owned()))
}
pub fn b64_encode<T: AsRef<[u8]>>(input: T) -> String {
base64::engine::general_purpose::STANDARD.encode(input)
}
pub fn b64_decode<T: AsRef<[u8]>>(input: T) -> Result<Vec<u8>, base64::DecodeError> {
base64::engine::general_purpose::STANDARD.decode(input)
}
/// An iterator over the chars in a string (in str format)
pub struct SplitChar<'a> {
txt: &'a str,
index: usize,
}
impl<'a> From<&'a str> for SplitChar<'a> {
fn from(value: &'a str) -> Self {
Self {
txt: value,
index: 0,
}
}
}
impl<'a> Iterator for SplitChar<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<Self::Item> {
self.txt
.get(self.index..)
.and_then(|txt| txt.chars().next())
.map(|c| {
let start = self.index;
self.index += c.len_utf8();
&self.txt[start..self.index]
})
}
}
/// An iterator for parsing strings. It can either iterate over words or characters.
pub enum SplitTokens<'a> {
Word(SplitWhitespace<'a>),
Char(SplitChar<'a>),
}
impl<'a> SplitTokens<'a> {
pub fn new(s: &'a str, by_char: bool) -> Self {
match by_char {
true => Self::Char(SplitChar::from(s)),
false => Self::Word(s.split_whitespace()),
}
}
}
impl<'a> Iterator for SplitTokens<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<Self::Item> {
match self {
SplitTokens::Word(iter) => iter.next(),
SplitTokens::Char(iter) => iter.next(),
}
}
}
#[cfg(test)]
pub(crate) mod tests {
use std::{fs::File, io::BufReader, path::PathBuf};
use path_macro::path;
use rstest::rstest;
use super::*;
/// Get the path of the `testfiles` directory
pub static TESTFILES: Lazy<PathBuf> =
Lazy::new(|| path!(env!("CARGO_MANIFEST_DIR") / "testfiles"));
#[rstest]
#[case("1.000", 1000)]
#[case("4 Hello World 2", 42)]
fn t_parse_num(#[case] string: &str, #[case] expect: u32) {
let n = parse_numeric::<u32>(string).unwrap();
assert_eq!(n, expect);
}
#[rstest]
#[case("15.03.2022", vec![15, 3, 2022])]
#[case("4 Hello World 2", vec![4, 2])]
#[case("最后更新时间2020年1月3日", vec![2020, 1, 3])]
fn t_parse_numeric_vec(#[case] string: &str, #[case] expect: Vec<u32>) {
let n = parse_numeric_vec::<u32>(string);
assert_eq!(n, expect);
}
#[rstest]
#[case("0:49", Some(49))]
#[case("bla 2:02 h3llo w0rld", Some(122))]
#[case("18:22", Some(1102))]
#[case("1:48:18", Some(6498))]
#[case("102:12:39", Some(367959))]
#[case("42", None)]
fn t_parse_video_length(#[case] text: &str, #[case] expect: Option<u32>) {
let n = parse_video_length(text);
assert_eq!(n, expect);
}
#[rstest]
#[case(0, 800, 1500)]
#[case(1, 2400, 4500)]
#[case(2, 7200, 13500)]
#[case(100, 60000, 60000)]
fn t_retry_delay(#[case] n: u32, #[case] expect_min: u32, #[case] expect_max: u32) {
let res = retry_delay(n, 1000, 60000, 3);
assert!(
res >= expect_min && res <= expect_max,
"res: {res} not within {expect_min} and {expect_max}"
);
}
#[test]
fn t_vec_try_remove() {
let mut v = vec![1, 2, 3];
assert_eq!(v.try_remove(0).unwrap(), 1);
assert_eq!(v.try_remove(1).unwrap(), 3);
assert_eq!(v.try_remove(1), None);
}
#[test]
fn t_vec_try_swap_remove() {
let mut v = vec![1, 2, 3];
assert_eq!(v.try_swap_remove(0).unwrap(), 1);
assert_eq!(v.try_swap_remove(1).unwrap(), 2);
assert_eq!(v.try_swap_remove(1), None);
}
#[rstest]
#[case(
"https://www.youtube.com/redirect?event=video_description&redir_token=QUFFLUhqbXFjbjZ6bWdHc1VFLVNBN1NiRGR1QmRuR0lGZ3xBQ3Jtc0trcG1fWHpRNlE2eGNER0ZGczFlZXM5ZlctZzFSbl8wcHdieTlTb1ktSUc5OTZxVDVQamcxdS0yRjJJelFWTGdOS09nUk8xRExqbWhOSG5MTm83WG1QQzJqZTJuT2d6cGp0cEZTWmdsal80ODk0WkNESQ&q=http%3A%2F%2Fincompetech.com%2Fmusic%2Froyalty-free%2F&v=86YLFOog4GM",
"http://incompetech.com/music/royalty-free/",
)]
#[case("https://www.gnu.org", "https://www.gnu.org/")]
#[case(
"https://www.youtube.com/watch?v=Rp2V7d69hyM",
"https://www.youtube.com/watch?v=Rp2V7d69hyM"
)]
#[case(
"https://www.youtube.com/redirect?event=product_shelf&redir_token=QUFFLUhqbDVUMUF3SndkcDFJbzMxYkNIMDRWSzRVQU84QXxBQ3Jtc0tsQWdpaUlaMzFUQmQwSGYwR3dDRDhHWld1bFFtUmlmMng0MmxtN19iVW1EeV9oSk1Xb1VlQ1UyT2xUOWhPdUZvVEZ6UWE4Unlia3pwZXhpUmd4RVg4eWZtcHFId2RJVkMyMUFIMDhiUVUzc2x6ZVNxbw&q=https%3A%2F%2Flttstore.com%2F%3Futm_medium%3Dproduct_shelf%26utm_source%3Dyoutube%26utm_content%3DYT-AERwsnLS3vZeiqL7_mR16DPg7FPBWvP7OW-zX2M1UIPlexPS8-gpk-2c3epSZ8lJ5NYbLof0MXDKhRLCSyfOn9BYJrcG8YtpTA9VU2VXUVhhl9AKi87G_-vFhj6jcGN1CWcYYvmZYbIqA93kwkeFuUh46ntDZR1Y8p5WygwVlhfxy_BZiNbzkWw%253D&v=nFDBxBUfE74",
"https://lttstore.com/",
)]
fn t_sanitize_yt_url(#[case] url: &str, #[case] expect: &str) {
let res = sanitize_yt_url(url);
assert_eq!(res, expect);
}
#[rstest]
#[case(
Language::Iw,
"\u{200f}\u{202b}3.36M\u{200f}\u{202c}\u{200f} \u{200f}מנויים\u{200f}",
3_360_000
)]
#[case(Language::As, "১ জন গ্ৰাহক", 1)]
fn t_parse_large_numstr(#[case] lang: Language, #[case] string: &str, #[case] expect: u64) {
let res = parse_large_numstr::<u64>(string, lang).unwrap();
assert_eq!(res, expect);
}
#[test]
fn t_parse_large_numstr_samples() {
let json_path = path!(*TESTFILES / "dict" / "large_number_samples.json");
let json_file = File::open(json_path).unwrap();
let number_samples: BTreeMap<Language, BTreeMap<String, u64>> =
serde_json::from_reader(BufReader::new(json_file)).unwrap();
number_samples.iter().for_each(|(lang, entry)| {
entry.iter().for_each(|(txt, expect)| {
testcase_parse_large_numstr(txt, *lang, *expect);
});
});
}
fn testcase_parse_large_numstr(string: &str, lang: Language, expect: u64) {
// Round the expected number to the amount of significant digits included
// in the string.
let rounded = {
let n_significant_d = string.chars().filter(char::is_ascii_digit).count();
if n_significant_d == 0 {
expect
} else {
let mag = (expect as f64).log10().floor();
let factor = 10_u64.pow(1 + mag as u32 - n_significant_d as u32);
(((expect as f64) / factor as f64).floor() as u64) * factor
}
};
let emsg = format!("{string} (lang: {lang}, exact: {expect})");
let res = parse_large_numstr::<u64>(string, lang).expect(&emsg);
assert_eq!(res, rounded, "{emsg}");
}
#[test]
fn split_char() {
let teststr = "abc今天更新def";
let res = SplitTokens::new(teststr, true).collect::<Vec<_>>();
assert_eq!(res.len(), 10);
let res_str = res.into_iter().collect::<String>();
assert_eq!(res_str, teststr)
}
#[test]
fn split_words() {
let teststr = "abc 今天更新 ghi";
let res = SplitTokens::new(teststr, false).collect::<Vec<_>>();
assert_eq!(res.len(), 3);
let res_str = res.join(" ");
assert_eq!(res_str, teststr)
}
#[rstest]
#[case("en", Some(Language::En))]
#[case("en-GB", Some(Language::EnGb))]
#[case("en-US", Some(Language::En))]
#[case("en-ZZ", Some(Language::En))]
#[case("xy", None)]
#[case("xy-ZZ", None)]
fn parse_language(#[case] s: &str, #[case] expect: Option<Language>) {
let res = Language::from_str(s).ok();
assert_eq!(res, expect);
}
}