use std::{ collections::{BTreeMap, HashMap}, fs::File, io::BufReader, }; use anyhow::Result; use futures::{stream, StreamExt}; use path_macro::path; use rustypipe::{ client::{ClientType, RustyPipe, RustyPipeQuery}, param::{Language, LANGUAGES}, }; use crate::{ model::{Channel, QBrowse, TimeAgo, TimeUnit}, util::{self, DICT_DIR}, }; type CollectedDurations = BTreeMap>; /// Collect the video duration texts in every supported language /// and write them to `testfiles/dict/video_duration_samples.json`. /// /// The length of YouTube short videos is only available in textual form. /// To parse it correctly, we need to collect samples of this text in every /// language. We collect these samples from regular channel videos because these /// include a textual duration in addition to the easy to parse "mm:ss" /// duration format. pub async fn collect_video_durations(concurrency: usize) { let json_path = path!(*DICT_DIR / "video_duration_samples.json"); let rp = RustyPipe::new(); let channels = [ "UCq-Fj5jknLsUf-MWSy4_brA", "UCMcS5ITpSohfr8Ppzlo4vKw", "UCXuqSBlHAE6Xw-yeJA0Tunw", ]; let durations: CollectedDurations = stream::iter(LANGUAGES) .map(|lang| { let rp = rp.query().lang(lang); async move { let mut map = BTreeMap::new(); for (n, ch_id) in channels.iter().enumerate() { get_channel_vlengths(&rp, ch_id, &mut map).await.unwrap(); println!("collected {lang}-{n}"); } // Since we are only parsing shorts durations, we do not need durations >= 1h let map = map.into_iter().filter(|(_, v)| v < &3600).collect(); (lang, map) } }) .buffer_unordered(concurrency) .collect() .await; let file = File::create(json_path).unwrap(); serde_json::to_writer_pretty(file, &durations).unwrap(); } pub fn parse_video_durations() { let json_path = path!(*DICT_DIR / "video_duration_samples.json"); let json_file = File::open(json_path).unwrap(); let durations: CollectedDurations = serde_json::from_reader(BufReader::new(json_file)).unwrap(); let mut dict = util::read_dict(); let langs = dict.keys().copied().collect::>(); for lang in langs { let dict_entry = dict.entry(lang).or_default(); let mut e_langs = dict_entry.equivalent.clone(); e_langs.push(lang); for lang in e_langs { let mut words = HashMap::new(); fn check_add_word( words: &mut HashMap>, by_char: bool, val: u32, expect: u32, w: &str, unit: TimeUnit, ) -> bool { let ok = val == expect || val * 2 == expect; if ok { let mut ins = |w: &str, val: &mut TimeAgo| { // Filter stop words if matches!( w, "na" | "y" | "و" | "ja" | "et" | "e" | "i" | "և" | "og" | "en" | "и" | "a" | "és" | "ir" | "un" | "și" | "in" | "และ" | "\u{0456}" | "鐘" | "eta" | "અને" | "और" | "കൂടാതെ" | "සහ" ) { return; } let entry = words.entry(w.to_owned()).or_insert(Some(*val)); if let Some(e) = entry { if e != val { *entry = None; } } }; let mut val = TimeAgo { n: (expect / val).try_into().unwrap(), unit, }; if by_char { w.chars().for_each(|c| { if !c.is_whitespace() { ins(&c.to_string(), &mut val); } }); } else { w.split_whitespace().for_each(|w| ins(w, &mut val)); } } ok } fn parse( words: &mut HashMap>, lang: Language, by_char: bool, txt: &str, d: u32, ) { let (m, s) = split_duration(d); let mut parts = split_duration_txt(txt, matches!(lang, Language::Si | Language::Sw)) .into_iter(); let p1 = parts.next().unwrap(); let p1_n = p1.digits.parse::().unwrap_or(1); let p2: Option = parts.next(); match p2 { Some(p2) => { let p2_n = p2.digits.parse::().unwrap_or(1); assert!( check_add_word(words, by_char, p1_n, m, &p1.word, TimeUnit::Minute), "{txt}: min parse error" ); assert!( check_add_word(words, by_char, p2_n, s, &p2.word, TimeUnit::Second), "{txt}: sec parse error" ); } None => { if s == 0 { assert!( check_add_word(words, by_char, p1_n, m, &p1.word, TimeUnit::Minute), "{txt}: min parse error" ); } else if m == 0 { assert!( check_add_word(words, by_char, p1_n, s, &p1.word, TimeUnit::Second), "{txt}: sec parse error" ); } else { let p = txt .find([',', 'و']) .unwrap_or_else(|| panic!("`{txt}`: only 1 part")); parse(words, lang, by_char, &txt[0..p], m); parse(words, lang, by_char, &txt[p..], s); } } } assert!(parts.next().is_none(), "`{txt}`: more than 2 parts"); } for (txt, d) in &durations[&lang] { parse(&mut words, lang, dict_entry.by_char, txt, *d); } // dbg!(&words); for (k, v) in words { if let Some(v) = v { dict_entry.timeago_tokens.insert(k, v.to_string()); } } } } util::write_dict(dict); } fn split_duration(d: u32) -> (u32, u32) { (d / 60, d % 60) } #[derive(Debug, Default)] struct DurationTxtSegment { digits: String, word: String, } fn split_duration_txt(txt: &str, start_c: bool) -> Vec { let mut segments = Vec::new(); // 1: parse digits, 2: parse word let mut state: u8 = 0; let mut seg = DurationTxtSegment::default(); for c in txt.chars() { if c.is_ascii_digit() { if state == 2 && (!seg.digits.is_empty() || (!start_c && segments.is_empty())) { segments.push(seg); seg = DurationTxtSegment::default(); } seg.digits.push(c); state = 1; } else { if (state == 1) && (!seg.word.is_empty() || (start_c && segments.is_empty())) { segments.push(seg); seg = DurationTxtSegment::default(); } if c != ',' { c.to_lowercase().for_each(|c| seg.word.push(c)); } state = 2; } } if !seg.word.is_empty() || !seg.digits.is_empty() { segments.push(seg); } segments } async fn get_channel_vlengths( query: &RustyPipeQuery, channel_id: &str, map: &mut BTreeMap, ) -> Result<()> { let resp = query .raw( ClientType::Desktop, "browse", &QBrowse { context: query.get_context(ClientType::Desktop, true, None).await, browse_id: channel_id, params: Some("EgZ2aWRlb3MYASAAMAE"), }, ) .await?; let channel = serde_json::from_str::(&resp)?; let tab = channel .contents .two_column_browse_results_renderer .tabs .into_iter() .next() .unwrap() .tab_renderer .content .rich_grid_renderer; tab.contents.into_iter().for_each(|c| { let lt = c.rich_item_renderer.content.video_renderer.length_text; let duration = util::parse_video_length(<.simple_text).unwrap(); map.insert(lt.accessibility.accessibility_data.label, duration); }); Ok(()) } #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] enum PluralCategory { Zero, One, Two, Few, Many, Other, } impl From for PluralCategory { fn from(value: intl_pluralrules::PluralCategory) -> Self { match value { intl_pluralrules::PluralCategory::ZERO => Self::Zero, intl_pluralrules::PluralCategory::ONE => Self::One, intl_pluralrules::PluralCategory::TWO => Self::Two, intl_pluralrules::PluralCategory::FEW => Self::Few, intl_pluralrules::PluralCategory::MANY => Self::Many, intl_pluralrules::PluralCategory::OTHER => Self::Other, } } } #[cfg(test)] mod tests { use super::*; use std::collections::HashSet; use std::io::BufReader; use intl_pluralrules::{PluralRuleType, PluralRules}; use unic_langid::LanguageIdentifier; /// Verify that the duration sample set covers all pluralization variants of the languages #[test] fn check_video_duration_samples() { let json_path = path!(*DICT_DIR / "video_duration_samples.json"); let json_file = File::open(json_path).unwrap(); let durations: CollectedDurations = serde_json::from_reader(BufReader::new(json_file)).unwrap(); let mut failed = false; for (lang, durations) in durations { let ul: LanguageIdentifier = lang.to_string().split('-').next().unwrap().parse().unwrap(); let pr = PluralRules::create(ul, PluralRuleType::CARDINAL) .unwrap_or_else(|_| panic!("{}", lang.to_string())); let mut plurals_m: HashSet = HashSet::new(); for n in 1..60 { plurals_m.insert(pr.select(n).unwrap().into()); } let mut plurals_s = plurals_m.clone(); for v in durations.values() { let (m, s) = split_duration(*v); plurals_m.remove(&pr.select(m).unwrap().into()); plurals_s.remove(&pr.select(s).unwrap().into()); } if !plurals_m.is_empty() { println!("{lang}: missing minutes {plurals_m:?}"); failed = true; } if !plurals_s.is_empty() { println!("{lang}: missing seconds {plurals_m:?}"); failed = true; } } assert!(!failed); } }