rustypipe/codegen/src/collect_video_durations.rs

use std::{
    collections::{BTreeMap, HashMap},
    fs::File,
    io::BufReader,
};

use anyhow::Result;
use futures::{stream, StreamExt};
use path_macro::path;
use rustypipe::{
    client::{ClientType, RustyPipe, RustyPipeQuery},
    param::{Language, LANGUAGES},
};

use crate::{
    model::{Channel, QBrowse, TimeAgo, TimeUnit},
    util::{self, DICT_DIR},
};

type CollectedDurations = BTreeMap<Language, BTreeMap<String, u32>>;

/// Collect the video duration texts in every supported language
/// and write them to `testfiles/dict/video_duration_samples.json`.
///
/// The length of YouTube short videos is only available in textual form.
/// To parse it correctly, we need to collect samples of this text in every
/// language. We collect these samples from regular channel videos because these
/// include a textual duration in addition to the easy to parse "mm:ss"
/// duration format.
pub async fn collect_video_durations(concurrency: usize) {
    let json_path = path!(*DICT_DIR / "video_duration_samples.json");
    let rp = RustyPipe::new();

    let channels = [
        "UCq-Fj5jknLsUf-MWSy4_brA",
        "UCMcS5ITpSohfr8Ppzlo4vKw",
        "UCXuqSBlHAE6Xw-yeJA0Tunw",
    ];

    let durations: CollectedDurations = stream::iter(LANGUAGES)
        .map(|lang| {
            let rp = rp.query().lang(lang);
            async move {
                let mut map = BTreeMap::new();

                for (n, ch_id) in channels.iter().enumerate() {
                    get_channel_vlengths(&rp, ch_id, &mut map).await.unwrap();
                    println!("collected {lang}-{n}");
                }

                // Since we are only parsing shorts durations, we do not need durations >= 1h
                let map = map.into_iter().filter(|(_, v)| v < &3600).collect();
                (lang, map)
            }
        })
        .buffer_unordered(concurrency)
        .collect()
        .await;

    let file = File::create(json_path).unwrap();
    serde_json::to_writer_pretty(file, &durations).unwrap();
}

pub fn parse_video_durations() {
    let json_path = path!(*DICT_DIR / "video_duration_samples.json");
    let json_file = File::open(json_path).unwrap();
    let durations: CollectedDurations = serde_json::from_reader(BufReader::new(json_file)).unwrap();

    let mut dict = util::read_dict();
    let langs = dict.keys().copied().collect::<Vec<_>>();

    for lang in langs {
        let dict_entry = dict.entry(lang).or_default();

        let mut e_langs = dict_entry.equivalent.clone();
        e_langs.push(lang);

        for lang in e_langs {
            let mut words = HashMap::new();

            fn check_add_word(
                words: &mut HashMap<String, Option<TimeAgo>>,
                by_char: bool,
                val: u32,
                expect: u32,
                w: &str,
                unit: TimeUnit,
            ) -> bool {
                let ok = val == expect || val * 2 == expect;
                if ok {
                    let mut ins = |w: &str, val: &mut TimeAgo| {
                        // Filter stop words
                        if matches!(
                            w,
                            "na" | "y"
                                | "و"
                                | "ja"
                                | "et"
                                | "e"
                                | "i"
                                | "և"
                                | "og"
                                | "en"
                                | "и"
                                | "a"
                                | "és"
                                | "ir"
                                | "un"
                                | "și"
                                | "in"
                                | "และ"
                                | "\u{0456}"
                                | "鐘"
                                | "eta"
                                | "અને"
                                | "और"
                                | "കൂടാതെ"
                                | "සහ"
                        ) {
                            return;
                        }

                        let entry = words.entry(w.to_owned()).or_insert(Some(*val));
                        if let Some(e) = entry {
                            if e != val {
                                *entry = None;
                            }
                        }
                    };

                    let mut val = TimeAgo {
                        n: (expect / val).try_into().unwrap(),
                        unit,
                    };

                    if by_char {
                        w.chars().for_each(|c| {
                            if !c.is_whitespace() {
                                ins(&c.to_string(), &mut val);
                            }
                        });
                    } else {
                        w.split_whitespace().for_each(|w| ins(w, &mut val));
                    }
                }
                ok
            }

            fn parse(
                words: &mut HashMap<String, Option<TimeAgo>>,
                lang: Language,
                by_char: bool,
                txt: &str,
                d: u32,
            ) {
                let (m, s) = split_duration(d);

                let mut parts =
                    split_duration_txt(txt, matches!(lang, Language::Si | Language::Sw))
                        .into_iter();

                let p1 = parts.next().unwrap();
                let p1_n = p1.digits.parse::<u32>().unwrap_or(1);
                let p2: Option<DurationTxtSegment> = parts.next();

                match p2 {
                    Some(p2) => {
                        let p2_n = p2.digits.parse::<u32>().unwrap_or(1);

                        assert!(
                            check_add_word(words, by_char, p1_n, m, &p1.word, TimeUnit::Minute),
                            "{txt}: min parse error"
                        );
                        assert!(
                            check_add_word(words, by_char, p2_n, s, &p2.word, TimeUnit::Second),
                            "{txt}: sec parse error"
                        );
                    }
                    None => {
                        if s == 0 {
                            assert!(
                                check_add_word(words, by_char, p1_n, m, &p1.word, TimeUnit::Minute),
                                "{txt}: min parse error"
                            );
                        } else if m == 0 {
                            assert!(
                                check_add_word(words, by_char, p1_n, s, &p1.word, TimeUnit::Second),
                                "{txt}: sec parse error"
                            );
                        } else {
                            let p = txt
                                .find([',', 'و'])
                                .unwrap_or_else(|| panic!("`{txt}`: only 1 part"));
                            parse(words, lang, by_char, &txt[0..p], m);
                            parse(words, lang, by_char, &txt[p..], s);
                        }
                    }
                }

                assert!(parts.next().is_none(), "`{txt}`: more than 2 parts");
            }

            for (txt, d) in &durations[&lang] {
                parse(&mut words, lang, dict_entry.by_char, txt, *d);
            }

            // dbg!(&words);

            for (k, v) in words {
                if let Some(v) = v {
                    dict_entry.timeago_tokens.insert(k, v.to_string());
                }
            }
        }
    }

    util::write_dict(dict);
}

fn split_duration(d: u32) -> (u32, u32) {
    (d / 60, d % 60)
}

#[derive(Debug, Default)]
struct DurationTxtSegment {
    digits: String,
    word: String,
}

fn split_duration_txt(txt: &str, start_c: bool) -> Vec<DurationTxtSegment> {
    let mut segments = Vec::new();

    // 1: parse digits, 2: parse word
    let mut state: u8 = 0;
    let mut seg = DurationTxtSegment::default();

    for c in txt.chars() {
        if c.is_ascii_digit() {
            if state == 2 && (!seg.digits.is_empty() || (!start_c && segments.is_empty())) {
                segments.push(seg);
                seg = DurationTxtSegment::default();
            }
            seg.digits.push(c);
            state = 1;
        } else {
            if (state == 1) && (!seg.word.is_empty() || (start_c && segments.is_empty())) {
                segments.push(seg);
                seg = DurationTxtSegment::default();
            }
            if c != ',' {
                c.to_lowercase().for_each(|c| seg.word.push(c));
            }
            state = 2;
        }
    }
    if !seg.word.is_empty() || !seg.digits.is_empty() {
        segments.push(seg);
    }

    segments
}

async fn get_channel_vlengths(
    query: &RustyPipeQuery,
    channel_id: &str,
    map: &mut BTreeMap<String, u32>,
) -> Result<()> {
    let resp = query
        .raw(
            ClientType::Desktop,
            "browse",
            &QBrowse {
                context: query.get_context(ClientType::Desktop, true, None).await,
                browse_id: channel_id,
                params: Some("EgZ2aWRlb3MYASAAMAE"),
            },
        )
        .await?;

    let channel = serde_json::from_str::<Channel>(&resp)?;

    let tab = channel
        .contents
        .two_column_browse_results_renderer
        .tabs
        .into_iter()
        .next()
        .unwrap()
        .tab_renderer
        .content
        .rich_grid_renderer;

    tab.contents.into_iter().for_each(|c| {
        let lt = c.rich_item_renderer.content.video_renderer.length_text;
        let duration = util::parse_video_length(&lt.simple_text).unwrap();
        map.insert(lt.accessibility.accessibility_data.label, duration);
    });

    Ok(())
}

#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
enum PluralCategory {
    Zero,
    One,
    Two,
    Few,
    Many,
    Other,
}

impl From<intl_pluralrules::PluralCategory> for PluralCategory {
    fn from(value: intl_pluralrules::PluralCategory) -> Self {
        match value {
            intl_pluralrules::PluralCategory::ZERO => Self::Zero,
            intl_pluralrules::PluralCategory::ONE => Self::One,
            intl_pluralrules::PluralCategory::TWO => Self::Two,
            intl_pluralrules::PluralCategory::FEW => Self::Few,
            intl_pluralrules::PluralCategory::MANY => Self::Many,
            intl_pluralrules::PluralCategory::OTHER => Self::Other,
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    use std::collections::HashSet;
    use std::io::BufReader;

    use intl_pluralrules::{PluralRuleType, PluralRules};
    use unic_langid::LanguageIdentifier;

    /// Verify that the duration sample set covers all pluralization variants of the languages
    #[test]
    fn check_video_duration_samples() {
        let json_path = path!(*DICT_DIR / "video_duration_samples.json");
        let json_file = File::open(json_path).unwrap();
        let durations: CollectedDurations =
            serde_json::from_reader(BufReader::new(json_file)).unwrap();
        let mut failed = false;

        for (lang, durations) in durations {
            let ul: LanguageIdentifier =
                lang.to_string().split('-').next().unwrap().parse().unwrap();

            let pr = PluralRules::create(ul, PluralRuleType::CARDINAL)
                .unwrap_or_else(|_| panic!("{}", lang.to_string()));

            let mut plurals_m: HashSet<PluralCategory> = HashSet::new();
            for n in 1..60 {
                plurals_m.insert(pr.select(n).unwrap().into());
            }
            let mut plurals_s = plurals_m.clone();

            for v in durations.values() {
                let (m, s) = split_duration(*v);
                plurals_m.remove(&pr.select(m).unwrap().into());
                plurals_s.remove(&pr.select(s).unwrap().into());
            }

            if !plurals_m.is_empty() {
                println!("{lang}: missing minutes {plurals_m:?}");
                failed = true;
            }

            if !plurals_s.is_empty() {
                println!("{lang}: missing seconds {plurals_m:?}");
                failed = true;
            }
        }

        assert!(!failed);
    }
}