fix: add dictionary support for short timeago strings

2023-05-31 01:41:46 +02:00 · 2023-05-31 01:41:46 +02:00 · 0cd018e37a
commit 0cd018e37a
parent cc2cadc309
10 changed files with 6308 additions and 1694 deletions
--- a/codegen/src/collect_video_dates.rs
+++ b/codegen/src/collect_video_dates.rs
@ -0,0 +1,83 @@
 use std::{
    collections::{BTreeMap, HashSet},
    fs::File,
 };
 use futures::{stream, StreamExt};
 use path_macro::path;
 use rustypipe::{
    client::{RustyPipe, RustyPipeQuery},
    param::{Language, LANGUAGES},
 };
 use crate::util::DICT_DIR;
 pub async fn collect_video_dates(concurrency: usize) {
    let json_path = path!(*DICT_DIR / "timeago_samples_short.json");
    let rp = RustyPipe::builder()
        .visitor_data("Cgtwel9tMkh2eHh0USiyzc6jBg%3D%3D")
        .build();
    let channels = [
        "UCeY0bbntWzzVIaj2z3QigXg",
        "UCcmpeVbSSQlZRvHfdC-CRwg",
        "UC65afEgL62PGFWXY7n6CUbA",
        "UCEOXxzW2vU0P-0THehuIIeg",
    ];
    let mut lang_strings: BTreeMap<Language, Vec<String>> = BTreeMap::new();
    for lang in LANGUAGES {
        println!("{lang}");
        let query = rp.query().lang(lang);
        let strings = stream::iter(channels)
            .map(|id| get_channel_datestrings(&query, id))
            .buffered(concurrency)
            .collect::<Vec<_>>()
            .await
            .into_iter()
            .flatten()
            .collect::<Vec<_>>();
        lang_strings.insert(lang, strings);
    }
    let mut en_strings_uniq: HashSet<&str> = HashSet::new();
    let mut uniq_ids: HashSet<usize> = HashSet::new();
    lang_strings[&Language::En]
        .iter()
        .enumerate()
        .for_each(|(n, s)| {
            if en_strings_uniq.insert(s) {
                uniq_ids.insert(n);
            }
        });
    let strings_map = lang_strings
        .iter()
        .map(|(lang, strings)| {
            (
                lang,
                strings
                    .iter()
                    .enumerate()
                    .filter(|(n, _)| uniq_ids.contains(n))
                    .map(|(_, s)| s)
                    .collect::<Vec<_>>(),
            )
        })
        .collect::<BTreeMap<_, _>>();
    let file = File::create(json_path).unwrap();
    serde_json::to_writer_pretty(file, &strings_map).unwrap();
 }
 async fn get_channel_datestrings(rp: &RustyPipeQuery, id: &str) -> Vec<String> {
    let channel = rp.channel_videos(id).await.unwrap();
    channel
        .content
        .items
        .into_iter()
        .filter_map(|itm| itm.publish_date_txt)
        .collect()
 }
--- a/codegen/src/main.rs
+++ b/codegen/src/main.rs
@ -4,6 +4,7 @@ mod abtest;
 mod collect_album_types;
 mod collect_large_numbers;
 mod collect_playlist_dates;
 mod collect_video_dates;
 mod collect_video_durations;
 mod download_testfiles;
 mod gen_dictionary;
@ -27,6 +28,7 @@ enum Commands {
    CollectLargeNumbers,
    CollectAlbumTypes,
    CollectVideoDurations,
    CollectVideoDates,
    ParsePlaylistDates,
    ParseLargeNumbers,
    ParseAlbumTypes,
@ -60,6 +62,9 @@ async fn main() {
        Commands::CollectVideoDurations => {
            collect_video_durations::collect_video_durations(cli.concurrency).await;
        }
        Commands::CollectVideoDates => {
            collect_video_dates::collect_video_dates(cli.concurrency).await;
        }
        Commands::ParsePlaylistDates => collect_playlist_dates::write_samples_to_dict(),
        Commands::ParseLargeNumbers => collect_large_numbers::write_samples_to_dict(),
        Commands::ParseAlbumTypes => collect_album_types::write_samples_to_dict(),
--- a/src/util/dictionary.rs
+++ b/src/util/dictionary.rs
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@ -128,7 +128,35 @@ where
    buf.parse()
 }
-/// Parse all numbers occurring in a string and reurn them as a vec
+/// Parse a string after removing all non-numeric characters.
 ///
 /// If the string contains multiple numbers, it returns the product of them.
 pub fn parse_numeric_prod<F>(string: &str) -> Option<F>
 where
    F: FromStr + Copy + std::ops::Mul<Output = F>,
 {
    let mut n = None;
    let mut buf = String::new();
    for c in string.chars() {
        if c.is_ascii_digit() {
            buf.push(c);
        } else if !buf.is_empty() {
            if let Ok(x) = buf.parse::<F>() {
                n = n.map(|n| n * x).or(Some(x));
            }
            buf.clear();
        }
    }
    if !buf.is_empty() {
        if let Ok(x) = buf.parse::<F>() {
            n = n.map(|n| n * x).or(Some(x));
        }
    }
    n
 }
 /// Parse all numbers occurring in a string and return them as a vec
 pub fn parse_numeric_vec<F>(string: &str) -> Vec<F>
 where
    F: FromStr,
--- a/src/util/timeago.rs
+++ b/src/util/timeago.rs
@ -199,7 +199,20 @@ pub fn parse_timeago(lang: Language, textual_date: &str) -> Option<TimeAgo> {
    let entry = dictionary::entry(lang);
    let filtered_str = filter_str(textual_date);
-    let qu: u8 = util::parse_numeric(textual_date).unwrap_or(1);
+    let qu: u8 = util::parse_numeric_prod(textual_date).unwrap_or(1);
    // French uses 'a' as a short form of years.
    // Since 'a' is also a word in French, it cannot be parsed as a token.
    if matches!(
        lang,
        Language::Fr | Language::FrCa | Language::Es | Language::Es419 | Language::EsUs
    ) && textual_date.ends_with(" a")
    {
        return Some(TimeAgo {
            n: qu,
            unit: TimeUnit::Year,
        });
    }
    TaTokenParser::new(&entry, util::lang_by_char(lang), false, &filtered_str)
        .next()
@ -403,10 +416,10 @@ mod tests {
    use crate::util::tests::TESTFILES;
    #[rstest]
-    #[case(Language::De, "vor 1 Sekunde", Some(TimeAgo { n: 1, unit: TimeUnit::Second }))]
+    #[case::de(Language::De, "vor 1 Sekunde", Some(TimeAgo { n: 1, unit: TimeUnit::Second }))]
-    #[case(Language::Ar, "قبل ساعة واحدة", Some(TimeAgo { n: 1, unit: TimeUnit::Hour }))]
+    #[case::ar(Language::Ar, "قبل ساعة واحدة", Some(TimeAgo { n: 1, unit: TimeUnit::Hour }))]
    // No-break space
-    #[case(Language::De, "Vor 3\u{a0}Tagen aktualisiert", Some(TimeAgo { n: 3, unit: TimeUnit::Day }))]
+    #[case::nbsp(Language::De, "Vor 3\u{a0}Tagen aktualisiert", Some(TimeAgo { n: 3, unit: TimeUnit::Day }))]
    fn t_parse(
        #[case] lang: Language,
        #[case] textual_date: &str,
@ -581,7 +594,196 @@ mod tests {
                assert_eq!(
                    parse_timeago(*lang, s),
                    Some(expect[n]),
-                    "Language: {lang}, n: {n}"
+                    "Language: {lang}, txt: `{s}`"
                );
            });
        })
    }
    #[test]
    fn t_testfile_short() {
        let json_path = path!(*TESTFILES / "dict" / "timeago_samples_short.json");
        let expect = [
            TimeAgo {
                n: 35,
                unit: TimeUnit::Minute,
            },
            TimeAgo {
                n: 50,
                unit: TimeUnit::Minute,
            },
            TimeAgo {
                n: 1,
                unit: TimeUnit::Hour,
            },
            TimeAgo {
                n: 2,
                unit: TimeUnit::Hour,
            },
            TimeAgo {
                n: 3,
                unit: TimeUnit::Hour,
            },
            TimeAgo {
                n: 4,
                unit: TimeUnit::Hour,
            },
            TimeAgo {
                n: 5,
                unit: TimeUnit::Hour,
            },
            TimeAgo {
                n: 6,
                unit: TimeUnit::Hour,
            },
            TimeAgo {
                n: 7,
                unit: TimeUnit::Hour,
            },
            TimeAgo {
                n: 8,
                unit: TimeUnit::Hour,
            },
            TimeAgo {
                n: 9,
                unit: TimeUnit::Hour,
            },
            TimeAgo {
                n: 12,
                unit: TimeUnit::Hour,
            },
            TimeAgo {
                n: 17,
                unit: TimeUnit::Hour,
            },
            TimeAgo {
                n: 18,
                unit: TimeUnit::Hour,
            },
            TimeAgo {
                n: 19,
                unit: TimeUnit::Hour,
            },
            TimeAgo {
                n: 20,
                unit: TimeUnit::Hour,
            },
            TimeAgo {
                n: 10,
                unit: TimeUnit::Hour,
            },
            TimeAgo {
                n: 11,
                unit: TimeUnit::Hour,
            },
            TimeAgo {
                n: 13,
                unit: TimeUnit::Hour,
            },
            TimeAgo {
                n: 1,
                unit: TimeUnit::Day,
            },
            TimeAgo {
                n: 2,
                unit: TimeUnit::Day,
            },
            TimeAgo {
                n: 3,
                unit: TimeUnit::Day,
            },
            TimeAgo {
                n: 4,
                unit: TimeUnit::Day,
            },
            TimeAgo {
                n: 6,
                unit: TimeUnit::Day,
            },
            TimeAgo {
                n: 8,
                unit: TimeUnit::Day,
            },
            TimeAgo {
                n: 10,
                unit: TimeUnit::Day,
            },
            TimeAgo {
                n: 11,
                unit: TimeUnit::Day,
            },
            TimeAgo {
                n: 12,
                unit: TimeUnit::Day,
            },
            TimeAgo {
                n: 13,
                unit: TimeUnit::Day,
            },
            TimeAgo {
                n: 2,
                unit: TimeUnit::Week,
            },
            TimeAgo {
                n: 3,
                unit: TimeUnit::Week,
            },
            TimeAgo {
                n: 1,
                unit: TimeUnit::Month,
            },
            TimeAgo {
                n: 4,
                unit: TimeUnit::Week,
            },
            TimeAgo {
                n: 7,
                unit: TimeUnit::Month,
            },
            TimeAgo {
                n: 10,
                unit: TimeUnit::Month,
            },
            TimeAgo {
                n: 1,
                unit: TimeUnit::Year,
            },
            TimeAgo {
                n: 2,
                unit: TimeUnit::Year,
            },
            TimeAgo {
                n: 3,
                unit: TimeUnit::Year,
            },
            TimeAgo {
                n: 4,
                unit: TimeUnit::Year,
            },
            TimeAgo {
                n: 5,
                unit: TimeUnit::Year,
            },
        ];
        let json_file = File::open(json_path).unwrap();
        let strings_map: BTreeMap<Language, Vec<String>> =
            serde_json::from_reader(BufReader::new(json_file)).unwrap();
        strings_map.iter().for_each(|(lang, strings)| {
            assert_eq!(strings.len(), expect.len(), "Language: {lang}");
            strings.iter().enumerate().for_each(|(n, s)| {
                let mut exp = expect[n];
                if *lang == Language::Mn && exp.unit == TimeUnit::Week {
                    exp.unit = TimeUnit::Day;
                    exp.n *= 7;
                }
                assert_eq!(
                    parse_timeago(*lang, s),
                    Some(exp),
                    "Language: {lang}, txt: `{s}`"
                );
            });
        })
--- a/testfiles/dict/cldr_data/.gitignore
+++ b/testfiles/dict/cldr_data/.gitignore
@ -0,0 +1,2 @@
 node_modules
 package-lock.json
--- a/testfiles/dict/cldr_data/collect_ta_tokens.js
+++ b/testfiles/dict/cldr_data/collect_ta_tokens.js
@ -0,0 +1,162 @@
 const fs = require("fs");
 const DICT_PATH = "../dictionary.json";
 function translateLang(lang) {
  switch (lang) {
    case "iw": // Hebrew
      return "he";
    case "zh-CN": // Simplified Chinese
      return "zh-Hans";
    case "zh-HK":
      return "zh-Hant-HK";
    case "zh-TW":
      return "zh-Hant";
    default:
      return lang;
  }
 }
 function prepString(s, by_char) {
  const replaced = s.toLowerCase().replace("{0}", "").replace("-", " ");
  if (by_char) {
    return replaced.replace(/\s/, "").split("");
  } else {
    return replaced.split(/\s+/);
  }
 }
 function storeToken(tokens, word, unit) {
  if (word) {
    if (word in tokens && tokens[word] != unit) {
      tokens[word] = null;
    } else {
      tokens[word] = unit;
    }
  }
 }
 function validateTokens(tokens, lang) {
  const units = { Y: 1, M: 1, W: 1, D: 1, h: 1, m: 1, s: 1 };
  if (lang === "iw") {
    tokens["שתי"] = "2";
  }
  for (const [key, val] of Object.entries(tokens)) {
    if (val === null) {
      delete tokens[key];
    } else {
      delete units[val];
    }
  }
  if (Object.keys(units).length > 0) {
    console.log(
      `missing units ${JSON.stringify(
        Object.keys(units)
      )} for lang: ${lang}; tokens: ${JSON.stringify(tokens)}`
    );
  }
 }
 function validateNdTokens(tokens, lang) {
  const units = { "0D": 1, "1D": 1 };
  for (const [key, val] of Object.entries(tokens)) {
    if (val === null) {
      delete tokens[key];
    } else {
      delete units[val];
    }
  }
  if (Object.keys(units).length > 0) {
    console.log(
      `missing nd tokens ${JSON.stringify(
        Object.keys(units)
      )} for lang: ${lang}; tokens: ${JSON.stringify(tokens)}`
    );
  } else if (Object.keys(tokens).length > 2) {
    console.log(
      `too many nd tokens for lang: ${lang}; tokens: ${JSON.stringify(tokens)}`
    );
  }
 }
 const sortObject = (obj) =>
  Object.keys(obj)
    .sort()
    .reduce((res, key) => ((res[key] = obj[key]), res), {});
 function collectTimeago(lang, by_char, timeagoTokens, timeagoNdTokens) {
  const cldrLang = translateLang(lang);
  const dates = require(`cldr-dates-modern/main/${cldrLang}/dateFields.json`);
  const dateFields = dates.main[cldrLang].dates.fields;
  for (const [unitStr, unit] of Object.entries(units)) {
    for (const unitFields of [dateFields[unitStr], dateFields[`${unitStr}-short`]]) {
      for (const [sKey, s] of Object.entries(unitFields["relativeTime-type-past"])) {
        let u = unit;
        if (s.indexOf("{0}") === -1) {
          if (sKey.endsWith("-zero")) {
            u = "0" + u;
          } else if (sKey.endsWith("-one")) {
            u = "1" + u;
          } else if (sKey.endsWith("-two")) {
            u = "2" + u;
          } else {
            throw new Error(`Invalid time pattern. lang: ${lang} key: ${sKey}`);
          }
        }
        const words = prepString(s, by_char);
        for (const word of words) {
          storeToken(timeagoTokens, word, u);
        }
      }
    }
  }
  if (dateFields.day["relative-type-0"]) {
    const words = prepString(dateFields.day["relative-type-0"], by_char);
    for (const word of words) {
      storeToken(timeagoNdTokens, word, "0D");
    }
  }
  if (dateFields.day["relative-type--1"]) {
    const words = prepString(dateFields.day["relative-type--1"], by_char);
    for (const word of words) {
      storeToken(timeagoNdTokens, word, "1D");
    }
  }
 }
 const dict = JSON.parse(fs.readFileSync(DICT_PATH));
 const units = {
  second: "s",
  minute: "m",
  hour: "h",
  day: "D",
  week: "W",
  month: "M",
  year: "Y",
 };
 for (const [mainLang, entry] of Object.entries(dict)) {
  const langs = [mainLang, ...entry["equivalent"]];
  const timeagoTokens = {};
  const timeagoNdTokens = {};
  for (lang of langs) {
    collectTimeago(lang, entry["by_char"], timeagoTokens, timeagoNdTokens);
  }
  validateTokens(timeagoTokens, mainLang);
  // validateNdTokens(timeagoNdTokens, mainLang);
  dict[mainLang]["timeago_tokens"] = timeagoTokens;
  // dict[mainLang]["timeago_nd_tokens"] = timeagoNdTokens;
 }
 fs.writeFileSync(DICT_PATH, JSON.stringify(dict, null, 2));
--- a/testfiles/dict/cldr_data/package.json
+++ b/testfiles/dict/cldr_data/package.json
@ -0,0 +1,12 @@
 {
  "name": "cldr_data",
  "version": "1.0.0",
  "description": "Build the RustyPipe parsing dictionary using CLDR data",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "dependencies": {
    "cldr-dates-modern": "^43.0.0",
    "cldr-numbers-modern": "^43.0.0"
  }
 }
--- a/testfiles/dict/dictionary.json
+++ b/testfiles/dict/dictionary.json
--- a/testfiles/dict/timeago_samples_short.json
+++ b/testfiles/dict/timeago_samples_short.json