fix: add dictionary support for short timeago strings

2023-05-31 01:41:46 +02:00 · 2023-05-31 01:41:46 +02:00 · 0cd018e37a
commit 0cd018e37a
parent cc2cadc309
10 changed files with 6308 additions and 1694 deletions
--- a/codegen/src/collect_video_dates.rs
+++ b/codegen/src/collect_video_dates.rs
@ -0,0 +1,83 @@
+use std::{
+    collections::{BTreeMap, HashSet},
+    fs::File,
+};
+
+use futures::{stream, StreamExt};
+use path_macro::path;
+use rustypipe::{
+    client::{RustyPipe, RustyPipeQuery},
+    param::{Language, LANGUAGES},
+};
+
+use crate::util::DICT_DIR;
+
+pub async fn collect_video_dates(concurrency: usize) {
+    let json_path = path!(*DICT_DIR / "timeago_samples_short.json");
+    let rp = RustyPipe::builder()
+        .visitor_data("Cgtwel9tMkh2eHh0USiyzc6jBg%3D%3D")
+        .build();
+
+    let channels = [
+        "UCeY0bbntWzzVIaj2z3QigXg",
+        "UCcmpeVbSSQlZRvHfdC-CRwg",
+        "UC65afEgL62PGFWXY7n6CUbA",
+        "UCEOXxzW2vU0P-0THehuIIeg",
+    ];
+
+    let mut lang_strings: BTreeMap<Language, Vec<String>> = BTreeMap::new();
+    for lang in LANGUAGES {
+        println!("{lang}");
+        let query = rp.query().lang(lang);
+        let strings = stream::iter(channels)
+            .map(|id| get_channel_datestrings(&query, id))
+            .buffered(concurrency)
+            .collect::<Vec<_>>()
+            .await
+            .into_iter()
+            .flatten()
+            .collect::<Vec<_>>();
+        lang_strings.insert(lang, strings);
+    }
+
+    let mut en_strings_uniq: HashSet<&str> = HashSet::new();
+    let mut uniq_ids: HashSet<usize> = HashSet::new();
+
+    lang_strings[&Language::En]
+        .iter()
+        .enumerate()
+        .for_each(|(n, s)| {
+            if en_strings_uniq.insert(s) {
+                uniq_ids.insert(n);
+            }
+        });
+
+    let strings_map = lang_strings
+        .iter()
+        .map(|(lang, strings)| {
+            (
+                lang,
+                strings
+                    .iter()
+                    .enumerate()
+                    .filter(|(n, _)| uniq_ids.contains(n))
+                    .map(|(_, s)| s)
+                    .collect::<Vec<_>>(),
+            )
+        })
+        .collect::<BTreeMap<_, _>>();
+
+    let file = File::create(json_path).unwrap();
+    serde_json::to_writer_pretty(file, &strings_map).unwrap();
+}
+
+async fn get_channel_datestrings(rp: &RustyPipeQuery, id: &str) -> Vec<String> {
+    let channel = rp.channel_videos(id).await.unwrap();
+
+    channel
+        .content
+        .items
+        .into_iter()
+        .filter_map(|itm| itm.publish_date_txt)
+        .collect()
+}
--- a/codegen/src/main.rs
+++ b/codegen/src/main.rs
@ -4,6 +4,7 @@ mod abtest;
 mod collect_album_types;
 mod collect_large_numbers;
 mod collect_playlist_dates;
+mod collect_video_dates;
 mod collect_video_durations;
 mod download_testfiles;
 mod gen_dictionary;
@ -27,6 +28,7 @@ enum Commands {
    CollectLargeNumbers,
    CollectAlbumTypes,
    CollectVideoDurations,
+    CollectVideoDates,
    ParsePlaylistDates,
    ParseLargeNumbers,
    ParseAlbumTypes,
@ -60,6 +62,9 @@ async fn main() {
        Commands::CollectVideoDurations => {
            collect_video_durations::collect_video_durations(cli.concurrency).await;
        }
+        Commands::CollectVideoDates => {
+            collect_video_dates::collect_video_dates(cli.concurrency).await;
+        }
        Commands::ParsePlaylistDates => collect_playlist_dates::write_samples_to_dict(),
        Commands::ParseLargeNumbers => collect_large_numbers::write_samples_to_dict(),
        Commands::ParseAlbumTypes => collect_album_types::write_samples_to_dict(),
--- a/src/util/dictionary.rs
+++ b/src/util/dictionary.rs
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@ -128,7 +128,35 @@ where
    buf.parse()
 }

-/// Parse all numbers occurring in a string and reurn them as a vec
+/// Parse a string after removing all non-numeric characters.
+///
+/// If the string contains multiple numbers, it returns the product of them.
+pub fn parse_numeric_prod<F>(string: &str) -> Option<F>
+where
+    F: FromStr + Copy + std::ops::Mul<Output = F>,
+{
+    let mut n = None;
+    let mut buf = String::new();
+
+    for c in string.chars() {
+        if c.is_ascii_digit() {
+            buf.push(c);
+        } else if !buf.is_empty() {
+            if let Ok(x) = buf.parse::<F>() {
+                n = n.map(|n| n * x).or(Some(x));
+            }
+            buf.clear();
+        }
+    }
+    if !buf.is_empty() {
+        if let Ok(x) = buf.parse::<F>() {
+            n = n.map(|n| n * x).or(Some(x));
+        }
+    }
+    n
+}
+
+/// Parse all numbers occurring in a string and return them as a vec
 pub fn parse_numeric_vec<F>(string: &str) -> Vec<F>
 where
    F: FromStr,
--- a/src/util/timeago.rs
+++ b/src/util/timeago.rs
@ -199,7 +199,20 @@ pub fn parse_timeago(lang: Language, textual_date: &str) -> Option<TimeAgo> {
    let entry = dictionary::entry(lang);
    let filtered_str = filter_str(textual_date);

-    let qu: u8 = util::parse_numeric(textual_date).unwrap_or(1);
+    let qu: u8 = util::parse_numeric_prod(textual_date).unwrap_or(1);
+
+    // French uses 'a' as a short form of years.
+    // Since 'a' is also a word in French, it cannot be parsed as a token.
+    if matches!(
+        lang,
+        Language::Fr | Language::FrCa | Language::Es | Language::Es419 | Language::EsUs
+    ) && textual_date.ends_with(" a")
+    {
+        return Some(TimeAgo {
+            n: qu,
+            unit: TimeUnit::Year,
+        });
+    }

    TaTokenParser::new(&entry, util::lang_by_char(lang), false, &filtered_str)
        .next()
@ -403,10 +416,10 @@ mod tests {
    use crate::util::tests::TESTFILES;

    #[rstest]
-    #[case(Language::De, "vor 1 Sekunde", Some(TimeAgo { n: 1, unit: TimeUnit::Second }))]
-    #[case(Language::Ar, "قبل ساعة واحدة", Some(TimeAgo { n: 1, unit: TimeUnit::Hour }))]
+    #[case::de(Language::De, "vor 1 Sekunde", Some(TimeAgo { n: 1, unit: TimeUnit::Second }))]
+    #[case::ar(Language::Ar, "قبل ساعة واحدة", Some(TimeAgo { n: 1, unit: TimeUnit::Hour }))]
    // No-break space
-    #[case(Language::De, "Vor 3\u{a0}Tagen aktualisiert", Some(TimeAgo { n: 3, unit: TimeUnit::Day }))]
+    #[case::nbsp(Language::De, "Vor 3\u{a0}Tagen aktualisiert", Some(TimeAgo { n: 3, unit: TimeUnit::Day }))]
    fn t_parse(
        #[case] lang: Language,
        #[case] textual_date: &str,
@ -581,7 +594,196 @@ mod tests {
                assert_eq!(
                    parse_timeago(*lang, s),
                    Some(expect[n]),
-                    "Language: {lang}, n: {n}"
+                    "Language: {lang}, txt: `{s}`"
+                );
+            });
+        })
+    }
+
+    #[test]
+    fn t_testfile_short() {
+        let json_path = path!(*TESTFILES / "dict" / "timeago_samples_short.json");
+
+        let expect = [
+            TimeAgo {
+                n: 35,
+                unit: TimeUnit::Minute,
+            },
+            TimeAgo {
+                n: 50,
+                unit: TimeUnit::Minute,
+            },
+            TimeAgo {
+                n: 1,
+                unit: TimeUnit::Hour,
+            },
+            TimeAgo {
+                n: 2,
+                unit: TimeUnit::Hour,
+            },
+            TimeAgo {
+                n: 3,
+                unit: TimeUnit::Hour,
+            },
+            TimeAgo {
+                n: 4,
+                unit: TimeUnit::Hour,
+            },
+            TimeAgo {
+                n: 5,
+                unit: TimeUnit::Hour,
+            },
+            TimeAgo {
+                n: 6,
+                unit: TimeUnit::Hour,
+            },
+            TimeAgo {
+                n: 7,
+                unit: TimeUnit::Hour,
+            },
+            TimeAgo {
+                n: 8,
+                unit: TimeUnit::Hour,
+            },
+            TimeAgo {
+                n: 9,
+                unit: TimeUnit::Hour,
+            },
+            TimeAgo {
+                n: 12,
+                unit: TimeUnit::Hour,
+            },
+            TimeAgo {
+                n: 17,
+                unit: TimeUnit::Hour,
+            },
+            TimeAgo {
+                n: 18,
+                unit: TimeUnit::Hour,
+            },
+            TimeAgo {
+                n: 19,
+                unit: TimeUnit::Hour,
+            },
+            TimeAgo {
+                n: 20,
+                unit: TimeUnit::Hour,
+            },
+            TimeAgo {
+                n: 10,
+                unit: TimeUnit::Hour,
+            },
+            TimeAgo {
+                n: 11,
+                unit: TimeUnit::Hour,
+            },
+            TimeAgo {
+                n: 13,
+                unit: TimeUnit::Hour,
+            },
+            TimeAgo {
+                n: 1,
+                unit: TimeUnit::Day,
+            },
+            TimeAgo {
+                n: 2,
+                unit: TimeUnit::Day,
+            },
+            TimeAgo {
+                n: 3,
+                unit: TimeUnit::Day,
+            },
+            TimeAgo {
+                n: 4,
+                unit: TimeUnit::Day,
+            },
+            TimeAgo {
+                n: 6,
+                unit: TimeUnit::Day,
+            },
+            TimeAgo {
+                n: 8,
+                unit: TimeUnit::Day,
+            },
+            TimeAgo {
+                n: 10,
+                unit: TimeUnit::Day,
+            },
+            TimeAgo {
+                n: 11,
+                unit: TimeUnit::Day,
+            },
+            TimeAgo {
+                n: 12,
+                unit: TimeUnit::Day,
+            },
+            TimeAgo {
+                n: 13,
+                unit: TimeUnit::Day,
+            },
+            TimeAgo {
+                n: 2,
+                unit: TimeUnit::Week,
+            },
+            TimeAgo {
+                n: 3,
+                unit: TimeUnit::Week,
+            },
+            TimeAgo {
+                n: 1,
+                unit: TimeUnit::Month,
+            },
+            TimeAgo {
+                n: 4,
+                unit: TimeUnit::Week,
+            },
+            TimeAgo {
+                n: 7,
+                unit: TimeUnit::Month,
+            },
+            TimeAgo {
+                n: 10,
+                unit: TimeUnit::Month,
+            },
+            TimeAgo {
+                n: 1,
+                unit: TimeUnit::Year,
+            },
+            TimeAgo {
+                n: 2,
+                unit: TimeUnit::Year,
+            },
+            TimeAgo {
+                n: 3,
+                unit: TimeUnit::Year,
+            },
+            TimeAgo {
+                n: 4,
+                unit: TimeUnit::Year,
+            },
+            TimeAgo {
+                n: 5,
+                unit: TimeUnit::Year,
+            },
+        ];
+
+        let json_file = File::open(json_path).unwrap();
+        let strings_map: BTreeMap<Language, Vec<String>> =
+            serde_json::from_reader(BufReader::new(json_file)).unwrap();
+
+        strings_map.iter().for_each(|(lang, strings)| {
+            assert_eq!(strings.len(), expect.len(), "Language: {lang}");
+            strings.iter().enumerate().for_each(|(n, s)| {
+                let mut exp = expect[n];
+                if *lang == Language::Mn && exp.unit == TimeUnit::Week {
+                    exp.unit = TimeUnit::Day;
+                    exp.n *= 7;
+                }
+
+                assert_eq!(
+                    parse_timeago(*lang, s),
+                    Some(exp),
+                    "Language: {lang}, txt: `{s}`"
                );
            });
        })
--- a/testfiles/dict/cldr_data/.gitignore
+++ b/testfiles/dict/cldr_data/.gitignore
@ -0,0 +1,2 @@
+node_modules
+package-lock.json
--- a/testfiles/dict/cldr_data/collect_ta_tokens.js
+++ b/testfiles/dict/cldr_data/collect_ta_tokens.js
@ -0,0 +1,162 @@
+const fs = require("fs");
+
+const DICT_PATH = "../dictionary.json";
+
+function translateLang(lang) {
+  switch (lang) {
+    case "iw": // Hebrew
+      return "he";
+    case "zh-CN": // Simplified Chinese
+      return "zh-Hans";
+    case "zh-HK":
+      return "zh-Hant-HK";
+    case "zh-TW":
+      return "zh-Hant";
+    default:
+      return lang;
+  }
+}
+
+function prepString(s, by_char) {
+  const replaced = s.toLowerCase().replace("{0}", "").replace("-", " ");
+  if (by_char) {
+    return replaced.replace(/\s/, "").split("");
+  } else {
+    return replaced.split(/\s+/);
+  }
+}
+
+function storeToken(tokens, word, unit) {
+  if (word) {
+    if (word in tokens && tokens[word] != unit) {
+      tokens[word] = null;
+    } else {
+      tokens[word] = unit;
+    }
+  }
+}
+
+function validateTokens(tokens, lang) {
+  const units = { Y: 1, M: 1, W: 1, D: 1, h: 1, m: 1, s: 1 };
+
+  if (lang === "iw") {
+    tokens["שתי"] = "2";
+  }
+
+  for (const [key, val] of Object.entries(tokens)) {
+    if (val === null) {
+      delete tokens[key];
+    } else {
+      delete units[val];
+    }
+  }
+  if (Object.keys(units).length > 0) {
+    console.log(
+      `missing units ${JSON.stringify(
+        Object.keys(units)
+      )} for lang: ${lang}; tokens: ${JSON.stringify(tokens)}`
+    );
+  }
+}
+
+function validateNdTokens(tokens, lang) {
+  const units = { "0D": 1, "1D": 1 };
+
+  for (const [key, val] of Object.entries(tokens)) {
+    if (val === null) {
+      delete tokens[key];
+    } else {
+      delete units[val];
+    }
+  }
+
+  if (Object.keys(units).length > 0) {
+    console.log(
+      `missing nd tokens ${JSON.stringify(
+        Object.keys(units)
+      )} for lang: ${lang}; tokens: ${JSON.stringify(tokens)}`
+    );
+  } else if (Object.keys(tokens).length > 2) {
+    console.log(
+      `too many nd tokens for lang: ${lang}; tokens: ${JSON.stringify(tokens)}`
+    );
+  }
+}
+
+const sortObject = (obj) =>
+  Object.keys(obj)
+    .sort()
+    .reduce((res, key) => ((res[key] = obj[key]), res), {});
+
+function collectTimeago(lang, by_char, timeagoTokens, timeagoNdTokens) {
+  const cldrLang = translateLang(lang);
+  const dates = require(`cldr-dates-modern/main/${cldrLang}/dateFields.json`);
+  const dateFields = dates.main[cldrLang].dates.fields;
+
+  for (const [unitStr, unit] of Object.entries(units)) {
+    for (const unitFields of [dateFields[unitStr], dateFields[`${unitStr}-short`]]) {
+      for (const [sKey, s] of Object.entries(unitFields["relativeTime-type-past"])) {
+        let u = unit;
+        if (s.indexOf("{0}") === -1) {
+          if (sKey.endsWith("-zero")) {
+            u = "0" + u;
+          } else if (sKey.endsWith("-one")) {
+            u = "1" + u;
+          } else if (sKey.endsWith("-two")) {
+            u = "2" + u;
+          } else {
+            throw new Error(`Invalid time pattern. lang: ${lang} key: ${sKey}`);
+          }
+        }
+
+        const words = prepString(s, by_char);
+        for (const word of words) {
+          storeToken(timeagoTokens, word, u);
+        }
+      }
+    }
+  }
+
+  if (dateFields.day["relative-type-0"]) {
+    const words = prepString(dateFields.day["relative-type-0"], by_char);
+    for (const word of words) {
+      storeToken(timeagoNdTokens, word, "0D");
+    }
+  }
+  if (dateFields.day["relative-type--1"]) {
+    const words = prepString(dateFields.day["relative-type--1"], by_char);
+    for (const word of words) {
+      storeToken(timeagoNdTokens, word, "1D");
+    }
+  }
+}
+
+const dict = JSON.parse(fs.readFileSync(DICT_PATH));
+
+const units = {
+  second: "s",
+  minute: "m",
+  hour: "h",
+  day: "D",
+  week: "W",
+  month: "M",
+  year: "Y",
+};
+
+for (const [mainLang, entry] of Object.entries(dict)) {
+  const langs = [mainLang, ...entry["equivalent"]];
+
+  const timeagoTokens = {};
+  const timeagoNdTokens = {};
+
+  for (lang of langs) {
+    collectTimeago(lang, entry["by_char"], timeagoTokens, timeagoNdTokens);
+  }
+  validateTokens(timeagoTokens, mainLang);
+  // validateNdTokens(timeagoNdTokens, mainLang);
+
+  dict[mainLang]["timeago_tokens"] = timeagoTokens;
+  // dict[mainLang]["timeago_nd_tokens"] = timeagoNdTokens;
+}
+
+fs.writeFileSync(DICT_PATH, JSON.stringify(dict, null, 2));
--- a/testfiles/dict/cldr_data/package.json
+++ b/testfiles/dict/cldr_data/package.json
@ -0,0 +1,12 @@
+{
+  "name": "cldr_data",
+  "version": "1.0.0",
+  "description": "Build the RustyPipe parsing dictionary using CLDR data",
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "dependencies": {
+    "cldr-dates-modern": "^43.0.0",
+    "cldr-numbers-modern": "^43.0.0"
+  }
+}
--- a/testfiles/dict/dictionary.json
+++ b/testfiles/dict/dictionary.json
--- a/testfiles/dict/timeago_samples_short.json
+++ b/testfiles/dict/timeago_samples_short.json