fix: large number parser (ar)

2022-09-23 18:52:35 +02:00 · 2022-09-23 18:52:35 +02:00 · f66dce9d3d
commit f66dce9d3d
parent fc7655093b
5 changed files with 15809 additions and 24 deletions
--- a/codegen/src/collect_large_numbers.rs
+++ b/codegen/src/collect_large_numbers.rs
@ -33,6 +33,9 @@ pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
    let mut json_path = project_root.to_path_buf();
    json_path.push("testfiles/dict/large_number_samples.json");
    let mut json_path_all = project_root.to_path_buf();
    json_path_all.push("testfiles/dict/large_number_samples_all.json");
    let channels = [
        "UCq-Fj5jknLsUf-MWSy4_brA", // 10e8 (225M)
        "UCcdwLMPsaU2ezNSJU1nFoBQ", // 10e7 (60M)
@ -40,10 +43,11 @@ pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
        "UCD0y51PJfvkZNe3y3FR5riw", // 10e5 (125K)
        "UCNcN0dW43zE0Om3278fjY8A", // 10e4 (27K)
        "UC0QEucPrn0-Ddi3JBTcs5Kw", // 10e3 (5K)
-        "UCGiJh0NZ52wRhYKYnuZI08Q", // 10e1 (37)
+        "UCXvtcj9xUQhaqPaitFf2DqA", // (170)
        "UCq-XMc01T641v-4P3hQYJWg", // (636)
    ];
-    let collected_numbers: CollectedNumbers = stream::iter(LANGUAGES)
+    let collected_numbers_all: BTreeMap<Language, BTreeMap<String, u64>> = stream::iter(LANGUAGES)
        .map(|lang| async move {
            let mut entry = BTreeMap::new();
@ -54,7 +58,7 @@ pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
                    .unwrap();
                channel.view_counts.iter().for_each(|(num, txt)| {
-                    entry.insert(get_mag(*num), (txt.to_owned(), *num));
+                    entry.insert(txt.to_owned(), *num);
                });
                println!("collected {}-{}", lang, n);
@ -66,32 +70,53 @@ pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
        .collect()
        .await;
    let collected_numbers: CollectedNumbers = collected_numbers_all
        .iter()
        .map(|(lang, entry)| {
            let mut e2 = BTreeMap::new();
            entry.iter().for_each(|(txt, num)| {
                e2.insert(get_mag(*num), (txt.to_owned(), *num));
            });
            (*lang, e2)
        })
        .collect();
    let file = File::create(json_path).unwrap();
    serde_json::to_writer_pretty(file, &collected_numbers).unwrap();
    let file = File::create(json_path_all).unwrap();
    serde_json::to_writer_pretty(file, &collected_numbers_all).unwrap();
 }
 /// Attempt to parse the numbers collected by `collect-large-numbers`
 /// and write the results to `dictionary.json`.
 ///
 /// Manual corrections:
 /// as
 /// "কোঃটা": 9,
 /// "নিঃটা": 6,
 /// "নিযুতটা": 6,
 /// "লাখটা": 5,
 /// "হাজাৰটা": 3
 ///
 /// bn
 /// "লাটি": 5,
 /// "শত": 2,
 /// "হাটি": 3,
 /// "কোটি": 7
 ///
 /// es/es-US
 /// "mil": 3,
 /// "M": 6
 ///
 pub fn write_samples_to_dict(project_root: &Path) {
    /*
    Manual corrections:
    as
    "কোঃটা": 9,
    "নিঃটা": 6,
    "নিযুতটা": 6,
    "লাখটা": 5,
    "হাজাৰটা": 3
    ar
    "ألف": 3,
    "آلاف": 3,
    "مليار": 9,
    "مليون": 6
    bn
    "লাটি": 5,
    "শত": 2,
    "হাটি": 3,
    "কোটি": 7
    es/es-US
    "mil": 3,
    "M": 6
    */
    let mut json_path = project_root.to_path_buf();
    json_path.push("testfiles/dict/large_number_samples.json");
--- a/src/dictionary.rs
+++ b/src/dictionary.rs
@ -245,14 +245,15 @@ pub fn entry(lang: Language) -> Entry {
            },
            comma_decimal: false,
            number_tokens: ::phf::Map {
-                key: 12913932095322966823,
+                key: 7485420634051515786,
                disps: &[
                    (0, 0),
                ],
                entries: &[
-                    ("مليار", 9),
+                    ("آلاف", 3),
                    ("مليون", 6),
                    ("ألف", 3),
                    ("مليار", 9),
                ],
            },
        },
--- a/src/util.rs
+++ b/src/util.rs
@ -383,6 +383,20 @@ mod tests {
        });
    }
    #[test]
    fn t_parse_large_numstr_samples2() {
        let json_path = Path::new("testfiles/dict/large_number_samples_all.json");
        let json_file = File::open(json_path).unwrap();
        let number_samples: BTreeMap<Language, BTreeMap<String, u64>> =
            serde_json::from_reader(BufReader::new(json_file)).unwrap();
        number_samples.iter().for_each(|(lang, entry)| {
            entry.iter().for_each(|(txt, expect)| {
                testcase_parse_large_numstr(txt, *lang, *expect);
            });
        });
    }
    fn testcase_parse_large_numstr(string: &str, lang: Language, expect: u64) {
        // Round the expected number to the amount of significant digits included
        // in the string.
--- a/testfiles/dict/dictionary.json
+++ b/testfiles/dict/dictionary.json
@ -125,6 +125,7 @@
    "comma_decimal": false,
    "number_tokens": {
      "ألف": 3,
      "آلاف": 3,
      "مليار": 9,
      "مليون": 6
    }
--- a/testfiles/dict/large_number_samples_all.json
+++ b/testfiles/dict/large_number_samples_all.json