feat: add large number parser

2022-09-23 18:19:24 +02:00 · 2022-09-23 18:19:24 +02:00 · fc7655093b
commit fc7655093b
parent 5d19259a14
5 changed files with 192 additions and 68 deletions
--- a/codegen/src/collect_large_numbers.rs
+++ b/codegen/src/collect_large_numbers.rs
@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path};

 use anyhow::{Context, Result};
@ -72,6 +72,25 @@ pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {

 /// Attempt to parse the numbers collected by `collect-large-numbers`
 /// and write the results to `dictionary.json`.
+///
+/// Manual corrections:
+/// as
+/// "কোঃটা": 9,
+/// "নিঃটা": 6,
+/// "নিযুতটা": 6,
+/// "লাখটা": 5,
+/// "হাজাৰটা": 3
+///
+/// bn
+/// "লাটি": 5,
+/// "শত": 2,
+/// "হাটি": 3,
+/// "কোটি": 7
+///
+/// es/es-US
+/// "mil": 3,
+/// "M": 6
+///
 pub fn write_samples_to_dict(project_root: &Path) {
    let mut json_path = project_root.to_path_buf();
    json_path.push("testfiles/dict/large_number_samples.json");
@ -160,8 +179,8 @@ pub fn write_samples_to_dict(project_root: &Path) {
                    let known_tmag: u8 = if t.len() == 1 {
                        match t.as_str() {
                            "K" | "k" => 3,
-                            "M" => 6,
                            // 'm' means 10^3 in Catalan, 'B' means 10^3 in Turkish
+                            // 'M' means 10^9 in Indonesian
                            _ => 0,
                        }
                    } else {
@ -186,6 +205,12 @@ pub fn write_samples_to_dict(project_root: &Path) {
            .filter_map(|(k, v)| v.map(|v| (k, v)))
            .collect();
        dict_entry.comma_decimal = comma_decimal;
+
+        // Check for duplicates
+        let mut uniq = HashSet::new();
+        if !dict_entry.number_tokens.values().all(|x| uniq.insert(x)) {
+            println!("Warning: collected duplicate tokens for {}", lang);
+        }
    }

    util::write_dict(project_root, &dict);
@ -340,19 +365,3 @@ async fn get_channel(channel_id: &str, lang: Language) -> Result<ChannelData> {
            .unwrap_or_default(),
    })
 }
-
-#[tokio::test]
-async fn test() {
-    let channel = get_channel("UCcdwLMPsaU2ezNSJU1nFoBQ", Language::Az)
-        .await
-        .unwrap();
-
-    dbg!(channel);
-}
-
-#[test]
-fn test2() {
-    write_samples_to_dict(Path::new(
-        "/home/thetadev/Documents/Programmieren/Rust/rustypipe",
-    ));
-}
--- a/src/client/player.rs
+++ b/src/client/player.rs
@ -298,20 +298,17 @@ fn deobf_nsig(
    last_nsig: &mut [String; 2],
 ) -> Result<()> {
    let nsig: String;
-    match url_params.get("n") {
-        Some(n) => {
-            nsig = if n == &last_nsig[0] {
-                last_nsig[1].to_owned()
-            } else {
-                let nsig = deobf.deobfuscate_nsig(n)?;
-                last_nsig[0] = n.to_string();
-                last_nsig[1] = nsig.to_owned();
-                nsig
-            };
+    if let Some(n) = url_params.get("n") {
+        nsig = if n == &last_nsig[0] {
+            last_nsig[1].to_owned()
+        } else {
+            let nsig = deobf.deobfuscate_nsig(n)?;
+            last_nsig[0] = n.to_string();
+            last_nsig[1] = nsig.to_owned();
+            nsig
+        };

-            url_params.insert("n".to_owned(), nsig);
-        }
-        None => {}
+        url_params.insert("n".to_owned(), nsig);
    };
    Ok(())
 }
--- a/src/dictionary.rs
+++ b/src/dictionary.rs
@ -294,18 +294,16 @@ pub fn entry(lang: Language) -> Entry {
            },
            comma_decimal: false,
            number_tokens: ::phf::Map {
-                key: 15467950696543387533,
+                key: 12913932095322966823,
                disps: &[
-                    (1, 0),
-                    (4, 5),
+                    (3, 0),
                ],
                entries: &[
                    ("নিয\u{9c1}তট\u{9be}", 6),
-                    ("নিঃট\u{9be}", 6),
-                    ("ল\u{9be}খট\u{9be}", 5),
-                    ("শঃ", 9),
-                    ("কোঃট\u{9be}", 9),
                    ("হ\u{9be}জ\u{9be}ৰট\u{9be}", 3),
+                    ("নিঃট\u{9be}", 6),
+                    ("কোঃট\u{9be}", 9),
+                    ("ল\u{9be}খট\u{9be}", 5),
                ],
            },
        },
@ -568,12 +566,13 @@ pub fn entry(lang: Language) -> Entry {
            number_tokens: ::phf::Map {
                key: 15467950696543387533,
                disps: &[
-                    (1, 0),
+                    (0, 0),
                ],
                entries: &[
                    ("ল\u{9be}টি", 5),
-                    ("শত", 9),
+                    ("শত", 2),
                    ("হ\u{9be}টি", 3),
+                    ("কোটি", 7),
                ],
            },
        },
@ -716,12 +715,13 @@ pub fn entry(lang: Language) -> Entry {
            },
            comma_decimal: true,
            number_tokens: ::phf::Map {
-                key: 15467950696543387533,
+                key: 12913932095322966823,
                disps: &[
-                    (0, 0),
+                    (2, 0),
                ],
                entries: &[
                    ("mM", 9),
+                    ("M", 6),
                    ("m", 3),
                ],
            },
@ -1044,14 +1044,15 @@ pub fn entry(lang: Language) -> Entry {
            },
            comma_decimal: false,
            number_tokens: ::phf::Map {
-                key: 15467950696543387533,
+                key: 14108922650502679131,
                disps: &[
                    (1, 0),
                ],
                entries: &[
-                    ("crore", 7),
-                    ("B", 9),
                    ("lakh", 5),
+                    ("crore", 7),
+                    ("M", 6),
+                    ("B", 9),
                ],
            },
        },
@ -1118,10 +1119,11 @@ pub fn entry(lang: Language) -> Entry {
            number_tokens: ::phf::Map {
                key: 12913932095322966823,
                disps: &[
-                    (0, 0),
+                    (1, 0),
                ],
                entries: &[
-                    ("mil", 9),
+                    ("mil", 3),
+                    ("M", 6),
                ],
            },
        },
@ -1188,10 +1190,11 @@ pub fn entry(lang: Language) -> Entry {
            number_tokens: ::phf::Map {
                key: 12913932095322966823,
                disps: &[
-                    (0, 0),
+                    (1, 0),
                ],
                entries: &[
-                    ("mil", 9),
+                    ("mil", 3),
+                    ("M", 6),
                ],
            },
        },
@ -1328,8 +1331,10 @@ pub fn entry(lang: Language) -> Entry {
            number_tokens: ::phf::Map {
                key: 12913932095322966823,
                disps: &[
+                    (0, 0),
                ],
                entries: &[
+                    ("M", 6),
                ],
            },
        },
@ -1512,6 +1517,7 @@ pub fn entry(lang: Language) -> Entry {
                    (0, 0),
                ],
                entries: &[
+                    ("M", 6),
                    ("B", 9),
                ],
            },
@ -1579,10 +1585,11 @@ pub fn entry(lang: Language) -> Entry {
            number_tokens: ::phf::Map {
                key: 12913932095322966823,
                disps: &[
-                    (1, 0),
+                    (2, 0),
                ],
                entries: &[
                    ("G", 9),
+                    ("M", 6),
                    ("Md", 9),
                ],
            },
@ -1650,8 +1657,10 @@ pub fn entry(lang: Language) -> Entry {
            number_tokens: ::phf::Map {
                key: 12913932095322966823,
                disps: &[
+                    (0, 0),
                ],
                entries: &[
+                    ("M", 6),
                ],
            },
        },
@ -1924,13 +1933,14 @@ pub fn entry(lang: Language) -> Entry {
            },
            comma_decimal: true,
            number_tokens: ::phf::Map {
-                key: 12913932095322966823,
+                key: 15467950696543387533,
                disps: &[
-                    (0, 0),
+                    (2, 0),
                ],
                entries: &[
-                    ("Mrd", 9),
                    ("E", 3),
+                    ("Mrd", 9),
+                    ("M", 6),
                ],
            },
        },
@ -2051,12 +2061,13 @@ pub fn entry(lang: Language) -> Entry {
            },
            comma_decimal: true,
            number_tokens: ::phf::Map {
-                key: 12913932095322966823,
+                key: 15467950696543387533,
                disps: &[
                    (0, 0),
                ],
                entries: &[
                    ("jt", 6),
+                    ("M", 9),
                    ("rb", 3),
                ],
            },
@ -3820,11 +3831,12 @@ pub fn entry(lang: Language) -> Entry {
            number_tokens: ::phf::Map {
                key: 12913932095322966823,
                disps: &[
-                    (0, 0),
+                    (2, 0),
                ],
                entries: &[
-                    ("mil", 3),
                    ("mM", 9),
+                    ("M", 6),
+                    ("mil", 3),
                ],
            },
        },
@ -4497,11 +4509,12 @@ pub fn entry(lang: Language) -> Entry {
            number_tokens: ::phf::Map {
                key: 12913932095322966823,
                disps: &[
-                    (0, 0),
+                    (1, 0),
                ],
                entries: &[
-                    ("elfu", 3),
                    ("B", 9),
+                    ("elfu", 3),
+                    ("M", 6),
                ],
            },
        },
@ -5136,6 +5149,7 @@ pub fn entry(lang: Language) -> Entry {
                    (0, 0),
                ],
                entries: &[
+                    ("M", 6),
                    ("B", 9),
                ],
            },
@ -5254,6 +5268,7 @@ pub fn entry(lang: Language) -> Entry {
                    (0, 0),
                ],
                entries: &[
+                    ("M", 6),
                    ("B", 9),
                ],
            },
--- a/src/util.rs
+++ b/src/util.rs
@ -6,6 +6,8 @@ use once_cell::sync::Lazy;
 use rand::Rng;
 use url::Url;

+use crate::{dictionary, model::Language};
+
 const CONTENT_PLAYBACK_NONCE_ALPHABET: &[u8; 64] =
    b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";

@ -228,8 +230,61 @@ impl<T> TryRemove<T> for Vec<T> {
    }
 }

+fn parse_large_numstr(string: &str, lang: Language) -> Option<u64> {
+    let dict_entry = dictionary::entry(lang);
+    let decimal_point = match dict_entry.comma_decimal {
+        true => ',',
+        false => '.',
+    };
+
+    let (num, mut exp, filtered) = {
+        let mut buf = String::new();
+        let mut filtered = String::new();
+        let mut exp = 0;
+        let mut after_point = false;
+        for c in string.chars() {
+            if c.is_ascii_digit() {
+                buf.push(c);
+
+                if after_point {
+                    exp -= 1;
+                }
+            } else if c == decimal_point {
+                after_point = true;
+            } else if !matches!(c, '\u{200b}' | '.' | ',') {
+                filtered.push(c);
+            }
+        }
+        (ok_or_bail!(buf.parse::<u64>(), None), exp, filtered)
+    };
+
+    let lookup_token = |token: &str| match token {
+        "K" | "k" => Some(3),
+        _ => dict_entry.number_tokens.get(token).map(|t| *t as i32),
+    };
+
+    if dict_entry.by_char {
+        exp += filtered
+            .chars()
+            .filter_map(|token| lookup_token(&token.to_string()))
+            .sum::<i32>();
+    } else {
+        exp += filtered
+            .split_whitespace()
+            .filter_map(lookup_token)
+            .sum::<i32>();
+    }
+
+    num.checked_mul(some_or_bail!(
+        (10_u64).checked_pow(ok_or_bail!(exp.try_into(), None)),
+        None
+    ))
+}
+
 #[cfg(test)]
 mod tests {
+    use std::{fs::File, io::BufReader, path::Path};
+
    use super::*;

    use rstest::rstest;
@ -313,4 +368,36 @@ mod tests {
        let res = sanitize_yt_url(url);
        assert_eq!(res, expect);
    }
+
+    #[test]
+    fn t_parse_large_numstr_samples() {
+        let json_path = Path::new("testfiles/dict/large_number_samples.json");
+        let json_file = File::open(json_path).unwrap();
+        let number_samples: BTreeMap<Language, BTreeMap<u8, (String, u64)>> =
+            serde_json::from_reader(BufReader::new(json_file)).unwrap();
+
+        number_samples.iter().for_each(|(lang, entry)| {
+            entry.iter().for_each(|(_, (txt, expect))| {
+                testcase_parse_large_numstr(txt, *lang, *expect);
+            });
+        });
+    }
+
+    fn testcase_parse_large_numstr(string: &str, lang: Language, expect: u64) {
+        // Round the expected number to the amount of significant digits included
+        // in the string.
+        let rounded = {
+            let n_significant_d = string.chars().filter(char::is_ascii_digit).count();
+            let mag = (expect as f64).log10().floor();
+            let factor = 10_u64.pow(1 + mag as u32 - n_significant_d as u32);
+            (((expect as f64) / factor as f64).floor() as u64) * factor
+        };
+
+        let res = parse_large_numstr(string, lang).expect(string);
+        assert_eq!(
+            res, rounded,
+            "{} (lang: {}, exact: {})",
+            string, lang, expect
+        );
+    }
 }
--- a/testfiles/dict/dictionary.json
+++ b/testfiles/dict/dictionary.json
@ -153,7 +153,6 @@
      "নিঃটা": 6,
      "নিযুতটা": 6,
      "লাখটা": 5,
-      "শঃ": 9,
      "হাজাৰটা": 3
    }
  },
@ -315,8 +314,9 @@
    "comma_decimal": false,
    "number_tokens": {
      "লাটি": 5,
-      "শত": 9,
-      "হাটি": 3
+      "শত": 2,
+      "হাটি": 3,
+      "কোটি": 7
    }
  },
  "bs": {
@ -409,6 +409,7 @@
    },
    "comma_decimal": true,
    "number_tokens": {
+      "M": 6,
      "m": 3,
      "mM": 9
    }
@ -610,6 +611,7 @@
    "comma_decimal": false,
    "number_tokens": {
      "B": 9,
+      "M": 6,
      "crore": 7,
      "lakh": 5
    }
@ -654,7 +656,8 @@
    },
    "comma_decimal": true,
    "number_tokens": {
-      "mil": 9
+      "M": 6,
+      "mil": 3
    }
  },
  "es-US": {
@ -699,7 +702,8 @@
    },
    "comma_decimal": false,
    "number_tokens": {
-      "mil": 9
+      "M": 6,
+      "mil": 3
    }
  },
  "et": {
@ -784,7 +788,9 @@
      "gaur": "0D"
    },
    "comma_decimal": true,
-    "number_tokens": {}
+    "number_tokens": {
+      "M": 6
+    }
  },
  "fa": {
    "equivalent": [],
@ -889,7 +895,8 @@
    },
    "comma_decimal": false,
    "number_tokens": {
-      "B": 9
+      "B": 9,
+      "M": 6
    }
  },
  "fr": {
@ -935,6 +942,7 @@
    "comma_decimal": true,
    "number_tokens": {
      "G": 9,
+      "M": 6,
      "Md": 9
    }
  },
@ -977,7 +985,9 @@
      "onte": "1D"
    },
    "comma_decimal": true,
-    "number_tokens": {}
+    "number_tokens": {
+      "M": 6
+    }
  },
  "gu": {
    "equivalent": [],
@ -1148,6 +1158,7 @@
    "comma_decimal": true,
    "number_tokens": {
      "E": 3,
+      "M": 6,
      "Mrd": 9
    }
  },
@ -1222,6 +1233,7 @@
    },
    "comma_decimal": true,
    "number_tokens": {
+      "M": 9,
      "jt": 6,
      "rb": 3
    }
@ -2281,6 +2293,7 @@
    },
    "comma_decimal": true,
    "number_tokens": {
+      "M": 6,
      "mM": 9,
      "mil": 3
    }
@ -2693,6 +2706,7 @@
    "comma_decimal": false,
    "number_tokens": {
      "B": 9,
+      "M": 6,
      "elfu": 3
    }
  },
@ -3068,7 +3082,8 @@
    },
    "comma_decimal": false,
    "number_tokens": {
-      "B": 9
+      "B": 9,
+      "M": 6
    }
  },
  "zh-TW": {
@ -3135,7 +3150,8 @@
    },
    "comma_decimal": false,
    "number_tokens": {
-      "B": 9
+      "B": 9,
+      "M": 6
    }
  }
 }