diff --git a/codegen/src/collect_large_numbers.rs b/codegen/src/collect_large_numbers.rs index c340b12..a7ee294 100644 --- a/codegen/src/collect_large_numbers.rs +++ b/codegen/src/collect_large_numbers.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path}; use anyhow::{Context, Result}; @@ -72,6 +72,25 @@ pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) { /// Attempt to parse the numbers collected by `collect-large-numbers` /// and write the results to `dictionary.json`. +/// +/// Manual corrections: +/// as +/// "কোঃটা": 9, +/// "নিঃটা": 6, +/// "নিযুতটা": 6, +/// "লাখটা": 5, +/// "হাজাৰটা": 3 +/// +/// bn +/// "লাটি": 5, +/// "শত": 2, +/// "হাটি": 3, +/// "কোটি": 7 +/// +/// es/es-US +/// "mil": 3, +/// "M": 6 +/// pub fn write_samples_to_dict(project_root: &Path) { let mut json_path = project_root.to_path_buf(); json_path.push("testfiles/dict/large_number_samples.json"); @@ -160,8 +179,8 @@ pub fn write_samples_to_dict(project_root: &Path) { let known_tmag: u8 = if t.len() == 1 { match t.as_str() { "K" | "k" => 3, - "M" => 6, // 'm' means 10^3 in Catalan, 'B' means 10^3 in Turkish + // 'M' means 10^9 in Indonesian _ => 0, } } else { @@ -186,6 +205,12 @@ pub fn write_samples_to_dict(project_root: &Path) { .filter_map(|(k, v)| v.map(|v| (k, v))) .collect(); dict_entry.comma_decimal = comma_decimal; + + // Check for duplicates + let mut uniq = HashSet::new(); + if !dict_entry.number_tokens.values().all(|x| uniq.insert(x)) { + println!("Warning: collected duplicate tokens for {}", lang); + } } util::write_dict(project_root, &dict); @@ -340,19 +365,3 @@ async fn get_channel(channel_id: &str, lang: Language) -> Result { .unwrap_or_default(), }) } - -#[tokio::test] -async fn test() { - let channel = get_channel("UCcdwLMPsaU2ezNSJU1nFoBQ", Language::Az) - .await - .unwrap(); - - dbg!(channel); -} - -#[test] -fn test2() { - write_samples_to_dict(Path::new( - "/home/thetadev/Documents/Programmieren/Rust/rustypipe", - )); -} diff --git a/src/client/player.rs b/src/client/player.rs index c01bd04..e5e0df8 100644 --- a/src/client/player.rs +++ b/src/client/player.rs @@ -298,20 +298,17 @@ fn deobf_nsig( last_nsig: &mut [String; 2], ) -> Result<()> { let nsig: String; - match url_params.get("n") { - Some(n) => { - nsig = if n == &last_nsig[0] { - last_nsig[1].to_owned() - } else { - let nsig = deobf.deobfuscate_nsig(n)?; - last_nsig[0] = n.to_string(); - last_nsig[1] = nsig.to_owned(); - nsig - }; + if let Some(n) = url_params.get("n") { + nsig = if n == &last_nsig[0] { + last_nsig[1].to_owned() + } else { + let nsig = deobf.deobfuscate_nsig(n)?; + last_nsig[0] = n.to_string(); + last_nsig[1] = nsig.to_owned(); + nsig + }; - url_params.insert("n".to_owned(), nsig); - } - None => {} + url_params.insert("n".to_owned(), nsig); }; Ok(()) } diff --git a/src/dictionary.rs b/src/dictionary.rs index 84d4465..cb715b9 100644 --- a/src/dictionary.rs +++ b/src/dictionary.rs @@ -294,18 +294,16 @@ pub fn entry(lang: Language) -> Entry { }, comma_decimal: false, number_tokens: ::phf::Map { - key: 15467950696543387533, + key: 12913932095322966823, disps: &[ - (1, 0), - (4, 5), + (3, 0), ], entries: &[ ("নিয\u{9c1}তট\u{9be}", 6), - ("নিঃট\u{9be}", 6), - ("ল\u{9be}খট\u{9be}", 5), - ("শঃ", 9), - ("কোঃট\u{9be}", 9), ("হ\u{9be}জ\u{9be}ৰট\u{9be}", 3), + ("নিঃট\u{9be}", 6), + ("কোঃট\u{9be}", 9), + ("ল\u{9be}খট\u{9be}", 5), ], }, }, @@ -568,12 +566,13 @@ pub fn entry(lang: Language) -> Entry { number_tokens: ::phf::Map { key: 15467950696543387533, disps: &[ - (1, 0), + (0, 0), ], entries: &[ ("ল\u{9be}টি", 5), - ("শত", 9), + ("শত", 2), ("হ\u{9be}টি", 3), + ("কোটি", 7), ], }, }, @@ -716,12 +715,13 @@ pub fn entry(lang: Language) -> Entry { }, comma_decimal: true, number_tokens: ::phf::Map { - key: 15467950696543387533, + key: 12913932095322966823, disps: &[ - (0, 0), + (2, 0), ], entries: &[ ("mM", 9), + ("M", 6), ("m", 3), ], }, @@ -1044,14 +1044,15 @@ pub fn entry(lang: Language) -> Entry { }, comma_decimal: false, number_tokens: ::phf::Map { - key: 15467950696543387533, + key: 14108922650502679131, disps: &[ (1, 0), ], entries: &[ - ("crore", 7), - ("B", 9), ("lakh", 5), + ("crore", 7), + ("M", 6), + ("B", 9), ], }, }, @@ -1118,10 +1119,11 @@ pub fn entry(lang: Language) -> Entry { number_tokens: ::phf::Map { key: 12913932095322966823, disps: &[ - (0, 0), + (1, 0), ], entries: &[ - ("mil", 9), + ("mil", 3), + ("M", 6), ], }, }, @@ -1188,10 +1190,11 @@ pub fn entry(lang: Language) -> Entry { number_tokens: ::phf::Map { key: 12913932095322966823, disps: &[ - (0, 0), + (1, 0), ], entries: &[ - ("mil", 9), + ("mil", 3), + ("M", 6), ], }, }, @@ -1328,8 +1331,10 @@ pub fn entry(lang: Language) -> Entry { number_tokens: ::phf::Map { key: 12913932095322966823, disps: &[ + (0, 0), ], entries: &[ + ("M", 6), ], }, }, @@ -1512,6 +1517,7 @@ pub fn entry(lang: Language) -> Entry { (0, 0), ], entries: &[ + ("M", 6), ("B", 9), ], }, @@ -1579,10 +1585,11 @@ pub fn entry(lang: Language) -> Entry { number_tokens: ::phf::Map { key: 12913932095322966823, disps: &[ - (1, 0), + (2, 0), ], entries: &[ ("G", 9), + ("M", 6), ("Md", 9), ], }, @@ -1650,8 +1657,10 @@ pub fn entry(lang: Language) -> Entry { number_tokens: ::phf::Map { key: 12913932095322966823, disps: &[ + (0, 0), ], entries: &[ + ("M", 6), ], }, }, @@ -1924,13 +1933,14 @@ pub fn entry(lang: Language) -> Entry { }, comma_decimal: true, number_tokens: ::phf::Map { - key: 12913932095322966823, + key: 15467950696543387533, disps: &[ - (0, 0), + (2, 0), ], entries: &[ - ("Mrd", 9), ("E", 3), + ("Mrd", 9), + ("M", 6), ], }, }, @@ -2051,12 +2061,13 @@ pub fn entry(lang: Language) -> Entry { }, comma_decimal: true, number_tokens: ::phf::Map { - key: 12913932095322966823, + key: 15467950696543387533, disps: &[ (0, 0), ], entries: &[ ("jt", 6), + ("M", 9), ("rb", 3), ], }, @@ -3820,11 +3831,12 @@ pub fn entry(lang: Language) -> Entry { number_tokens: ::phf::Map { key: 12913932095322966823, disps: &[ - (0, 0), + (2, 0), ], entries: &[ - ("mil", 3), ("mM", 9), + ("M", 6), + ("mil", 3), ], }, }, @@ -4497,11 +4509,12 @@ pub fn entry(lang: Language) -> Entry { number_tokens: ::phf::Map { key: 12913932095322966823, disps: &[ - (0, 0), + (1, 0), ], entries: &[ - ("elfu", 3), ("B", 9), + ("elfu", 3), + ("M", 6), ], }, }, @@ -5136,6 +5149,7 @@ pub fn entry(lang: Language) -> Entry { (0, 0), ], entries: &[ + ("M", 6), ("B", 9), ], }, @@ -5254,6 +5268,7 @@ pub fn entry(lang: Language) -> Entry { (0, 0), ], entries: &[ + ("M", 6), ("B", 9), ], }, diff --git a/src/util.rs b/src/util.rs index 8276f9f..14d332e 100644 --- a/src/util.rs +++ b/src/util.rs @@ -6,6 +6,8 @@ use once_cell::sync::Lazy; use rand::Rng; use url::Url; +use crate::{dictionary, model::Language}; + const CONTENT_PLAYBACK_NONCE_ALPHABET: &[u8; 64] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; @@ -228,8 +230,61 @@ impl TryRemove for Vec { } } +fn parse_large_numstr(string: &str, lang: Language) -> Option { + let dict_entry = dictionary::entry(lang); + let decimal_point = match dict_entry.comma_decimal { + true => ',', + false => '.', + }; + + let (num, mut exp, filtered) = { + let mut buf = String::new(); + let mut filtered = String::new(); + let mut exp = 0; + let mut after_point = false; + for c in string.chars() { + if c.is_ascii_digit() { + buf.push(c); + + if after_point { + exp -= 1; + } + } else if c == decimal_point { + after_point = true; + } else if !matches!(c, '\u{200b}' | '.' | ',') { + filtered.push(c); + } + } + (ok_or_bail!(buf.parse::(), None), exp, filtered) + }; + + let lookup_token = |token: &str| match token { + "K" | "k" => Some(3), + _ => dict_entry.number_tokens.get(token).map(|t| *t as i32), + }; + + if dict_entry.by_char { + exp += filtered + .chars() + .filter_map(|token| lookup_token(&token.to_string())) + .sum::(); + } else { + exp += filtered + .split_whitespace() + .filter_map(lookup_token) + .sum::(); + } + + num.checked_mul(some_or_bail!( + (10_u64).checked_pow(ok_or_bail!(exp.try_into(), None)), + None + )) +} + #[cfg(test)] mod tests { + use std::{fs::File, io::BufReader, path::Path}; + use super::*; use rstest::rstest; @@ -313,4 +368,36 @@ mod tests { let res = sanitize_yt_url(url); assert_eq!(res, expect); } + + #[test] + fn t_parse_large_numstr_samples() { + let json_path = Path::new("testfiles/dict/large_number_samples.json"); + let json_file = File::open(json_path).unwrap(); + let number_samples: BTreeMap> = + serde_json::from_reader(BufReader::new(json_file)).unwrap(); + + number_samples.iter().for_each(|(lang, entry)| { + entry.iter().for_each(|(_, (txt, expect))| { + testcase_parse_large_numstr(txt, *lang, *expect); + }); + }); + } + + fn testcase_parse_large_numstr(string: &str, lang: Language, expect: u64) { + // Round the expected number to the amount of significant digits included + // in the string. + let rounded = { + let n_significant_d = string.chars().filter(char::is_ascii_digit).count(); + let mag = (expect as f64).log10().floor(); + let factor = 10_u64.pow(1 + mag as u32 - n_significant_d as u32); + (((expect as f64) / factor as f64).floor() as u64) * factor + }; + + let res = parse_large_numstr(string, lang).expect(string); + assert_eq!( + res, rounded, + "{} (lang: {}, exact: {})", + string, lang, expect + ); + } } diff --git a/testfiles/dict/dictionary.json b/testfiles/dict/dictionary.json index 6ac29e1..1a5db07 100644 --- a/testfiles/dict/dictionary.json +++ b/testfiles/dict/dictionary.json @@ -153,7 +153,6 @@ "নিঃটা": 6, "নিযুতটা": 6, "লাখটা": 5, - "শঃ": 9, "হাজাৰটা": 3 } }, @@ -315,8 +314,9 @@ "comma_decimal": false, "number_tokens": { "লাটি": 5, - "শত": 9, - "হাটি": 3 + "শত": 2, + "হাটি": 3, + "কোটি": 7 } }, "bs": { @@ -409,6 +409,7 @@ }, "comma_decimal": true, "number_tokens": { + "M": 6, "m": 3, "mM": 9 } @@ -610,6 +611,7 @@ "comma_decimal": false, "number_tokens": { "B": 9, + "M": 6, "crore": 7, "lakh": 5 } @@ -654,7 +656,8 @@ }, "comma_decimal": true, "number_tokens": { - "mil": 9 + "M": 6, + "mil": 3 } }, "es-US": { @@ -699,7 +702,8 @@ }, "comma_decimal": false, "number_tokens": { - "mil": 9 + "M": 6, + "mil": 3 } }, "et": { @@ -784,7 +788,9 @@ "gaur": "0D" }, "comma_decimal": true, - "number_tokens": {} + "number_tokens": { + "M": 6 + } }, "fa": { "equivalent": [], @@ -889,7 +895,8 @@ }, "comma_decimal": false, "number_tokens": { - "B": 9 + "B": 9, + "M": 6 } }, "fr": { @@ -935,6 +942,7 @@ "comma_decimal": true, "number_tokens": { "G": 9, + "M": 6, "Md": 9 } }, @@ -977,7 +985,9 @@ "onte": "1D" }, "comma_decimal": true, - "number_tokens": {} + "number_tokens": { + "M": 6 + } }, "gu": { "equivalent": [], @@ -1148,6 +1158,7 @@ "comma_decimal": true, "number_tokens": { "E": 3, + "M": 6, "Mrd": 9 } }, @@ -1222,6 +1233,7 @@ }, "comma_decimal": true, "number_tokens": { + "M": 9, "jt": 6, "rb": 3 } @@ -2281,6 +2293,7 @@ }, "comma_decimal": true, "number_tokens": { + "M": 6, "mM": 9, "mil": 3 } @@ -2693,6 +2706,7 @@ "comma_decimal": false, "number_tokens": { "B": 9, + "M": 6, "elfu": 3 } }, @@ -3068,7 +3082,8 @@ }, "comma_decimal": false, "number_tokens": { - "B": 9 + "B": 9, + "M": 6 } }, "zh-TW": { @@ -3135,7 +3150,8 @@ }, "comma_decimal": false, "number_tokens": { - "B": 9 + "B": 9, + "M": 6 } } }