feat: add large number parser
This commit is contained in:
parent
5d19259a14
commit
fc7655093b
5 changed files with 192 additions and 68 deletions
|
|
@ -1,4 +1,4 @@
|
|||
use std::collections::HashMap;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
|
|
@ -72,6 +72,25 @@ pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
|
|||
|
||||
/// Attempt to parse the numbers collected by `collect-large-numbers`
|
||||
/// and write the results to `dictionary.json`.
|
||||
///
|
||||
/// Manual corrections:
|
||||
/// as
|
||||
/// "কোঃটা": 9,
|
||||
/// "নিঃটা": 6,
|
||||
/// "নিযুতটা": 6,
|
||||
/// "লাখটা": 5,
|
||||
/// "হাজাৰটা": 3
|
||||
///
|
||||
/// bn
|
||||
/// "লাটি": 5,
|
||||
/// "শত": 2,
|
||||
/// "হাটি": 3,
|
||||
/// "কোটি": 7
|
||||
///
|
||||
/// es/es-US
|
||||
/// "mil": 3,
|
||||
/// "M": 6
|
||||
///
|
||||
pub fn write_samples_to_dict(project_root: &Path) {
|
||||
let mut json_path = project_root.to_path_buf();
|
||||
json_path.push("testfiles/dict/large_number_samples.json");
|
||||
|
|
@ -160,8 +179,8 @@ pub fn write_samples_to_dict(project_root: &Path) {
|
|||
let known_tmag: u8 = if t.len() == 1 {
|
||||
match t.as_str() {
|
||||
"K" | "k" => 3,
|
||||
"M" => 6,
|
||||
// 'm' means 10^3 in Catalan, 'B' means 10^3 in Turkish
|
||||
// 'M' means 10^9 in Indonesian
|
||||
_ => 0,
|
||||
}
|
||||
} else {
|
||||
|
|
@ -186,6 +205,12 @@ pub fn write_samples_to_dict(project_root: &Path) {
|
|||
.filter_map(|(k, v)| v.map(|v| (k, v)))
|
||||
.collect();
|
||||
dict_entry.comma_decimal = comma_decimal;
|
||||
|
||||
// Check for duplicates
|
||||
let mut uniq = HashSet::new();
|
||||
if !dict_entry.number_tokens.values().all(|x| uniq.insert(x)) {
|
||||
println!("Warning: collected duplicate tokens for {}", lang);
|
||||
}
|
||||
}
|
||||
|
||||
util::write_dict(project_root, &dict);
|
||||
|
|
@ -340,19 +365,3 @@ async fn get_channel(channel_id: &str, lang: Language) -> Result<ChannelData> {
|
|||
.unwrap_or_default(),
|
||||
})
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test() {
|
||||
let channel = get_channel("UCcdwLMPsaU2ezNSJU1nFoBQ", Language::Az)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
dbg!(channel);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test2() {
|
||||
write_samples_to_dict(Path::new(
|
||||
"/home/thetadev/Documents/Programmieren/Rust/rustypipe",
|
||||
));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -298,20 +298,17 @@ fn deobf_nsig(
|
|||
last_nsig: &mut [String; 2],
|
||||
) -> Result<()> {
|
||||
let nsig: String;
|
||||
match url_params.get("n") {
|
||||
Some(n) => {
|
||||
nsig = if n == &last_nsig[0] {
|
||||
last_nsig[1].to_owned()
|
||||
} else {
|
||||
let nsig = deobf.deobfuscate_nsig(n)?;
|
||||
last_nsig[0] = n.to_string();
|
||||
last_nsig[1] = nsig.to_owned();
|
||||
nsig
|
||||
};
|
||||
if let Some(n) = url_params.get("n") {
|
||||
nsig = if n == &last_nsig[0] {
|
||||
last_nsig[1].to_owned()
|
||||
} else {
|
||||
let nsig = deobf.deobfuscate_nsig(n)?;
|
||||
last_nsig[0] = n.to_string();
|
||||
last_nsig[1] = nsig.to_owned();
|
||||
nsig
|
||||
};
|
||||
|
||||
url_params.insert("n".to_owned(), nsig);
|
||||
}
|
||||
None => {}
|
||||
url_params.insert("n".to_owned(), nsig);
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
|||
|
|
@ -294,18 +294,16 @@ pub fn entry(lang: Language) -> Entry {
|
|||
},
|
||||
comma_decimal: false,
|
||||
number_tokens: ::phf::Map {
|
||||
key: 15467950696543387533,
|
||||
key: 12913932095322966823,
|
||||
disps: &[
|
||||
(1, 0),
|
||||
(4, 5),
|
||||
(3, 0),
|
||||
],
|
||||
entries: &[
|
||||
("নিয\u{9c1}তট\u{9be}", 6),
|
||||
("নিঃট\u{9be}", 6),
|
||||
("ল\u{9be}খট\u{9be}", 5),
|
||||
("শঃ", 9),
|
||||
("কোঃট\u{9be}", 9),
|
||||
("হ\u{9be}জ\u{9be}ৰট\u{9be}", 3),
|
||||
("নিঃট\u{9be}", 6),
|
||||
("কোঃট\u{9be}", 9),
|
||||
("ল\u{9be}খট\u{9be}", 5),
|
||||
],
|
||||
},
|
||||
},
|
||||
|
|
@ -568,12 +566,13 @@ pub fn entry(lang: Language) -> Entry {
|
|||
number_tokens: ::phf::Map {
|
||||
key: 15467950696543387533,
|
||||
disps: &[
|
||||
(1, 0),
|
||||
(0, 0),
|
||||
],
|
||||
entries: &[
|
||||
("ল\u{9be}টি", 5),
|
||||
("শত", 9),
|
||||
("শত", 2),
|
||||
("হ\u{9be}টি", 3),
|
||||
("কোটি", 7),
|
||||
],
|
||||
},
|
||||
},
|
||||
|
|
@ -716,12 +715,13 @@ pub fn entry(lang: Language) -> Entry {
|
|||
},
|
||||
comma_decimal: true,
|
||||
number_tokens: ::phf::Map {
|
||||
key: 15467950696543387533,
|
||||
key: 12913932095322966823,
|
||||
disps: &[
|
||||
(0, 0),
|
||||
(2, 0),
|
||||
],
|
||||
entries: &[
|
||||
("mM", 9),
|
||||
("M", 6),
|
||||
("m", 3),
|
||||
],
|
||||
},
|
||||
|
|
@ -1044,14 +1044,15 @@ pub fn entry(lang: Language) -> Entry {
|
|||
},
|
||||
comma_decimal: false,
|
||||
number_tokens: ::phf::Map {
|
||||
key: 15467950696543387533,
|
||||
key: 14108922650502679131,
|
||||
disps: &[
|
||||
(1, 0),
|
||||
],
|
||||
entries: &[
|
||||
("crore", 7),
|
||||
("B", 9),
|
||||
("lakh", 5),
|
||||
("crore", 7),
|
||||
("M", 6),
|
||||
("B", 9),
|
||||
],
|
||||
},
|
||||
},
|
||||
|
|
@ -1118,10 +1119,11 @@ pub fn entry(lang: Language) -> Entry {
|
|||
number_tokens: ::phf::Map {
|
||||
key: 12913932095322966823,
|
||||
disps: &[
|
||||
(0, 0),
|
||||
(1, 0),
|
||||
],
|
||||
entries: &[
|
||||
("mil", 9),
|
||||
("mil", 3),
|
||||
("M", 6),
|
||||
],
|
||||
},
|
||||
},
|
||||
|
|
@ -1188,10 +1190,11 @@ pub fn entry(lang: Language) -> Entry {
|
|||
number_tokens: ::phf::Map {
|
||||
key: 12913932095322966823,
|
||||
disps: &[
|
||||
(0, 0),
|
||||
(1, 0),
|
||||
],
|
||||
entries: &[
|
||||
("mil", 9),
|
||||
("mil", 3),
|
||||
("M", 6),
|
||||
],
|
||||
},
|
||||
},
|
||||
|
|
@ -1328,8 +1331,10 @@ pub fn entry(lang: Language) -> Entry {
|
|||
number_tokens: ::phf::Map {
|
||||
key: 12913932095322966823,
|
||||
disps: &[
|
||||
(0, 0),
|
||||
],
|
||||
entries: &[
|
||||
("M", 6),
|
||||
],
|
||||
},
|
||||
},
|
||||
|
|
@ -1512,6 +1517,7 @@ pub fn entry(lang: Language) -> Entry {
|
|||
(0, 0),
|
||||
],
|
||||
entries: &[
|
||||
("M", 6),
|
||||
("B", 9),
|
||||
],
|
||||
},
|
||||
|
|
@ -1579,10 +1585,11 @@ pub fn entry(lang: Language) -> Entry {
|
|||
number_tokens: ::phf::Map {
|
||||
key: 12913932095322966823,
|
||||
disps: &[
|
||||
(1, 0),
|
||||
(2, 0),
|
||||
],
|
||||
entries: &[
|
||||
("G", 9),
|
||||
("M", 6),
|
||||
("Md", 9),
|
||||
],
|
||||
},
|
||||
|
|
@ -1650,8 +1657,10 @@ pub fn entry(lang: Language) -> Entry {
|
|||
number_tokens: ::phf::Map {
|
||||
key: 12913932095322966823,
|
||||
disps: &[
|
||||
(0, 0),
|
||||
],
|
||||
entries: &[
|
||||
("M", 6),
|
||||
],
|
||||
},
|
||||
},
|
||||
|
|
@ -1924,13 +1933,14 @@ pub fn entry(lang: Language) -> Entry {
|
|||
},
|
||||
comma_decimal: true,
|
||||
number_tokens: ::phf::Map {
|
||||
key: 12913932095322966823,
|
||||
key: 15467950696543387533,
|
||||
disps: &[
|
||||
(0, 0),
|
||||
(2, 0),
|
||||
],
|
||||
entries: &[
|
||||
("Mrd", 9),
|
||||
("E", 3),
|
||||
("Mrd", 9),
|
||||
("M", 6),
|
||||
],
|
||||
},
|
||||
},
|
||||
|
|
@ -2051,12 +2061,13 @@ pub fn entry(lang: Language) -> Entry {
|
|||
},
|
||||
comma_decimal: true,
|
||||
number_tokens: ::phf::Map {
|
||||
key: 12913932095322966823,
|
||||
key: 15467950696543387533,
|
||||
disps: &[
|
||||
(0, 0),
|
||||
],
|
||||
entries: &[
|
||||
("jt", 6),
|
||||
("M", 9),
|
||||
("rb", 3),
|
||||
],
|
||||
},
|
||||
|
|
@ -3820,11 +3831,12 @@ pub fn entry(lang: Language) -> Entry {
|
|||
number_tokens: ::phf::Map {
|
||||
key: 12913932095322966823,
|
||||
disps: &[
|
||||
(0, 0),
|
||||
(2, 0),
|
||||
],
|
||||
entries: &[
|
||||
("mil", 3),
|
||||
("mM", 9),
|
||||
("M", 6),
|
||||
("mil", 3),
|
||||
],
|
||||
},
|
||||
},
|
||||
|
|
@ -4497,11 +4509,12 @@ pub fn entry(lang: Language) -> Entry {
|
|||
number_tokens: ::phf::Map {
|
||||
key: 12913932095322966823,
|
||||
disps: &[
|
||||
(0, 0),
|
||||
(1, 0),
|
||||
],
|
||||
entries: &[
|
||||
("elfu", 3),
|
||||
("B", 9),
|
||||
("elfu", 3),
|
||||
("M", 6),
|
||||
],
|
||||
},
|
||||
},
|
||||
|
|
@ -5136,6 +5149,7 @@ pub fn entry(lang: Language) -> Entry {
|
|||
(0, 0),
|
||||
],
|
||||
entries: &[
|
||||
("M", 6),
|
||||
("B", 9),
|
||||
],
|
||||
},
|
||||
|
|
@ -5254,6 +5268,7 @@ pub fn entry(lang: Language) -> Entry {
|
|||
(0, 0),
|
||||
],
|
||||
entries: &[
|
||||
("M", 6),
|
||||
("B", 9),
|
||||
],
|
||||
},
|
||||
|
|
|
|||
87
src/util.rs
87
src/util.rs
|
|
@ -6,6 +6,8 @@ use once_cell::sync::Lazy;
|
|||
use rand::Rng;
|
||||
use url::Url;
|
||||
|
||||
use crate::{dictionary, model::Language};
|
||||
|
||||
const CONTENT_PLAYBACK_NONCE_ALPHABET: &[u8; 64] =
|
||||
b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
|
||||
|
||||
|
|
@ -228,8 +230,61 @@ impl<T> TryRemove<T> for Vec<T> {
|
|||
}
|
||||
}
|
||||
|
||||
fn parse_large_numstr(string: &str, lang: Language) -> Option<u64> {
|
||||
let dict_entry = dictionary::entry(lang);
|
||||
let decimal_point = match dict_entry.comma_decimal {
|
||||
true => ',',
|
||||
false => '.',
|
||||
};
|
||||
|
||||
let (num, mut exp, filtered) = {
|
||||
let mut buf = String::new();
|
||||
let mut filtered = String::new();
|
||||
let mut exp = 0;
|
||||
let mut after_point = false;
|
||||
for c in string.chars() {
|
||||
if c.is_ascii_digit() {
|
||||
buf.push(c);
|
||||
|
||||
if after_point {
|
||||
exp -= 1;
|
||||
}
|
||||
} else if c == decimal_point {
|
||||
after_point = true;
|
||||
} else if !matches!(c, '\u{200b}' | '.' | ',') {
|
||||
filtered.push(c);
|
||||
}
|
||||
}
|
||||
(ok_or_bail!(buf.parse::<u64>(), None), exp, filtered)
|
||||
};
|
||||
|
||||
let lookup_token = |token: &str| match token {
|
||||
"K" | "k" => Some(3),
|
||||
_ => dict_entry.number_tokens.get(token).map(|t| *t as i32),
|
||||
};
|
||||
|
||||
if dict_entry.by_char {
|
||||
exp += filtered
|
||||
.chars()
|
||||
.filter_map(|token| lookup_token(&token.to_string()))
|
||||
.sum::<i32>();
|
||||
} else {
|
||||
exp += filtered
|
||||
.split_whitespace()
|
||||
.filter_map(lookup_token)
|
||||
.sum::<i32>();
|
||||
}
|
||||
|
||||
num.checked_mul(some_or_bail!(
|
||||
(10_u64).checked_pow(ok_or_bail!(exp.try_into(), None)),
|
||||
None
|
||||
))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{fs::File, io::BufReader, path::Path};
|
||||
|
||||
use super::*;
|
||||
|
||||
use rstest::rstest;
|
||||
|
|
@ -313,4 +368,36 @@ mod tests {
|
|||
let res = sanitize_yt_url(url);
|
||||
assert_eq!(res, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn t_parse_large_numstr_samples() {
|
||||
let json_path = Path::new("testfiles/dict/large_number_samples.json");
|
||||
let json_file = File::open(json_path).unwrap();
|
||||
let number_samples: BTreeMap<Language, BTreeMap<u8, (String, u64)>> =
|
||||
serde_json::from_reader(BufReader::new(json_file)).unwrap();
|
||||
|
||||
number_samples.iter().for_each(|(lang, entry)| {
|
||||
entry.iter().for_each(|(_, (txt, expect))| {
|
||||
testcase_parse_large_numstr(txt, *lang, *expect);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
fn testcase_parse_large_numstr(string: &str, lang: Language, expect: u64) {
|
||||
// Round the expected number to the amount of significant digits included
|
||||
// in the string.
|
||||
let rounded = {
|
||||
let n_significant_d = string.chars().filter(char::is_ascii_digit).count();
|
||||
let mag = (expect as f64).log10().floor();
|
||||
let factor = 10_u64.pow(1 + mag as u32 - n_significant_d as u32);
|
||||
(((expect as f64) / factor as f64).floor() as u64) * factor
|
||||
};
|
||||
|
||||
let res = parse_large_numstr(string, lang).expect(string);
|
||||
assert_eq!(
|
||||
res, rounded,
|
||||
"{} (lang: {}, exact: {})",
|
||||
string, lang, expect
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -153,7 +153,6 @@
|
|||
"নিঃটা": 6,
|
||||
"নিযুতটা": 6,
|
||||
"লাখটা": 5,
|
||||
"শঃ": 9,
|
||||
"হাজাৰটা": 3
|
||||
}
|
||||
},
|
||||
|
|
@ -315,8 +314,9 @@
|
|||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"লাটি": 5,
|
||||
"শত": 9,
|
||||
"হাটি": 3
|
||||
"শত": 2,
|
||||
"হাটি": 3,
|
||||
"কোটি": 7
|
||||
}
|
||||
},
|
||||
"bs": {
|
||||
|
|
@ -409,6 +409,7 @@
|
|||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"M": 6,
|
||||
"m": 3,
|
||||
"mM": 9
|
||||
}
|
||||
|
|
@ -610,6 +611,7 @@
|
|||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"B": 9,
|
||||
"M": 6,
|
||||
"crore": 7,
|
||||
"lakh": 5
|
||||
}
|
||||
|
|
@ -654,7 +656,8 @@
|
|||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"mil": 9
|
||||
"M": 6,
|
||||
"mil": 3
|
||||
}
|
||||
},
|
||||
"es-US": {
|
||||
|
|
@ -699,7 +702,8 @@
|
|||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"mil": 9
|
||||
"M": 6,
|
||||
"mil": 3
|
||||
}
|
||||
},
|
||||
"et": {
|
||||
|
|
@ -784,7 +788,9 @@
|
|||
"gaur": "0D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {}
|
||||
"number_tokens": {
|
||||
"M": 6
|
||||
}
|
||||
},
|
||||
"fa": {
|
||||
"equivalent": [],
|
||||
|
|
@ -889,7 +895,8 @@
|
|||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"B": 9
|
||||
"B": 9,
|
||||
"M": 6
|
||||
}
|
||||
},
|
||||
"fr": {
|
||||
|
|
@ -935,6 +942,7 @@
|
|||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"G": 9,
|
||||
"M": 6,
|
||||
"Md": 9
|
||||
}
|
||||
},
|
||||
|
|
@ -977,7 +985,9 @@
|
|||
"onte": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {}
|
||||
"number_tokens": {
|
||||
"M": 6
|
||||
}
|
||||
},
|
||||
"gu": {
|
||||
"equivalent": [],
|
||||
|
|
@ -1148,6 +1158,7 @@
|
|||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"E": 3,
|
||||
"M": 6,
|
||||
"Mrd": 9
|
||||
}
|
||||
},
|
||||
|
|
@ -1222,6 +1233,7 @@
|
|||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"M": 9,
|
||||
"jt": 6,
|
||||
"rb": 3
|
||||
}
|
||||
|
|
@ -2281,6 +2293,7 @@
|
|||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"M": 6,
|
||||
"mM": 9,
|
||||
"mil": 3
|
||||
}
|
||||
|
|
@ -2693,6 +2706,7 @@
|
|||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"B": 9,
|
||||
"M": 6,
|
||||
"elfu": 3
|
||||
}
|
||||
},
|
||||
|
|
@ -3068,7 +3082,8 @@
|
|||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"B": 9
|
||||
"B": 9,
|
||||
"M": 6
|
||||
}
|
||||
},
|
||||
"zh-TW": {
|
||||
|
|
@ -3135,7 +3150,8 @@
|
|||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"B": 9
|
||||
"B": 9,
|
||||
"M": 6
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Reference in a new issue