feat: add large number parser

This commit is contained in:
ThetaDev 2022-09-23 18:19:24 +02:00
parent 5d19259a14
commit fc7655093b
5 changed files with 192 additions and 68 deletions

View file

@ -1,4 +1,4 @@
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};
use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path};
use anyhow::{Context, Result};
@ -72,6 +72,25 @@ pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
/// Attempt to parse the numbers collected by `collect-large-numbers`
/// and write the results to `dictionary.json`.
///
/// Manual corrections:
/// as
/// "কোঃটা": 9,
/// "নিঃটা": 6,
/// "নিযুতটা": 6,
/// "লাখটা": 5,
/// "হাজাৰটা": 3
///
/// bn
/// "লাটি": 5,
/// "শত": 2,
/// "হাটি": 3,
/// "কোটি": 7
///
/// es/es-US
/// "mil": 3,
/// "M": 6
///
pub fn write_samples_to_dict(project_root: &Path) {
let mut json_path = project_root.to_path_buf();
json_path.push("testfiles/dict/large_number_samples.json");
@ -160,8 +179,8 @@ pub fn write_samples_to_dict(project_root: &Path) {
let known_tmag: u8 = if t.len() == 1 {
match t.as_str() {
"K" | "k" => 3,
"M" => 6,
// 'm' means 10^3 in Catalan, 'B' means 10^3 in Turkish
// 'M' means 10^9 in Indonesian
_ => 0,
}
} else {
@ -186,6 +205,12 @@ pub fn write_samples_to_dict(project_root: &Path) {
.filter_map(|(k, v)| v.map(|v| (k, v)))
.collect();
dict_entry.comma_decimal = comma_decimal;
// Check for duplicates
let mut uniq = HashSet::new();
if !dict_entry.number_tokens.values().all(|x| uniq.insert(x)) {
println!("Warning: collected duplicate tokens for {}", lang);
}
}
util::write_dict(project_root, &dict);
@ -340,19 +365,3 @@ async fn get_channel(channel_id: &str, lang: Language) -> Result<ChannelData> {
.unwrap_or_default(),
})
}
#[tokio::test]
async fn test() {
let channel = get_channel("UCcdwLMPsaU2ezNSJU1nFoBQ", Language::Az)
.await
.unwrap();
dbg!(channel);
}
#[test]
fn test2() {
write_samples_to_dict(Path::new(
"/home/thetadev/Documents/Programmieren/Rust/rustypipe",
));
}

View file

@ -298,20 +298,17 @@ fn deobf_nsig(
last_nsig: &mut [String; 2],
) -> Result<()> {
let nsig: String;
match url_params.get("n") {
Some(n) => {
nsig = if n == &last_nsig[0] {
last_nsig[1].to_owned()
} else {
let nsig = deobf.deobfuscate_nsig(n)?;
last_nsig[0] = n.to_string();
last_nsig[1] = nsig.to_owned();
nsig
};
if let Some(n) = url_params.get("n") {
nsig = if n == &last_nsig[0] {
last_nsig[1].to_owned()
} else {
let nsig = deobf.deobfuscate_nsig(n)?;
last_nsig[0] = n.to_string();
last_nsig[1] = nsig.to_owned();
nsig
};
url_params.insert("n".to_owned(), nsig);
}
None => {}
url_params.insert("n".to_owned(), nsig);
};
Ok(())
}

View file

@ -294,18 +294,16 @@ pub fn entry(lang: Language) -> Entry {
},
comma_decimal: false,
number_tokens: ::phf::Map {
key: 15467950696543387533,
key: 12913932095322966823,
disps: &[
(1, 0),
(4, 5),
(3, 0),
],
entries: &[
("নিয\u{9c1}তট\u{9be}", 6),
("নিঃট\u{9be}", 6),
("\u{9be}খট\u{9be}", 5),
("শঃ", 9),
("কোঃট\u{9be}", 9),
("\u{9be}\u{9be}ৰট\u{9be}", 3),
("নিঃট\u{9be}", 6),
("কোঃট\u{9be}", 9),
("\u{9be}খট\u{9be}", 5),
],
},
},
@ -568,12 +566,13 @@ pub fn entry(lang: Language) -> Entry {
number_tokens: ::phf::Map {
key: 15467950696543387533,
disps: &[
(1, 0),
(0, 0),
],
entries: &[
("\u{9be}টি", 5),
("শত", 9),
("শত", 2),
("\u{9be}টি", 3),
("কোটি", 7),
],
},
},
@ -716,12 +715,13 @@ pub fn entry(lang: Language) -> Entry {
},
comma_decimal: true,
number_tokens: ::phf::Map {
key: 15467950696543387533,
key: 12913932095322966823,
disps: &[
(0, 0),
(2, 0),
],
entries: &[
("mM", 9),
("M", 6),
("m", 3),
],
},
@ -1044,14 +1044,15 @@ pub fn entry(lang: Language) -> Entry {
},
comma_decimal: false,
number_tokens: ::phf::Map {
key: 15467950696543387533,
key: 14108922650502679131,
disps: &[
(1, 0),
],
entries: &[
("crore", 7),
("B", 9),
("lakh", 5),
("crore", 7),
("M", 6),
("B", 9),
],
},
},
@ -1118,10 +1119,11 @@ pub fn entry(lang: Language) -> Entry {
number_tokens: ::phf::Map {
key: 12913932095322966823,
disps: &[
(0, 0),
(1, 0),
],
entries: &[
("mil", 9),
("mil", 3),
("M", 6),
],
},
},
@ -1188,10 +1190,11 @@ pub fn entry(lang: Language) -> Entry {
number_tokens: ::phf::Map {
key: 12913932095322966823,
disps: &[
(0, 0),
(1, 0),
],
entries: &[
("mil", 9),
("mil", 3),
("M", 6),
],
},
},
@ -1328,8 +1331,10 @@ pub fn entry(lang: Language) -> Entry {
number_tokens: ::phf::Map {
key: 12913932095322966823,
disps: &[
(0, 0),
],
entries: &[
("M", 6),
],
},
},
@ -1512,6 +1517,7 @@ pub fn entry(lang: Language) -> Entry {
(0, 0),
],
entries: &[
("M", 6),
("B", 9),
],
},
@ -1579,10 +1585,11 @@ pub fn entry(lang: Language) -> Entry {
number_tokens: ::phf::Map {
key: 12913932095322966823,
disps: &[
(1, 0),
(2, 0),
],
entries: &[
("G", 9),
("M", 6),
("Md", 9),
],
},
@ -1650,8 +1657,10 @@ pub fn entry(lang: Language) -> Entry {
number_tokens: ::phf::Map {
key: 12913932095322966823,
disps: &[
(0, 0),
],
entries: &[
("M", 6),
],
},
},
@ -1924,13 +1933,14 @@ pub fn entry(lang: Language) -> Entry {
},
comma_decimal: true,
number_tokens: ::phf::Map {
key: 12913932095322966823,
key: 15467950696543387533,
disps: &[
(0, 0),
(2, 0),
],
entries: &[
("Mrd", 9),
("E", 3),
("Mrd", 9),
("M", 6),
],
},
},
@ -2051,12 +2061,13 @@ pub fn entry(lang: Language) -> Entry {
},
comma_decimal: true,
number_tokens: ::phf::Map {
key: 12913932095322966823,
key: 15467950696543387533,
disps: &[
(0, 0),
],
entries: &[
("jt", 6),
("M", 9),
("rb", 3),
],
},
@ -3820,11 +3831,12 @@ pub fn entry(lang: Language) -> Entry {
number_tokens: ::phf::Map {
key: 12913932095322966823,
disps: &[
(0, 0),
(2, 0),
],
entries: &[
("mil", 3),
("mM", 9),
("M", 6),
("mil", 3),
],
},
},
@ -4497,11 +4509,12 @@ pub fn entry(lang: Language) -> Entry {
number_tokens: ::phf::Map {
key: 12913932095322966823,
disps: &[
(0, 0),
(1, 0),
],
entries: &[
("elfu", 3),
("B", 9),
("elfu", 3),
("M", 6),
],
},
},
@ -5136,6 +5149,7 @@ pub fn entry(lang: Language) -> Entry {
(0, 0),
],
entries: &[
("M", 6),
("B", 9),
],
},
@ -5254,6 +5268,7 @@ pub fn entry(lang: Language) -> Entry {
(0, 0),
],
entries: &[
("M", 6),
("B", 9),
],
},

View file

@ -6,6 +6,8 @@ use once_cell::sync::Lazy;
use rand::Rng;
use url::Url;
use crate::{dictionary, model::Language};
const CONTENT_PLAYBACK_NONCE_ALPHABET: &[u8; 64] =
b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
@ -228,8 +230,61 @@ impl<T> TryRemove<T> for Vec<T> {
}
}
fn parse_large_numstr(string: &str, lang: Language) -> Option<u64> {
let dict_entry = dictionary::entry(lang);
let decimal_point = match dict_entry.comma_decimal {
true => ',',
false => '.',
};
let (num, mut exp, filtered) = {
let mut buf = String::new();
let mut filtered = String::new();
let mut exp = 0;
let mut after_point = false;
for c in string.chars() {
if c.is_ascii_digit() {
buf.push(c);
if after_point {
exp -= 1;
}
} else if c == decimal_point {
after_point = true;
} else if !matches!(c, '\u{200b}' | '.' | ',') {
filtered.push(c);
}
}
(ok_or_bail!(buf.parse::<u64>(), None), exp, filtered)
};
let lookup_token = |token: &str| match token {
"K" | "k" => Some(3),
_ => dict_entry.number_tokens.get(token).map(|t| *t as i32),
};
if dict_entry.by_char {
exp += filtered
.chars()
.filter_map(|token| lookup_token(&token.to_string()))
.sum::<i32>();
} else {
exp += filtered
.split_whitespace()
.filter_map(lookup_token)
.sum::<i32>();
}
num.checked_mul(some_or_bail!(
(10_u64).checked_pow(ok_or_bail!(exp.try_into(), None)),
None
))
}
#[cfg(test)]
mod tests {
use std::{fs::File, io::BufReader, path::Path};
use super::*;
use rstest::rstest;
@ -313,4 +368,36 @@ mod tests {
let res = sanitize_yt_url(url);
assert_eq!(res, expect);
}
#[test]
fn t_parse_large_numstr_samples() {
let json_path = Path::new("testfiles/dict/large_number_samples.json");
let json_file = File::open(json_path).unwrap();
let number_samples: BTreeMap<Language, BTreeMap<u8, (String, u64)>> =
serde_json::from_reader(BufReader::new(json_file)).unwrap();
number_samples.iter().for_each(|(lang, entry)| {
entry.iter().for_each(|(_, (txt, expect))| {
testcase_parse_large_numstr(txt, *lang, *expect);
});
});
}
fn testcase_parse_large_numstr(string: &str, lang: Language, expect: u64) {
// Round the expected number to the amount of significant digits included
// in the string.
let rounded = {
let n_significant_d = string.chars().filter(char::is_ascii_digit).count();
let mag = (expect as f64).log10().floor();
let factor = 10_u64.pow(1 + mag as u32 - n_significant_d as u32);
(((expect as f64) / factor as f64).floor() as u64) * factor
};
let res = parse_large_numstr(string, lang).expect(string);
assert_eq!(
res, rounded,
"{} (lang: {}, exact: {})",
string, lang, expect
);
}
}

View file

@ -153,7 +153,6 @@
"নিঃটা": 6,
"নিযুতটা": 6,
"লাখটা": 5,
"শঃ": 9,
"হাজাৰটা": 3
}
},
@ -315,8 +314,9 @@
"comma_decimal": false,
"number_tokens": {
"লাটি": 5,
"শত": 9,
"হাটি": 3
"শত": 2,
"হাটি": 3,
"কোটি": 7
}
},
"bs": {
@ -409,6 +409,7 @@
},
"comma_decimal": true,
"number_tokens": {
"M": 6,
"m": 3,
"mM": 9
}
@ -610,6 +611,7 @@
"comma_decimal": false,
"number_tokens": {
"B": 9,
"M": 6,
"crore": 7,
"lakh": 5
}
@ -654,7 +656,8 @@
},
"comma_decimal": true,
"number_tokens": {
"mil": 9
"M": 6,
"mil": 3
}
},
"es-US": {
@ -699,7 +702,8 @@
},
"comma_decimal": false,
"number_tokens": {
"mil": 9
"M": 6,
"mil": 3
}
},
"et": {
@ -784,7 +788,9 @@
"gaur": "0D"
},
"comma_decimal": true,
"number_tokens": {}
"number_tokens": {
"M": 6
}
},
"fa": {
"equivalent": [],
@ -889,7 +895,8 @@
},
"comma_decimal": false,
"number_tokens": {
"B": 9
"B": 9,
"M": 6
}
},
"fr": {
@ -935,6 +942,7 @@
"comma_decimal": true,
"number_tokens": {
"G": 9,
"M": 6,
"Md": 9
}
},
@ -977,7 +985,9 @@
"onte": "1D"
},
"comma_decimal": true,
"number_tokens": {}
"number_tokens": {
"M": 6
}
},
"gu": {
"equivalent": [],
@ -1148,6 +1158,7 @@
"comma_decimal": true,
"number_tokens": {
"E": 3,
"M": 6,
"Mrd": 9
}
},
@ -1222,6 +1233,7 @@
},
"comma_decimal": true,
"number_tokens": {
"M": 9,
"jt": 6,
"rb": 3
}
@ -2281,6 +2293,7 @@
},
"comma_decimal": true,
"number_tokens": {
"M": 6,
"mM": 9,
"mil": 3
}
@ -2693,6 +2706,7 @@
"comma_decimal": false,
"number_tokens": {
"B": 9,
"M": 6,
"elfu": 3
}
},
@ -3068,7 +3082,8 @@
},
"comma_decimal": false,
"number_tokens": {
"B": 9
"B": 9,
"M": 6
}
},
"zh-TW": {
@ -3135,7 +3150,8 @@
},
"comma_decimal": false,
"number_tokens": {
"B": 9
"B": 9,
"M": 6
}
}
}