fix: large number parser (ar)

This commit is contained in:
ThetaDev 2022-09-23 18:52:35 +02:00
parent fc7655093b
commit f66dce9d3d
5 changed files with 15809 additions and 24 deletions

View file

@ -33,6 +33,9 @@ pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
let mut json_path = project_root.to_path_buf();
json_path.push("testfiles/dict/large_number_samples.json");
let mut json_path_all = project_root.to_path_buf();
json_path_all.push("testfiles/dict/large_number_samples_all.json");
let channels = [
"UCq-Fj5jknLsUf-MWSy4_brA", // 10e8 (225M)
"UCcdwLMPsaU2ezNSJU1nFoBQ", // 10e7 (60M)
@ -40,10 +43,11 @@ pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
"UCD0y51PJfvkZNe3y3FR5riw", // 10e5 (125K)
"UCNcN0dW43zE0Om3278fjY8A", // 10e4 (27K)
"UC0QEucPrn0-Ddi3JBTcs5Kw", // 10e3 (5K)
"UCGiJh0NZ52wRhYKYnuZI08Q", // 10e1 (37)
"UCXvtcj9xUQhaqPaitFf2DqA", // (170)
"UCq-XMc01T641v-4P3hQYJWg", // (636)
];
let collected_numbers: CollectedNumbers = stream::iter(LANGUAGES)
let collected_numbers_all: BTreeMap<Language, BTreeMap<String, u64>> = stream::iter(LANGUAGES)
.map(|lang| async move {
let mut entry = BTreeMap::new();
@ -54,7 +58,7 @@ pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
.unwrap();
channel.view_counts.iter().for_each(|(num, txt)| {
entry.insert(get_mag(*num), (txt.to_owned(), *num));
entry.insert(txt.to_owned(), *num);
});
println!("collected {}-{}", lang, n);
@ -66,32 +70,53 @@ pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
.collect()
.await;
let collected_numbers: CollectedNumbers = collected_numbers_all
.iter()
.map(|(lang, entry)| {
let mut e2 = BTreeMap::new();
entry.iter().for_each(|(txt, num)| {
e2.insert(get_mag(*num), (txt.to_owned(), *num));
});
(*lang, e2)
})
.collect();
let file = File::create(json_path).unwrap();
serde_json::to_writer_pretty(file, &collected_numbers).unwrap();
let file = File::create(json_path_all).unwrap();
serde_json::to_writer_pretty(file, &collected_numbers_all).unwrap();
}
/// Attempt to parse the numbers collected by `collect-large-numbers`
/// and write the results to `dictionary.json`.
///
/// Manual corrections:
/// as
/// "কোঃটা": 9,
/// "নিঃটা": 6,
/// "নিযুতটা": 6,
/// "লাখটা": 5,
/// "হাজাৰটা": 3
///
/// bn
/// "লাটি": 5,
/// "শত": 2,
/// "হাটি": 3,
/// "কোটি": 7
///
/// es/es-US
/// "mil": 3,
/// "M": 6
///
pub fn write_samples_to_dict(project_root: &Path) {
/*
Manual corrections:
as
"কোঃটা": 9,
"নিঃটা": 6,
"নিযুতটা": 6,
"লাখটা": 5,
"হাজাৰটা": 3
ar
"ألف": 3,
"آلاف": 3,
"مليار": 9,
"مليون": 6
bn
"লাটি": 5,
"শত": 2,
"হাটি": 3,
"কোটি": 7
es/es-US
"mil": 3,
"M": 6
*/
let mut json_path = project_root.to_path_buf();
json_path.push("testfiles/dict/large_number_samples.json");

View file

@ -245,14 +245,15 @@ pub fn entry(lang: Language) -> Entry {
},
comma_decimal: false,
number_tokens: ::phf::Map {
key: 12913932095322966823,
key: 7485420634051515786,
disps: &[
(0, 0),
],
entries: &[
("مليار", 9),
("آلاف", 3),
("مليون", 6),
("ألف", 3),
("مليار", 9),
],
},
},

View file

@ -383,6 +383,20 @@ mod tests {
});
}
#[test]
fn t_parse_large_numstr_samples2() {
let json_path = Path::new("testfiles/dict/large_number_samples_all.json");
let json_file = File::open(json_path).unwrap();
let number_samples: BTreeMap<Language, BTreeMap<String, u64>> =
serde_json::from_reader(BufReader::new(json_file)).unwrap();
number_samples.iter().for_each(|(lang, entry)| {
entry.iter().for_each(|(txt, expect)| {
testcase_parse_large_numstr(txt, *lang, *expect);
});
});
}
fn testcase_parse_large_numstr(string: &str, lang: Language, expect: u64) {
// Round the expected number to the amount of significant digits included
// in the string.

View file

@ -125,6 +125,7 @@
"comma_decimal": false,
"number_tokens": {
"ألف": 3,
"آلاف": 3,
"مليار": 9,
"مليون": 6
}

File diff suppressed because it is too large Load diff