fix: large number parser (ar)
This commit is contained in:
parent
fc7655093b
commit
f66dce9d3d
5 changed files with 15809 additions and 24 deletions
|
|
@ -33,6 +33,9 @@ pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
|
||||||
let mut json_path = project_root.to_path_buf();
|
let mut json_path = project_root.to_path_buf();
|
||||||
json_path.push("testfiles/dict/large_number_samples.json");
|
json_path.push("testfiles/dict/large_number_samples.json");
|
||||||
|
|
||||||
|
let mut json_path_all = project_root.to_path_buf();
|
||||||
|
json_path_all.push("testfiles/dict/large_number_samples_all.json");
|
||||||
|
|
||||||
let channels = [
|
let channels = [
|
||||||
"UCq-Fj5jknLsUf-MWSy4_brA", // 10e8 (225M)
|
"UCq-Fj5jknLsUf-MWSy4_brA", // 10e8 (225M)
|
||||||
"UCcdwLMPsaU2ezNSJU1nFoBQ", // 10e7 (60M)
|
"UCcdwLMPsaU2ezNSJU1nFoBQ", // 10e7 (60M)
|
||||||
|
|
@ -40,10 +43,11 @@ pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
|
||||||
"UCD0y51PJfvkZNe3y3FR5riw", // 10e5 (125K)
|
"UCD0y51PJfvkZNe3y3FR5riw", // 10e5 (125K)
|
||||||
"UCNcN0dW43zE0Om3278fjY8A", // 10e4 (27K)
|
"UCNcN0dW43zE0Om3278fjY8A", // 10e4 (27K)
|
||||||
"UC0QEucPrn0-Ddi3JBTcs5Kw", // 10e3 (5K)
|
"UC0QEucPrn0-Ddi3JBTcs5Kw", // 10e3 (5K)
|
||||||
"UCGiJh0NZ52wRhYKYnuZI08Q", // 10e1 (37)
|
"UCXvtcj9xUQhaqPaitFf2DqA", // (170)
|
||||||
|
"UCq-XMc01T641v-4P3hQYJWg", // (636)
|
||||||
];
|
];
|
||||||
|
|
||||||
let collected_numbers: CollectedNumbers = stream::iter(LANGUAGES)
|
let collected_numbers_all: BTreeMap<Language, BTreeMap<String, u64>> = stream::iter(LANGUAGES)
|
||||||
.map(|lang| async move {
|
.map(|lang| async move {
|
||||||
let mut entry = BTreeMap::new();
|
let mut entry = BTreeMap::new();
|
||||||
|
|
||||||
|
|
@ -54,7 +58,7 @@ pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
channel.view_counts.iter().for_each(|(num, txt)| {
|
channel.view_counts.iter().for_each(|(num, txt)| {
|
||||||
entry.insert(get_mag(*num), (txt.to_owned(), *num));
|
entry.insert(txt.to_owned(), *num);
|
||||||
});
|
});
|
||||||
|
|
||||||
println!("collected {}-{}", lang, n);
|
println!("collected {}-{}", lang, n);
|
||||||
|
|
@ -66,32 +70,53 @@ pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
|
||||||
.collect()
|
.collect()
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
|
let collected_numbers: CollectedNumbers = collected_numbers_all
|
||||||
|
.iter()
|
||||||
|
.map(|(lang, entry)| {
|
||||||
|
let mut e2 = BTreeMap::new();
|
||||||
|
entry.iter().for_each(|(txt, num)| {
|
||||||
|
e2.insert(get_mag(*num), (txt.to_owned(), *num));
|
||||||
|
});
|
||||||
|
(*lang, e2)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
let file = File::create(json_path).unwrap();
|
let file = File::create(json_path).unwrap();
|
||||||
serde_json::to_writer_pretty(file, &collected_numbers).unwrap();
|
serde_json::to_writer_pretty(file, &collected_numbers).unwrap();
|
||||||
|
|
||||||
|
let file = File::create(json_path_all).unwrap();
|
||||||
|
serde_json::to_writer_pretty(file, &collected_numbers_all).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Attempt to parse the numbers collected by `collect-large-numbers`
|
/// Attempt to parse the numbers collected by `collect-large-numbers`
|
||||||
/// and write the results to `dictionary.json`.
|
/// and write the results to `dictionary.json`.
|
||||||
///
|
|
||||||
/// Manual corrections:
|
|
||||||
/// as
|
|
||||||
/// "কোঃটা": 9,
|
|
||||||
/// "নিঃটা": 6,
|
|
||||||
/// "নিযুতটা": 6,
|
|
||||||
/// "লাখটা": 5,
|
|
||||||
/// "হাজাৰটা": 3
|
|
||||||
///
|
|
||||||
/// bn
|
|
||||||
/// "লাটি": 5,
|
|
||||||
/// "শত": 2,
|
|
||||||
/// "হাটি": 3,
|
|
||||||
/// "কোটি": 7
|
|
||||||
///
|
|
||||||
/// es/es-US
|
|
||||||
/// "mil": 3,
|
|
||||||
/// "M": 6
|
|
||||||
///
|
|
||||||
pub fn write_samples_to_dict(project_root: &Path) {
|
pub fn write_samples_to_dict(project_root: &Path) {
|
||||||
|
/*
|
||||||
|
Manual corrections:
|
||||||
|
as
|
||||||
|
"কোঃটা": 9,
|
||||||
|
"নিঃটা": 6,
|
||||||
|
"নিযুতটা": 6,
|
||||||
|
"লাখটা": 5,
|
||||||
|
"হাজাৰটা": 3
|
||||||
|
|
||||||
|
ar
|
||||||
|
"ألف": 3,
|
||||||
|
"آلاف": 3,
|
||||||
|
"مليار": 9,
|
||||||
|
"مليون": 6
|
||||||
|
|
||||||
|
bn
|
||||||
|
"লাটি": 5,
|
||||||
|
"শত": 2,
|
||||||
|
"হাটি": 3,
|
||||||
|
"কোটি": 7
|
||||||
|
|
||||||
|
es/es-US
|
||||||
|
"mil": 3,
|
||||||
|
"M": 6
|
||||||
|
*/
|
||||||
|
|
||||||
let mut json_path = project_root.to_path_buf();
|
let mut json_path = project_root.to_path_buf();
|
||||||
json_path.push("testfiles/dict/large_number_samples.json");
|
json_path.push("testfiles/dict/large_number_samples.json");
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -245,14 +245,15 @@ pub fn entry(lang: Language) -> Entry {
|
||||||
},
|
},
|
||||||
comma_decimal: false,
|
comma_decimal: false,
|
||||||
number_tokens: ::phf::Map {
|
number_tokens: ::phf::Map {
|
||||||
key: 12913932095322966823,
|
key: 7485420634051515786,
|
||||||
disps: &[
|
disps: &[
|
||||||
(0, 0),
|
(0, 0),
|
||||||
],
|
],
|
||||||
entries: &[
|
entries: &[
|
||||||
("مليار", 9),
|
("آلاف", 3),
|
||||||
("مليون", 6),
|
("مليون", 6),
|
||||||
("ألف", 3),
|
("ألف", 3),
|
||||||
|
("مليار", 9),
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|
|
||||||
14
src/util.rs
14
src/util.rs
|
|
@ -383,6 +383,20 @@ mod tests {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn t_parse_large_numstr_samples2() {
|
||||||
|
let json_path = Path::new("testfiles/dict/large_number_samples_all.json");
|
||||||
|
let json_file = File::open(json_path).unwrap();
|
||||||
|
let number_samples: BTreeMap<Language, BTreeMap<String, u64>> =
|
||||||
|
serde_json::from_reader(BufReader::new(json_file)).unwrap();
|
||||||
|
|
||||||
|
number_samples.iter().for_each(|(lang, entry)| {
|
||||||
|
entry.iter().for_each(|(txt, expect)| {
|
||||||
|
testcase_parse_large_numstr(txt, *lang, *expect);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
fn testcase_parse_large_numstr(string: &str, lang: Language, expect: u64) {
|
fn testcase_parse_large_numstr(string: &str, lang: Language, expect: u64) {
|
||||||
// Round the expected number to the amount of significant digits included
|
// Round the expected number to the amount of significant digits included
|
||||||
// in the string.
|
// in the string.
|
||||||
|
|
|
||||||
|
|
@ -125,6 +125,7 @@
|
||||||
"comma_decimal": false,
|
"comma_decimal": false,
|
||||||
"number_tokens": {
|
"number_tokens": {
|
||||||
"ألف": 3,
|
"ألف": 3,
|
||||||
|
"آلاف": 3,
|
||||||
"مليار": 9,
|
"مليار": 9,
|
||||||
"مليون": 6
|
"مليون": 6
|
||||||
}
|
}
|
||||||
|
|
|
||||||
15744
testfiles/dict/large_number_samples_all.json
Normal file
15744
testfiles/dict/large_number_samples_all.json
Normal file
File diff suppressed because it is too large
Load diff
Reference in a new issue