fix date parsing

This commit is contained in:
ThetaDev 2022-09-07 18:09:43 +02:00
parent a992495b2b
commit 9ddf9a3ac4
5 changed files with 332 additions and 224 deletions

View file

@ -84,15 +84,7 @@ async fn collect_dates() {
serde_json::to_writer_pretty(file, &collected_dates).unwrap();
}
fn filter_str(string: &str) -> String {
string
.to_lowercase()
.chars()
.filter(|c| c != &'\u{200b}' && !c.is_ascii_digit())
.collect()
}
#[test]
// #[test]
fn write_samples_to_dict() {
let json_path = Path::new("testfiles/date/playlist_samples.json").to_path_buf();
let json_file = File::open(json_path).unwrap();
@ -132,96 +124,17 @@ fn write_samples_to_dict() {
];
for lang in langs {
let datestr_table = collected_dates.get(&lang).unwrap();
let mut month_words: HashMap<String, usize> = HashMap::new();
let mut num_order = "".to_owned();
// Today/Yesterday
let mut td_words: HashMap<String, i8> = HashMap::new();
{
let mut parse = |string: &str, n: i8| {
filter_str(string).split_whitespace().for_each(|word| {
td_words
.entry(word.to_owned())
.and_modify(|e| *e = 0)
.or_insert(n);
});
};
parse(datestr_table.get(&DateCase::Today).unwrap(), 1);
parse(datestr_table.get(&DateCase::Yesterday).unwrap(), 2);
parse(datestr_table.get(&DateCase::Ago).unwrap(), 0);
parse(datestr_table.get(&DateCase::Jan).unwrap(), 0);
}
// n days ago
{
let datestr = datestr_table.get(&DateCase::Ago).unwrap();
let tago = timeago::parse(lang, &datestr);
assert_eq!(
tago,
Some(TimeAgo {
n: 3,
unit: timeago::TimeUnit::Day
}),
"lang: {}, txt: {}",
lang,
datestr
);
}
// Absolute dates (Jan 3, 2020)
months.iter().enumerate().for_each(|(n, m)| {
let datestr = datestr_table.get(m).unwrap();
// Get order of numbers
let nums = util::parse_numeric_vec::<u32>(&datestr);
let date = dates[n];
let this_num_order = nums
.iter()
.map(|n| {
if n == &date.0 {
"Y"
} else if n == &date.1 {
"M"
} else if n == &date.2 {
"D"
} else {
panic!("invalid number {} in {}", n, datestr);
}
})
.collect::<String>();
if num_order == "" {
num_order = this_num_order;
} else {
assert_eq!(this_num_order, num_order);
}
// Insert words into the map
filter_str(&datestr).split_whitespace().for_each(|word| {
month_words
.entry(word.to_owned())
.and_modify(|e| *e = 0)
.or_insert(n + 1);
});
});
let mut datestr_tables = vec![collected_dates.get(&lang).unwrap()];
dict.get(&lang)
.unwrap()
.equivalent
.iter()
.for_each(|l| datestr_tables.push(collected_dates.get(l).unwrap()));
let dict_entry = dict.entry(lang).or_default();
dict_entry.date_order = num_order;
dict_entry.months = month_words
.iter()
.filter_map(|(word, m)| {
if *m == 0 {
None
} else {
Some((word.to_owned(), *m as u8))
}
})
.collect();
let mut num_order = "".to_owned();
match lang {
let collect_nd_tokens = match lang {
Language::Ja
| Language::ZhCn
| Language::ZhHk
@ -232,25 +145,134 @@ fn write_samples_to_dict() {
| Language::Ur
| Language::Uz
| Language::Te
| Language::PtPt
// Singhalese YT translation is broken (today == tomorrow)
| Language::Si => {}
_ => {
dict_entry.timeago_nd_tokens = td_words
| Language::Si => false,
_ => true,
};
dict_entry.months = BTreeMap::new();
if collect_nd_tokens {
dict_entry.timeago_nd_tokens = BTreeMap::new();
}
for datestr_table in &datestr_tables {
let mut month_words: HashMap<String, usize> = HashMap::new();
let mut td_words: HashMap<String, i8> = HashMap::new();
// Today/Yesterday
{
let mut parse = |string: &str, n: i8| {
timeago::filter_str(string)
.split_whitespace()
.for_each(|word| {
td_words
.entry(word.to_owned())
.and_modify(|e| *e = 0)
.or_insert(n);
});
};
parse(datestr_table.get(&DateCase::Today).unwrap(), 1);
parse(datestr_table.get(&DateCase::Yesterday).unwrap(), 2);
parse(datestr_table.get(&DateCase::Ago).unwrap(), 0);
parse(datestr_table.get(&DateCase::Jan).unwrap(), 0);
}
// n days ago
{
let datestr = datestr_table.get(&DateCase::Ago).unwrap();
let tago = timeago::parse(lang, &datestr);
assert_eq!(
tago,
Some(TimeAgo {
n: 3,
unit: timeago::TimeUnit::Day
}),
"lang: {}, txt: {}",
lang,
datestr
);
}
// Absolute dates (Jan 3, 2020)
months.iter().enumerate().for_each(|(n, m)| {
let datestr = datestr_table.get(m).unwrap();
// Get order of numbers
let nums = util::parse_numeric_vec::<u32>(&datestr);
let date = dates[n];
let this_num_order = nums
.iter()
.filter_map(|(word, n)| {
match n {
// Today
1 => Some((word.to_owned(), "0D".to_owned())),
// Yesterday
2 => Some((word.to_owned(), "1D".to_owned())),
_ => None,
.map(|n| {
if n == &date.0 {
"Y"
} else if n == &date.1 {
"M"
} else if n == &date.2 {
"D"
} else {
panic!("invalid number {} in {}", n, datestr);
}
})
.collect();
.collect::<String>();
assert_eq!(dict_entry.timeago_nd_tokens.len(), 2, "lang: {}, nd_tokens: {:?}", lang, &dict_entry.timeago_nd_tokens);
if num_order == "" {
num_order = this_num_order;
} else {
assert_eq!(this_num_order, num_order, "lang: {}", lang);
}
// Insert words into the map
timeago::filter_str(&datestr)
.split_whitespace()
.for_each(|word| {
month_words
.entry(word.to_owned())
.and_modify(|e| *e = 0)
.or_insert(n + 1);
});
});
month_words.iter().for_each(|(word, m)| {
if *m != 0 {
dict_entry.months.insert(word.to_owned(), *m as u8);
};
});
if collect_nd_tokens {
td_words.iter().for_each(|(word, n)| {
match n {
// Today
1 => {
dict_entry
.timeago_nd_tokens
.insert(word.to_owned(), "0D".to_owned());
}
// Yesterday
2 => {
dict_entry
.timeago_nd_tokens
.insert(word.to_owned(), "1D".to_owned());
}
_ => {}
};
});
if datestr_tables.len() == 1 {
assert_eq!(
dict_entry.timeago_nd_tokens.len(),
2,
"lang: {}, nd_tokens: {:?}",
lang,
&dict_entry.timeago_nd_tokens
);
}
}
}
dict_entry.date_order = num_order;
}
super::write_dict(&dict);

View file

@ -27,7 +27,7 @@ fn parse_tu(tu: &str) -> (u8, Option<TimeUnit>) {
}
}
#[test]
// #[test]
fn generate_dictionary() {
let dict = super::read_dict();

View file

@ -815,23 +815,24 @@ pub fn entry(lang: Language) -> Entry {
months: ::phf::Map {
key: 12913932095322966823,
disps: &[
(11, 10),
(0, 0),
(0, 0),
(8, 0),
(5, 8),
(4, 0),
],
entries: &[
("oct", 10),
("sep", 9),
("jul", 7),
("jun", 6),
("may", 5),
("nov", 11),
("sept", 9),
("apr", 4),
("mar", 3),
("jan", 1),
("aug", 8),
("feb", 2),
("dec", 12),
("mar", 3),
("jun", 6),
("sep", 9),
("may", 5),
("jul", 7),
("jan", 1),
("oct", 10),
("feb", 2),
("aug", 8),
],
},
timeago_nd_tokens: ::phf::Map {
@ -1199,25 +1200,26 @@ pub fn entry(lang: Language) -> Entry {
},
date_order: &[DateCmp::D, DateCmp::Y],
months: ::phf::Map {
key: 15467950696543387533,
key: 12913932095322966823,
disps: &[
(0, 0),
(5, 2),
(2, 1),
(9, 0),
(7, 6),
(1, 5),
],
entries: &[
("déc.", 12),
("mai", 5),
("sept.", 9),
("nov.", 11),
("mars", 3),
("févr.", 2),
("avr.", 4),
("janv.", 1),
("août", 8),
("oct.", 10),
("avr.", 4),
("juil.", 7),
("févr.", 2),
("oct.", 10),
("déc.", 12),
("janv.", 1),
("mars", 3),
("juin", 6),
("juill.", 7),
("nov.", 11),
("sept.", 9),
("mai", 5),
],
},
timeago_nd_tokens: ::phf::Map {
@ -2121,25 +2123,25 @@ pub fn entry(lang: Language) -> Entry {
},
date_order: &[DateCmp::Y, DateCmp::D],
months: ::phf::Map {
key: 12913932095322966823,
key: 2980949210194914378,
disps: &[
(0, 0),
(4, 0),
(0, 10),
(5, 0),
(0, 0),
],
entries: &[
("-янв.", 1),
("-июн.", 6),
("-апр.", 4),
("-мар.", 3),
("-дек.", 12),
("-май", 5),
("-июл.", 7),
("-авг.", 8),
("-ноя.", 11),
("-фев.", 2),
("-сен.", 9),
("-окт.", 10),
("фев.", 2),
("ноя.", 11),
("авг.", 8),
("май", 5),
("янв.", 1),
("окт.", 10),
("июл.", 7),
("сен.", 9),
("июн.", 6),
("мар.", 3),
("дек.", 12),
("апр.", 4),
],
},
timeago_nd_tokens: ::phf::Map {
@ -2961,7 +2963,7 @@ pub fn entry(lang: Language) -> Entry {
],
},
},
Language::Pt | Language::PtPt => Entry {
Language::Pt => Entry {
by_char: false,
timeago_tokens: ::phf::Map {
key: 10121458955350035957,
@ -3021,6 +3023,51 @@ pub fn entry(lang: Language) -> Entry {
],
},
},
Language::PtPt => Entry {
by_char: false,
timeago_tokens: ::phf::Map {
key: 10121458955350035957,
disps: &[
(6, 9),
(0, 0),
(2, 6),
],
entries: &[
("segundos", TaToken { n: 1, unit: Some(TimeUnit::Second) }),
("dia", TaToken { n: 1, unit: Some(TimeUnit::Day) }),
("ano", TaToken { n: 1, unit: Some(TimeUnit::Year) }),
("meses", TaToken { n: 1, unit: Some(TimeUnit::Month) }),
("anos", TaToken { n: 1, unit: Some(TimeUnit::Year) }),
("minuto", TaToken { n: 1, unit: Some(TimeUnit::Minute) }),
("semana", TaToken { n: 1, unit: Some(TimeUnit::Week) }),
("hora", TaToken { n: 1, unit: Some(TimeUnit::Hour) }),
("semanas", TaToken { n: 1, unit: Some(TimeUnit::Week) }),
("segundo", TaToken { n: 1, unit: Some(TimeUnit::Second) }),
("minutos", TaToken { n: 1, unit: Some(TimeUnit::Minute) }),
("horas", TaToken { n: 1, unit: Some(TimeUnit::Hour) }),
("mês", TaToken { n: 1, unit: Some(TimeUnit::Month) }),
("dias", TaToken { n: 1, unit: Some(TimeUnit::Day) }),
],
},
date_order: &[DateCmp::D, DateCmp::M, DateCmp::Y],
months: ::phf::Map {
key: 12913932095322966823,
disps: &[
],
entries: &[
],
},
timeago_nd_tokens: ::phf::Map {
key: 12913932095322966823,
disps: &[
(1, 0),
],
entries: &[
("hoje", TaToken { n: 0, unit: Some(TimeUnit::Day) }),
("ontem", TaToken { n: 1, unit: Some(TimeUnit::Day) }),
],
},
},
Language::Ro => Entry {
by_char: false,
timeago_tokens: ::phf::Map {
@ -3950,25 +3997,25 @@ pub fn entry(lang: Language) -> Entry {
},
date_order: &[DateCmp::D, DateCmp::Y],
months: ::phf::Map {
key: 10121458955350035957,
key: 12913932095322966823,
disps: &[
(2, 5),
(0, 0),
(5, 9),
(0, 1),
(9, 6),
],
entries: &[
("-avg,", 8),
("-iyn,", 6),
("-sen,", 9),
("-noy,", 11),
("-fev,", 2),
("-apr,", 4),
("-yan,", 1),
("-may,", 5),
("-mar,", 3),
("-dek,", 12),
("-okt,", 10),
("-iyl,", 7),
("mar,", 3),
("may,", 5),
("noy,", 11),
("avg,", 8),
("iyl,", 7),
("iyn,", 6),
("yan,", 1),
("fev,", 2),
("dek,", 12),
("okt,", 10),
("sen,", 9),
("apr,", 4),
],
},
timeago_nd_tokens: ::phf::Map {
@ -4160,25 +4207,25 @@ pub fn entry(lang: Language) -> Entry {
},
date_order: &[DateCmp::D, DateCmp::Y],
months: ::phf::Map {
key: 15467950696543387533,
key: 7485420634051515786,
disps: &[
(6, 0),
(0, 0),
(2, 1),
(2, 0),
(1, 9),
(1, 10),
],
entries: &[
("ka-jun", 6),
("ka-eph", 4),
("ka-sep", 9),
("ka-dis", 12),
("ka-jan", 1),
("ka-nov", 11),
("ka-mey", 5),
("ka-mas", 3),
("ka-feb", 2),
("ka-okt", 10),
("ka-aga", 8),
("ka-jul", 7),
("dis", 12),
("jul", 7),
("aga", 8),
("okt", 10),
("feb", 2),
("nov", 11),
("sep", 9),
("mas", 3),
("eph", 4),
("mey", 5),
("jan", 1),
("jun", 6),
],
},
timeago_nd_tokens: ::phf::Map {

View file

@ -90,11 +90,19 @@ impl Mul<u8> for TimeAgo {
}
}
fn filter_str(string: &str) -> String {
pub fn filter_str(string: &str) -> String {
string
.to_lowercase()
.chars()
.filter(|c| c != &'\u{200b}' && !c.is_ascii_digit())
.filter_map(|c| {
if c == '\u{200b}' || c.is_ascii_digit() {
None
} else if c == '-' {
Some(' ')
} else {
Some(c)
}
})
.collect()
}
@ -439,6 +447,7 @@ mod tests {
#[case(Language::En, "Updated today", Some(ParsedDate::Relative(TimeAgo { n: 0, unit: TimeUnit::Day })))]
#[case(Language::En, "Updated yesterday", Some(ParsedDate::Relative(TimeAgo { n: 1, unit: TimeUnit::Day })))]
#[case(Language::En, "Updated 2 days ago", Some(ParsedDate::Relative(TimeAgo { n: 2, unit: TimeUnit::Day })))]
#[case(Language::Si, "ඊයේ යාවත්කාලීන කරන ලදී", Some(ParsedDate::Relative(TimeAgo { n: 1, unit: TimeUnit::Day })))]
#[case(
Language::En,
"Last updated on Jun 04, 2003",
@ -473,7 +482,11 @@ mod tests {
assert_eq!(
parse_date(*lang, samples.get("Yesterday").unwrap()),
Some(ParsedDate::Relative(TimeAgo {
n: 1,
// YT's Singhalese translation has an error (yesterday == today)
n: match lang {
Language::Si => 0,
_ => 1,
},
unit: TimeUnit::Day
})),
"lang: {}",

View file

@ -518,7 +518,8 @@
"may": 5,
"nov": 11,
"oct": 10,
"sep": 9
"sep": 9,
"sept": 9
},
"timeago_nd_tokens": {
"today": "0D",
@ -761,6 +762,7 @@
"févr.": 2,
"janv.": 1,
"juil.": 7,
"juill.": 7,
"juin": 6,
"mai": 5,
"mars": 3,
@ -1332,18 +1334,18 @@
},
"date_order": "YD",
"months": {
"-авг.": 8,
"-апр.": 4,
"-дек.": 12,
"-июл.": 7,
"-июн.": 6,
"-май": 5,
"-мар.": 3,
"-ноя.": 11,
"-окт.": 10,
"-сен.": 9,
"-фев.": 2,
"-янв.": 1
"авг.": 8,
"апр.": 4,
"дек.": 12,
"июл.": 7,
"июн.": 6,
"май": 5,
"мар.": 3,
"ноя.": 11,
"окт.": 10,
"сен.": 9,
"фев.": 2,
"янв.": 1
},
"timeago_nd_tokens": {
"бүгүн": "0D",
@ -1855,9 +1857,7 @@
}
},
"pt": {
"equivalent": [
"pt-PT"
],
"equivalent": [],
"by_char": false,
"timeago_tokens": {
"ano": "Y",
@ -1895,6 +1895,32 @@
"ontem": "1D"
}
},
"pt-PT": {
"equivalent": [],
"by_char": false,
"timeago_tokens": {
"ano": "Y",
"anos": "Y",
"dia": "D",
"dias": "D",
"hora": "h",
"horas": "h",
"meses": "M",
"minuto": "m",
"minutos": "m",
"mês": "M",
"segundo": "s",
"segundos": "s",
"semana": "W",
"semanas": "W"
},
"date_order": "DMY",
"months": {},
"timeago_nd_tokens": {
"hoje": "0D",
"ontem": "1D"
}
},
"ro": {
"equivalent": [],
"by_char": false,
@ -2487,18 +2513,18 @@
},
"date_order": "DY",
"months": {
"-apr,": 4,
"-avg,": 8,
"-dek,": 12,
"-fev,": 2,
"-iyl,": 7,
"-iyn,": 6,
"-mar,": 3,
"-may,": 5,
"-noy,": 11,
"-okt,": 10,
"-sen,": 9,
"-yan,": 1
"apr,": 4,
"avg,": 8,
"dek,": 12,
"fev,": 2,
"iyl,": 7,
"iyn,": 6,
"mar,": 3,
"may,": 5,
"noy,": 11,
"okt,": 10,
"sen,": 9,
"yan,": 1
},
"timeago_nd_tokens": {
"bugun": "0D",
@ -2604,18 +2630,18 @@
},
"date_order": "DY",
"months": {
"ka-aga": 8,
"ka-dis": 12,
"ka-eph": 4,
"ka-feb": 2,
"ka-jan": 1,
"ka-jul": 7,
"ka-jun": 6,
"ka-mas": 3,
"ka-mey": 5,
"ka-nov": 11,
"ka-okt": 10,
"ka-sep": 9
"aga": 8,
"dis": 12,
"eph": 4,
"feb": 2,
"jan": 1,
"jul": 7,
"jun": 6,
"mas": 3,
"mey": 5,
"nov": 11,
"okt": 10,
"sep": 9
},
"timeago_nd_tokens": {
"izolo": "1D",