From 9ddf9a3ac4a9cabc055c568f0d7c6a3d0c547c75 Mon Sep 17 00:00:00 2001 From: ThetaDev Date: Wed, 7 Sep 2022 18:09:43 +0200 Subject: [PATCH] fix date parsing --- src/codegen/collect_playlist_dates.rs | 238 ++++++++++++++------------ src/codegen/gen_dictionary.rs | 2 +- src/dictionary.rs | 191 +++++++++++++-------- src/timeago.rs | 19 +- testfiles/date/dictionary.json | 106 +++++++----- 5 files changed, 332 insertions(+), 224 deletions(-) diff --git a/src/codegen/collect_playlist_dates.rs b/src/codegen/collect_playlist_dates.rs index 6801a29..fcf1f4b 100644 --- a/src/codegen/collect_playlist_dates.rs +++ b/src/codegen/collect_playlist_dates.rs @@ -84,15 +84,7 @@ async fn collect_dates() { serde_json::to_writer_pretty(file, &collected_dates).unwrap(); } -fn filter_str(string: &str) -> String { - string - .to_lowercase() - .chars() - .filter(|c| c != &'\u{200b}' && !c.is_ascii_digit()) - .collect() -} - -#[test] +// #[test] fn write_samples_to_dict() { let json_path = Path::new("testfiles/date/playlist_samples.json").to_path_buf(); let json_file = File::open(json_path).unwrap(); @@ -132,96 +124,17 @@ fn write_samples_to_dict() { ]; for lang in langs { - let datestr_table = collected_dates.get(&lang).unwrap(); - let mut month_words: HashMap = HashMap::new(); - let mut num_order = "".to_owned(); - - // Today/Yesterday - let mut td_words: HashMap = HashMap::new(); - { - let mut parse = |string: &str, n: i8| { - filter_str(string).split_whitespace().for_each(|word| { - td_words - .entry(word.to_owned()) - .and_modify(|e| *e = 0) - .or_insert(n); - }); - }; - - parse(datestr_table.get(&DateCase::Today).unwrap(), 1); - parse(datestr_table.get(&DateCase::Yesterday).unwrap(), 2); - parse(datestr_table.get(&DateCase::Ago).unwrap(), 0); - parse(datestr_table.get(&DateCase::Jan).unwrap(), 0); - } - - // n days ago - { - let datestr = datestr_table.get(&DateCase::Ago).unwrap(); - let tago = timeago::parse(lang, &datestr); - assert_eq!( - tago, - Some(TimeAgo { - n: 3, - unit: timeago::TimeUnit::Day - }), - "lang: {}, txt: {}", - lang, - datestr - ); - } - - // Absolute dates (Jan 3, 2020) - months.iter().enumerate().for_each(|(n, m)| { - let datestr = datestr_table.get(m).unwrap(); - - // Get order of numbers - let nums = util::parse_numeric_vec::(&datestr); - let date = dates[n]; - - let this_num_order = nums - .iter() - .map(|n| { - if n == &date.0 { - "Y" - } else if n == &date.1 { - "M" - } else if n == &date.2 { - "D" - } else { - panic!("invalid number {} in {}", n, datestr); - } - }) - .collect::(); - - if num_order == "" { - num_order = this_num_order; - } else { - assert_eq!(this_num_order, num_order); - } - - // Insert words into the map - filter_str(&datestr).split_whitespace().for_each(|word| { - month_words - .entry(word.to_owned()) - .and_modify(|e| *e = 0) - .or_insert(n + 1); - }); - }); + let mut datestr_tables = vec![collected_dates.get(&lang).unwrap()]; + dict.get(&lang) + .unwrap() + .equivalent + .iter() + .for_each(|l| datestr_tables.push(collected_dates.get(l).unwrap())); let dict_entry = dict.entry(lang).or_default(); - dict_entry.date_order = num_order; - dict_entry.months = month_words - .iter() - .filter_map(|(word, m)| { - if *m == 0 { - None - } else { - Some((word.to_owned(), *m as u8)) - } - }) - .collect(); + let mut num_order = "".to_owned(); - match lang { + let collect_nd_tokens = match lang { Language::Ja | Language::ZhCn | Language::ZhHk @@ -232,25 +145,134 @@ fn write_samples_to_dict() { | Language::Ur | Language::Uz | Language::Te + | Language::PtPt // Singhalese YT translation is broken (today == tomorrow) - | Language::Si => {} - _ => { - dict_entry.timeago_nd_tokens = td_words + | Language::Si => false, + _ => true, + }; + + dict_entry.months = BTreeMap::new(); + + if collect_nd_tokens { + dict_entry.timeago_nd_tokens = BTreeMap::new(); + } + + for datestr_table in &datestr_tables { + let mut month_words: HashMap = HashMap::new(); + let mut td_words: HashMap = HashMap::new(); + + // Today/Yesterday + { + let mut parse = |string: &str, n: i8| { + timeago::filter_str(string) + .split_whitespace() + .for_each(|word| { + td_words + .entry(word.to_owned()) + .and_modify(|e| *e = 0) + .or_insert(n); + }); + }; + + parse(datestr_table.get(&DateCase::Today).unwrap(), 1); + parse(datestr_table.get(&DateCase::Yesterday).unwrap(), 2); + parse(datestr_table.get(&DateCase::Ago).unwrap(), 0); + parse(datestr_table.get(&DateCase::Jan).unwrap(), 0); + } + + // n days ago + { + let datestr = datestr_table.get(&DateCase::Ago).unwrap(); + let tago = timeago::parse(lang, &datestr); + assert_eq!( + tago, + Some(TimeAgo { + n: 3, + unit: timeago::TimeUnit::Day + }), + "lang: {}, txt: {}", + lang, + datestr + ); + } + + // Absolute dates (Jan 3, 2020) + months.iter().enumerate().for_each(|(n, m)| { + let datestr = datestr_table.get(m).unwrap(); + + // Get order of numbers + let nums = util::parse_numeric_vec::(&datestr); + let date = dates[n]; + + let this_num_order = nums .iter() - .filter_map(|(word, n)| { - match n { - // Today - 1 => Some((word.to_owned(), "0D".to_owned())), - // Yesterday - 2 => Some((word.to_owned(), "1D".to_owned())), - _ => None, + .map(|n| { + if n == &date.0 { + "Y" + } else if n == &date.1 { + "M" + } else if n == &date.2 { + "D" + } else { + panic!("invalid number {} in {}", n, datestr); } }) - .collect(); + .collect::(); - assert_eq!(dict_entry.timeago_nd_tokens.len(), 2, "lang: {}, nd_tokens: {:?}", lang, &dict_entry.timeago_nd_tokens); + if num_order == "" { + num_order = this_num_order; + } else { + assert_eq!(this_num_order, num_order, "lang: {}", lang); + } + + // Insert words into the map + timeago::filter_str(&datestr) + .split_whitespace() + .for_each(|word| { + month_words + .entry(word.to_owned()) + .and_modify(|e| *e = 0) + .or_insert(n + 1); + }); + }); + + month_words.iter().for_each(|(word, m)| { + if *m != 0 { + dict_entry.months.insert(word.to_owned(), *m as u8); + }; + }); + + if collect_nd_tokens { + td_words.iter().for_each(|(word, n)| { + match n { + // Today + 1 => { + dict_entry + .timeago_nd_tokens + .insert(word.to_owned(), "0D".to_owned()); + } + // Yesterday + 2 => { + dict_entry + .timeago_nd_tokens + .insert(word.to_owned(), "1D".to_owned()); + } + _ => {} + }; + }); + + if datestr_tables.len() == 1 { + assert_eq!( + dict_entry.timeago_nd_tokens.len(), + 2, + "lang: {}, nd_tokens: {:?}", + lang, + &dict_entry.timeago_nd_tokens + ); + } } } + dict_entry.date_order = num_order; } super::write_dict(&dict); diff --git a/src/codegen/gen_dictionary.rs b/src/codegen/gen_dictionary.rs index 143c950..be66a7c 100644 --- a/src/codegen/gen_dictionary.rs +++ b/src/codegen/gen_dictionary.rs @@ -27,7 +27,7 @@ fn parse_tu(tu: &str) -> (u8, Option) { } } -#[test] +// #[test] fn generate_dictionary() { let dict = super::read_dict(); diff --git a/src/dictionary.rs b/src/dictionary.rs index bf94de8..520ff7c 100644 --- a/src/dictionary.rs +++ b/src/dictionary.rs @@ -815,23 +815,24 @@ pub fn entry(lang: Language) -> Entry { months: ::phf::Map { key: 12913932095322966823, disps: &[ - (11, 10), - (0, 0), - (0, 0), + (8, 0), + (5, 8), + (4, 0), ], entries: &[ - ("oct", 10), - ("sep", 9), - ("jul", 7), - ("jun", 6), - ("may", 5), ("nov", 11), + ("sept", 9), ("apr", 4), - ("mar", 3), - ("jan", 1), - ("aug", 8), - ("feb", 2), ("dec", 12), + ("mar", 3), + ("jun", 6), + ("sep", 9), + ("may", 5), + ("jul", 7), + ("jan", 1), + ("oct", 10), + ("feb", 2), + ("aug", 8), ], }, timeago_nd_tokens: ::phf::Map { @@ -1199,25 +1200,26 @@ pub fn entry(lang: Language) -> Entry { }, date_order: &[DateCmp::D, DateCmp::Y], months: ::phf::Map { - key: 15467950696543387533, + key: 12913932095322966823, disps: &[ - (0, 0), - (5, 2), - (2, 1), + (9, 0), + (7, 6), + (1, 5), ], entries: &[ - ("déc.", 12), - ("mai", 5), - ("sept.", 9), - ("nov.", 11), - ("mars", 3), - ("févr.", 2), - ("avr.", 4), - ("janv.", 1), ("août", 8), - ("oct.", 10), + ("avr.", 4), ("juil.", 7), + ("févr.", 2), + ("oct.", 10), + ("déc.", 12), + ("janv.", 1), + ("mars", 3), ("juin", 6), + ("juill.", 7), + ("nov.", 11), + ("sept.", 9), + ("mai", 5), ], }, timeago_nd_tokens: ::phf::Map { @@ -2121,25 +2123,25 @@ pub fn entry(lang: Language) -> Entry { }, date_order: &[DateCmp::Y, DateCmp::D], months: ::phf::Map { - key: 12913932095322966823, + key: 2980949210194914378, disps: &[ (0, 0), - (4, 0), - (0, 10), + (5, 0), + (0, 0), ], entries: &[ - ("-янв.", 1), - ("-июн.", 6), - ("-апр.", 4), - ("-мар.", 3), - ("-дек.", 12), - ("-май", 5), - ("-июл.", 7), - ("-авг.", 8), - ("-ноя.", 11), - ("-фев.", 2), - ("-сен.", 9), - ("-окт.", 10), + ("фев.", 2), + ("ноя.", 11), + ("авг.", 8), + ("май", 5), + ("янв.", 1), + ("окт.", 10), + ("июл.", 7), + ("сен.", 9), + ("июн.", 6), + ("мар.", 3), + ("дек.", 12), + ("апр.", 4), ], }, timeago_nd_tokens: ::phf::Map { @@ -2961,7 +2963,7 @@ pub fn entry(lang: Language) -> Entry { ], }, }, - Language::Pt | Language::PtPt => Entry { + Language::Pt => Entry { by_char: false, timeago_tokens: ::phf::Map { key: 10121458955350035957, @@ -3021,6 +3023,51 @@ pub fn entry(lang: Language) -> Entry { ], }, }, + Language::PtPt => Entry { + by_char: false, + timeago_tokens: ::phf::Map { + key: 10121458955350035957, + disps: &[ + (6, 9), + (0, 0), + (2, 6), + ], + entries: &[ + ("segundos", TaToken { n: 1, unit: Some(TimeUnit::Second) }), + ("dia", TaToken { n: 1, unit: Some(TimeUnit::Day) }), + ("ano", TaToken { n: 1, unit: Some(TimeUnit::Year) }), + ("meses", TaToken { n: 1, unit: Some(TimeUnit::Month) }), + ("anos", TaToken { n: 1, unit: Some(TimeUnit::Year) }), + ("minuto", TaToken { n: 1, unit: Some(TimeUnit::Minute) }), + ("semana", TaToken { n: 1, unit: Some(TimeUnit::Week) }), + ("hora", TaToken { n: 1, unit: Some(TimeUnit::Hour) }), + ("semanas", TaToken { n: 1, unit: Some(TimeUnit::Week) }), + ("segundo", TaToken { n: 1, unit: Some(TimeUnit::Second) }), + ("minutos", TaToken { n: 1, unit: Some(TimeUnit::Minute) }), + ("horas", TaToken { n: 1, unit: Some(TimeUnit::Hour) }), + ("mês", TaToken { n: 1, unit: Some(TimeUnit::Month) }), + ("dias", TaToken { n: 1, unit: Some(TimeUnit::Day) }), + ], + }, + date_order: &[DateCmp::D, DateCmp::M, DateCmp::Y], + months: ::phf::Map { + key: 12913932095322966823, + disps: &[ + ], + entries: &[ + ], + }, + timeago_nd_tokens: ::phf::Map { + key: 12913932095322966823, + disps: &[ + (1, 0), + ], + entries: &[ + ("hoje", TaToken { n: 0, unit: Some(TimeUnit::Day) }), + ("ontem", TaToken { n: 1, unit: Some(TimeUnit::Day) }), + ], + }, + }, Language::Ro => Entry { by_char: false, timeago_tokens: ::phf::Map { @@ -3950,25 +3997,25 @@ pub fn entry(lang: Language) -> Entry { }, date_order: &[DateCmp::D, DateCmp::Y], months: ::phf::Map { - key: 10121458955350035957, + key: 12913932095322966823, disps: &[ + (2, 5), (0, 0), - (5, 9), - (0, 1), + (9, 6), ], entries: &[ - ("-avg,", 8), - ("-iyn,", 6), - ("-sen,", 9), - ("-noy,", 11), - ("-fev,", 2), - ("-apr,", 4), - ("-yan,", 1), - ("-may,", 5), - ("-mar,", 3), - ("-dek,", 12), - ("-okt,", 10), - ("-iyl,", 7), + ("mar,", 3), + ("may,", 5), + ("noy,", 11), + ("avg,", 8), + ("iyl,", 7), + ("iyn,", 6), + ("yan,", 1), + ("fev,", 2), + ("dek,", 12), + ("okt,", 10), + ("sen,", 9), + ("apr,", 4), ], }, timeago_nd_tokens: ::phf::Map { @@ -4160,25 +4207,25 @@ pub fn entry(lang: Language) -> Entry { }, date_order: &[DateCmp::D, DateCmp::Y], months: ::phf::Map { - key: 15467950696543387533, + key: 7485420634051515786, disps: &[ - (6, 0), - (0, 0), - (2, 1), + (2, 0), + (1, 9), + (1, 10), ], entries: &[ - ("ka-jun", 6), - ("ka-eph", 4), - ("ka-sep", 9), - ("ka-dis", 12), - ("ka-jan", 1), - ("ka-nov", 11), - ("ka-mey", 5), - ("ka-mas", 3), - ("ka-feb", 2), - ("ka-okt", 10), - ("ka-aga", 8), - ("ka-jul", 7), + ("dis", 12), + ("jul", 7), + ("aga", 8), + ("okt", 10), + ("feb", 2), + ("nov", 11), + ("sep", 9), + ("mas", 3), + ("eph", 4), + ("mey", 5), + ("jan", 1), + ("jun", 6), ], }, timeago_nd_tokens: ::phf::Map { diff --git a/src/timeago.rs b/src/timeago.rs index df87dc0..71ed3d7 100644 --- a/src/timeago.rs +++ b/src/timeago.rs @@ -90,11 +90,19 @@ impl Mul for TimeAgo { } } -fn filter_str(string: &str) -> String { +pub fn filter_str(string: &str) -> String { string .to_lowercase() .chars() - .filter(|c| c != &'\u{200b}' && !c.is_ascii_digit()) + .filter_map(|c| { + if c == '\u{200b}' || c.is_ascii_digit() { + None + } else if c == '-' { + Some(' ') + } else { + Some(c) + } + }) .collect() } @@ -439,6 +447,7 @@ mod tests { #[case(Language::En, "Updated today", Some(ParsedDate::Relative(TimeAgo { n: 0, unit: TimeUnit::Day })))] #[case(Language::En, "Updated yesterday", Some(ParsedDate::Relative(TimeAgo { n: 1, unit: TimeUnit::Day })))] #[case(Language::En, "Updated 2 days ago", Some(ParsedDate::Relative(TimeAgo { n: 2, unit: TimeUnit::Day })))] + #[case(Language::Si, "ඊයේ යාවත්කාලීන කරන ලදී", Some(ParsedDate::Relative(TimeAgo { n: 1, unit: TimeUnit::Day })))] #[case( Language::En, "Last updated on Jun 04, 2003", @@ -473,7 +482,11 @@ mod tests { assert_eq!( parse_date(*lang, samples.get("Yesterday").unwrap()), Some(ParsedDate::Relative(TimeAgo { - n: 1, + // YT's Singhalese translation has an error (yesterday == today) + n: match lang { + Language::Si => 0, + _ => 1, + }, unit: TimeUnit::Day })), "lang: {}", diff --git a/testfiles/date/dictionary.json b/testfiles/date/dictionary.json index 7a04e9b..9e7513a 100644 --- a/testfiles/date/dictionary.json +++ b/testfiles/date/dictionary.json @@ -518,7 +518,8 @@ "may": 5, "nov": 11, "oct": 10, - "sep": 9 + "sep": 9, + "sept": 9 }, "timeago_nd_tokens": { "today": "0D", @@ -761,6 +762,7 @@ "févr.": 2, "janv.": 1, "juil.": 7, + "juill.": 7, "juin": 6, "mai": 5, "mars": 3, @@ -1332,18 +1334,18 @@ }, "date_order": "YD", "months": { - "-авг.": 8, - "-апр.": 4, - "-дек.": 12, - "-июл.": 7, - "-июн.": 6, - "-май": 5, - "-мар.": 3, - "-ноя.": 11, - "-окт.": 10, - "-сен.": 9, - "-фев.": 2, - "-янв.": 1 + "авг.": 8, + "апр.": 4, + "дек.": 12, + "июл.": 7, + "июн.": 6, + "май": 5, + "мар.": 3, + "ноя.": 11, + "окт.": 10, + "сен.": 9, + "фев.": 2, + "янв.": 1 }, "timeago_nd_tokens": { "бүгүн": "0D", @@ -1855,9 +1857,7 @@ } }, "pt": { - "equivalent": [ - "pt-PT" - ], + "equivalent": [], "by_char": false, "timeago_tokens": { "ano": "Y", @@ -1895,6 +1895,32 @@ "ontem": "1D" } }, + "pt-PT": { + "equivalent": [], + "by_char": false, + "timeago_tokens": { + "ano": "Y", + "anos": "Y", + "dia": "D", + "dias": "D", + "hora": "h", + "horas": "h", + "meses": "M", + "minuto": "m", + "minutos": "m", + "mês": "M", + "segundo": "s", + "segundos": "s", + "semana": "W", + "semanas": "W" + }, + "date_order": "DMY", + "months": {}, + "timeago_nd_tokens": { + "hoje": "0D", + "ontem": "1D" + } + }, "ro": { "equivalent": [], "by_char": false, @@ -2487,18 +2513,18 @@ }, "date_order": "DY", "months": { - "-apr,": 4, - "-avg,": 8, - "-dek,": 12, - "-fev,": 2, - "-iyl,": 7, - "-iyn,": 6, - "-mar,": 3, - "-may,": 5, - "-noy,": 11, - "-okt,": 10, - "-sen,": 9, - "-yan,": 1 + "apr,": 4, + "avg,": 8, + "dek,": 12, + "fev,": 2, + "iyl,": 7, + "iyn,": 6, + "mar,": 3, + "may,": 5, + "noy,": 11, + "okt,": 10, + "sen,": 9, + "yan,": 1 }, "timeago_nd_tokens": { "bugun": "0D", @@ -2604,18 +2630,18 @@ }, "date_order": "DY", "months": { - "ka-aga": 8, - "ka-dis": 12, - "ka-eph": 4, - "ka-feb": 2, - "ka-jan": 1, - "ka-jul": 7, - "ka-jun": 6, - "ka-mas": 3, - "ka-mey": 5, - "ka-nov": 11, - "ka-okt": 10, - "ka-sep": 9 + "aga": 8, + "dis": 12, + "eph": 4, + "feb": 2, + "jan": 1, + "jul": 7, + "jun": 6, + "mas": 3, + "mey": 5, + "nov": 11, + "okt": 10, + "sep": 9 }, "timeago_nd_tokens": { "izolo": "1D",