fix date parsing
This commit is contained in:
parent
a992495b2b
commit
9ddf9a3ac4
5 changed files with 332 additions and 224 deletions
|
|
@ -84,15 +84,7 @@ async fn collect_dates() {
|
|||
serde_json::to_writer_pretty(file, &collected_dates).unwrap();
|
||||
}
|
||||
|
||||
fn filter_str(string: &str) -> String {
|
||||
string
|
||||
.to_lowercase()
|
||||
.chars()
|
||||
.filter(|c| c != &'\u{200b}' && !c.is_ascii_digit())
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
// #[test]
|
||||
fn write_samples_to_dict() {
|
||||
let json_path = Path::new("testfiles/date/playlist_samples.json").to_path_buf();
|
||||
let json_file = File::open(json_path).unwrap();
|
||||
|
|
@ -132,96 +124,17 @@ fn write_samples_to_dict() {
|
|||
];
|
||||
|
||||
for lang in langs {
|
||||
let datestr_table = collected_dates.get(&lang).unwrap();
|
||||
let mut month_words: HashMap<String, usize> = HashMap::new();
|
||||
let mut num_order = "".to_owned();
|
||||
|
||||
// Today/Yesterday
|
||||
let mut td_words: HashMap<String, i8> = HashMap::new();
|
||||
{
|
||||
let mut parse = |string: &str, n: i8| {
|
||||
filter_str(string).split_whitespace().for_each(|word| {
|
||||
td_words
|
||||
.entry(word.to_owned())
|
||||
.and_modify(|e| *e = 0)
|
||||
.or_insert(n);
|
||||
});
|
||||
};
|
||||
|
||||
parse(datestr_table.get(&DateCase::Today).unwrap(), 1);
|
||||
parse(datestr_table.get(&DateCase::Yesterday).unwrap(), 2);
|
||||
parse(datestr_table.get(&DateCase::Ago).unwrap(), 0);
|
||||
parse(datestr_table.get(&DateCase::Jan).unwrap(), 0);
|
||||
}
|
||||
|
||||
// n days ago
|
||||
{
|
||||
let datestr = datestr_table.get(&DateCase::Ago).unwrap();
|
||||
let tago = timeago::parse(lang, &datestr);
|
||||
assert_eq!(
|
||||
tago,
|
||||
Some(TimeAgo {
|
||||
n: 3,
|
||||
unit: timeago::TimeUnit::Day
|
||||
}),
|
||||
"lang: {}, txt: {}",
|
||||
lang,
|
||||
datestr
|
||||
);
|
||||
}
|
||||
|
||||
// Absolute dates (Jan 3, 2020)
|
||||
months.iter().enumerate().for_each(|(n, m)| {
|
||||
let datestr = datestr_table.get(m).unwrap();
|
||||
|
||||
// Get order of numbers
|
||||
let nums = util::parse_numeric_vec::<u32>(&datestr);
|
||||
let date = dates[n];
|
||||
|
||||
let this_num_order = nums
|
||||
.iter()
|
||||
.map(|n| {
|
||||
if n == &date.0 {
|
||||
"Y"
|
||||
} else if n == &date.1 {
|
||||
"M"
|
||||
} else if n == &date.2 {
|
||||
"D"
|
||||
} else {
|
||||
panic!("invalid number {} in {}", n, datestr);
|
||||
}
|
||||
})
|
||||
.collect::<String>();
|
||||
|
||||
if num_order == "" {
|
||||
num_order = this_num_order;
|
||||
} else {
|
||||
assert_eq!(this_num_order, num_order);
|
||||
}
|
||||
|
||||
// Insert words into the map
|
||||
filter_str(&datestr).split_whitespace().for_each(|word| {
|
||||
month_words
|
||||
.entry(word.to_owned())
|
||||
.and_modify(|e| *e = 0)
|
||||
.or_insert(n + 1);
|
||||
});
|
||||
});
|
||||
let mut datestr_tables = vec![collected_dates.get(&lang).unwrap()];
|
||||
dict.get(&lang)
|
||||
.unwrap()
|
||||
.equivalent
|
||||
.iter()
|
||||
.for_each(|l| datestr_tables.push(collected_dates.get(l).unwrap()));
|
||||
|
||||
let dict_entry = dict.entry(lang).or_default();
|
||||
dict_entry.date_order = num_order;
|
||||
dict_entry.months = month_words
|
||||
.iter()
|
||||
.filter_map(|(word, m)| {
|
||||
if *m == 0 {
|
||||
None
|
||||
} else {
|
||||
Some((word.to_owned(), *m as u8))
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let mut num_order = "".to_owned();
|
||||
|
||||
match lang {
|
||||
let collect_nd_tokens = match lang {
|
||||
Language::Ja
|
||||
| Language::ZhCn
|
||||
| Language::ZhHk
|
||||
|
|
@ -232,25 +145,134 @@ fn write_samples_to_dict() {
|
|||
| Language::Ur
|
||||
| Language::Uz
|
||||
| Language::Te
|
||||
| Language::PtPt
|
||||
// Singhalese YT translation is broken (today == tomorrow)
|
||||
| Language::Si => {}
|
||||
_ => {
|
||||
dict_entry.timeago_nd_tokens = td_words
|
||||
| Language::Si => false,
|
||||
_ => true,
|
||||
};
|
||||
|
||||
dict_entry.months = BTreeMap::new();
|
||||
|
||||
if collect_nd_tokens {
|
||||
dict_entry.timeago_nd_tokens = BTreeMap::new();
|
||||
}
|
||||
|
||||
for datestr_table in &datestr_tables {
|
||||
let mut month_words: HashMap<String, usize> = HashMap::new();
|
||||
let mut td_words: HashMap<String, i8> = HashMap::new();
|
||||
|
||||
// Today/Yesterday
|
||||
{
|
||||
let mut parse = |string: &str, n: i8| {
|
||||
timeago::filter_str(string)
|
||||
.split_whitespace()
|
||||
.for_each(|word| {
|
||||
td_words
|
||||
.entry(word.to_owned())
|
||||
.and_modify(|e| *e = 0)
|
||||
.or_insert(n);
|
||||
});
|
||||
};
|
||||
|
||||
parse(datestr_table.get(&DateCase::Today).unwrap(), 1);
|
||||
parse(datestr_table.get(&DateCase::Yesterday).unwrap(), 2);
|
||||
parse(datestr_table.get(&DateCase::Ago).unwrap(), 0);
|
||||
parse(datestr_table.get(&DateCase::Jan).unwrap(), 0);
|
||||
}
|
||||
|
||||
// n days ago
|
||||
{
|
||||
let datestr = datestr_table.get(&DateCase::Ago).unwrap();
|
||||
let tago = timeago::parse(lang, &datestr);
|
||||
assert_eq!(
|
||||
tago,
|
||||
Some(TimeAgo {
|
||||
n: 3,
|
||||
unit: timeago::TimeUnit::Day
|
||||
}),
|
||||
"lang: {}, txt: {}",
|
||||
lang,
|
||||
datestr
|
||||
);
|
||||
}
|
||||
|
||||
// Absolute dates (Jan 3, 2020)
|
||||
months.iter().enumerate().for_each(|(n, m)| {
|
||||
let datestr = datestr_table.get(m).unwrap();
|
||||
|
||||
// Get order of numbers
|
||||
let nums = util::parse_numeric_vec::<u32>(&datestr);
|
||||
let date = dates[n];
|
||||
|
||||
let this_num_order = nums
|
||||
.iter()
|
||||
.filter_map(|(word, n)| {
|
||||
match n {
|
||||
// Today
|
||||
1 => Some((word.to_owned(), "0D".to_owned())),
|
||||
// Yesterday
|
||||
2 => Some((word.to_owned(), "1D".to_owned())),
|
||||
_ => None,
|
||||
.map(|n| {
|
||||
if n == &date.0 {
|
||||
"Y"
|
||||
} else if n == &date.1 {
|
||||
"M"
|
||||
} else if n == &date.2 {
|
||||
"D"
|
||||
} else {
|
||||
panic!("invalid number {} in {}", n, datestr);
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
.collect::<String>();
|
||||
|
||||
assert_eq!(dict_entry.timeago_nd_tokens.len(), 2, "lang: {}, nd_tokens: {:?}", lang, &dict_entry.timeago_nd_tokens);
|
||||
if num_order == "" {
|
||||
num_order = this_num_order;
|
||||
} else {
|
||||
assert_eq!(this_num_order, num_order, "lang: {}", lang);
|
||||
}
|
||||
|
||||
// Insert words into the map
|
||||
timeago::filter_str(&datestr)
|
||||
.split_whitespace()
|
||||
.for_each(|word| {
|
||||
month_words
|
||||
.entry(word.to_owned())
|
||||
.and_modify(|e| *e = 0)
|
||||
.or_insert(n + 1);
|
||||
});
|
||||
});
|
||||
|
||||
month_words.iter().for_each(|(word, m)| {
|
||||
if *m != 0 {
|
||||
dict_entry.months.insert(word.to_owned(), *m as u8);
|
||||
};
|
||||
});
|
||||
|
||||
if collect_nd_tokens {
|
||||
td_words.iter().for_each(|(word, n)| {
|
||||
match n {
|
||||
// Today
|
||||
1 => {
|
||||
dict_entry
|
||||
.timeago_nd_tokens
|
||||
.insert(word.to_owned(), "0D".to_owned());
|
||||
}
|
||||
// Yesterday
|
||||
2 => {
|
||||
dict_entry
|
||||
.timeago_nd_tokens
|
||||
.insert(word.to_owned(), "1D".to_owned());
|
||||
}
|
||||
_ => {}
|
||||
};
|
||||
});
|
||||
|
||||
if datestr_tables.len() == 1 {
|
||||
assert_eq!(
|
||||
dict_entry.timeago_nd_tokens.len(),
|
||||
2,
|
||||
"lang: {}, nd_tokens: {:?}",
|
||||
lang,
|
||||
&dict_entry.timeago_nd_tokens
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
dict_entry.date_order = num_order;
|
||||
}
|
||||
|
||||
super::write_dict(&dict);
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ fn parse_tu(tu: &str) -> (u8, Option<TimeUnit>) {
|
|||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
// #[test]
|
||||
fn generate_dictionary() {
|
||||
let dict = super::read_dict();
|
||||
|
||||
|
|
|
|||
|
|
@ -815,23 +815,24 @@ pub fn entry(lang: Language) -> Entry {
|
|||
months: ::phf::Map {
|
||||
key: 12913932095322966823,
|
||||
disps: &[
|
||||
(11, 10),
|
||||
(0, 0),
|
||||
(0, 0),
|
||||
(8, 0),
|
||||
(5, 8),
|
||||
(4, 0),
|
||||
],
|
||||
entries: &[
|
||||
("oct", 10),
|
||||
("sep", 9),
|
||||
("jul", 7),
|
||||
("jun", 6),
|
||||
("may", 5),
|
||||
("nov", 11),
|
||||
("sept", 9),
|
||||
("apr", 4),
|
||||
("mar", 3),
|
||||
("jan", 1),
|
||||
("aug", 8),
|
||||
("feb", 2),
|
||||
("dec", 12),
|
||||
("mar", 3),
|
||||
("jun", 6),
|
||||
("sep", 9),
|
||||
("may", 5),
|
||||
("jul", 7),
|
||||
("jan", 1),
|
||||
("oct", 10),
|
||||
("feb", 2),
|
||||
("aug", 8),
|
||||
],
|
||||
},
|
||||
timeago_nd_tokens: ::phf::Map {
|
||||
|
|
@ -1199,25 +1200,26 @@ pub fn entry(lang: Language) -> Entry {
|
|||
},
|
||||
date_order: &[DateCmp::D, DateCmp::Y],
|
||||
months: ::phf::Map {
|
||||
key: 15467950696543387533,
|
||||
key: 12913932095322966823,
|
||||
disps: &[
|
||||
(0, 0),
|
||||
(5, 2),
|
||||
(2, 1),
|
||||
(9, 0),
|
||||
(7, 6),
|
||||
(1, 5),
|
||||
],
|
||||
entries: &[
|
||||
("déc.", 12),
|
||||
("mai", 5),
|
||||
("sept.", 9),
|
||||
("nov.", 11),
|
||||
("mars", 3),
|
||||
("févr.", 2),
|
||||
("avr.", 4),
|
||||
("janv.", 1),
|
||||
("août", 8),
|
||||
("oct.", 10),
|
||||
("avr.", 4),
|
||||
("juil.", 7),
|
||||
("févr.", 2),
|
||||
("oct.", 10),
|
||||
("déc.", 12),
|
||||
("janv.", 1),
|
||||
("mars", 3),
|
||||
("juin", 6),
|
||||
("juill.", 7),
|
||||
("nov.", 11),
|
||||
("sept.", 9),
|
||||
("mai", 5),
|
||||
],
|
||||
},
|
||||
timeago_nd_tokens: ::phf::Map {
|
||||
|
|
@ -2121,25 +2123,25 @@ pub fn entry(lang: Language) -> Entry {
|
|||
},
|
||||
date_order: &[DateCmp::Y, DateCmp::D],
|
||||
months: ::phf::Map {
|
||||
key: 12913932095322966823,
|
||||
key: 2980949210194914378,
|
||||
disps: &[
|
||||
(0, 0),
|
||||
(4, 0),
|
||||
(0, 10),
|
||||
(5, 0),
|
||||
(0, 0),
|
||||
],
|
||||
entries: &[
|
||||
("-янв.", 1),
|
||||
("-июн.", 6),
|
||||
("-апр.", 4),
|
||||
("-мар.", 3),
|
||||
("-дек.", 12),
|
||||
("-май", 5),
|
||||
("-июл.", 7),
|
||||
("-авг.", 8),
|
||||
("-ноя.", 11),
|
||||
("-фев.", 2),
|
||||
("-сен.", 9),
|
||||
("-окт.", 10),
|
||||
("фев.", 2),
|
||||
("ноя.", 11),
|
||||
("авг.", 8),
|
||||
("май", 5),
|
||||
("янв.", 1),
|
||||
("окт.", 10),
|
||||
("июл.", 7),
|
||||
("сен.", 9),
|
||||
("июн.", 6),
|
||||
("мар.", 3),
|
||||
("дек.", 12),
|
||||
("апр.", 4),
|
||||
],
|
||||
},
|
||||
timeago_nd_tokens: ::phf::Map {
|
||||
|
|
@ -2961,7 +2963,7 @@ pub fn entry(lang: Language) -> Entry {
|
|||
],
|
||||
},
|
||||
},
|
||||
Language::Pt | Language::PtPt => Entry {
|
||||
Language::Pt => Entry {
|
||||
by_char: false,
|
||||
timeago_tokens: ::phf::Map {
|
||||
key: 10121458955350035957,
|
||||
|
|
@ -3021,6 +3023,51 @@ pub fn entry(lang: Language) -> Entry {
|
|||
],
|
||||
},
|
||||
},
|
||||
Language::PtPt => Entry {
|
||||
by_char: false,
|
||||
timeago_tokens: ::phf::Map {
|
||||
key: 10121458955350035957,
|
||||
disps: &[
|
||||
(6, 9),
|
||||
(0, 0),
|
||||
(2, 6),
|
||||
],
|
||||
entries: &[
|
||||
("segundos", TaToken { n: 1, unit: Some(TimeUnit::Second) }),
|
||||
("dia", TaToken { n: 1, unit: Some(TimeUnit::Day) }),
|
||||
("ano", TaToken { n: 1, unit: Some(TimeUnit::Year) }),
|
||||
("meses", TaToken { n: 1, unit: Some(TimeUnit::Month) }),
|
||||
("anos", TaToken { n: 1, unit: Some(TimeUnit::Year) }),
|
||||
("minuto", TaToken { n: 1, unit: Some(TimeUnit::Minute) }),
|
||||
("semana", TaToken { n: 1, unit: Some(TimeUnit::Week) }),
|
||||
("hora", TaToken { n: 1, unit: Some(TimeUnit::Hour) }),
|
||||
("semanas", TaToken { n: 1, unit: Some(TimeUnit::Week) }),
|
||||
("segundo", TaToken { n: 1, unit: Some(TimeUnit::Second) }),
|
||||
("minutos", TaToken { n: 1, unit: Some(TimeUnit::Minute) }),
|
||||
("horas", TaToken { n: 1, unit: Some(TimeUnit::Hour) }),
|
||||
("mês", TaToken { n: 1, unit: Some(TimeUnit::Month) }),
|
||||
("dias", TaToken { n: 1, unit: Some(TimeUnit::Day) }),
|
||||
],
|
||||
},
|
||||
date_order: &[DateCmp::D, DateCmp::M, DateCmp::Y],
|
||||
months: ::phf::Map {
|
||||
key: 12913932095322966823,
|
||||
disps: &[
|
||||
],
|
||||
entries: &[
|
||||
],
|
||||
},
|
||||
timeago_nd_tokens: ::phf::Map {
|
||||
key: 12913932095322966823,
|
||||
disps: &[
|
||||
(1, 0),
|
||||
],
|
||||
entries: &[
|
||||
("hoje", TaToken { n: 0, unit: Some(TimeUnit::Day) }),
|
||||
("ontem", TaToken { n: 1, unit: Some(TimeUnit::Day) }),
|
||||
],
|
||||
},
|
||||
},
|
||||
Language::Ro => Entry {
|
||||
by_char: false,
|
||||
timeago_tokens: ::phf::Map {
|
||||
|
|
@ -3950,25 +3997,25 @@ pub fn entry(lang: Language) -> Entry {
|
|||
},
|
||||
date_order: &[DateCmp::D, DateCmp::Y],
|
||||
months: ::phf::Map {
|
||||
key: 10121458955350035957,
|
||||
key: 12913932095322966823,
|
||||
disps: &[
|
||||
(2, 5),
|
||||
(0, 0),
|
||||
(5, 9),
|
||||
(0, 1),
|
||||
(9, 6),
|
||||
],
|
||||
entries: &[
|
||||
("-avg,", 8),
|
||||
("-iyn,", 6),
|
||||
("-sen,", 9),
|
||||
("-noy,", 11),
|
||||
("-fev,", 2),
|
||||
("-apr,", 4),
|
||||
("-yan,", 1),
|
||||
("-may,", 5),
|
||||
("-mar,", 3),
|
||||
("-dek,", 12),
|
||||
("-okt,", 10),
|
||||
("-iyl,", 7),
|
||||
("mar,", 3),
|
||||
("may,", 5),
|
||||
("noy,", 11),
|
||||
("avg,", 8),
|
||||
("iyl,", 7),
|
||||
("iyn,", 6),
|
||||
("yan,", 1),
|
||||
("fev,", 2),
|
||||
("dek,", 12),
|
||||
("okt,", 10),
|
||||
("sen,", 9),
|
||||
("apr,", 4),
|
||||
],
|
||||
},
|
||||
timeago_nd_tokens: ::phf::Map {
|
||||
|
|
@ -4160,25 +4207,25 @@ pub fn entry(lang: Language) -> Entry {
|
|||
},
|
||||
date_order: &[DateCmp::D, DateCmp::Y],
|
||||
months: ::phf::Map {
|
||||
key: 15467950696543387533,
|
||||
key: 7485420634051515786,
|
||||
disps: &[
|
||||
(6, 0),
|
||||
(0, 0),
|
||||
(2, 1),
|
||||
(2, 0),
|
||||
(1, 9),
|
||||
(1, 10),
|
||||
],
|
||||
entries: &[
|
||||
("ka-jun", 6),
|
||||
("ka-eph", 4),
|
||||
("ka-sep", 9),
|
||||
("ka-dis", 12),
|
||||
("ka-jan", 1),
|
||||
("ka-nov", 11),
|
||||
("ka-mey", 5),
|
||||
("ka-mas", 3),
|
||||
("ka-feb", 2),
|
||||
("ka-okt", 10),
|
||||
("ka-aga", 8),
|
||||
("ka-jul", 7),
|
||||
("dis", 12),
|
||||
("jul", 7),
|
||||
("aga", 8),
|
||||
("okt", 10),
|
||||
("feb", 2),
|
||||
("nov", 11),
|
||||
("sep", 9),
|
||||
("mas", 3),
|
||||
("eph", 4),
|
||||
("mey", 5),
|
||||
("jan", 1),
|
||||
("jun", 6),
|
||||
],
|
||||
},
|
||||
timeago_nd_tokens: ::phf::Map {
|
||||
|
|
|
|||
|
|
@ -90,11 +90,19 @@ impl Mul<u8> for TimeAgo {
|
|||
}
|
||||
}
|
||||
|
||||
fn filter_str(string: &str) -> String {
|
||||
pub fn filter_str(string: &str) -> String {
|
||||
string
|
||||
.to_lowercase()
|
||||
.chars()
|
||||
.filter(|c| c != &'\u{200b}' && !c.is_ascii_digit())
|
||||
.filter_map(|c| {
|
||||
if c == '\u{200b}' || c.is_ascii_digit() {
|
||||
None
|
||||
} else if c == '-' {
|
||||
Some(' ')
|
||||
} else {
|
||||
Some(c)
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
|
|
@ -439,6 +447,7 @@ mod tests {
|
|||
#[case(Language::En, "Updated today", Some(ParsedDate::Relative(TimeAgo { n: 0, unit: TimeUnit::Day })))]
|
||||
#[case(Language::En, "Updated yesterday", Some(ParsedDate::Relative(TimeAgo { n: 1, unit: TimeUnit::Day })))]
|
||||
#[case(Language::En, "Updated 2 days ago", Some(ParsedDate::Relative(TimeAgo { n: 2, unit: TimeUnit::Day })))]
|
||||
#[case(Language::Si, "ඊයේ යාවත්කාලීන කරන ලදී", Some(ParsedDate::Relative(TimeAgo { n: 1, unit: TimeUnit::Day })))]
|
||||
#[case(
|
||||
Language::En,
|
||||
"Last updated on Jun 04, 2003",
|
||||
|
|
@ -473,7 +482,11 @@ mod tests {
|
|||
assert_eq!(
|
||||
parse_date(*lang, samples.get("Yesterday").unwrap()),
|
||||
Some(ParsedDate::Relative(TimeAgo {
|
||||
n: 1,
|
||||
// YT's Singhalese translation has an error (yesterday == today)
|
||||
n: match lang {
|
||||
Language::Si => 0,
|
||||
_ => 1,
|
||||
},
|
||||
unit: TimeUnit::Day
|
||||
})),
|
||||
"lang: {}",
|
||||
|
|
|
|||
Reference in a new issue