feat: add playlist date parser

This commit is contained in:
ThetaDev 2022-09-07 15:32:08 +02:00
parent d18f175aef
commit a992495b2b
8 changed files with 1909 additions and 422 deletions

View file

@ -12,3 +12,6 @@ indent_style = tab
[*.{json,md,rst,ini,yml,yaml,xml,html,js,jsx,ts,tsx,vue,kt}] [*.{json,md,rst,ini,yml,yaml,xml,html,js,jsx,ts,tsx,vue,kt}]
indent_size = 2 indent_size = 2
[*.json]
insert_final_newline = false

View file

@ -13,6 +13,7 @@ use serde::{Deserialize, Serialize};
use crate::{ use crate::{
client::RustyTube, client::RustyTube,
model::{locale::LANGUAGES, Country, Language}, model::{locale::LANGUAGES, Country, Language},
timeago::{self, TimeAgo},
util, util,
}; };
@ -83,8 +84,16 @@ async fn collect_dates() {
serde_json::to_writer_pretty(file, &collected_dates).unwrap(); serde_json::to_writer_pretty(file, &collected_dates).unwrap();
} }
// #[test] fn filter_str(string: &str) -> String {
fn parse_months() { string
.to_lowercase()
.chars()
.filter(|c| c != &'\u{200b}' && !c.is_ascii_digit())
.collect()
}
#[test]
fn write_samples_to_dict() {
let json_path = Path::new("testfiles/date/playlist_samples.json").to_path_buf(); let json_path = Path::new("testfiles/date/playlist_samples.json").to_path_buf();
let json_file = File::open(json_path).unwrap(); let json_file = File::open(json_path).unwrap();
let collected_dates: CollectedDates = let collected_dates: CollectedDates =
@ -123,11 +132,47 @@ fn parse_months() {
]; ];
for lang in langs { for lang in langs {
let datestr_table = collected_dates.get(&lang).unwrap();
let mut month_words: HashMap<String, usize> = HashMap::new(); let mut month_words: HashMap<String, usize> = HashMap::new();
let mut num_order = "".to_owned(); let mut num_order = "".to_owned();
// Today/Yesterday
let mut td_words: HashMap<String, i8> = HashMap::new();
{
let mut parse = |string: &str, n: i8| {
filter_str(string).split_whitespace().for_each(|word| {
td_words
.entry(word.to_owned())
.and_modify(|e| *e = 0)
.or_insert(n);
});
};
parse(datestr_table.get(&DateCase::Today).unwrap(), 1);
parse(datestr_table.get(&DateCase::Yesterday).unwrap(), 2);
parse(datestr_table.get(&DateCase::Ago).unwrap(), 0);
parse(datestr_table.get(&DateCase::Jan).unwrap(), 0);
}
// n days ago
{
let datestr = datestr_table.get(&DateCase::Ago).unwrap();
let tago = timeago::parse(lang, &datestr);
assert_eq!(
tago,
Some(TimeAgo {
n: 3,
unit: timeago::TimeUnit::Day
}),
"lang: {}, txt: {}",
lang,
datestr
);
}
// Absolute dates (Jan 3, 2020)
months.iter().enumerate().for_each(|(n, m)| { months.iter().enumerate().for_each(|(n, m)| {
let datestr = collected_dates.get(&lang).unwrap().get(m).unwrap(); let datestr = datestr_table.get(m).unwrap();
// Get order of numbers // Get order of numbers
let nums = util::parse_numeric_vec::<u32>(&datestr); let nums = util::parse_numeric_vec::<u32>(&datestr);
@ -155,12 +200,7 @@ fn parse_months() {
} }
// Insert words into the map // Insert words into the map
let filtered_str = datestr filter_str(&datestr).split_whitespace().for_each(|word| {
.chars()
.filter(|c| !c.is_ascii_digit())
.collect::<String>();
filtered_str.split_whitespace().for_each(|word| {
month_words month_words
.entry(word.to_owned()) .entry(word.to_owned())
.and_modify(|e| *e = 0) .and_modify(|e| *e = 0)
@ -170,13 +210,47 @@ fn parse_months() {
let dict_entry = dict.entry(lang).or_default(); let dict_entry = dict.entry(lang).or_default();
dict_entry.date_order = num_order; dict_entry.date_order = num_order;
dict_entry.months = month_words.iter().filter_map(|(word, m)| { dict_entry.months = month_words
if *m == 0 { .iter()
None .filter_map(|(word, m)| {
} else { if *m == 0 {
Some((word.to_owned(), *m as u8)) None
} else {
Some((word.to_owned(), *m as u8))
}
})
.collect();
match lang {
Language::Ja
| Language::ZhCn
| Language::ZhHk
| Language::ZhTw
| Language::Ko
| Language::Gu
| Language::Pa
| Language::Ur
| Language::Uz
| Language::Te
// Singhalese YT translation is broken (today == tomorrow)
| Language::Si => {}
_ => {
dict_entry.timeago_nd_tokens = td_words
.iter()
.filter_map(|(word, n)| {
match n {
// Today
1 => Some((word.to_owned(), "0D".to_owned())),
// Yesterday
2 => Some((word.to_owned(), "1D".to_owned())),
_ => None,
}
})
.collect();
assert_eq!(dict_entry.timeago_nd_tokens.len(), 2, "lang: {}, nd_tokens: {:?}", lang, &dict_entry.timeago_nd_tokens);
} }
}).collect(); }
} }
super::write_dict(&dict); super::write_dict(&dict);

View file

@ -27,20 +27,22 @@ fn parse_tu(tu: &str) -> (u8, Option<TimeUnit>) {
} }
} }
// #[test] #[test]
fn generate_dictionary() { fn generate_dictionary() {
let dict = super::read_dict(); let dict = super::read_dict();
let code_head = r#"// This file is automatically generated. DO NOT EDIT. let code_head = r#"// This file is automatically generated. DO NOT EDIT.
use crate::{ use crate::{
model::Language, model::Language,
timeago::{TaToken, TimeUnit}, timeago::{TaToken, TimeUnit, DateCmp},
}; };
pub struct Entry { pub struct Entry {
pub by_char: bool,
pub timeago_tokens: phf::Map<&'static str, TaToken>, pub timeago_tokens: phf::Map<&'static str, TaToken>,
pub date_order: &'static str, pub date_order: &'static [DateCmp],
pub months: phf::Map<&'static str, u8>, pub months: phf::Map<&'static str, u8>,
pub timeago_nd_tokens: phf::Map<&'static str, TaToken>,
} }
"#; "#;
@ -76,12 +78,33 @@ pub fn entry(lang: Language) -> Entry {
months.entry(&txt, &n_mon.to_string()); months.entry(&txt, &n_mon.to_string());
}); });
// Timeago(ND) tokens
let mut ta_nd_tokens = phf_codegen::Map::<&str>::new();
entry.timeago_nd_tokens.iter().for_each(|(txt, tu_str)| {
let (n, unit) = parse_tu(&tu_str);
match unit {
Some(unit) => ta_nd_tokens.entry(
&txt,
&format!("TaToken {{ n: {}, unit: Some(TimeUnit::{:?}) }}", n, unit),
),
None => ta_nd_tokens.entry(&txt, &format!("TaToken {{ n: {}, unit: None }}", n)),
};
});
// Date order
let mut date_order = "&[".to_owned();
entry.date_order.chars().for_each(|c| {
date_order += &format!("DateCmp::{}, ", c);
});
date_order = date_order.trim_end_matches([' ', ',']).to_owned() + "]";
let code_ta_tokens = &ta_tokens.build().to_string().replace('\n', "\n "); let code_ta_tokens = &ta_tokens.build().to_string().replace('\n', "\n ");
let code_ta_nd_tokens = &ta_nd_tokens.build().to_string().replace('\n', "\n ");
let code_months = &months.build().to_string().replace('\n', "\n "); let code_months = &months.build().to_string().replace('\n', "\n ");
code_timeago_tokens += &format!( code_timeago_tokens += &format!(
"{} => Entry {{\n timeago_tokens: {},\n date_order: \"{}\",\n months: {},\n }},\n ", "{} => Entry {{\n by_char: {:?},\n timeago_tokens: {},\n date_order: {},\n months: {},\n timeago_nd_tokens: {},\n }},\n ",
selector, code_ta_tokens, entry.date_order, code_months selector, entry.by_char, code_ta_tokens, date_order, code_months, code_ta_nd_tokens
); );
}); });

View file

@ -21,6 +21,7 @@ struct DictEntry {
timeago_tokens: BTreeMap<String, String>, timeago_tokens: BTreeMap<String, String>,
date_order: String, date_order: String,
months: BTreeMap<String, u8>, months: BTreeMap<String, u8>,
timeago_nd_tokens: BTreeMap<String, String>,
} }
fn read_dict() -> Dictionary { fn read_dict() -> Dictionary {

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,6 @@
use std::cmp::Ordering; use std::{cmp::Ordering, ops::Mul};
use chrono::NaiveDate;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::{dictionary, model::Language, util}; use crate::{dictionary, model::Language, util};
@ -16,6 +17,12 @@ pub struct TaToken {
pub unit: Option<TimeUnit>, pub unit: Option<TimeUnit>,
} }
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
enum ParsedDate {
Absolute(NaiveDate),
Relative(TimeAgo),
}
#[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash)] #[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[serde(rename_all = "lowercase")] #[serde(rename_all = "lowercase")]
pub enum TimeUnit { pub enum TimeUnit {
@ -28,6 +35,12 @@ pub enum TimeUnit {
Year, Year,
} }
pub enum DateCmp {
Y,
M,
D,
}
impl TimeUnit { impl TimeUnit {
fn seconds(&self) -> u64 { fn seconds(&self) -> u64 {
match self { match self {
@ -66,34 +79,48 @@ impl PartialOrd for TimeAgo {
} }
} }
pub fn parse(lang: Language, textual_date: &str) -> Option<TimeAgo> { impl Mul<u8> for TimeAgo {
let mappings = dictionary::entry(lang).timeago_tokens; type Output = Self;
let filtered_str = textual_date fn mul(self, rhs: u8) -> Self::Output {
TimeAgo {
n: self.n * rhs,
unit: self.unit,
}
}
}
fn filter_str(string: &str) -> String {
string
.to_lowercase() .to_lowercase()
.chars() .chars()
.filter(|c| c != &'\u{200b}' && !c.is_ascii_digit()) .filter(|c| c != &'\u{200b}' && !c.is_ascii_digit())
.collect::<String>(); .collect()
}
let mut qu: u8 = util::parse_numeric(&textual_date).unwrap_or(1); fn parse_ta_token(entry: &dictionary::Entry, nd: bool, filtered_str: &str) -> Option<TimeAgo> {
let tokens = match nd {
true => &entry.timeago_nd_tokens,
false => &entry.timeago_tokens,
};
let mut qu = 1;
match lang { if entry.by_char {
Language::Ja | Language::ZhCn | Language::ZhHk | Language::ZhTw => { filtered_str.chars().find_map(|word| {
filtered_str.chars().find_map(|word| { tokens
mappings .get(&word.to_string())
.get(&word.to_string()) .map(|t| match t.unit {
.map(|t| match t.unit { Some(unit) => Some(TimeAgo { n: t.n * qu, unit }),
Some(unit) => Some(TimeAgo { n: t.n * qu, unit }), None => {
None => { qu = t.n;
qu = t.n; None
None }
} })
}) .flatten()
.flatten() })
}) } else {
} filtered_str.split_whitespace().find_map(|word| {
_ => filtered_str.split_whitespace().find_map(|word| { tokens
mappings
.get(word) .get(word)
.map(|t| match t.unit { .map(|t| match t.unit {
Some(unit) => Some(TimeAgo { n: t.n * qu, unit }), Some(unit) => Some(TimeAgo { n: t.n * qu, unit }),
@ -103,7 +130,75 @@ pub fn parse(lang: Language, textual_date: &str) -> Option<TimeAgo> {
} }
}) })
.flatten() .flatten()
}), })
}
}
fn parse_textual_month(entry: &dictionary::Entry, filtered_str: &str) -> Option<u8> {
if entry.by_char {
// Chinese/Japanese dont use textual months
None
} else {
filtered_str
.split_whitespace()
.find_map(|word| entry.months.get(word).map(|n| *n))
}
}
pub fn parse(lang: Language, textual_date: &str) -> Option<TimeAgo> {
let entry = dictionary::entry(lang);
let filtered_str = filter_str(textual_date);
let qu: u8 = util::parse_numeric(&textual_date).unwrap_or(1);
parse_ta_token(&entry, false, &filtered_str).map(|ta| ta * qu)
}
fn parse_date(lang: Language, textual_date: &str) -> Option<ParsedDate> {
let entry = dictionary::entry(lang);
let filtered_str = filter_str(textual_date);
let nums = util::parse_numeric_vec::<u16>(textual_date);
match nums.len() {
0 => match parse_ta_token(&entry, true, &filtered_str) {
Some(timeago) => Some(ParsedDate::Relative(timeago)),
None => parse_ta_token(&entry, false, &filtered_str)
.map(|timeago| ParsedDate::Relative(timeago)),
},
1 => parse_ta_token(&entry, false, &filtered_str)
.map(|timeago| ParsedDate::Relative(timeago * nums[0] as u8)),
2..=3 => {
if nums.len() == entry.date_order.len() {
let mut y: Option<u16> = None;
let mut m: Option<u16> = None;
let mut d: Option<u16> = None;
nums.iter()
.enumerate()
.for_each(|(i, n)| match entry.date_order[i] {
DateCmp::Y => y = Some(*n),
DateCmp::M => m = Some(*n),
DateCmp::D => d = Some(*n),
});
if m.is_none() {
m = parse_textual_month(&entry, &filtered_str).map(|n| n as u16);
}
match (y, m, d) {
(Some(y), Some(m), Some(d)) => Some(ParsedDate::Absolute(NaiveDate::from_ymd(
y.into(),
m.into(),
d.into(),
))),
_ => None,
}
} else {
None
}
}
_ => None,
} }
} }
@ -125,8 +220,8 @@ mod tests {
#[case] textual_date: &str, #[case] textual_date: &str,
#[case] expect: Option<TimeAgo>, #[case] expect: Option<TimeAgo>,
) { ) {
let secs_ago = parse(lang, textual_date); let time_ago = parse(lang, textual_date);
assert_eq!(secs_ago, expect); assert_eq!(time_ago, expect);
} }
#[test] #[test]
@ -339,4 +434,132 @@ mod tests {
assert_eq!(n_cases, 1065) assert_eq!(n_cases, 1065)
} }
#[rstest]
#[case(Language::En, "Updated today", Some(ParsedDate::Relative(TimeAgo { n: 0, unit: TimeUnit::Day })))]
#[case(Language::En, "Updated yesterday", Some(ParsedDate::Relative(TimeAgo { n: 1, unit: TimeUnit::Day })))]
#[case(Language::En, "Updated 2 days ago", Some(ParsedDate::Relative(TimeAgo { n: 2, unit: TimeUnit::Day })))]
#[case(
Language::En,
"Last updated on Jun 04, 2003",
Some(ParsedDate::Absolute(NaiveDate::from_ymd(2003, 6, 4)))
)]
fn t_parse_date(
#[case] lang: Language,
#[case] textual_date: &str,
#[case] expect: Option<ParsedDate>,
) {
let parsed_date = parse_date(lang, textual_date);
assert_eq!(parsed_date, expect);
}
#[test]
fn t_parse_date_samples() {
let json_path = Path::new("testfiles/date/playlist_samples.json");
let json_file = File::open(json_path).unwrap();
let date_samples: BTreeMap<Language, BTreeMap<String, String>> =
serde_json::from_reader(BufReader::new(json_file)).unwrap();
date_samples.iter().for_each(|(lang, samples)| {
assert_eq!(
parse_date(*lang, samples.get("Today").unwrap()),
Some(ParsedDate::Relative(TimeAgo {
n: 0,
unit: TimeUnit::Day
})),
"lang: {}",
lang
);
assert_eq!(
parse_date(*lang, samples.get("Yesterday").unwrap()),
Some(ParsedDate::Relative(TimeAgo {
n: 1,
unit: TimeUnit::Day
})),
"lang: {}",
lang
);
assert_eq!(
parse_date(*lang, samples.get("Ago").unwrap()),
Some(ParsedDate::Relative(TimeAgo {
n: 3,
unit: TimeUnit::Day
})),
"lang: {}",
lang
);
assert_eq!(
parse_date(*lang, samples.get("Jan").unwrap()),
Some(ParsedDate::Absolute(NaiveDate::from_ymd(2020, 1, 3))),
"lang: {}",
lang
);
assert_eq!(
parse_date(*lang, samples.get("Feb").unwrap()),
Some(ParsedDate::Absolute(NaiveDate::from_ymd(2016, 2, 7))),
"lang: {}",
lang
);
assert_eq!(
parse_date(*lang, samples.get("Mar").unwrap()),
Some(ParsedDate::Absolute(NaiveDate::from_ymd(2015, 3, 9))),
"lang: {}",
lang
);
assert_eq!(
parse_date(*lang, samples.get("Apr").unwrap()),
Some(ParsedDate::Absolute(NaiveDate::from_ymd(2017, 4, 2))),
"lang: {}",
lang
);
assert_eq!(
parse_date(*lang, samples.get("May").unwrap()),
Some(ParsedDate::Absolute(NaiveDate::from_ymd(2014, 5, 22))),
"lang: {}",
lang
);
assert_eq!(
parse_date(*lang, samples.get("Jun").unwrap()),
Some(ParsedDate::Absolute(NaiveDate::from_ymd(2014, 6, 28))),
"lang: {}",
lang
);
assert_eq!(
parse_date(*lang, samples.get("Jul").unwrap()),
Some(ParsedDate::Absolute(NaiveDate::from_ymd(2014, 7, 2))),
"lang: {}",
lang
);
assert_eq!(
parse_date(*lang, samples.get("Aug").unwrap()),
Some(ParsedDate::Absolute(NaiveDate::from_ymd(2015, 8, 23))),
"lang: {}",
lang
);
assert_eq!(
parse_date(*lang, samples.get("Sep").unwrap()),
Some(ParsedDate::Absolute(NaiveDate::from_ymd(2018, 9, 16))),
"lang: {}",
lang
);
assert_eq!(
parse_date(*lang, samples.get("Oct").unwrap()),
Some(ParsedDate::Absolute(NaiveDate::from_ymd(2014, 10, 31))),
"lang: {}",
lang
);
assert_eq!(
parse_date(*lang, samples.get("Nov").unwrap()),
Some(ParsedDate::Absolute(NaiveDate::from_ymd(2016, 11, 3))),
"lang: {}",
lang
);
assert_eq!(
parse_date(*lang, samples.get("Dec").unwrap()),
Some(ParsedDate::Absolute(NaiveDate::from_ymd(2021, 12, 24))),
"lang: {}",
lang
);
})
}
} }

View file

@ -2,7 +2,6 @@ use std::{collections::BTreeMap, str::FromStr};
use anyhow::Result; use anyhow::Result;
use fancy_regex::Regex; use fancy_regex::Regex;
use once_cell::sync::Lazy;
use rand::Rng; use rand::Rng;
use url::Url; use url::Url;

File diff suppressed because it is too large Load diff