//! Parser for textual dates and times. //! //! The YouTube API mostly outputs pre-formatted dates and times //! like "18 minutes ago" or "Jul 2, 2014" instead of standardized //! machine-readable date and time formats. //! //! Additionally these formats are localized, meaning they depend //! on the configured language. //! //! This module can parse these dates using an embedded dictionary which //! contains date/time unit tokens for all supported languages. use std::ops::Mul; use serde::{Deserialize, Serialize}; use time::{Date, Duration, Month, OffsetDateTime}; use crate::{ param::Language, util::{self, dictionary, SplitTokens}, }; /// Parsed TimeAgo string, contains amount and time unit. /// /// Example: "14 hours ago" => `TimeAgo {n: 14, unit: TimeUnit::Hour}` #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct TimeAgo { /// Number of time units pub n: u8, /// Time unit pub unit: TimeUnit, } /// Parsed date string that may be relative or absolute. /// /// Examples: /// /// - "Jul 2, 2014" => `ParsedDate::Absolute("2014-07-02")` /// - "2 months ago" => `ParsedDate::Relative(TimeAgo {n: 2, unit: TimeUnit::Month})` #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub enum ParsedDate { /// Absolute date /// /// Example: "Jul 2, 2014" Absolute(Date), /// Relative date /// /// Example: "2 months ago" Relative(TimeAgo), } /// Parsed time unit #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash)] #[serde(rename_all = "lowercase")] #[allow(missing_docs)] pub enum TimeUnit { Second, Minute, Hour, Day, Week, Month, Year, } /// Value of a parsed TimeAgo token, used in the dictionary #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct TaToken { pub n: u8, pub unit: Option, } pub enum DateCmp { Y, M, D, } impl TimeUnit { pub fn secs(self) -> u32 { match self { TimeUnit::Second => 1, TimeUnit::Minute => 60, TimeUnit::Hour => 3600, TimeUnit::Day => 24 * 3600, TimeUnit::Week => 7 * 24 * 3600, TimeUnit::Month => 30 * 24 * 3600, TimeUnit::Year => 365 * 24 * 3600, } } } impl TimeAgo { fn secs(self) -> u32 { u32::from(self.n) * self.unit.secs() } } impl Mul for TimeAgo { type Output = Self; fn mul(self, rhs: u8) -> Self::Output { TimeAgo { n: self.n * rhs, unit: self.unit, } } } impl From for Duration { fn from(ta: TimeAgo) -> Self { Duration::seconds(ta.secs().into()) } } impl From for OffsetDateTime { fn from(ta: TimeAgo) -> Self { let ts = util::now_sec(); match ta.unit { TimeUnit::Month => ts.replace_date(util::shift_months(ts.date(), -i32::from(ta.n))), TimeUnit::Year => ts.replace_date(util::shift_years(ts.date(), -i32::from(ta.n))), _ => ts - Duration::from(ta), } } } impl From for OffsetDateTime { fn from(date: ParsedDate) -> Self { match date { ParsedDate::Absolute(date) => date.with_hms(0, 0, 0).unwrap().assume_utc(), ParsedDate::Relative(timeago) => timeago.into(), } } } /// Prepare the datestring for parsing: lowercase and filter out unnecessary punctuation fn filter_datestr(string: &str) -> String { string .to_lowercase() .chars() .filter_map(|c| { if matches!(c, '\u{200b}' | '.') || c.is_ascii_digit() { None } else if c == '-' { Some(' ') } else { Some(c) } }) .collect() } struct TaTokenParser<'a> { iter: SplitTokens<'a>, tokens: &'a phf::Map<&'static str, TaToken>, } impl<'a> TaTokenParser<'a> { fn new(entry: &'a dictionary::Entry, by_char: bool, nd: bool, filtered_str: &'a str) -> Self { let tokens = if nd { &entry.timeago_nd_tokens } else { &entry.timeago_tokens }; Self { iter: SplitTokens::new(filtered_str, by_char), tokens, } } } impl<'a> Iterator for TaTokenParser<'a> { type Item = TimeAgo; fn next(&mut self) -> Option { // Quantity for parsing separate quantity + unit tokens let mut qu = 1; self.iter.find_map(|word| { self.tokens.get(word).and_then(|t| match t.unit { Some(unit) => Some(TimeAgo { n: t.n * qu, unit }), None => { qu = t.n; None } }) }) } } fn parse_textual_month(entry: &dictionary::Entry, filtered_str: &str) -> Option { filtered_str .split_whitespace() .find_map(|word| entry.months.get(word).copied()) } /// Parse a TimeAgo string (e.g. "29 minutes ago") into a TimeAgo object. /// /// Returns [`None`] if the date could not be parsed. pub fn parse_timeago(lang: Language, textual_date: &str) -> Option { let entry = dictionary::entry(lang); let filtered_str = filter_datestr(textual_date); let qu: u8 = util::parse_numeric_prod(textual_date).unwrap_or(1); // French uses 'a' as a short form of years. // Since 'a' is also a word in French, it cannot be parsed as a token. if matches!( lang, Language::Fr | Language::FrCa | Language::Es | Language::Es419 | Language::EsUs ) && textual_date.ends_with(" a") { return Some(TimeAgo { n: qu, unit: TimeUnit::Year, }); } TaTokenParser::new(&entry, util::lang_by_char(lang), false, &filtered_str) .next() .map(|ta| ta * qu) } /// Parse a TimeAgo string (e.g. "29 minutes ago") into a Chrono DateTime object. /// /// Returns [`None`] if the date could not be parsed. pub fn parse_timeago_dt(lang: Language, textual_date: &str) -> Option { parse_timeago(lang, textual_date).map(OffsetDateTime::from) } pub fn parse_timeago_dt_or_warn( lang: Language, textual_date: &str, warnings: &mut Vec, ) -> Option { let res = parse_timeago_dt(lang, textual_date); if res.is_none() { warnings.push(format!("could not parse timeago `{textual_date}`")); } res } /// Parse a textual date (e.g. "29 minutes ago" or "Jul 2, 2014") into a ParsedDate object. /// /// Returns [`None`] if the date could not be parsed. pub fn parse_textual_date(lang: Language, textual_date: &str) -> Option { let entry = dictionary::entry(lang); let by_char = util::lang_by_char(lang); let filtered_str = filter_datestr(textual_date); let nums = util::parse_numeric_vec::(textual_date); match nums.len() { 0 => match TaTokenParser::new(&entry, by_char, true, &filtered_str).next() { Some(timeago) => Some(ParsedDate::Relative(timeago)), None => TaTokenParser::new(&entry, by_char, false, &filtered_str) .next() .map(ParsedDate::Relative), }, 1 => TaTokenParser::new(&entry, by_char, false, &filtered_str) .next() .map(|timeago| ParsedDate::Relative(timeago * nums[0] as u8)), 2..=3 => { if nums.len() == entry.date_order.len() { let mut y: Option = None; let mut m: Option = None; let mut d: Option = None; nums.iter() .enumerate() .for_each(|(i, n)| match entry.date_order[i] { DateCmp::Y => y = Some(*n), DateCmp::M => m = Some(*n), DateCmp::D => d = Some(*n), }); // Chinese/Japanese dont use textual months if m.is_none() && !by_char { m = parse_textual_month(&entry, &filtered_str).map(u16::from); } match (y, m, d) { (Some(y), Some(m), Some(d)) => Month::try_from(m as u8) .ok() .and_then(|m| Date::from_calendar_date(y.into(), m, d as u8).ok()) .map(ParsedDate::Absolute), _ => None, } } else { None } } _ => None, } } /// Parse a textual date (e.g. "29 minutes ago" or "Jul 2, 2014") into a Chrono DateTime object. /// /// Returns None if the date could not be parsed. pub fn parse_textual_date_to_dt(lang: Language, textual_date: &str) -> Option { parse_textual_date(lang, textual_date).map(OffsetDateTime::from) } pub fn parse_textual_date_or_warn( lang: Language, textual_date: &str, warnings: &mut Vec, ) -> Option { let res = parse_textual_date_to_dt(lang, textual_date); if res.is_none() { warnings.push(format!("could not parse textual date `{textual_date}`")); } res } /// Parse a textual video duration (e.g. "11 minutes, 20 seconds") /// /// Returns None if the duration could not be parsed pub fn parse_video_duration(lang: Language, video_duration: &str) -> Option { let entry = dictionary::entry(lang); let by_char = util::lang_by_char(lang); let parts = split_duration_txt(video_duration, matches!(lang, Language::Si | Language::Sw)); let mut secs = 0; for part in parts { let mut n = if part.digits.is_empty() { 1 } else { part.digits.parse::().ok()? }; let mut tokens = TaTokenParser::new(&entry, by_char, false, &part.word).peekable(); tokens.peek()?; tokens.for_each(|ta| { secs += n * ta.secs(); n = 1; }); } Some(secs) } pub fn parse_video_duration_or_warn( lang: Language, video_duration: &str, warnings: &mut Vec, ) -> Option { let res = parse_video_duration(lang, video_duration); if res.is_none() { warnings.push(format!("could not parse video duration `{video_duration}`")); } res } #[derive(Default)] struct DurationTxtSegment { digits: String, word: String, } /// Split a video duration string into its segments. /// /// Each segment consists of a word and a string of digits (one of them may be empty). /// /// The `start_word` parameter determines whether the segments should start with a word /// instead of a number. This is the case in Swahili and Singhalese. /// /// Example (start_word=false): /// - `1 minute, 13 seconds` -> `{1;minute} {13;seconds}` /// - `foo 1 minute, 13 seconds bar` -> `{foo} {1;minute} {13;seconds bar}` /// /// Example (start_word=true): /// - `dakika 1 na sekunde 1` -> `{1;dakika} {1;na sekunde}` /// - `foo dakika 1 na sekunde 1 bar` -> `{1;foo dakika} {1;na sekunde} {bar}` fn split_duration_txt(txt: &str, start_word: bool) -> Vec { let mut segments = Vec::new(); // 1: parse digits, 2: parse word let mut state: u8 = 0; let mut seg = DurationTxtSegment::default(); for c in txt.trim().chars() { if c.is_ascii_digit() { if state == 2 && (!seg.digits.is_empty() || (!start_word && segments.is_empty())) { segments.push(seg); seg = DurationTxtSegment::default(); } seg.digits.push(c); state = 1; } else { if (state == 1) && (!seg.word.is_empty() || (start_word && segments.is_empty())) { segments.push(seg); seg = DurationTxtSegment::default(); } if !matches!(c, '.' | ',') { c.to_lowercase().for_each(|c| seg.word.push(c)); } state = 2; } } if !seg.word.is_empty() || !seg.digits.is_empty() { segments.push(seg); } segments } #[cfg(test)] mod tests { use std::{collections::BTreeMap, fs::File, io::BufReader}; use path_macro::path; use rstest::rstest; use time::macros::{date, datetime}; use super::*; use crate::util::tests::TESTFILES; #[rstest] #[case::de(Language::De, "vor 1 Sekunde", Some(TimeAgo { n: 1, unit: TimeUnit::Second }))] #[case::ar(Language::Ar, "قبل ساعة واحدة", Some(TimeAgo { n: 1, unit: TimeUnit::Hour }))] // No-break space #[case::nbsp(Language::De, "Vor 3\u{a0}Tagen aktualisiert", Some(TimeAgo { n: 3, unit: TimeUnit::Day }))] fn t_parse( #[case] lang: Language, #[case] textual_date: &str, #[case] expect: Option, ) { let time_ago = parse_timeago(lang, textual_date); assert_eq!(time_ago, expect); } #[test] fn t_testfile() { let json_path = path!(*TESTFILES / "dict" / "timeago_samples.json"); let expect = [ TimeAgo { n: 10, unit: TimeUnit::Minute, }, TimeAgo { n: 20, unit: TimeUnit::Minute, }, TimeAgo { n: 1, unit: TimeUnit::Hour, }, TimeAgo { n: 2, unit: TimeUnit::Hour, }, TimeAgo { n: 7, unit: TimeUnit::Hour, }, TimeAgo { n: 8, unit: TimeUnit::Hour, }, TimeAgo { n: 9, unit: TimeUnit::Hour, }, TimeAgo { n: 10, unit: TimeUnit::Hour, }, TimeAgo { n: 11, unit: TimeUnit::Hour, }, TimeAgo { n: 12, unit: TimeUnit::Hour, }, TimeAgo { n: 13, unit: TimeUnit::Hour, }, TimeAgo { n: 14, unit: TimeUnit::Hour, }, TimeAgo { n: 15, unit: TimeUnit::Hour, }, TimeAgo { n: 3, unit: TimeUnit::Hour, }, TimeAgo { n: 4, unit: TimeUnit::Hour, }, TimeAgo { n: 4, unit: TimeUnit::Hour, }, TimeAgo { n: 5, unit: TimeUnit::Hour, }, TimeAgo { n: 6, unit: TimeUnit::Hour, }, TimeAgo { n: 6, unit: TimeUnit::Hour, }, TimeAgo { n: 20, unit: TimeUnit::Hour, }, TimeAgo { n: 2, unit: TimeUnit::Day, }, TimeAgo { n: 3, unit: TimeUnit::Day, }, TimeAgo { n: 5, unit: TimeUnit::Day, }, TimeAgo { n: 6, unit: TimeUnit::Day, }, TimeAgo { n: 8, unit: TimeUnit::Day, }, TimeAgo { n: 10, unit: TimeUnit::Day, }, TimeAgo { n: 12, unit: TimeUnit::Day, }, TimeAgo { n: 2, unit: TimeUnit::Week, }, TimeAgo { n: 3, unit: TimeUnit::Week, }, TimeAgo { n: 4, unit: TimeUnit::Week, }, TimeAgo { n: 1, unit: TimeUnit::Month, }, TimeAgo { n: 8, unit: TimeUnit::Month, }, TimeAgo { n: 11, unit: TimeUnit::Month, }, TimeAgo { n: 1, unit: TimeUnit::Year, }, TimeAgo { n: 2, unit: TimeUnit::Year, }, TimeAgo { n: 3, unit: TimeUnit::Year, }, TimeAgo { n: 4, unit: TimeUnit::Year, }, ]; let json_file = File::open(json_path).unwrap(); let strings_map: BTreeMap> = serde_json::from_reader(BufReader::new(json_file)).unwrap(); for (lang, strings) in &strings_map { assert_eq!(strings.len(), expect.len()); strings.iter().enumerate().for_each(|(n, s)| { assert_eq!( parse_timeago(*lang, s), Some(expect[n]), "Language: {lang}, txt: `{s}`" ); }); } } #[test] fn t_testfile_short() { let json_path = path!(*TESTFILES / "dict" / "timeago_samples_short.json"); let expect = [ TimeAgo { n: 35, unit: TimeUnit::Minute, }, TimeAgo { n: 50, unit: TimeUnit::Minute, }, TimeAgo { n: 1, unit: TimeUnit::Hour, }, TimeAgo { n: 2, unit: TimeUnit::Hour, }, TimeAgo { n: 3, unit: TimeUnit::Hour, }, TimeAgo { n: 4, unit: TimeUnit::Hour, }, TimeAgo { n: 5, unit: TimeUnit::Hour, }, TimeAgo { n: 6, unit: TimeUnit::Hour, }, TimeAgo { n: 7, unit: TimeUnit::Hour, }, TimeAgo { n: 8, unit: TimeUnit::Hour, }, TimeAgo { n: 9, unit: TimeUnit::Hour, }, TimeAgo { n: 12, unit: TimeUnit::Hour, }, TimeAgo { n: 17, unit: TimeUnit::Hour, }, TimeAgo { n: 18, unit: TimeUnit::Hour, }, TimeAgo { n: 19, unit: TimeUnit::Hour, }, TimeAgo { n: 20, unit: TimeUnit::Hour, }, TimeAgo { n: 10, unit: TimeUnit::Hour, }, TimeAgo { n: 11, unit: TimeUnit::Hour, }, TimeAgo { n: 13, unit: TimeUnit::Hour, }, TimeAgo { n: 1, unit: TimeUnit::Day, }, TimeAgo { n: 2, unit: TimeUnit::Day, }, TimeAgo { n: 3, unit: TimeUnit::Day, }, TimeAgo { n: 4, unit: TimeUnit::Day, }, TimeAgo { n: 6, unit: TimeUnit::Day, }, TimeAgo { n: 8, unit: TimeUnit::Day, }, TimeAgo { n: 10, unit: TimeUnit::Day, }, TimeAgo { n: 11, unit: TimeUnit::Day, }, TimeAgo { n: 12, unit: TimeUnit::Day, }, TimeAgo { n: 13, unit: TimeUnit::Day, }, TimeAgo { n: 2, unit: TimeUnit::Week, }, TimeAgo { n: 3, unit: TimeUnit::Week, }, TimeAgo { n: 1, unit: TimeUnit::Month, }, TimeAgo { n: 4, unit: TimeUnit::Week, }, TimeAgo { n: 7, unit: TimeUnit::Month, }, TimeAgo { n: 10, unit: TimeUnit::Month, }, TimeAgo { n: 1, unit: TimeUnit::Year, }, TimeAgo { n: 2, unit: TimeUnit::Year, }, TimeAgo { n: 3, unit: TimeUnit::Year, }, TimeAgo { n: 4, unit: TimeUnit::Year, }, TimeAgo { n: 5, unit: TimeUnit::Year, }, ]; let json_file = File::open(json_path).unwrap(); let strings_map: BTreeMap> = serde_json::from_reader(BufReader::new(json_file)).unwrap(); for (lang, strings) in &strings_map { assert_eq!(strings.len(), expect.len(), "Language: {lang}"); strings.iter().enumerate().for_each(|(n, s)| { let mut exp = expect[n]; if *lang == Language::Mn && exp.unit == TimeUnit::Week { exp.unit = TimeUnit::Day; exp.n *= 7; } assert_eq!( parse_timeago(*lang, s), Some(exp), "Language: {lang}, txt: `{s}`" ); }); } } #[test] fn t_timeago_table() { #[derive(Debug, Clone, Deserialize)] struct TimeagoTable { entries: BTreeMap>, } #[derive(Debug, Clone, Deserialize)] struct TimeagoTableEntry { cases: BTreeMap, } let json_path = path!(*TESTFILES / "dict" / "timeago_table.json"); let json_file = File::open(json_path).unwrap(); let timeago_table: TimeagoTable = serde_json::from_reader(BufReader::new(json_file)).unwrap(); let mut n_cases = 0; timeago_table.entries.iter().for_each(|(lang, entries)| { for (t, entry) in entries { entry.cases.iter().for_each(|(txt, n)| { let timeago = parse_timeago(*lang, txt); assert_eq!( timeago, Some(TimeAgo { n: *n, unit: *t }), "lang: {lang}, txt: {txt}" ); n_cases += 1; }); } }); assert_eq!(n_cases, 1065); } #[rstest] #[case(Language::En, "Updated today", Some(ParsedDate::Relative(TimeAgo { n: 0, unit: TimeUnit::Day })))] #[case(Language::En, "Updated yesterday", Some(ParsedDate::Relative(TimeAgo { n: 1, unit: TimeUnit::Day })))] #[case(Language::En, "Updated 2 days ago", Some(ParsedDate::Relative(TimeAgo { n: 2, unit: TimeUnit::Day })))] #[case(Language::Si, "ඊයේ යාවත්කාලීන කරන ලදී", Some(ParsedDate::Relative(TimeAgo { n: 1, unit: TimeUnit::Day })))] #[case( Language::En, "Last updated on Jun 04, 2003", Some(ParsedDate::Absolute(date!(2003-6-4))) )] #[case( Language::Bn, "যোগ দিয়েছেন 24 সেপ, 2013", Some(ParsedDate::Absolute(date!(2013-9-24))) )] fn t_parse_date( #[case] lang: Language, #[case] textual_date: &str, #[case] expect: Option, ) { let parsed_date = parse_textual_date(lang, textual_date); assert_eq!(parsed_date, expect); } #[test] fn t_parse_date_samples() { let json_path = path!(*TESTFILES / "dict" / "playlist_samples.json"); let json_file = File::open(json_path).unwrap(); let date_samples: BTreeMap> = serde_json::from_reader(BufReader::new(json_file)).unwrap(); for (lang, samples) in &date_samples { assert_eq!( parse_textual_date(*lang, samples.get("Today").unwrap()), Some(ParsedDate::Relative(TimeAgo { n: 0, unit: TimeUnit::Day })), "lang: {lang}" ); assert_eq!( parse_textual_date(*lang, samples.get("Yesterday").unwrap()), Some(ParsedDate::Relative(TimeAgo { n: 1, unit: TimeUnit::Day })), "lang: {lang}" ); assert_eq!( parse_textual_date(*lang, samples.get("Ago").unwrap()), Some(ParsedDate::Relative(TimeAgo { n: 5, unit: TimeUnit::Day })), "lang: {lang}" ); assert_eq!( parse_textual_date(*lang, samples.get("Jan").unwrap()), Some(ParsedDate::Absolute(date!(2020 - 1 - 3))), "lang: {lang}" ); assert_eq!( parse_textual_date(*lang, samples.get("Feb").unwrap()), Some(ParsedDate::Absolute(date!(2016 - 2 - 7))), "lang: {lang}" ); assert_eq!( parse_textual_date(*lang, samples.get("Mar").unwrap()), Some(ParsedDate::Absolute(date!(2015 - 3 - 9))), "lang: {lang}" ); assert_eq!( parse_textual_date(*lang, samples.get("Apr").unwrap()), Some(ParsedDate::Absolute(date!(2017 - 4 - 2))), "lang: {lang}" ); assert_eq!( parse_textual_date(*lang, samples.get("May").unwrap()), Some(ParsedDate::Absolute(date!(2014 - 5 - 22))), "lang: {lang}" ); assert_eq!( parse_textual_date(*lang, samples.get("Jun").unwrap()), Some(ParsedDate::Absolute(date!(2014 - 6 - 28))), "lang: {lang}" ); assert_eq!( parse_textual_date(*lang, samples.get("Jul").unwrap()), Some(ParsedDate::Absolute(date!(2014 - 7 - 2))), "lang: {lang}" ); assert_eq!( parse_textual_date(*lang, samples.get("Aug").unwrap()), Some(ParsedDate::Absolute(date!(2015 - 8 - 23))), "lang: {lang}" ); assert_eq!( parse_textual_date(*lang, samples.get("Sep").unwrap()), Some(ParsedDate::Absolute(date!(2018 - 9 - 16))), "lang: {lang}" ); assert_eq!( parse_textual_date(*lang, samples.get("Oct").unwrap()), Some(ParsedDate::Absolute(date!(2014 - 10 - 31))), "lang: {lang}" ); assert_eq!( parse_textual_date(*lang, samples.get("Nov").unwrap()), Some(ParsedDate::Absolute(date!(2016 - 11 - 3))), "lang: {lang}" ); assert_eq!( parse_textual_date(*lang, samples.get("Dec").unwrap()), Some(ParsedDate::Absolute(date!(2021 - 12 - 24))), "lang: {lang}" ); } } #[test] fn t_parse_video_duration() { let json_path = path!(*TESTFILES / "dict" / "video_duration_samples.json"); let json_file = File::open(json_path).unwrap(); let date_samples: BTreeMap> = serde_json::from_reader(BufReader::new(json_file)).unwrap(); for (lang, samples) in &date_samples { for (txt, duration) in samples { assert_eq!( parse_video_duration(*lang, txt), Some(*duration), "lang: {lang}; txt: `{txt}`" ); } } } #[rstest] #[case(Language::Ar, "19 دقيقة وثانيتان", 1142)] #[case(Language::Ar, "دقيقة و13 ثانية", 73)] #[case(Language::Sw, "dakika 1 na sekunde 13", 73)] fn t_parse_video_duration2( #[case] lang: Language, #[case] video_duration: &str, #[case] expect: u32, ) { assert_eq!(parse_video_duration(lang, video_duration), Some(expect)); } #[test] fn t_to_datetime() { // Absolute date let date = parse_textual_date_to_dt(Language::En, "Last updated on Jan 3, 2020").unwrap(); assert_eq!(date, datetime!(2020-1-3 0:00 +0)); // Relative date let date = parse_textual_date_to_dt(Language::En, "1 year ago").unwrap(); let now = OffsetDateTime::now_utc(); assert_eq!(date.year(), now.year() - 1); } }