feat: add absolute dates/months to dictionary

This commit is contained in:
ThetaDev 2022-09-06 21:27:28 +02:00
parent c9433d721d
commit d18f175aef
10 changed files with 9942 additions and 1834 deletions

View file

@ -1,12 +1,19 @@
#![cfg(test)]
use std::{collections::BTreeMap, fs::File, path::Path};
use std::{
collections::{BTreeMap, HashMap},
fs::File,
hash::Hash,
io::BufReader,
path::Path,
};
use serde::{Deserialize, Serialize};
use crate::{
client::RustyTube,
model::{locale::LANGUAGES, Country, Language},
util,
};
type CollectedDates = BTreeMap<Language, BTreeMap<DateCase, String>>;
@ -30,7 +37,7 @@ enum DateCase {
Dec,
}
#[test_log::test(tokio::test)]
// #[test_log::test(tokio::test)]
async fn collect_dates() {
let json_path = Path::new("testfiles/date/playlist_samples.json").to_path_buf();
if json_path.exists() {
@ -44,7 +51,7 @@ async fn collect_dates() {
),
(DateCase::Yesterday, "PLmB6td997u3kUOrfFwkULZ910ho44oQSy"),
(DateCase::Ago, "PL7zsB-C3aNu2yRY2869T0zj1FhtRIu5am"),
(DateCase::Jan, "PL1J-6JOckZtHxTA3hN5SK7gBQaFfKzeXr"),
(DateCase::Jan, "PL1J-6JOckZtFjcni6Xj1pLYglJp6JCpKD"),
(DateCase::Feb, "PL1J-6JOckZtETrbzwZE7mRIIK6BzWNLAs"),
(DateCase::Mar, "PL1J-6JOckZtG3AVdvBXhMO64mB2k3BtKi"),
(DateCase::Apr, "PL1J-6JOckZtE_rUpK24S6X5hOE4eQoprN"),
@ -75,3 +82,102 @@ async fn collect_dates() {
let file = File::create(json_path).unwrap();
serde_json::to_writer_pretty(file, &collected_dates).unwrap();
}
// #[test]
fn parse_months() {
let json_path = Path::new("testfiles/date/playlist_samples.json").to_path_buf();
let json_file = File::open(json_path).unwrap();
let collected_dates: CollectedDates =
serde_json::from_reader(BufReader::new(json_file)).unwrap();
let mut dict = super::read_dict();
let langs = dict.keys().map(|k| k.to_owned()).collect::<Vec<_>>();
let months = [
DateCase::Jan,
DateCase::Feb,
DateCase::Mar,
DateCase::Apr,
DateCase::May,
DateCase::Jun,
DateCase::Jul,
DateCase::Aug,
DateCase::Sep,
DateCase::Oct,
DateCase::Nov,
DateCase::Dec,
];
let dates: [(u32, u32, u32); 12] = [
(2020, 1, 3),
(2016, 2, 7),
(2015, 3, 9),
(2017, 4, 2),
(2014, 5, 22),
(2014, 6, 28),
(2014, 7, 2),
(2015, 8, 23),
(2018, 9, 16),
(2014, 10, 31),
(2016, 11, 3),
(2021, 12, 24),
];
for lang in langs {
let mut month_words: HashMap<String, usize> = HashMap::new();
let mut num_order = "".to_owned();
months.iter().enumerate().for_each(|(n, m)| {
let datestr = collected_dates.get(&lang).unwrap().get(m).unwrap();
// Get order of numbers
let nums = util::parse_numeric_vec::<u32>(&datestr);
let date = dates[n];
let this_num_order = nums
.iter()
.map(|n| {
if n == &date.0 {
"Y"
} else if n == &date.1 {
"M"
} else if n == &date.2 {
"D"
} else {
panic!("invalid number {} in {}", n, datestr);
}
})
.collect::<String>();
if num_order == "" {
num_order = this_num_order;
} else {
assert_eq!(this_num_order, num_order);
}
// Insert words into the map
let filtered_str = datestr
.chars()
.filter(|c| !c.is_ascii_digit())
.collect::<String>();
filtered_str.split_whitespace().for_each(|word| {
month_words
.entry(word.to_owned())
.and_modify(|e| *e = 0)
.or_insert(n + 1);
});
});
let dict_entry = dict.entry(lang).or_default();
dict_entry.date_order = num_order;
dict_entry.months = month_words.iter().filter_map(|(word, m)| {
if *m == 0 {
None
} else {
Some((word.to_owned(), *m as u8))
}
}).collect();
}
super::write_dict(&dict);
}

View file

@ -1,31 +1,11 @@
#![cfg(test)]
use std::{
collections::BTreeMap,
fmt::Debug,
fs::File,
io::{BufReader},
};
use crate::{model::Language, timeago::TimeUnit};
use crate::{timeago::TimeUnit};
use fancy_regex::Regex;
use once_cell::sync::Lazy;
use serde::Deserialize;
const DICT_PATH: &str = "testfiles/date/dictionary.json";
const TARGET_FILE: &str = "src/dictionary.rs";
type Dictionary = BTreeMap<Language, DictEntry>;
#[derive(Debug, Deserialize)]
struct DictEntry {
#[serde(default)]
equivalent: Vec<Language>,
#[serde(default)]
by_char: bool,
timeago_tokens: BTreeMap<String, String>,
}
fn parse_tu(tu: &str) -> (u8, Option<TimeUnit>) {
static TU_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"^(\d*)(\w?)$").unwrap());
match TU_PATTERN.captures(tu).unwrap() {
@ -47,51 +27,62 @@ fn parse_tu(tu: &str) -> (u8, Option<TimeUnit>) {
}
}
fn read_dict() -> Dictionary {
let json_file = File::open(DICT_PATH).unwrap();
serde_json::from_reader(BufReader::new(json_file)).unwrap()
}
// #[test]
fn generate_dictionary() {
let dict = read_dict();
let dict = super::read_dict();
let code_head = r#"// This file is automatically generated. DO NOT EDIT.
use crate::{
model::Language,
timeago::{TaToken, TimeUnit},
};
pub struct Entry {
pub timeago_tokens: phf::Map<&'static str, TaToken>,
pub date_order: &'static str,
pub months: phf::Map<&'static str, u8>,
}
"#;
let mut code_timeago_tokens = r#"#[rustfmt::skip]
pub(crate) fn get_timeago_tokens(lang: Language) -> phf::Map<&'static str, TaToken> {
pub fn entry(lang: Language) -> Entry {
match lang {
"#
.to_owned();
dict.iter().for_each(|(lang, entry)| {
// Create a map for the language
let mut map = phf_codegen::Map::<&str>::new();
entry.timeago_tokens.iter().for_each(|(txt, tu_str)| {
let (n, unit) = parse_tu(&tu_str);
match unit {
Some(unit) => map.entry(
&txt,
&format!("TaToken {{ n: {}, unit: Some(TimeUnit::{:?}) }}", n, unit),
),
None => map.entry(&txt, &format!("TaToken {{ n: {}, unit: None }}", n)),
};
});
// Match selector
let mut selector = format!("Language::{:?}", lang);
entry.equivalent.iter().for_each(|eq| {
selector += &format!(" | Language::{:?}", eq);
});
let code_map = &map.build().to_string().replace('\n', "\n ");
// Timeago tokens
let mut ta_tokens = phf_codegen::Map::<&str>::new();
entry.timeago_tokens.iter().for_each(|(txt, tu_str)| {
let (n, unit) = parse_tu(&tu_str);
match unit {
Some(unit) => ta_tokens.entry(
&txt,
&format!("TaToken {{ n: {}, unit: Some(TimeUnit::{:?}) }}", n, unit),
),
None => ta_tokens.entry(&txt, &format!("TaToken {{ n: {}, unit: None }}", n)),
};
});
code_timeago_tokens += &format!("{} => {},\n ", selector, code_map);
// Months
let mut months = phf_codegen::Map::<&str>::new();
entry.months.iter().for_each(|(txt, n_mon)| {
months.entry(&txt, &n_mon.to_string());
});
let code_ta_tokens = &ta_tokens.build().to_string().replace('\n', "\n ");
let code_months = &months.build().to_string().replace('\n', "\n ");
code_timeago_tokens += &format!(
"{} => Entry {{\n timeago_tokens: {},\n date_order: \"{}\",\n months: {},\n }},\n ",
selector, code_ta_tokens, entry.date_order, code_months
);
});
code_timeago_tokens = code_timeago_tokens.trim_end().to_owned() + "\n }\n}\n";

View file

@ -1,4 +1,34 @@
#![cfg(test)]
use std::{collections::BTreeMap, fs::File, io::BufReader};
use serde::{Serialize, Deserialize};
use crate::model::Language;
mod collect_playlist_dates;
mod gen_dictionary;
mod gen_locales;
const DICT_PATH: &str = "testfiles/date/dictionary.json";
type Dictionary = BTreeMap<Language, DictEntry>;
#[derive(Debug, Default, Serialize, Deserialize)]
#[serde(default)]
struct DictEntry {
equivalent: Vec<Language>,
by_char: bool,
timeago_tokens: BTreeMap<String, String>,
date_order: String,
months: BTreeMap<String, u8>,
}
fn read_dict() -> Dictionary {
let json_file = File::open(DICT_PATH).unwrap();
serde_json::from_reader(BufReader::new(json_file)).unwrap()
}
fn write_dict(dict: &Dictionary) {
let json_file = File::create(DICT_PATH).unwrap();
serde_json::to_writer_pretty(json_file, dict).unwrap();
}

File diff suppressed because it is too large Load diff

View file

@ -11,7 +11,7 @@ pub struct TimeAgo {
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub(crate) struct TaToken {
pub struct TaToken {
pub n: u8,
pub unit: Option<TimeUnit>,
}
@ -67,7 +67,7 @@ impl PartialOrd for TimeAgo {
}
pub fn parse(lang: Language, textual_date: &str) -> Option<TimeAgo> {
let mappings = dictionary::get_timeago_tokens(lang);
let mappings = dictionary::entry(lang).timeago_tokens;
let filtered_str = textual_date
.to_lowercase()

View file

@ -59,8 +59,36 @@ pub fn parse_numeric<F>(string: &str) -> Result<F, F::Err>
where
F: FromStr,
{
static NUM_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new("\\D+").unwrap());
NUM_PATTERN.replace_all(string, "").parse()
let mut buf = String::new();
for c in string.chars() {
if c.is_ascii_digit() {
buf.push(c);
}
}
buf.parse()
}
/// Parse all numbers occurring in a string and reurn them as a vec
pub fn parse_numeric_vec<F>(string: &str) -> Vec<F>
where
F: FromStr,
{
let mut numbers = vec![];
let mut buf = String::new();
for c in string.chars() {
if c.is_ascii_digit() {
buf.push(c);
} else if !buf.is_empty() {
buf.parse::<F>().map_or((), |n| numbers.push(n));
buf.clear();
}
}
if !buf.is_empty() {
buf.parse::<F>().map_or((), |n| numbers.push(n));
}
numbers
}
#[cfg(test)]
@ -76,4 +104,13 @@ mod tests {
let n = parse_numeric::<u32>(string).unwrap();
assert_eq!(n, expect);
}
#[rstest]
#[case("15.03.2022", vec![15, 3, 2022])]
#[case("4 Hello World 2", vec![4, 2])]
#[case("最后更新时间2020年1月3日", vec![2020, 1, 3])]
fn t_parse_numeric_vec(#[case] string: &str, #[case] expect: Vec<u32>) {
let n = parse_numeric_vec::<u32>(string);
assert_eq!(n, expect);
}
}