Completed timeago table

This commit is contained in:
ThetaDev 2022-09-04 00:04:44 +02:00
parent 17094d121b
commit 2bba9064fc
3 changed files with 4392 additions and 136 deletions

View file

@ -7,22 +7,19 @@ use std::{
path::Path,
};
use anyhow::anyhow;
use fancy_regex::Regex;
use futures::{stream, StreamExt};
use intl_pluralrules::{PluralCategory, PluralRuleType, PluralRules};
use log::{error, info};
use once_cell::sync::Lazy;
use reqwest::Method;
use serde::{Deserialize, Serialize};
use unic_langid::LanguageIdentifier;
use crate::{
client::{
response::{self, video::CommentListItem},
ClientType, ContextYT, RustyTube,
},
client::{response, ClientType, ContextYT, RustyTube},
model::{Country, Language},
timeago::{self, TimeAgo, TimeUnit, LANGUAGES},
timeago::{self, TimeUnit, TimeagoPattern, LANGUAGES},
};
#[derive(Clone, Debug, Serialize)]
@ -77,49 +74,48 @@ async fn get_channel_datestrings(rp: &RustyTube, channel_id: &str) -> Vec<String
.collect::<Vec<_>>()
}
async fn get_comment_initial_ctoken(rp: &RustyTube, video_id: &str) -> (String, String) {
async fn get_comment_initial_ctoken(rp: &RustyTube, video_id: &str, latest: bool) -> String {
let video_response = rp.get_video_response(video_id).await.unwrap();
let top = video_response
.contents
.two_column_watch_next_results
.results
.results
.contents
.iter()
.find_map(|c| match c {
response::video::VideoResultsItem::ItemSectionRenderer {
contents,
section_identifier,
} => match section_identifier == "comment-item-section" {
true => match &contents[0] {
response::video::ItemSection::ContinuationItemRenderer {
continuation_endpoint,
} => Some(continuation_endpoint.continuation_command.token.to_owned()),
_ => None,
match latest {
true => video_response
.engagement_panels
.iter()
.find_map(|p| {
p.engagement_panel_section_list_renderer
.header
.engagement_panel_title_header_renderer
.menu
.sort_filter_sub_menu_renderer
.sub_menu_items
.get(1)
.map(|i| i.service_endpoint.continuation_command.token.to_owned())
})
.unwrap(),
false => video_response
.contents
.two_column_watch_next_results
.results
.results
.contents
.iter()
.find_map(|c| match c {
response::video::VideoResultsItem::ItemSectionRenderer {
contents,
section_identifier,
} => match section_identifier == "comment-item-section" {
true => match &contents[0] {
response::video::ItemSection::ContinuationItemRenderer {
continuation_endpoint,
} => Some(continuation_endpoint.continuation_command.token.to_owned()),
_ => None,
},
false => None,
},
false => None,
},
_ => None,
})
.unwrap();
let latest = video_response
.engagement_panels
.iter()
.find_map(|p| {
p.engagement_panel_section_list_renderer
.header
.engagement_panel_title_header_renderer
.menu
.sort_filter_sub_menu_renderer
.sub_menu_items
.get(1)
.map(|i| i.service_endpoint.continuation_command.token.to_owned())
})
.unwrap();
(top, latest)
_ => None,
})
.unwrap(),
}
}
async fn get_comment_datestrings(rp: &RustyTube, ctoken: &str) -> (Vec<String>, Option<String>) {
@ -253,6 +249,7 @@ struct Ruleset {
#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
enum PluralCat {
Zero,
One,
Two,
Few,
@ -260,17 +257,15 @@ enum PluralCat {
Other,
}
impl TryFrom<PluralCategory> for PluralCat {
type Error = anyhow::Error;
fn try_from(value: PluralCategory) -> Result<Self, Self::Error> {
impl From<PluralCategory> for PluralCat {
fn from(value: PluralCategory) -> Self {
match value {
PluralCategory::ZERO => Err(anyhow!("zero is not supported")),
PluralCategory::ONE => Ok(Self::One),
PluralCategory::TWO => Ok(Self::Two),
PluralCategory::FEW => Ok(Self::Few),
PluralCategory::MANY => Ok(Self::Many),
PluralCategory::OTHER => Ok(Self::Other),
PluralCategory::ZERO => Self::Zero,
PluralCategory::ONE => Self::One,
PluralCategory::TWO => Self::Two,
PluralCategory::FEW => Self::Few,
PluralCategory::MANY => Self::Many,
PluralCategory::OTHER => Self::Other,
}
}
}
@ -308,11 +303,15 @@ static PLURAL_RULES: Lazy<BTreeMap<String, HashSet<PluralCat>>> = Lazy::new(|| {
.collect::<BTreeMap<_, _>>()
});
type TimeagoTable = BTreeMap<Language, BTreeMap<TimeUnit, TimeagoTableEntry>>;
#[derive(Debug, Clone, Serialize, Deserialize)]
struct TimeagoTable {
entries: BTreeMap<Language, BTreeMap<TimeUnit, TimeagoTableEntry>>,
errors: BTreeMap<Language, HashSet<String>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct TimeagoTableEntry {
cases: BTreeMap<String, TimeAgo>,
cases: BTreeMap<String, u32>,
missing_plurals: HashSet<PluralCat>,
}
@ -327,85 +326,153 @@ const TIME_UNITS: [TimeUnit; 7] = [
];
fn new_timeago_table() -> TimeagoTable {
LANGUAGES
.iter()
.filter_map(|lang| {
// Check if language is redundant
match lang {
Language::EnGb
| Language::EnIn
| Language::FrCa
| Language::EsUs
| Language::Es419 => None,
_ => {
let cldr_lang_str = match lang {
Language::SrLatn => "sr".to_owned(),
Language::ZhCn | Language::ZhHk | Language::ZhTw => "zh".to_owned(),
_ => lang.to_string(),
};
TimeagoTable {
entries: LANGUAGES
.iter()
.filter_map(|lang| {
// Check if language is redundant
match lang {
Language::EnGb
| Language::EnIn
| Language::FrCa
| Language::EsUs
| Language::Es419 => None,
_ => {
let cldr_lang_str = match lang {
Language::SrLatn => "sr".to_owned(),
Language::ZhCn | Language::ZhHk | Language::ZhTw => "zh".to_owned(),
_ => lang.to_string(),
};
let m = TIME_UNITS
.iter()
.map(|t| {
let missing_plurals = if t == &TimeUnit::Week {
// Week only has 3 valid values (1-3)
let mut mp = HashSet::new();
let m = TIME_UNITS
.iter()
.map(|t| {
let missing_plurals = if t == &TimeUnit::Week {
// Week only has 3 valid values (2-4)
let mut mp = HashSet::new();
let l_id = cldr_lang_str.parse::<LanguageIdentifier>().unwrap();
let pr =
PluralRules::create(l_id, PluralRuleType::CARDINAL).unwrap();
let l_id = cldr_lang_str.parse::<LanguageIdentifier>().unwrap();
let pr = PluralRules::create(l_id, PluralRuleType::CARDINAL)
.unwrap();
mp.insert(PluralCat::try_from(pr.select(1).unwrap()).unwrap());
mp.insert(PluralCat::try_from(pr.select(2).unwrap()).unwrap());
mp.insert(PluralCat::try_from(pr.select(3).unwrap()).unwrap());
mp.insert(PluralCat::from(pr.select(2).unwrap()));
mp.insert(PluralCat::from(pr.select(3).unwrap()));
mp.insert(PluralCat::from(pr.select(4).unwrap()));
mp
} else {
PLURAL_RULES.get(&cldr_lang_str).unwrap().clone()
};
mp
} else {
PLURAL_RULES.get(&cldr_lang_str).unwrap().clone()
};
(
t.to_owned(),
TimeagoTableEntry {
cases: BTreeMap::new(),
missing_plurals,
},
)
})
.collect();
(
t.to_owned(),
TimeagoTableEntry {
cases: BTreeMap::new(),
missing_plurals,
},
)
})
.collect();
Some((lang.to_owned(), m))
Some((lang.to_owned(), m))
}
}
}
})
.collect()
})
.collect(),
errors: BTreeMap::new(),
}
}
#[test]
fn t_new_timeago_table() {
fn read_timeago_table() -> TimeagoTable {
let json_path = Path::new("testfiles/date/timeago_table.json").to_path_buf();
if json_path.exists() {
return;
let file = File::open(json_path).unwrap();
serde_json::from_reader(BufReader::new(file)).unwrap()
} else {
new_timeago_table()
}
let file = File::create(json_path).unwrap();
serde_json::to_writer_pretty(file, &new_timeago_table()).unwrap();
}
#[tokio::test]
async fn t_tmp() {
let rp = RustyTube::new();
let (top, latest) = get_comment_initial_ctoken(&rp, "gQlMMD8auMs").await;
// let (top, latest) = get_comment_initial_ctoken(&rp, "9bZkp7q19f0").await;
let mut ctoken = latest;
fn write_timeago_table(timeago_table: &TimeagoTable) {
let json_path = Path::new("testfiles/date/timeago_table.json").to_path_buf();
let file = File::create(json_path).unwrap();
serde_json::to_writer_pretty(file, timeago_table).unwrap();
}
fn insert_timeago_table(
timeago_table: &mut TimeagoTable,
lang: &Language,
date_str: &str,
limit: Option<TimeUnit>,
ignore_1s: bool,
) -> bool {
let pattern = TimeagoPattern::from(lang.to_owned());
match pattern.parse(date_str) {
Some(timeago) => {
let entry = timeago_table
.entries
.get_mut(lang)
.unwrap()
.get_mut(&timeago.unit)
.unwrap();
let cldr_lang_str = &lang.to_string()[0..2];
let l_id: LanguageIdentifier = cldr_lang_str.parse().unwrap();
let pl_pat = PluralRules::create(l_id, PluralRuleType::CARDINAL).unwrap();
let pl = PluralCat::from(pl_pat.select(timeago.n).unwrap());
// Collect the case if its plural type is missing
if entry.missing_plurals.remove(&pl) {
entry.cases.insert(date_str.to_owned(), timeago.n);
info!(
"Collected `{}` ({} {:?})",
date_str, timeago.n, timeago.unit
);
}
timeago_table
.entries
.get(lang)
.unwrap()
.iter()
.all(|(t, entry)| {
(limit.is_some() && t > &limit.unwrap())
|| entry.missing_plurals.is_empty()
|| (ignore_1s
&& t == &TimeUnit::Second
&& entry.missing_plurals.len() == 1
&& entry.missing_plurals.contains(&PluralCat::One))
})
}
None => {
error!("Could not parse `{}`", date_str);
let errors = timeago_table
.errors
.entry(*lang)
.or_insert_with(|| HashSet::new());
errors.insert(date_str.to_owned());
false
}
}
}
async fn insert_timeago_table_datestrings(
rp: &RustyTube,
timeago_table: &mut TimeagoTable,
video_id: &str,
latest: bool,
limit: Option<TimeUnit>,
ignore_1s: bool,
) {
let mut ctoken = get_comment_initial_ctoken(&rp, video_id, latest).await;
let brace_pattern = Regex::new(r"\(.+\)").unwrap();
let lang = &rp.localization.language;
let err_baseline = timeago_table.errors.len();
for _ in 0..100 {
for _ in 0..40 {
let (strings, new_ctoken) = get_comment_datestrings(&rp, &ctoken).await;
/*
strings
let res = strings
.iter()
.map(|s| {
// Remove zero-width space characters
@ -417,14 +484,186 @@ async fn t_tmp() {
let s = s.trim();
s.to_owned()
})
.for_each(|s| println!("{}", s));
*/
println!("n: {}", strings.len());
.find(|s| insert_timeago_table(timeago_table, lang, &s, limit, ignore_1s));
if res.is_some() {
break;
}
if timeago_table.errors.len() > err_baseline {
return;
}
if let Some(new_ctoken) = new_ctoken {
ctoken = new_ctoken.to_owned();
} else {
error!("end of comments");
break;
}
}
}
async fn insert_timeago_table_datestrings_channel(
rp: &RustyTube,
timeago_table: &mut TimeagoTable,
channel_id: &str,
) {
let lang = &rp.localization.language;
let strings = get_channel_datestrings(rp, channel_id).await;
strings
.iter()
.map(|s| {
// Remove zero-width space characters
let s = s.replace('\u{200b}', "");
let s = s.trim();
s.to_owned()
})
.for_each(|s| {
insert_timeago_table(timeago_table, lang, &s, None, false);
});
}
#[test_log::test(tokio::test)]
async fn t_build_timeago_table() {
let mut timeago_table = read_timeago_table();
let ignore_1s = false;
let langs = timeago_table
.entries
.keys()
.map(|k| k.to_owned())
.collect::<Vec<_>>();
for lang in langs {
if timeago_table
.entries
.get(&lang)
.unwrap()
.iter()
.all(|(t, entry)| {
entry.missing_plurals.is_empty()
|| (ignore_1s
&& t == &TimeUnit::Second
&& entry.missing_plurals.len() == 1
&& entry.missing_plurals.contains(&PluralCat::One))
})
{
continue;
}
let rp = RustyTube::new_with_ua(lang, Country::Us, None);
println!("{}: 1s!", lang);
{
let ctoken = get_comment_initial_ctoken(&rp, "gQlMMD8auMs", true).await;
// let ctoken = get_comment_initial_ctoken(&rp, "k6jqx9kZgPM", true).await;
let brace_pattern = Regex::new(r"\(.+\)").unwrap();
let lang = &rp.localization.language;
let err_baseline = timeago_table.errors.len();
loop {
let (strings, _) = get_comment_datestrings(&rp, &ctoken).await;
println!("{}", strings[0]);
let res = strings
.iter()
.map(|s| {
// Remove zero-width space characters
let s = s.replace('\u{200b}', "");
// Remove braces
let s = brace_pattern.replace(&s, "");
let s = s.trim();
s.to_owned()
})
.find(|s| {
insert_timeago_table(
&mut timeago_table,
lang,
&s,
Some(TimeUnit::Second),
ignore_1s,
)
});
if res.is_some() {
break;
}
if timeago_table.errors.len() > err_baseline {
break;
}
}
}
println!("{}: 2s - n min", lang);
insert_timeago_table_datestrings(
&rp,
&mut timeago_table,
"gQlMMD8auMs",
true,
Some(TimeUnit::Minute),
ignore_1s,
)
.await;
println!("{}: x hr", lang);
insert_timeago_table_datestrings(
&rp,
&mut timeago_table,
"TohrPm3ICJE",
true,
Some(TimeUnit::Hour),
ignore_1s,
)
.await;
println!("{}: 1 hr - n day", lang);
insert_timeago_table_datestrings(
&rp,
&mut timeago_table,
"J9NQFACZYEU",
true,
Some(TimeUnit::Day),
ignore_1s,
)
.await;
println!("{}: week", lang);
insert_timeago_table_datestrings(
&rp,
&mut timeago_table,
"-zPDx6HQ_9w",
true,
Some(TimeUnit::Week),
ignore_1s,
)
.await;
println!("{}: 1 yr - n yr", lang);
insert_timeago_table_datestrings_channel(
&rp,
&mut timeago_table,
"UCEOXxzW2vU0P-0THehuIIeg",
)
.await;
println!("{}: 11 mon", lang);
insert_timeago_table_datestrings_channel(
&rp,
&mut timeago_table,
"UCY1kMZp36IQSyNx_9h4mpCg",
)
.await;
println!("{}: 13 yr", lang);
insert_timeago_table_datestrings_channel(
&rp,
&mut timeago_table,
"UCfw6qEAJMDbmgqQbuoB5moA",
)
.await;
write_timeago_table(&timeago_table);
}
}

View file

@ -1,4 +1,4 @@
use std::{borrow::Cow, str::FromStr, vec};
use std::{borrow::Cow, str::FromStr, vec, cmp::Ordering};
use anyhow::Result;
use fancy_regex::Regex;
@ -92,7 +92,7 @@ pub const LANGUAGES: [Language; 83] = [
Language::Zu,
];
#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[derive(Debug, Copy, Clone, Serialize, Deserialize, Eq)]
pub struct TimeAgo {
pub n: u32,
pub unit: TimeUnit,
@ -139,7 +139,7 @@ impl From<Language> for TimeagoPattern<'_> {
},
Language::Am => TimeagoPattern {
word_separator: " ",
seconds: vec!["ሰኮንዶች", "ሴኮንድ"],
seconds: vec!["ሰኮንዶች", "ሴኮንድ", "ሰከንድ", "ሰከንዶች"],
minutes: vec!["ደቂቃ", "ደቂቃዎች"],
hours: vec!["ሰዓት", "ሰዓቶች"],
// INFO: add days[0]
@ -151,14 +151,28 @@ impl From<Language> for TimeagoPattern<'_> {
},
Language::Ar => TimeagoPattern {
word_separator: " ",
seconds: vec!["ثانية", "ثانيتين", "ثوانٍ"],
seconds: vec!["ثانية", "ثوانٍ", "ثوانِ"],
minutes: vec!["دقائق", "دقيقة", "دقيقتين"],
hours: vec!["ساعات", "ساعة", "ساعتين"],
days: vec!["أيام", "يوم", "يومين", "يومًا"],
weeks: vec!["أسابيع", "أسبوع", "أسبوعين"],
months: vec!["أشهر", "شهر", "شهرين", "شهرًا"],
years: vec!["سنة", "سنتين", "سنوات"],
hours: vec!["ساعات", "ساعة"],
days: vec!["أيام", "يوم", "يومًا"],
weeks: vec!["أسابيع", "أسبوع"],
months: vec!["أشهر", "شهر", "شهرًا"],
years: vec!["سنة", "سنوات"],
special_cases: vec![
(
"ثانيتين",
TimeAgo {
n: 2,
unit: TimeUnit::Second,
},
),
(
"دقيقتين",
TimeAgo {
n: 2,
unit: TimeUnit::Minute,
},
),
(
"ساعتين",
TimeAgo {
@ -281,7 +295,7 @@ impl From<Language> for TimeagoPattern<'_> {
seconds: vec!["sekundami", "sekundou"],
minutes: vec!["minutami", "minutou"],
hours: vec!["hodinami", "hodinou"],
days: vec!["dny", "včera"],
days: vec!["dny", "dnem"],
weeks: vec!["týdnem", "týdny"],
months: vec!["měsícem", "měsíci"],
years: vec!["rokem", "roky", "lety"],
@ -521,6 +535,20 @@ impl From<Language> for TimeagoPattern<'_> {
months: vec!["חודש", "חודשים"],
years: vec!["שנה", "שנים"],
special_cases: vec![
(
"שתי שניות",
TimeAgo {
n: 2,
unit: TimeUnit::Second,
},
),
(
"שתי דקות",
TimeAgo {
n: 2,
unit: TimeUnit::Minute,
},
),
(
"שעתיים",
TimeAgo {
@ -662,7 +690,7 @@ impl From<Language> for TimeagoPattern<'_> {
Language::Lv => TimeagoPattern {
word_separator: " ",
seconds: vec!["sekundes", "sekundēm"],
minutes: vec!["minūtes", "minūtēm", "minūtes"],
minutes: vec!["minūtes", "minūtēm"],
hours: vec!["stundas", "stundām"],
days: vec!["dienas", "dienām"],
weeks: vec!["nedēļas", "nedēļām"],
@ -867,7 +895,7 @@ impl From<Language> for TimeagoPattern<'_> {
seconds: vec!["sekundama", "sekundami", "sekundo"],
minutes: vec!["minutama", "minutami", "minuto"],
hours: vec!["urama", "urami", "uro"],
days: vec!["dnem", "dnevi", "dnevoma"],
days: vec!["dnem", "dnevi", "dnevoma", "dnevom"],
weeks: vec!["tedni", "tednom", "tednoma"],
months: vec!["mesecem", "mesecema", "meseci"],
years: vec!["leti", "letom", "letoma"],
@ -932,9 +960,9 @@ impl From<Language> for TimeagoPattern<'_> {
},
Language::Ta => TimeagoPattern {
word_separator: " ",
// INFO: fixed minutes hours months, TODO: 1 second
// INFO: fixed minutes hours months
// 2 விநாடிகளுக்கு முன்
seconds: vec!["வினாடி", "வினாடிகளுக்கு"],
seconds: vec!["வினாடி", "வினாடிகளுக்கு", "விநாடிகளுக்கு", "விநாடிக்கு"],
// 1 நிமிடத்திற்கு முன் 2 நிமிடங்களுக்கு முன்
minutes: vec!["நிமிடங்களுக்கு", "நிமிடத்திற்கு", "நிமிடங்கள்", "நிமிடம்"],
hours: vec!["மணிநேரம்"],
@ -1065,7 +1093,7 @@ impl From<Language> for TimeagoPattern<'_> {
seconds: vec!["amasekhondi", "isekhondi"],
minutes: vec!["amaminithi", "iminithi"],
hours: vec!["emahoreni", "amahora", "ihora"],
days: vec!["ezinsukwini", "izinsuku", "usuku"],
days: vec!["ezinsukwini", "izinsuku", "usuku", "osukwini"],
weeks: vec!["amaviki", "iviki"],
months: vec!["inyanga", "izinyanga"],
years: vec!["iminyaka", "unyaka"],
@ -1163,6 +1191,44 @@ impl TimeagoPattern<'_> {
}
}
impl TimeUnit {
fn seconds(&self) -> u64 {
match self {
TimeUnit::Second => 1,
TimeUnit::Minute => 60,
TimeUnit::Hour => 3600,
TimeUnit::Day => 24 * 3600,
TimeUnit::Week => 7 * 24 * 3600,
TimeUnit::Month => 30 * 24 * 3600,
TimeUnit::Year => 365 * 24 * 3600,
}
}
}
impl TimeAgo {
fn seconds(&self) -> u64 {
self.n as u64 * self.unit.seconds()
}
}
impl PartialEq for TimeAgo {
fn eq(&self, other: &Self) -> bool {
self.seconds() == other.seconds()
}
}
impl Ord for TimeAgo {
fn cmp(&self, other: &Self) -> Ordering {
self.seconds().cmp(&other.seconds())
}
}
impl PartialOrd for TimeAgo {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
#[cfg(test)]
mod tests {
use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path};

File diff suppressed because it is too large Load diff