Completed timeago table
This commit is contained in:
parent
17094d121b
commit
2bba9064fc
3 changed files with 4392 additions and 136 deletions
|
|
@ -7,22 +7,19 @@ use std::{
|
|||
path::Path,
|
||||
};
|
||||
|
||||
use anyhow::anyhow;
|
||||
use fancy_regex::Regex;
|
||||
use futures::{stream, StreamExt};
|
||||
use intl_pluralrules::{PluralCategory, PluralRuleType, PluralRules};
|
||||
use log::{error, info};
|
||||
use once_cell::sync::Lazy;
|
||||
use reqwest::Method;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use unic_langid::LanguageIdentifier;
|
||||
|
||||
use crate::{
|
||||
client::{
|
||||
response::{self, video::CommentListItem},
|
||||
ClientType, ContextYT, RustyTube,
|
||||
},
|
||||
client::{response, ClientType, ContextYT, RustyTube},
|
||||
model::{Country, Language},
|
||||
timeago::{self, TimeAgo, TimeUnit, LANGUAGES},
|
||||
timeago::{self, TimeUnit, TimeagoPattern, LANGUAGES},
|
||||
};
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
|
|
@ -77,49 +74,48 @@ async fn get_channel_datestrings(rp: &RustyTube, channel_id: &str) -> Vec<String
|
|||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
async fn get_comment_initial_ctoken(rp: &RustyTube, video_id: &str) -> (String, String) {
|
||||
async fn get_comment_initial_ctoken(rp: &RustyTube, video_id: &str, latest: bool) -> String {
|
||||
let video_response = rp.get_video_response(video_id).await.unwrap();
|
||||
|
||||
let top = video_response
|
||||
.contents
|
||||
.two_column_watch_next_results
|
||||
.results
|
||||
.results
|
||||
.contents
|
||||
.iter()
|
||||
.find_map(|c| match c {
|
||||
response::video::VideoResultsItem::ItemSectionRenderer {
|
||||
contents,
|
||||
section_identifier,
|
||||
} => match section_identifier == "comment-item-section" {
|
||||
true => match &contents[0] {
|
||||
response::video::ItemSection::ContinuationItemRenderer {
|
||||
continuation_endpoint,
|
||||
} => Some(continuation_endpoint.continuation_command.token.to_owned()),
|
||||
_ => None,
|
||||
match latest {
|
||||
true => video_response
|
||||
.engagement_panels
|
||||
.iter()
|
||||
.find_map(|p| {
|
||||
p.engagement_panel_section_list_renderer
|
||||
.header
|
||||
.engagement_panel_title_header_renderer
|
||||
.menu
|
||||
.sort_filter_sub_menu_renderer
|
||||
.sub_menu_items
|
||||
.get(1)
|
||||
.map(|i| i.service_endpoint.continuation_command.token.to_owned())
|
||||
})
|
||||
.unwrap(),
|
||||
false => video_response
|
||||
.contents
|
||||
.two_column_watch_next_results
|
||||
.results
|
||||
.results
|
||||
.contents
|
||||
.iter()
|
||||
.find_map(|c| match c {
|
||||
response::video::VideoResultsItem::ItemSectionRenderer {
|
||||
contents,
|
||||
section_identifier,
|
||||
} => match section_identifier == "comment-item-section" {
|
||||
true => match &contents[0] {
|
||||
response::video::ItemSection::ContinuationItemRenderer {
|
||||
continuation_endpoint,
|
||||
} => Some(continuation_endpoint.continuation_command.token.to_owned()),
|
||||
_ => None,
|
||||
},
|
||||
false => None,
|
||||
},
|
||||
false => None,
|
||||
},
|
||||
_ => None,
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let latest = video_response
|
||||
.engagement_panels
|
||||
.iter()
|
||||
.find_map(|p| {
|
||||
p.engagement_panel_section_list_renderer
|
||||
.header
|
||||
.engagement_panel_title_header_renderer
|
||||
.menu
|
||||
.sort_filter_sub_menu_renderer
|
||||
.sub_menu_items
|
||||
.get(1)
|
||||
.map(|i| i.service_endpoint.continuation_command.token.to_owned())
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
(top, latest)
|
||||
_ => None,
|
||||
})
|
||||
.unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_comment_datestrings(rp: &RustyTube, ctoken: &str) -> (Vec<String>, Option<String>) {
|
||||
|
|
@ -253,6 +249,7 @@ struct Ruleset {
|
|||
|
||||
#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
|
||||
enum PluralCat {
|
||||
Zero,
|
||||
One,
|
||||
Two,
|
||||
Few,
|
||||
|
|
@ -260,17 +257,15 @@ enum PluralCat {
|
|||
Other,
|
||||
}
|
||||
|
||||
impl TryFrom<PluralCategory> for PluralCat {
|
||||
type Error = anyhow::Error;
|
||||
|
||||
fn try_from(value: PluralCategory) -> Result<Self, Self::Error> {
|
||||
impl From<PluralCategory> for PluralCat {
|
||||
fn from(value: PluralCategory) -> Self {
|
||||
match value {
|
||||
PluralCategory::ZERO => Err(anyhow!("zero is not supported")),
|
||||
PluralCategory::ONE => Ok(Self::One),
|
||||
PluralCategory::TWO => Ok(Self::Two),
|
||||
PluralCategory::FEW => Ok(Self::Few),
|
||||
PluralCategory::MANY => Ok(Self::Many),
|
||||
PluralCategory::OTHER => Ok(Self::Other),
|
||||
PluralCategory::ZERO => Self::Zero,
|
||||
PluralCategory::ONE => Self::One,
|
||||
PluralCategory::TWO => Self::Two,
|
||||
PluralCategory::FEW => Self::Few,
|
||||
PluralCategory::MANY => Self::Many,
|
||||
PluralCategory::OTHER => Self::Other,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -308,11 +303,15 @@ static PLURAL_RULES: Lazy<BTreeMap<String, HashSet<PluralCat>>> = Lazy::new(|| {
|
|||
.collect::<BTreeMap<_, _>>()
|
||||
});
|
||||
|
||||
type TimeagoTable = BTreeMap<Language, BTreeMap<TimeUnit, TimeagoTableEntry>>;
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
struct TimeagoTable {
|
||||
entries: BTreeMap<Language, BTreeMap<TimeUnit, TimeagoTableEntry>>,
|
||||
errors: BTreeMap<Language, HashSet<String>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
struct TimeagoTableEntry {
|
||||
cases: BTreeMap<String, TimeAgo>,
|
||||
cases: BTreeMap<String, u32>,
|
||||
missing_plurals: HashSet<PluralCat>,
|
||||
}
|
||||
|
||||
|
|
@ -327,85 +326,153 @@ const TIME_UNITS: [TimeUnit; 7] = [
|
|||
];
|
||||
|
||||
fn new_timeago_table() -> TimeagoTable {
|
||||
LANGUAGES
|
||||
.iter()
|
||||
.filter_map(|lang| {
|
||||
// Check if language is redundant
|
||||
match lang {
|
||||
Language::EnGb
|
||||
| Language::EnIn
|
||||
| Language::FrCa
|
||||
| Language::EsUs
|
||||
| Language::Es419 => None,
|
||||
_ => {
|
||||
let cldr_lang_str = match lang {
|
||||
Language::SrLatn => "sr".to_owned(),
|
||||
Language::ZhCn | Language::ZhHk | Language::ZhTw => "zh".to_owned(),
|
||||
_ => lang.to_string(),
|
||||
};
|
||||
TimeagoTable {
|
||||
entries: LANGUAGES
|
||||
.iter()
|
||||
.filter_map(|lang| {
|
||||
// Check if language is redundant
|
||||
match lang {
|
||||
Language::EnGb
|
||||
| Language::EnIn
|
||||
| Language::FrCa
|
||||
| Language::EsUs
|
||||
| Language::Es419 => None,
|
||||
_ => {
|
||||
let cldr_lang_str = match lang {
|
||||
Language::SrLatn => "sr".to_owned(),
|
||||
Language::ZhCn | Language::ZhHk | Language::ZhTw => "zh".to_owned(),
|
||||
_ => lang.to_string(),
|
||||
};
|
||||
|
||||
let m = TIME_UNITS
|
||||
.iter()
|
||||
.map(|t| {
|
||||
let missing_plurals = if t == &TimeUnit::Week {
|
||||
// Week only has 3 valid values (1-3)
|
||||
let mut mp = HashSet::new();
|
||||
let m = TIME_UNITS
|
||||
.iter()
|
||||
.map(|t| {
|
||||
let missing_plurals = if t == &TimeUnit::Week {
|
||||
// Week only has 3 valid values (2-4)
|
||||
let mut mp = HashSet::new();
|
||||
|
||||
let l_id = cldr_lang_str.parse::<LanguageIdentifier>().unwrap();
|
||||
let pr =
|
||||
PluralRules::create(l_id, PluralRuleType::CARDINAL).unwrap();
|
||||
let l_id = cldr_lang_str.parse::<LanguageIdentifier>().unwrap();
|
||||
let pr = PluralRules::create(l_id, PluralRuleType::CARDINAL)
|
||||
.unwrap();
|
||||
|
||||
mp.insert(PluralCat::try_from(pr.select(1).unwrap()).unwrap());
|
||||
mp.insert(PluralCat::try_from(pr.select(2).unwrap()).unwrap());
|
||||
mp.insert(PluralCat::try_from(pr.select(3).unwrap()).unwrap());
|
||||
mp.insert(PluralCat::from(pr.select(2).unwrap()));
|
||||
mp.insert(PluralCat::from(pr.select(3).unwrap()));
|
||||
mp.insert(PluralCat::from(pr.select(4).unwrap()));
|
||||
|
||||
mp
|
||||
} else {
|
||||
PLURAL_RULES.get(&cldr_lang_str).unwrap().clone()
|
||||
};
|
||||
mp
|
||||
} else {
|
||||
PLURAL_RULES.get(&cldr_lang_str).unwrap().clone()
|
||||
};
|
||||
|
||||
(
|
||||
t.to_owned(),
|
||||
TimeagoTableEntry {
|
||||
cases: BTreeMap::new(),
|
||||
missing_plurals,
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
(
|
||||
t.to_owned(),
|
||||
TimeagoTableEntry {
|
||||
cases: BTreeMap::new(),
|
||||
missing_plurals,
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
Some((lang.to_owned(), m))
|
||||
Some((lang.to_owned(), m))
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
.collect(),
|
||||
errors: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn t_new_timeago_table() {
|
||||
fn read_timeago_table() -> TimeagoTable {
|
||||
let json_path = Path::new("testfiles/date/timeago_table.json").to_path_buf();
|
||||
if json_path.exists() {
|
||||
return;
|
||||
let file = File::open(json_path).unwrap();
|
||||
serde_json::from_reader(BufReader::new(file)).unwrap()
|
||||
} else {
|
||||
new_timeago_table()
|
||||
}
|
||||
|
||||
let file = File::create(json_path).unwrap();
|
||||
serde_json::to_writer_pretty(file, &new_timeago_table()).unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn t_tmp() {
|
||||
let rp = RustyTube::new();
|
||||
let (top, latest) = get_comment_initial_ctoken(&rp, "gQlMMD8auMs").await;
|
||||
// let (top, latest) = get_comment_initial_ctoken(&rp, "9bZkp7q19f0").await;
|
||||
let mut ctoken = latest;
|
||||
fn write_timeago_table(timeago_table: &TimeagoTable) {
|
||||
let json_path = Path::new("testfiles/date/timeago_table.json").to_path_buf();
|
||||
let file = File::create(json_path).unwrap();
|
||||
serde_json::to_writer_pretty(file, timeago_table).unwrap();
|
||||
}
|
||||
|
||||
fn insert_timeago_table(
|
||||
timeago_table: &mut TimeagoTable,
|
||||
lang: &Language,
|
||||
date_str: &str,
|
||||
limit: Option<TimeUnit>,
|
||||
ignore_1s: bool,
|
||||
) -> bool {
|
||||
let pattern = TimeagoPattern::from(lang.to_owned());
|
||||
match pattern.parse(date_str) {
|
||||
Some(timeago) => {
|
||||
let entry = timeago_table
|
||||
.entries
|
||||
.get_mut(lang)
|
||||
.unwrap()
|
||||
.get_mut(&timeago.unit)
|
||||
.unwrap();
|
||||
|
||||
let cldr_lang_str = &lang.to_string()[0..2];
|
||||
let l_id: LanguageIdentifier = cldr_lang_str.parse().unwrap();
|
||||
let pl_pat = PluralRules::create(l_id, PluralRuleType::CARDINAL).unwrap();
|
||||
let pl = PluralCat::from(pl_pat.select(timeago.n).unwrap());
|
||||
|
||||
// Collect the case if its plural type is missing
|
||||
if entry.missing_plurals.remove(&pl) {
|
||||
entry.cases.insert(date_str.to_owned(), timeago.n);
|
||||
info!(
|
||||
"Collected `{}` ({} {:?})",
|
||||
date_str, timeago.n, timeago.unit
|
||||
);
|
||||
}
|
||||
|
||||
timeago_table
|
||||
.entries
|
||||
.get(lang)
|
||||
.unwrap()
|
||||
.iter()
|
||||
.all(|(t, entry)| {
|
||||
(limit.is_some() && t > &limit.unwrap())
|
||||
|| entry.missing_plurals.is_empty()
|
||||
|| (ignore_1s
|
||||
&& t == &TimeUnit::Second
|
||||
&& entry.missing_plurals.len() == 1
|
||||
&& entry.missing_plurals.contains(&PluralCat::One))
|
||||
})
|
||||
}
|
||||
None => {
|
||||
error!("Could not parse `{}`", date_str);
|
||||
let errors = timeago_table
|
||||
.errors
|
||||
.entry(*lang)
|
||||
.or_insert_with(|| HashSet::new());
|
||||
errors.insert(date_str.to_owned());
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn insert_timeago_table_datestrings(
|
||||
rp: &RustyTube,
|
||||
timeago_table: &mut TimeagoTable,
|
||||
video_id: &str,
|
||||
latest: bool,
|
||||
limit: Option<TimeUnit>,
|
||||
ignore_1s: bool,
|
||||
) {
|
||||
let mut ctoken = get_comment_initial_ctoken(&rp, video_id, latest).await;
|
||||
let brace_pattern = Regex::new(r"\(.+\)").unwrap();
|
||||
let lang = &rp.localization.language;
|
||||
let err_baseline = timeago_table.errors.len();
|
||||
|
||||
for _ in 0..100 {
|
||||
for _ in 0..40 {
|
||||
let (strings, new_ctoken) = get_comment_datestrings(&rp, &ctoken).await;
|
||||
|
||||
/*
|
||||
strings
|
||||
let res = strings
|
||||
.iter()
|
||||
.map(|s| {
|
||||
// Remove zero-width space characters
|
||||
|
|
@ -417,14 +484,186 @@ async fn t_tmp() {
|
|||
let s = s.trim();
|
||||
s.to_owned()
|
||||
})
|
||||
.for_each(|s| println!("{}", s));
|
||||
*/
|
||||
println!("n: {}", strings.len());
|
||||
.find(|s| insert_timeago_table(timeago_table, lang, &s, limit, ignore_1s));
|
||||
|
||||
if res.is_some() {
|
||||
break;
|
||||
}
|
||||
|
||||
if timeago_table.errors.len() > err_baseline {
|
||||
return;
|
||||
}
|
||||
|
||||
if let Some(new_ctoken) = new_ctoken {
|
||||
ctoken = new_ctoken.to_owned();
|
||||
} else {
|
||||
error!("end of comments");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn insert_timeago_table_datestrings_channel(
|
||||
rp: &RustyTube,
|
||||
timeago_table: &mut TimeagoTable,
|
||||
channel_id: &str,
|
||||
) {
|
||||
let lang = &rp.localization.language;
|
||||
|
||||
let strings = get_channel_datestrings(rp, channel_id).await;
|
||||
|
||||
strings
|
||||
.iter()
|
||||
.map(|s| {
|
||||
// Remove zero-width space characters
|
||||
let s = s.replace('\u{200b}', "");
|
||||
|
||||
let s = s.trim();
|
||||
s.to_owned()
|
||||
})
|
||||
.for_each(|s| {
|
||||
insert_timeago_table(timeago_table, lang, &s, None, false);
|
||||
});
|
||||
}
|
||||
|
||||
#[test_log::test(tokio::test)]
|
||||
async fn t_build_timeago_table() {
|
||||
let mut timeago_table = read_timeago_table();
|
||||
let ignore_1s = false;
|
||||
let langs = timeago_table
|
||||
.entries
|
||||
.keys()
|
||||
.map(|k| k.to_owned())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for lang in langs {
|
||||
if timeago_table
|
||||
.entries
|
||||
.get(&lang)
|
||||
.unwrap()
|
||||
.iter()
|
||||
.all(|(t, entry)| {
|
||||
entry.missing_plurals.is_empty()
|
||||
|| (ignore_1s
|
||||
&& t == &TimeUnit::Second
|
||||
&& entry.missing_plurals.len() == 1
|
||||
&& entry.missing_plurals.contains(&PluralCat::One))
|
||||
})
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
let rp = RustyTube::new_with_ua(lang, Country::Us, None);
|
||||
|
||||
println!("{}: 1s!", lang);
|
||||
{
|
||||
let ctoken = get_comment_initial_ctoken(&rp, "gQlMMD8auMs", true).await;
|
||||
// let ctoken = get_comment_initial_ctoken(&rp, "k6jqx9kZgPM", true).await;
|
||||
let brace_pattern = Regex::new(r"\(.+\)").unwrap();
|
||||
let lang = &rp.localization.language;
|
||||
let err_baseline = timeago_table.errors.len();
|
||||
|
||||
loop {
|
||||
let (strings, _) = get_comment_datestrings(&rp, &ctoken).await;
|
||||
println!("{}", strings[0]);
|
||||
|
||||
let res = strings
|
||||
.iter()
|
||||
.map(|s| {
|
||||
// Remove zero-width space characters
|
||||
let s = s.replace('\u{200b}', "");
|
||||
|
||||
// Remove braces
|
||||
let s = brace_pattern.replace(&s, "");
|
||||
|
||||
let s = s.trim();
|
||||
s.to_owned()
|
||||
})
|
||||
.find(|s| {
|
||||
insert_timeago_table(
|
||||
&mut timeago_table,
|
||||
lang,
|
||||
&s,
|
||||
Some(TimeUnit::Second),
|
||||
ignore_1s,
|
||||
)
|
||||
});
|
||||
|
||||
if res.is_some() {
|
||||
break;
|
||||
}
|
||||
|
||||
if timeago_table.errors.len() > err_baseline {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
println!("{}: 2s - n min", lang);
|
||||
insert_timeago_table_datestrings(
|
||||
&rp,
|
||||
&mut timeago_table,
|
||||
"gQlMMD8auMs",
|
||||
true,
|
||||
Some(TimeUnit::Minute),
|
||||
ignore_1s,
|
||||
)
|
||||
.await;
|
||||
println!("{}: x hr", lang);
|
||||
insert_timeago_table_datestrings(
|
||||
&rp,
|
||||
&mut timeago_table,
|
||||
"TohrPm3ICJE",
|
||||
true,
|
||||
Some(TimeUnit::Hour),
|
||||
ignore_1s,
|
||||
)
|
||||
.await;
|
||||
println!("{}: 1 hr - n day", lang);
|
||||
insert_timeago_table_datestrings(
|
||||
&rp,
|
||||
&mut timeago_table,
|
||||
"J9NQFACZYEU",
|
||||
true,
|
||||
Some(TimeUnit::Day),
|
||||
ignore_1s,
|
||||
)
|
||||
.await;
|
||||
println!("{}: week", lang);
|
||||
insert_timeago_table_datestrings(
|
||||
&rp,
|
||||
&mut timeago_table,
|
||||
"-zPDx6HQ_9w",
|
||||
true,
|
||||
Some(TimeUnit::Week),
|
||||
ignore_1s,
|
||||
)
|
||||
.await;
|
||||
|
||||
println!("{}: 1 yr - n yr", lang);
|
||||
insert_timeago_table_datestrings_channel(
|
||||
&rp,
|
||||
&mut timeago_table,
|
||||
"UCEOXxzW2vU0P-0THehuIIeg",
|
||||
)
|
||||
.await;
|
||||
|
||||
println!("{}: 11 mon", lang);
|
||||
insert_timeago_table_datestrings_channel(
|
||||
&rp,
|
||||
&mut timeago_table,
|
||||
"UCY1kMZp36IQSyNx_9h4mpCg",
|
||||
)
|
||||
.await;
|
||||
|
||||
println!("{}: 13 yr", lang);
|
||||
insert_timeago_table_datestrings_channel(
|
||||
&rp,
|
||||
&mut timeago_table,
|
||||
"UCfw6qEAJMDbmgqQbuoB5moA",
|
||||
)
|
||||
.await;
|
||||
|
||||
write_timeago_table(&timeago_table);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Reference in a new issue