fix: add dictionary support for short timeago strings
This commit is contained in:
parent
cc2cadc309
commit
0cd018e37a
10 changed files with 6308 additions and 1694 deletions
83
codegen/src/collect_video_dates.rs
Normal file
83
codegen/src/collect_video_dates.rs
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
use std::{
|
||||
collections::{BTreeMap, HashSet},
|
||||
fs::File,
|
||||
};
|
||||
|
||||
use futures::{stream, StreamExt};
|
||||
use path_macro::path;
|
||||
use rustypipe::{
|
||||
client::{RustyPipe, RustyPipeQuery},
|
||||
param::{Language, LANGUAGES},
|
||||
};
|
||||
|
||||
use crate::util::DICT_DIR;
|
||||
|
||||
pub async fn collect_video_dates(concurrency: usize) {
|
||||
let json_path = path!(*DICT_DIR / "timeago_samples_short.json");
|
||||
let rp = RustyPipe::builder()
|
||||
.visitor_data("Cgtwel9tMkh2eHh0USiyzc6jBg%3D%3D")
|
||||
.build();
|
||||
|
||||
let channels = [
|
||||
"UCeY0bbntWzzVIaj2z3QigXg",
|
||||
"UCcmpeVbSSQlZRvHfdC-CRwg",
|
||||
"UC65afEgL62PGFWXY7n6CUbA",
|
||||
"UCEOXxzW2vU0P-0THehuIIeg",
|
||||
];
|
||||
|
||||
let mut lang_strings: BTreeMap<Language, Vec<String>> = BTreeMap::new();
|
||||
for lang in LANGUAGES {
|
||||
println!("{lang}");
|
||||
let query = rp.query().lang(lang);
|
||||
let strings = stream::iter(channels)
|
||||
.map(|id| get_channel_datestrings(&query, id))
|
||||
.buffered(concurrency)
|
||||
.collect::<Vec<_>>()
|
||||
.await
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.collect::<Vec<_>>();
|
||||
lang_strings.insert(lang, strings);
|
||||
}
|
||||
|
||||
let mut en_strings_uniq: HashSet<&str> = HashSet::new();
|
||||
let mut uniq_ids: HashSet<usize> = HashSet::new();
|
||||
|
||||
lang_strings[&Language::En]
|
||||
.iter()
|
||||
.enumerate()
|
||||
.for_each(|(n, s)| {
|
||||
if en_strings_uniq.insert(s) {
|
||||
uniq_ids.insert(n);
|
||||
}
|
||||
});
|
||||
|
||||
let strings_map = lang_strings
|
||||
.iter()
|
||||
.map(|(lang, strings)| {
|
||||
(
|
||||
lang,
|
||||
strings
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(n, _)| uniq_ids.contains(n))
|
||||
.map(|(_, s)| s)
|
||||
.collect::<Vec<_>>(),
|
||||
)
|
||||
})
|
||||
.collect::<BTreeMap<_, _>>();
|
||||
|
||||
let file = File::create(json_path).unwrap();
|
||||
serde_json::to_writer_pretty(file, &strings_map).unwrap();
|
||||
}
|
||||
|
||||
async fn get_channel_datestrings(rp: &RustyPipeQuery, id: &str) -> Vec<String> {
|
||||
let channel = rp.channel_videos(id).await.unwrap();
|
||||
|
||||
channel
|
||||
.content
|
||||
.items
|
||||
.into_iter()
|
||||
.filter_map(|itm| itm.publish_date_txt)
|
||||
.collect()
|
||||
}
|
||||
|
|
@ -4,6 +4,7 @@ mod abtest;
|
|||
mod collect_album_types;
|
||||
mod collect_large_numbers;
|
||||
mod collect_playlist_dates;
|
||||
mod collect_video_dates;
|
||||
mod collect_video_durations;
|
||||
mod download_testfiles;
|
||||
mod gen_dictionary;
|
||||
|
|
@ -27,6 +28,7 @@ enum Commands {
|
|||
CollectLargeNumbers,
|
||||
CollectAlbumTypes,
|
||||
CollectVideoDurations,
|
||||
CollectVideoDates,
|
||||
ParsePlaylistDates,
|
||||
ParseLargeNumbers,
|
||||
ParseAlbumTypes,
|
||||
|
|
@ -60,6 +62,9 @@ async fn main() {
|
|||
Commands::CollectVideoDurations => {
|
||||
collect_video_durations::collect_video_durations(cli.concurrency).await;
|
||||
}
|
||||
Commands::CollectVideoDates => {
|
||||
collect_video_dates::collect_video_dates(cli.concurrency).await;
|
||||
}
|
||||
Commands::ParsePlaylistDates => collect_playlist_dates::write_samples_to_dict(),
|
||||
Commands::ParseLargeNumbers => collect_large_numbers::write_samples_to_dict(),
|
||||
Commands::ParseAlbumTypes => collect_album_types::write_samples_to_dict(),
|
||||
|
|
|
|||
Reference in a new issue