feat: add number_tokens for parsing large nums to dictionary
This commit is contained in:
parent
67ae1eb21d
commit
5d19259a14
21 changed files with 5219 additions and 38 deletions
358
codegen/src/collect_large_numbers.rs
Normal file
358
codegen/src/collect_large_numbers.rs
Normal file
|
|
@ -0,0 +1,358 @@
|
|||
use std::collections::HashMap;
|
||||
use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use fancy_regex::Regex;
|
||||
use futures::{stream, StreamExt};
|
||||
use once_cell::sync::Lazy;
|
||||
use reqwest::{header, Client};
|
||||
use rustypipe::model::{locale::LANGUAGES, Language};
|
||||
use serde::Deserialize;
|
||||
use serde_with::serde_as;
|
||||
use serde_with::VecSkipError;
|
||||
|
||||
use crate::util::{self, Text};
|
||||
|
||||
type CollectedNumbers = BTreeMap<Language, BTreeMap<u8, (String, u64)>>;
|
||||
|
||||
/// Collect video view count texts in every supported language
|
||||
/// and write them to `testfiles/dict/large_number_samples.json`.
|
||||
///
|
||||
/// YouTube's API outputs the subscriber count of a channel only in a
|
||||
/// approximated format (e.g *880K subscribers*), which varies
|
||||
/// by language.
|
||||
///
|
||||
/// To parse these numbers correctly we need to collect textual numbers
|
||||
/// of different orders of magnitude in every language. This script extracts
|
||||
/// the view count texts from the most popular videos of different channels.
|
||||
///
|
||||
/// We extract these instead of subscriber counts because the YouTube API
|
||||
/// outputs view counts both in approximated and exact format, so we can use
|
||||
/// the exact counts to figure out the tokens.
|
||||
pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
|
||||
let mut json_path = project_root.to_path_buf();
|
||||
json_path.push("testfiles/dict/large_number_samples.json");
|
||||
|
||||
let channels = [
|
||||
"UCq-Fj5jknLsUf-MWSy4_brA", // 10e8 (225M)
|
||||
"UCcdwLMPsaU2ezNSJU1nFoBQ", // 10e7 (60M)
|
||||
"UC6mIxFTvXkWQVEHPsEdflzQ", // 10e6 (1.7M)
|
||||
"UCD0y51PJfvkZNe3y3FR5riw", // 10e5 (125K)
|
||||
"UCNcN0dW43zE0Om3278fjY8A", // 10e4 (27K)
|
||||
"UC0QEucPrn0-Ddi3JBTcs5Kw", // 10e3 (5K)
|
||||
"UCGiJh0NZ52wRhYKYnuZI08Q", // 10e1 (37)
|
||||
];
|
||||
|
||||
let collected_numbers: CollectedNumbers = stream::iter(LANGUAGES)
|
||||
.map(|lang| async move {
|
||||
let mut entry = BTreeMap::new();
|
||||
|
||||
for (n, ch_id) in channels.iter().enumerate() {
|
||||
let channel = get_channel(ch_id, lang)
|
||||
.await
|
||||
.context(format!("{}-{}", lang, n))
|
||||
.unwrap();
|
||||
|
||||
channel.view_counts.iter().for_each(|(num, txt)| {
|
||||
entry.insert(get_mag(*num), (txt.to_owned(), *num));
|
||||
});
|
||||
|
||||
println!("collected {}-{}", lang, n);
|
||||
}
|
||||
|
||||
(lang, entry)
|
||||
})
|
||||
.buffer_unordered(concurrency)
|
||||
.collect()
|
||||
.await;
|
||||
|
||||
let file = File::create(json_path).unwrap();
|
||||
serde_json::to_writer_pretty(file, &collected_numbers).unwrap();
|
||||
}
|
||||
|
||||
/// Attempt to parse the numbers collected by `collect-large-numbers`
|
||||
/// and write the results to `dictionary.json`.
|
||||
pub fn write_samples_to_dict(project_root: &Path) {
|
||||
let mut json_path = project_root.to_path_buf();
|
||||
json_path.push("testfiles/dict/large_number_samples.json");
|
||||
|
||||
let json_file = File::open(json_path).unwrap();
|
||||
let collected_nums: CollectedNumbers =
|
||||
serde_json::from_reader(BufReader::new(json_file)).unwrap();
|
||||
let mut dict = util::read_dict(project_root);
|
||||
let langs = dict.keys().map(|k| k.to_owned()).collect::<Vec<_>>();
|
||||
|
||||
static POINT_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"\d(\.|,)\d{1,3}(?:\D|$)").unwrap());
|
||||
|
||||
for lang in langs {
|
||||
let dict_entry = dict.entry(lang).or_default();
|
||||
|
||||
let mut e_langs = dict_entry.equivalent.clone();
|
||||
e_langs.push(lang);
|
||||
|
||||
let comma_decimal = collected_nums
|
||||
.get(&lang)
|
||||
.unwrap()
|
||||
.iter()
|
||||
.find_map(|(mag, (txt, _))| {
|
||||
let point = POINT_REGEX
|
||||
.captures(txt)
|
||||
.unwrap()
|
||||
.map(|c| c.get(1).unwrap().as_str());
|
||||
|
||||
if let Some(point) = point {
|
||||
let num_all = util::parse_numeric::<u64>(txt).unwrap();
|
||||
// If the number parsed from all digits has the same order of
|
||||
// magnitude as the actual number, it must be a separator.
|
||||
// Otherwise it is a decimal point
|
||||
return Some((get_mag(num_all) == *mag) ^ (point == ","));
|
||||
}
|
||||
None
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let decimal_point = match comma_decimal {
|
||||
true => ",",
|
||||
false => ".",
|
||||
};
|
||||
|
||||
// Search for tokens
|
||||
|
||||
// This map holds all the tokens we encounter while parsing the language
|
||||
// If a new token is found, it is stored in this map with the derived order of
|
||||
// magnitude.
|
||||
// If the token is found again with a different derived order of magnitude,
|
||||
// its value in the map is set to None.
|
||||
let mut found_tokens: HashMap<String, Option<u8>> = HashMap::new();
|
||||
|
||||
let mut insert_token = |token: String, mag: u8| {
|
||||
let found_token = found_tokens.entry(token).or_insert(match mag {
|
||||
0 => None,
|
||||
x => Some(x),
|
||||
});
|
||||
|
||||
if let Some(f) = found_token {
|
||||
if *f != mag {
|
||||
*found_token = None;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
for lang in e_langs {
|
||||
let entry = collected_nums.get(&lang).unwrap();
|
||||
|
||||
entry.iter().for_each(|(mag, (txt, _))| {
|
||||
let filtered = util::filter_largenumstr(txt);
|
||||
|
||||
let tokens: Vec<String> = match dict_entry.by_char {
|
||||
true => filtered.chars().map(|c| c.to_string()).collect(),
|
||||
false => filtered.split_whitespace().map(|c| c.to_string()).collect(),
|
||||
};
|
||||
|
||||
let num_before_point =
|
||||
util::parse_numeric::<u64>(txt.split(decimal_point).next().unwrap()).unwrap();
|
||||
let mag_before_point = get_mag(num_before_point);
|
||||
let mut mag_remaining = mag - mag_before_point;
|
||||
|
||||
tokens.iter().for_each(|t| {
|
||||
// These tokens are correct in all languages
|
||||
// and are used to parse combined prefixes like `1.1K crore` (en-IN)
|
||||
let known_tmag: u8 = if t.len() == 1 {
|
||||
match t.as_str() {
|
||||
"K" | "k" => 3,
|
||||
"M" => 6,
|
||||
// 'm' means 10^3 in Catalan, 'B' means 10^3 in Turkish
|
||||
_ => 0,
|
||||
}
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
// K/M/B
|
||||
if known_tmag > 0 {
|
||||
mag_remaining = mag_remaining
|
||||
.checked_sub(known_tmag)
|
||||
.expect("known magnitude incorrect");
|
||||
} else {
|
||||
insert_token(t.to_owned(), mag_remaining);
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Insert collected data into dictionary
|
||||
dict_entry.number_tokens = found_tokens
|
||||
.into_iter()
|
||||
.filter_map(|(k, v)| v.map(|v| (k, v)))
|
||||
.collect();
|
||||
dict_entry.comma_decimal = comma_decimal;
|
||||
}
|
||||
|
||||
util::write_dict(project_root, &dict);
|
||||
}
|
||||
|
||||
fn get_mag(n: u64) -> u8 {
|
||||
(n as f64).log10().floor() as u8
|
||||
}
|
||||
|
||||
/*
|
||||
YouTube channel videos response
|
||||
*/
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct Channel {
|
||||
contents: Contents,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct Contents {
|
||||
two_column_browse_results_renderer: TabsRenderer,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct TabsRenderer {
|
||||
#[serde_as(as = "VecSkipError<_>")]
|
||||
tabs: Vec<TabRendererWrap>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct TabRendererWrap {
|
||||
tab_renderer: TabRenderer,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct TabRenderer {
|
||||
content: SectionListRendererWrap,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct SectionListRendererWrap {
|
||||
section_list_renderer: SectionListRenderer,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct SectionListRenderer {
|
||||
contents: Vec<ItemSectionRendererWrap>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct ItemSectionRendererWrap {
|
||||
item_section_renderer: ItemSectionRenderer,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct ItemSectionRenderer {
|
||||
contents: Vec<GridRendererWrap>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct GridRendererWrap {
|
||||
grid_renderer: GridRenderer,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct GridRenderer {
|
||||
#[serde_as(as = "VecSkipError<_>")]
|
||||
items: Vec<VideoListItem>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct VideoListItem {
|
||||
grid_video_renderer: GridVideoRenderer,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct GridVideoRenderer {
|
||||
/// `24,194 views`
|
||||
view_count_text: Text,
|
||||
/// `19K views`
|
||||
short_view_count_text: Text,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct ChannelData {
|
||||
view_counts: Vec<(u64, String)>,
|
||||
}
|
||||
|
||||
async fn get_channel(channel_id: &str, lang: Language) -> Result<ChannelData> {
|
||||
let client = Client::new();
|
||||
|
||||
let body = format!(
|
||||
"{}{}{}{}{}",
|
||||
r##"{"context":{"client":{"clientName":"WEB","clientVersion":"2.20220914.06.00","platform":"DESKTOP","originalUrl":"https://www.youtube.com/","hl":""##,
|
||||
lang,
|
||||
r##"","gl":"US"},"request":{"internalExperimentFlags":[],"useSsl":true},"user":{"lockedSafetyMode":false}},"params":"EgZ2aWRlb3MYASAAMAE%3D","browseId":""##,
|
||||
channel_id,
|
||||
"\"}"
|
||||
);
|
||||
|
||||
let resp = client
|
||||
.post("https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false")
|
||||
.header(header::CONTENT_TYPE, "application/json")
|
||||
.body(body)
|
||||
.send().await?
|
||||
.error_for_status()?;
|
||||
|
||||
let channel = resp.json::<Channel>().await?;
|
||||
|
||||
Ok(ChannelData {
|
||||
view_counts: channel
|
||||
.contents
|
||||
.two_column_browse_results_renderer
|
||||
.tabs
|
||||
.get(0)
|
||||
.map(|tab| {
|
||||
tab.tab_renderer.content.section_list_renderer.contents[0]
|
||||
.item_section_renderer
|
||||
.contents[0]
|
||||
.grid_renderer
|
||||
.items
|
||||
.iter()
|
||||
.map(|itm| {
|
||||
(
|
||||
util::parse_numeric(
|
||||
&itm.grid_video_renderer.view_count_text.simple_text,
|
||||
)
|
||||
.unwrap(),
|
||||
itm.grid_video_renderer
|
||||
.short_view_count_text
|
||||
.simple_text
|
||||
.to_owned(),
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_default(),
|
||||
})
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test() {
|
||||
let channel = get_channel("UCcdwLMPsaU2ezNSJU1nFoBQ", Language::Az)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
dbg!(channel);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test2() {
|
||||
write_samples_to_dict(Path::new(
|
||||
"/home/thetadev/Documents/Programmieren/Rust/rustypipe",
|
||||
));
|
||||
}
|
||||
|
|
@ -38,7 +38,7 @@ enum DateCase {
|
|||
}
|
||||
|
||||
/// Collect 'Playlist updated' dates in every supported language
|
||||
/// and write them to `testfiles/date/playlist_samples.json`.
|
||||
/// and write them to `testfiles/dict/playlist_samples.json`.
|
||||
///
|
||||
/// YouTube's API outputs the update date of playlists only in a
|
||||
/// textual format (e.g. *Last updated on Jan 3, 2020*), which varies
|
||||
|
|
@ -55,13 +55,15 @@ enum DateCase {
|
|||
/// - one playlist updated yesterday
|
||||
/// - one playlist updated 2-7 days ago
|
||||
/// - one playlist from every month. Note that there should not
|
||||
/// be any dates which include the same number twice (e.g. 01.01.2020).
|
||||
/// be any dates which include the same number twice (e.g. 01.01.2020).
|
||||
///
|
||||
/// **IMPORTANT:**
|
||||
///
|
||||
/// Because the relative dates change with time, the first three playlists
|
||||
/// should be checked and eventually changed before running the program.
|
||||
/// have to checked and eventually changed before running the program.
|
||||
pub async fn collect_dates(project_root: &Path, concurrency: usize) {
|
||||
let mut json_path = project_root.to_path_buf();
|
||||
json_path.push("testfiles/date/playlist_samples.json");
|
||||
json_path.push("testfiles/dict/playlist_samples.json");
|
||||
|
||||
// These are the sample playlists
|
||||
let cases = [
|
||||
|
|
@ -115,7 +117,7 @@ pub async fn collect_dates(project_root: &Path, concurrency: usize) {
|
|||
/// parsed automatically and require manual work.
|
||||
pub fn write_samples_to_dict(project_root: &Path) {
|
||||
let mut json_path = project_root.to_path_buf();
|
||||
json_path.push("testfiles/date/playlist_samples.json");
|
||||
json_path.push("testfiles/dict/playlist_samples.json");
|
||||
|
||||
let json_file = File::open(json_path).unwrap();
|
||||
let collected_dates: CollectedDates =
|
||||
|
|
|
|||
|
|
@ -146,7 +146,7 @@ async fn video_details(testfiles: &Path) {
|
|||
async fn comments_top(testfiles: &Path) {
|
||||
let mut json_path = testfiles.to_path_buf();
|
||||
json_path.push("video_details");
|
||||
json_path.push(format!("comments_top.json"));
|
||||
json_path.push("comments_top.json");
|
||||
if json_path.exists() {
|
||||
return;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -34,17 +34,47 @@ pub fn generate_dictionary(project_root: &Path) {
|
|||
let dict = util::read_dict(project_root);
|
||||
|
||||
let code_head = r#"// This file is automatically generated. DO NOT EDIT.
|
||||
// See codegen/gen_dictionary.rs for the generation code.
|
||||
use crate::{
|
||||
model::Language,
|
||||
timeago::{DateCmp, TaToken, TimeUnit},
|
||||
};
|
||||
|
||||
/// The dictionary contains the information required to parse dates and numbers
|
||||
/// in all supported languages.
|
||||
pub struct Entry {
|
||||
/// Should the language be parsed by character instead of by word?
|
||||
/// (e.g. Chinese/Japanese)
|
||||
pub by_char: bool,
|
||||
/// Tokens for parsing timeago strings.
|
||||
///
|
||||
/// Format: Parsed token -> \[Quantity\] Identifier
|
||||
///
|
||||
/// Identifiers: `Y`(ear), `M`(month), `W`(eek), `D`(ay),
|
||||
/// `h`(our), `m`(inute), `s`(econd)
|
||||
pub timeago_tokens: phf::Map<&'static str, TaToken>,
|
||||
/// Order in which to parse numeric date components. Formatted as
|
||||
/// a string of date identifiers (Y, M, D).
|
||||
///
|
||||
/// Examples:
|
||||
///
|
||||
/// - 03.01.2020 => `"DMY"`
|
||||
/// - Jan 3, 2020 => `"DY"`
|
||||
pub date_order: &'static [DateCmp],
|
||||
/// Tokens for parsing month names.
|
||||
///
|
||||
/// Format: Parsed token -> Month number (starting from 1)
|
||||
pub months: phf::Map<&'static str, u8>,
|
||||
/// Tokens for parsing date strings with no digits (e.g. Today, Tomorrow)
|
||||
///
|
||||
/// Format: Parsed token -> \[Quantity\] Identifier
|
||||
pub timeago_nd_tokens: phf::Map<&'static str, TaToken>,
|
||||
/// Are commas (instead of points) used as decimal separators?
|
||||
pub comma_decimal: bool,
|
||||
/// Tokens for parsing decimal prefixes (K, M, B, ...)
|
||||
///
|
||||
/// Format: Parsed token -> decimal power
|
||||
pub number_tokens: phf::Map<&'static str, u8>,
|
||||
}
|
||||
"#;
|
||||
|
||||
|
|
@ -100,12 +130,19 @@ pub fn entry(lang: Language) -> Entry {
|
|||
});
|
||||
date_order = date_order.trim_end_matches([' ', ',']).to_owned() + "]";
|
||||
|
||||
// Number tokens
|
||||
let mut number_tokens = phf_codegen::Map::<&str>::new();
|
||||
entry.number_tokens.iter().for_each(|(txt, mag)| {
|
||||
number_tokens.entry(txt, &mag.to_string());
|
||||
});
|
||||
|
||||
let code_ta_tokens = &ta_tokens.build().to_string().replace('\n', "\n ");
|
||||
let code_ta_nd_tokens = &ta_nd_tokens.build().to_string().replace('\n', "\n ");
|
||||
let code_months = &months.build().to_string().replace('\n', "\n ");
|
||||
let code_number_tokens = &number_tokens.build().to_string().replace('\n', "\n ");
|
||||
|
||||
let _ = write!(code_timeago_tokens, "{} => Entry {{\n by_char: {:?},\n timeago_tokens: {},\n date_order: {},\n months: {},\n timeago_nd_tokens: {},\n }},\n ",
|
||||
selector, entry.by_char, code_ta_tokens, date_order, code_months, code_ta_nd_tokens);
|
||||
let _ = write!(code_timeago_tokens, "{} => Entry {{\n by_char: {:?},\n timeago_tokens: {},\n date_order: {},\n months: {},\n timeago_nd_tokens: {},\n comma_decimal: {:?},\n number_tokens: {},\n }},\n ",
|
||||
selector, entry.by_char, code_ta_tokens, date_order, code_months, code_ta_nd_tokens, entry.comma_decimal, code_number_tokens);
|
||||
});
|
||||
|
||||
code_timeago_tokens = code_timeago_tokens.trim_end().to_owned() + "\n }\n}\n";
|
||||
|
|
|
|||
|
|
@ -8,6 +8,8 @@ use serde::Deserialize;
|
|||
use serde_with::serde_as;
|
||||
use serde_with::VecSkipError;
|
||||
|
||||
use crate::util::Text;
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
|
|
@ -135,12 +137,6 @@ struct LanguageCountryCommand {
|
|||
hl: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct Text {
|
||||
simple_text: String,
|
||||
}
|
||||
|
||||
pub async fn generate_locales(project_root: &Path) {
|
||||
let (languages, countries) = get_locales().await;
|
||||
|
||||
|
|
@ -284,7 +280,7 @@ pub enum Country {
|
|||
async fn get_locales() -> (BTreeMap<String, String>, BTreeMap<String, String>) {
|
||||
let client = Client::new();
|
||||
let resp = client
|
||||
.post("https://www.youtube.com/youtubei/v1/account/account_menu?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8")
|
||||
.post("https://www.youtube.com/youtubei/v1/account/account_menu?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false")
|
||||
.header(header::CONTENT_TYPE, "application/json")
|
||||
.body(
|
||||
r##"{"context":{"client":{"clientName":"WEB","clientVersion":"2.20220914.06.00","platform":"DESKTOP","originalUrl":"https://www.youtube.com/","hl":"en","gl":"US"},"request":{"internalExperimentFlags":[],"useSsl":true},"user":{"lockedSafetyMode":false}}}"##
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
mod collect_large_numbers;
|
||||
mod collect_playlist_dates;
|
||||
mod download_testfiles;
|
||||
mod gen_dictionary;
|
||||
|
|
@ -21,7 +22,9 @@ struct Cli {
|
|||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
CollectPlaylistDates,
|
||||
WritePlaylistDates,
|
||||
CollectLargeNumbers,
|
||||
ParsePlaylistDates,
|
||||
ParseLargeNumbers,
|
||||
GenLocales,
|
||||
GenDict,
|
||||
DownloadTestfiles,
|
||||
|
|
@ -36,8 +39,14 @@ async fn main() {
|
|||
Commands::CollectPlaylistDates => {
|
||||
collect_playlist_dates::collect_dates(&cli.project_root, cli.concurrency).await;
|
||||
}
|
||||
Commands::WritePlaylistDates => {
|
||||
collect_playlist_dates::write_samples_to_dict(&cli.project_root);
|
||||
Commands::CollectLargeNumbers => {
|
||||
collect_large_numbers::collect_large_numbers(&cli.project_root, cli.concurrency).await;
|
||||
}
|
||||
Commands::ParsePlaylistDates => {
|
||||
collect_playlist_dates::write_samples_to_dict(&cli.project_root)
|
||||
}
|
||||
Commands::ParseLargeNumbers => {
|
||||
collect_large_numbers::write_samples_to_dict(&cli.project_root)
|
||||
}
|
||||
Commands::GenLocales => {
|
||||
gen_locales::generate_locales(&cli.project_root).await;
|
||||
|
|
|
|||
|
|
@ -3,19 +3,53 @@ use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path, str::FromS
|
|||
use rustypipe::model::Language;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
const DICT_PATH: &str = "testfiles/date/dictionary.json";
|
||||
const DICT_PATH: &str = "testfiles/dict/dictionary.json";
|
||||
|
||||
type Dictionary = BTreeMap<Language, DictEntry>;
|
||||
|
||||
#[derive(Debug, Default, Serialize, Deserialize)]
|
||||
#[serde(default)]
|
||||
pub struct DictEntry {
|
||||
/// List of languages that should be treated equally (e.g. EnUs/EnGb/EnIn)
|
||||
pub equivalent: Vec<Language>,
|
||||
/// Should the language be parsed by character instead of by word?
|
||||
/// (e.g. Chinese/Japanese)
|
||||
pub by_char: bool,
|
||||
/// Tokens for parsing timeago strings.
|
||||
///
|
||||
/// Format: Parsed token -> \[Quantity\] Identifier
|
||||
///
|
||||
/// Identifiers: `Y`(ear), `M`(month), `W`(eek), `D`(ay),
|
||||
/// `h`(our), `m`(inute), `s`(econd)
|
||||
pub timeago_tokens: BTreeMap<String, String>,
|
||||
/// Order in which to parse numeric date components. Formatted as
|
||||
/// a string of date identifiers (Y, M, D).
|
||||
///
|
||||
/// Examples:
|
||||
///
|
||||
/// - 03.01.2020 => `"DMY"`
|
||||
/// - Jan 3, 2020 => `"DY"`
|
||||
pub date_order: String,
|
||||
/// Tokens for parsing month names.
|
||||
///
|
||||
/// Format: Parsed token -> Month number (starting from 1)
|
||||
pub months: BTreeMap<String, u8>,
|
||||
/// Tokens for parsing date strings with no digits (e.g. Today, Tomorrow)
|
||||
///
|
||||
/// Format: Parsed token -> \[Quantity\] Identifier
|
||||
pub timeago_nd_tokens: BTreeMap<String, String>,
|
||||
/// Are commas (instead of points) used as decimal separators?
|
||||
pub comma_decimal: bool,
|
||||
/// Tokens for parsing decimal prefixes (K, M, B, ...)
|
||||
///
|
||||
/// Format: Parsed token -> decimal power
|
||||
pub number_tokens: BTreeMap<String, u8>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct Text {
|
||||
pub simple_text: String,
|
||||
}
|
||||
|
||||
pub fn read_dict(project_root: &Path) -> Dictionary {
|
||||
|
|
@ -48,6 +82,27 @@ pub fn filter_datestr(string: &str) -> String {
|
|||
.collect()
|
||||
}
|
||||
|
||||
pub fn filter_largenumstr(string: &str) -> String {
|
||||
string
|
||||
.chars()
|
||||
.filter(|c| !matches!(c, '\u{200b}' | '.' | ',') && !c.is_ascii_digit())
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Parse a string after removing all non-numeric characters
|
||||
pub fn parse_numeric<F>(string: &str) -> Result<F, F::Err>
|
||||
where
|
||||
F: FromStr,
|
||||
{
|
||||
let mut buf = String::new();
|
||||
for c in string.chars() {
|
||||
if c.is_ascii_digit() {
|
||||
buf.push(c);
|
||||
}
|
||||
}
|
||||
buf.parse()
|
||||
}
|
||||
|
||||
/// Parse all numbers occurring in a string and reurn them as a vec
|
||||
pub fn parse_numeric_vec<F>(string: &str) -> Vec<F>
|
||||
where
|
||||
|
|
|
|||
|
|
@ -59,6 +59,6 @@ Dec PL1J-6JOckZtHo91uApeb10Qlf2XhkfM-9 24.12.2021
|
|||
10e6: 1.7M UC6mIxFTvXkWQVEHPsEdflzQ
|
||||
10e5: 125K UCD0y51PJfvkZNe3y3FR5riw
|
||||
10e4: 27K UCNcN0dW43zE0Om3278fjY8A
|
||||
10e3: 5K UCNcN0dW43zE0Om3278fjY8A
|
||||
10e3: 5K UC0QEucPrn0-Ddi3JBTcs5Kw
|
||||
10e2: 388 UCllyEQfcoiPN68zHv6mGHDQ
|
||||
10e1: 37 UCNcN0dW43zE0Om3278fjY8A
|
||||
10e1: 37 UCGiJh0NZ52wRhYKYnuZI08Q
|
||||
|
|
|
|||
|
|
@ -72,7 +72,6 @@ impl MapResponse<ChannelVideos> for response::Channel {
|
|||
c: ChannelVideos {
|
||||
id: header.channel_id,
|
||||
name: header.title,
|
||||
subscriber_count_txt: header.subscriber_count_text,
|
||||
},
|
||||
warnings,
|
||||
})
|
||||
|
|
|
|||
|
|
@ -82,12 +82,14 @@ pub struct HeaderRenderer {
|
|||
pub channel_id: String,
|
||||
/// Channel name
|
||||
pub title: String,
|
||||
/// Approximate subscriber count (e.g. `880K subscribers`), depends on language
|
||||
#[serde_as(as = "Text")]
|
||||
pub subscriber_count_text: String,
|
||||
/// Approximate subscriber count (e.g. `880K subscribers`), depends on language.
|
||||
///
|
||||
/// `None` if the subscriber count is hidden.
|
||||
#[serde_as(as = "Option<Text>")]
|
||||
pub subscriber_count_text: Option<String>,
|
||||
pub avatar: Thumbnails,
|
||||
#[serde_as(as = "VecSkipError<_>")]
|
||||
pub badges: Vec<ChannelBadge>,
|
||||
#[serde_as(as = "Option<VecSkipError<_>>")]
|
||||
pub badges: Option<Vec<ChannelBadge>>,
|
||||
pub banner: Thumbnails,
|
||||
pub mobile_banner: Thumbnails,
|
||||
/// Fullscreen (16:9) channel banner
|
||||
|
|
|
|||
|
|
@ -93,6 +93,7 @@ pub struct GridVideoRenderer {
|
|||
pub published_time_text: Option<String>,
|
||||
#[serde_as(as = "Option<Text>")]
|
||||
pub view_count_text: Option<String>,
|
||||
/// Contains video length
|
||||
#[serde_as(as = "VecSkipError<_>")]
|
||||
pub thumbnail_overlays: Vec<TimeOverlay>,
|
||||
}
|
||||
|
|
@ -397,6 +398,10 @@ pub trait IsLive {
|
|||
fn is_live(&self) -> bool;
|
||||
}
|
||||
|
||||
pub trait IsShort {
|
||||
fn is_short(&self) -> bool;
|
||||
}
|
||||
|
||||
impl IsLive for Vec<VideoBadge> {
|
||||
fn is_live(&self) -> bool {
|
||||
self.iter().any(|badge| {
|
||||
|
|
@ -404,3 +409,19 @@ impl IsLive for Vec<VideoBadge> {
|
|||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl IsLive for Vec<TimeOverlay> {
|
||||
fn is_live(&self) -> bool {
|
||||
self.iter().any(|overlay| {
|
||||
overlay.thumbnail_overlay_time_status_renderer.style == TimeOverlayStyle::Live
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl IsShort for Vec<TimeOverlay> {
|
||||
fn is_short(&self) -> bool {
|
||||
self.iter().any(|overlay| {
|
||||
overlay.thumbnail_overlay_time_status_renderer.style == TimeOverlayStyle::Shorts
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
|||
1021
src/dictionary.rs
1021
src/dictionary.rs
File diff suppressed because it is too large
Load diff
|
|
@ -310,7 +310,7 @@ pub struct RecommendedVideo {
|
|||
pub publish_date_txt: Option<String>,
|
||||
/// View count
|
||||
///
|
||||
/// Is `None` if it could not be parsed
|
||||
/// `None` if it could not be extracted.
|
||||
pub view_count: Option<u64>,
|
||||
/// Is the video an active livestream?
|
||||
pub is_live: bool,
|
||||
|
|
@ -400,6 +400,43 @@ pub struct ChannelVideos {
|
|||
pub id: String,
|
||||
/// Channel name
|
||||
pub name: String,
|
||||
/// Textual subscriber count (e.g. `2.3M subscribers`), depends on language setting
|
||||
pub subscriber_count_txt: String,
|
||||
/*
|
||||
/// Channel subscriber count
|
||||
///
|
||||
/// `None` if the subscriber count was hidden by the owner
|
||||
/// or could not be parsed.
|
||||
pub subscriber_count: Option<u64>,
|
||||
pub videos: Paginator<ChannelVideo>,
|
||||
*/
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[non_exhaustive]
|
||||
pub struct ChannelVideo {
|
||||
/// Unique YouTube video ID
|
||||
pub id: String,
|
||||
/// Video title
|
||||
pub title: String,
|
||||
/// Video length in seconds.
|
||||
///
|
||||
/// Is `None` for livestreams.
|
||||
pub length: Option<u32>,
|
||||
/// Video thumbnail
|
||||
pub thumbnail: Vec<Thumbnail>,
|
||||
/// Video publishing date.
|
||||
///
|
||||
/// `None` if the date could not be parsed.
|
||||
pub publish_date: Option<DateTime<Local>>,
|
||||
/// Textual video publish date (e.g. `11 months ago`, depends on language)
|
||||
///
|
||||
/// Is `None` for livestreams.
|
||||
pub publish_date_txt: Option<String>,
|
||||
/// View count
|
||||
///
|
||||
/// `None` if it could not be extracted.
|
||||
pub view_count: Option<u64>,
|
||||
/// Is the video an active livestream?
|
||||
pub is_live: bool,
|
||||
/// Is the video a YouTube Short video (vertical and <60s)?
|
||||
pub is_short: bool,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ pub trait ToHtml {
|
|||
}
|
||||
|
||||
impl TextComponent {
|
||||
pub fn get_text<'a>(&'a self) -> &'a str {
|
||||
pub fn get_text(&self) -> &str {
|
||||
match self {
|
||||
TextComponent::Text(text) => text,
|
||||
TextComponent::Web { text, .. } => text,
|
||||
|
|
|
|||
|
|
@ -247,7 +247,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn t_testfile() {
|
||||
let json_path = Path::new("testfiles/date/timeago_samples.json");
|
||||
let json_path = Path::new("testfiles/dict/timeago_samples.json");
|
||||
|
||||
let expect = [
|
||||
TimeAgo {
|
||||
|
|
@ -430,7 +430,7 @@ mod tests {
|
|||
cases: BTreeMap<String, u8>,
|
||||
}
|
||||
|
||||
let json_path = Path::new("testfiles/date/timeago_table.json");
|
||||
let json_path = Path::new("testfiles/dict/timeago_table.json");
|
||||
let json_file = File::open(json_path).unwrap();
|
||||
let timeago_table: TimeagoTable =
|
||||
serde_json::from_reader(BufReader::new(json_file)).unwrap();
|
||||
|
|
@ -477,7 +477,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn t_parse_date_samples() {
|
||||
let json_path = Path::new("testfiles/date/playlist_samples.json");
|
||||
let json_path = Path::new("testfiles/dict/playlist_samples.json");
|
||||
let json_file = File::open(json_path).unwrap();
|
||||
let date_samples: BTreeMap<Language, BTreeMap<String, String>> =
|
||||
serde_json::from_reader(BufReader::new(json_file)).unwrap();
|
||||
|
|
|
|||
|
|
@ -35,6 +35,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"gister": "1D",
|
||||
"vandag": "0D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"m": 6,
|
||||
"mjd": 9
|
||||
}
|
||||
},
|
||||
"am": {
|
||||
|
|
@ -74,6 +79,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"ትላንት": "1D",
|
||||
"ዛሬ": "0D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"ሚ": 6,
|
||||
"ሺ": 3,
|
||||
"ቢ": 9
|
||||
}
|
||||
},
|
||||
"ar": {
|
||||
|
|
@ -110,6 +121,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"اليوم": "0D",
|
||||
"بالأمس": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"ألف": 3,
|
||||
"مليار": 9,
|
||||
"مليون": 6
|
||||
}
|
||||
},
|
||||
"as": {
|
||||
|
|
@ -129,6 +146,15 @@
|
|||
"timeago_nd_tokens": {
|
||||
"আজি": "0D",
|
||||
"কালি": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"কোঃটা": 9,
|
||||
"নিঃটা": 6,
|
||||
"নিযুতটা": 6,
|
||||
"লাখটা": 5,
|
||||
"শঃ": 9,
|
||||
"হাজাৰটা": 3
|
||||
}
|
||||
},
|
||||
"az": {
|
||||
|
|
@ -161,6 +187,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"bugün": "0D",
|
||||
"dünən": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"mln": 6,
|
||||
"mlrd": 9
|
||||
}
|
||||
},
|
||||
"be": {
|
||||
|
|
@ -210,6 +241,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"сёння": "0D",
|
||||
"ўчора": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"млн": 6,
|
||||
"млрд": 9,
|
||||
"тыс": 3
|
||||
}
|
||||
},
|
||||
"bg": {
|
||||
|
|
@ -236,6 +273,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"вчера": "1D",
|
||||
"днес": "0D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"млн": 6,
|
||||
"млрд": 9,
|
||||
"хил": 3
|
||||
}
|
||||
},
|
||||
"bn": {
|
||||
|
|
@ -268,6 +311,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"আজ": "0D",
|
||||
"গতকাল": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"লাটি": 5,
|
||||
"শত": 9,
|
||||
"হাটি": 3
|
||||
}
|
||||
},
|
||||
"bs": {
|
||||
|
|
@ -312,6 +361,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"danas": "0D",
|
||||
"jučer": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"hilj": 3,
|
||||
"mil": 6,
|
||||
"mlr": 9
|
||||
}
|
||||
},
|
||||
"ca": {
|
||||
|
|
@ -351,6 +406,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"ahir": "1D",
|
||||
"avui": "0D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"m": 3,
|
||||
"mM": 9
|
||||
}
|
||||
},
|
||||
"cs": {
|
||||
|
|
@ -378,6 +438,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"dnes": "0D",
|
||||
"včera": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"mil": 6,
|
||||
"mld": 9,
|
||||
"tis": 3
|
||||
}
|
||||
},
|
||||
"da": {
|
||||
|
|
@ -416,6 +482,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"dag": "0D",
|
||||
"går": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"mia": 9,
|
||||
"mio": 6
|
||||
}
|
||||
},
|
||||
"de": {
|
||||
|
|
@ -442,6 +513,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"gestern": "1D",
|
||||
"heute": "0D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"Mio": 6,
|
||||
"Mrd": 9
|
||||
}
|
||||
},
|
||||
"el": {
|
||||
|
|
@ -481,6 +557,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"σήμερα": "0D",
|
||||
"χτες": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"δισ": 9,
|
||||
"εκ": 6,
|
||||
"χιλ": 3
|
||||
}
|
||||
},
|
||||
"en": {
|
||||
|
|
@ -524,11 +606,59 @@
|
|||
"timeago_nd_tokens": {
|
||||
"today": "0D",
|
||||
"yesterday": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"B": 9,
|
||||
"crore": 7,
|
||||
"lakh": 5
|
||||
}
|
||||
},
|
||||
"es": {
|
||||
"equivalent": [],
|
||||
"by_char": false,
|
||||
"timeago_tokens": {
|
||||
"año": "Y",
|
||||
"años": "Y",
|
||||
"día": "D",
|
||||
"días": "D",
|
||||
"hora": "h",
|
||||
"horas": "h",
|
||||
"mes": "M",
|
||||
"meses": "M",
|
||||
"minuto": "m",
|
||||
"minutos": "m",
|
||||
"segundo": "s",
|
||||
"segundos": "s",
|
||||
"semana": "W",
|
||||
"semanas": "W"
|
||||
},
|
||||
"date_order": "DY",
|
||||
"months": {
|
||||
"abr": 4,
|
||||
"ago": 8,
|
||||
"dic": 12,
|
||||
"ene": 1,
|
||||
"feb": 2,
|
||||
"jul": 7,
|
||||
"jun": 6,
|
||||
"mar": 3,
|
||||
"may": 5,
|
||||
"nov": 11,
|
||||
"oct": 10,
|
||||
"sept": 9
|
||||
},
|
||||
"timeago_nd_tokens": {
|
||||
"ayer": "1D",
|
||||
"hoy": "0D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"mil": 9
|
||||
}
|
||||
},
|
||||
"es-US": {
|
||||
"equivalent": [
|
||||
"es-US",
|
||||
"es-419"
|
||||
],
|
||||
"by_char": false,
|
||||
|
|
@ -566,6 +696,10 @@
|
|||
"timeago_nd_tokens": {
|
||||
"ayer": "1D",
|
||||
"hoy": "0D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"mil": 9
|
||||
}
|
||||
},
|
||||
"et": {
|
||||
|
|
@ -607,6 +741,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"eile": "1D",
|
||||
"täna": "0D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"mld": 9,
|
||||
"mln": 6,
|
||||
"tuh": 3
|
||||
}
|
||||
},
|
||||
"eu": {
|
||||
|
|
@ -642,7 +782,9 @@
|
|||
"timeago_nd_tokens": {
|
||||
"atzo": "1D",
|
||||
"gaur": "0D"
|
||||
}
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {}
|
||||
},
|
||||
"fa": {
|
||||
"equivalent": [],
|
||||
|
|
@ -674,6 +816,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"امروز": "0D",
|
||||
"دیروز": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"میلیارد": 9,
|
||||
"میلیون": 6,
|
||||
"هزار": 3
|
||||
}
|
||||
},
|
||||
"fi": {
|
||||
|
|
@ -700,6 +848,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"eilen": "1D",
|
||||
"tänään": "0D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"milj": 6,
|
||||
"mrd": 9,
|
||||
"t": 3
|
||||
}
|
||||
},
|
||||
"fil": {
|
||||
|
|
@ -732,6 +886,10 @@
|
|||
"timeago_nd_tokens": {
|
||||
"kahapon": "1D",
|
||||
"ngayong": "0D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"B": 9
|
||||
}
|
||||
},
|
||||
"fr": {
|
||||
|
|
@ -773,6 +931,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"aujourd'hui": "0D",
|
||||
"hier": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"G": 9,
|
||||
"Md": 9
|
||||
}
|
||||
},
|
||||
"gl": {
|
||||
|
|
@ -812,7 +975,9 @@
|
|||
"timeago_nd_tokens": {
|
||||
"hoxe": "0D",
|
||||
"onte": "1D"
|
||||
}
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {}
|
||||
},
|
||||
"gu": {
|
||||
"equivalent": [],
|
||||
|
|
@ -844,6 +1009,13 @@
|
|||
"timeago_nd_tokens": {
|
||||
"આજે": "0D",
|
||||
"ગઈ": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"અબજ": 9,
|
||||
"કરોડ": 7,
|
||||
"લાખ": 5,
|
||||
"હજાર": 3
|
||||
}
|
||||
},
|
||||
"hi": {
|
||||
|
|
@ -876,6 +1048,13 @@
|
|||
"timeago_nd_tokens": {
|
||||
"आज": "0D",
|
||||
"कल": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"अ॰": 9,
|
||||
"क॰": 7,
|
||||
"लाख": 5,
|
||||
"हज़ार": 3
|
||||
}
|
||||
},
|
||||
"hr": {
|
||||
|
|
@ -920,6 +1099,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"danas": "0D",
|
||||
"jučer": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"mil": 6,
|
||||
"mlr": 9,
|
||||
"tis": 3
|
||||
}
|
||||
},
|
||||
"hu": {
|
||||
|
|
@ -959,6 +1144,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"ma": "0D",
|
||||
"tegnap": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"E": 3,
|
||||
"Mrd": 9
|
||||
}
|
||||
},
|
||||
"hy": {
|
||||
|
|
@ -991,6 +1181,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"այսօր": "0D",
|
||||
"երեկ": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"հզր": 3,
|
||||
"մլն": 6,
|
||||
"մլրդ": 9
|
||||
}
|
||||
},
|
||||
"id": {
|
||||
|
|
@ -1023,6 +1219,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"ini": "0D",
|
||||
"kemarin": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"jt": 6,
|
||||
"rb": 3
|
||||
}
|
||||
},
|
||||
"is": {
|
||||
|
|
@ -1062,6 +1263,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"dag": "0D",
|
||||
"gær": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"m": 6,
|
||||
"ma": 9,
|
||||
"þ": 3
|
||||
}
|
||||
},
|
||||
"it": {
|
||||
|
|
@ -1101,6 +1308,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"ieri": "1D",
|
||||
"oggi": "0D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"Mln": 6,
|
||||
"Mrd": 9
|
||||
}
|
||||
},
|
||||
"iw": {
|
||||
|
|
@ -1146,6 +1358,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"אתמול": "1D",
|
||||
"היום": "0D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"B": 9,
|
||||
"K": 3,
|
||||
"M": 6
|
||||
}
|
||||
},
|
||||
"ja": {
|
||||
|
|
@ -1165,6 +1383,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"日": "1D",
|
||||
"本": "0D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"万": 4,
|
||||
"億": 8
|
||||
}
|
||||
},
|
||||
"ka": {
|
||||
|
|
@ -1197,6 +1420,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"გუშინ": "1D",
|
||||
"დღეს": "0D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"ათ": 3,
|
||||
"მლნ": 6,
|
||||
"მლრდ": 9
|
||||
}
|
||||
},
|
||||
"kk": {
|
||||
|
|
@ -1229,6 +1458,13 @@
|
|||
"timeago_nd_tokens": {
|
||||
"бүгін": "0D",
|
||||
"кеше": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"м": 3,
|
||||
"млн": 6,
|
||||
"млрд": 9,
|
||||
"мың": 3
|
||||
}
|
||||
},
|
||||
"km": {
|
||||
|
|
@ -1261,6 +1497,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"បានធ្វើបច្ចុប្បន្នភាពថ្ងៃនេះ": "0D",
|
||||
"បានធ្វើបច្ចុប្បន្នភាពម្សិលមិញ": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"ប៊ីលាន": 9,
|
||||
"ពាន់": 3,
|
||||
"លាន": 6
|
||||
}
|
||||
},
|
||||
"kn": {
|
||||
|
|
@ -1300,6 +1542,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"ಇಂದು": "0D",
|
||||
"ನಿನ್ನೆ": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"ಕೋಟಿ": 7,
|
||||
"ಲಕ್ಷ": 5
|
||||
}
|
||||
},
|
||||
"ko": {
|
||||
|
|
@ -1318,6 +1565,12 @@
|
|||
"months": {},
|
||||
"timeago_nd_tokens": {
|
||||
"오늘": "0D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"만회": 4,
|
||||
"억회": 8,
|
||||
"천회": 3
|
||||
}
|
||||
},
|
||||
"ky": {
|
||||
|
|
@ -1350,6 +1603,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"бүгүн": "0D",
|
||||
"кечээ": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"миң": 3,
|
||||
"млд": 9,
|
||||
"млн": 6
|
||||
}
|
||||
},
|
||||
"lo": {
|
||||
|
|
@ -1382,6 +1641,13 @@
|
|||
"timeago_nd_tokens": {
|
||||
"ອັບເດດມື້ນີ້": "0D",
|
||||
"ອັບເດດມື້ວານນີ້": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"ກີບ": 3,
|
||||
"ຕື້": 9,
|
||||
"ພັນ": 3,
|
||||
"ລ້ານ": 6
|
||||
}
|
||||
},
|
||||
"lt": {
|
||||
|
|
@ -1415,6 +1681,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"vakar": "1D",
|
||||
"šiandien": "0D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"mln": 6,
|
||||
"mlrd": 9,
|
||||
"tūkst": 3
|
||||
}
|
||||
},
|
||||
"lv": {
|
||||
|
|
@ -1454,6 +1726,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"vakar": "1D",
|
||||
"šodien": "0D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"milj": 6,
|
||||
"mljrd": 9,
|
||||
"tūkst": 3
|
||||
}
|
||||
},
|
||||
"mk": {
|
||||
|
|
@ -1480,6 +1758,13 @@
|
|||
"timeago_nd_tokens": {
|
||||
"вчера": "1D",
|
||||
"денес": "0D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"М": 6,
|
||||
"илј": 3,
|
||||
"мил": 6,
|
||||
"милј": 9
|
||||
}
|
||||
},
|
||||
"ml": {
|
||||
|
|
@ -1512,6 +1797,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"ഇന്നലെ": "1D",
|
||||
"ഇന്ന്": "0D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"കോടി": 7,
|
||||
"ലക്ഷം": 5
|
||||
}
|
||||
},
|
||||
"mn": {
|
||||
|
|
@ -1531,6 +1821,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"өнөөдөр": "0D",
|
||||
"өчигдөр": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"мянга": 3,
|
||||
"сая": 6,
|
||||
"тэрбум": 9
|
||||
}
|
||||
},
|
||||
"mr": {
|
||||
|
|
@ -1570,6 +1866,13 @@
|
|||
"timeago_nd_tokens": {
|
||||
"आज": "0D",
|
||||
"काल": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"अब्ज": 9,
|
||||
"कोटी": 7,
|
||||
"लाख": 5,
|
||||
"ह": 3
|
||||
}
|
||||
},
|
||||
"ms": {
|
||||
|
|
@ -1602,6 +1905,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"ini": "0D",
|
||||
"semalam": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"B": 9,
|
||||
"J": 6
|
||||
}
|
||||
},
|
||||
"my": {
|
||||
|
|
@ -1635,6 +1943,15 @@
|
|||
"timeago_nd_tokens": {
|
||||
"မနေ့က": "1D",
|
||||
"ယနေ့": "0D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"ကုဋေ": 7,
|
||||
"ကုဋေထ": 10,
|
||||
"ထောင်": 3,
|
||||
"သန်း": 6,
|
||||
"သိန်း": 5,
|
||||
"သောင်း": 4
|
||||
}
|
||||
},
|
||||
"ne": {
|
||||
|
|
@ -1667,6 +1984,13 @@
|
|||
"timeago_nd_tokens": {
|
||||
"आज": "0D",
|
||||
"हिजो": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"अरब": 9,
|
||||
"करोड": 7,
|
||||
"लाख": 5,
|
||||
"हजार": 3
|
||||
}
|
||||
},
|
||||
"nl": {
|
||||
|
|
@ -1704,6 +2028,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"gisteren": "1D",
|
||||
"vandaag": "0D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"mld": 9,
|
||||
"mln": 6
|
||||
}
|
||||
},
|
||||
"no": {
|
||||
|
|
@ -1743,6 +2072,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"dag": "0D",
|
||||
"går": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"mill": 6,
|
||||
"mrd": 9
|
||||
}
|
||||
},
|
||||
"or": {
|
||||
|
|
@ -1775,6 +2109,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"ଆଜି": "0D",
|
||||
"ଗତକାଲି": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"ନିଟି": 6,
|
||||
"ବିଟି": 9,
|
||||
"ହଟି": 3
|
||||
}
|
||||
},
|
||||
"pa": {
|
||||
|
|
@ -1810,6 +2150,13 @@
|
|||
"timeago_nd_tokens": {
|
||||
"ਅੱਜ": "0D",
|
||||
"ਬੀੇਤੇ": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"ਅਰਬ": 9,
|
||||
"ਕਰੋੜ": 7,
|
||||
"ਲੱਖ": 5,
|
||||
"ਹਜ਼ਾਰ": 3
|
||||
}
|
||||
},
|
||||
"pl": {
|
||||
|
|
@ -1854,6 +2201,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"dzisiaj": "0D",
|
||||
"wczoraj": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"mld": 9,
|
||||
"mln": 6,
|
||||
"tys": 3
|
||||
}
|
||||
},
|
||||
"pt": {
|
||||
|
|
@ -1893,6 +2246,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"hoje": "0D",
|
||||
"ontem": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"bi": 9,
|
||||
"mi": 6,
|
||||
"mil": 3
|
||||
}
|
||||
},
|
||||
"pt-PT": {
|
||||
|
|
@ -1919,6 +2278,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"hoje": "0D",
|
||||
"ontem": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"mM": 9,
|
||||
"mil": 3
|
||||
}
|
||||
},
|
||||
"ro": {
|
||||
|
|
@ -1958,6 +2322,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"astăzi": "0D",
|
||||
"ieri": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"mil": 6,
|
||||
"mld": 9
|
||||
}
|
||||
},
|
||||
"ru": {
|
||||
|
|
@ -2003,6 +2372,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"вчера": "1D",
|
||||
"сегодня": "0D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"млн": 6,
|
||||
"млрд": 9,
|
||||
"тыс": 3
|
||||
}
|
||||
},
|
||||
"si": {
|
||||
|
|
@ -2036,6 +2411,12 @@
|
|||
"අද": "0D",
|
||||
"ඊයෙ": "1D",
|
||||
"ඊයේ": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"ද": 3,
|
||||
"බි": 9,
|
||||
"මි": 6
|
||||
}
|
||||
},
|
||||
"sk": {
|
||||
|
|
@ -2062,6 +2443,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"dnes": "0D",
|
||||
"včera": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"mil": 6,
|
||||
"mld": 9,
|
||||
"tis": 3
|
||||
}
|
||||
},
|
||||
"sl": {
|
||||
|
|
@ -2109,6 +2496,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"danes": "0D",
|
||||
"včeraj": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"mio": 6,
|
||||
"mrd": 9,
|
||||
"tis": 3
|
||||
}
|
||||
},
|
||||
"sq": {
|
||||
|
|
@ -2144,6 +2537,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"dje": "1D",
|
||||
"sot": "0D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"mijë": 3,
|
||||
"mld": 9,
|
||||
"mln": 6
|
||||
}
|
||||
},
|
||||
"sr": {
|
||||
|
|
@ -2172,6 +2571,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"данас": "0D",
|
||||
"јуче": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"мил": 6,
|
||||
"млрд": 9,
|
||||
"хиљ": 3
|
||||
}
|
||||
},
|
||||
"sr-Latn": {
|
||||
|
|
@ -2201,6 +2606,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"danas": "0D",
|
||||
"juče": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"hilj": 3,
|
||||
"mil": 6,
|
||||
"mlrd": 9
|
||||
}
|
||||
},
|
||||
"sv": {
|
||||
|
|
@ -2239,6 +2650,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"idag": "0D",
|
||||
"igår": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"md": 9,
|
||||
"mn": 6
|
||||
}
|
||||
},
|
||||
"sw": {
|
||||
|
|
@ -2273,6 +2689,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"jana": "1D",
|
||||
"leo": "0D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"B": 9,
|
||||
"elfu": 3
|
||||
}
|
||||
},
|
||||
"ta": {
|
||||
|
|
@ -2311,6 +2732,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"இன்று": "0D",
|
||||
"நேற்று": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"கோடி": 7,
|
||||
"லட்சம்": 5
|
||||
}
|
||||
},
|
||||
"te": {
|
||||
|
|
@ -2350,6 +2776,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"ఈ": "0D",
|
||||
"నిన్న": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"కోట్లు": 7,
|
||||
"లక్ష": 5,
|
||||
"లక్షలు": 5
|
||||
}
|
||||
},
|
||||
"th": {
|
||||
|
|
@ -2382,6 +2814,15 @@
|
|||
"timeago_nd_tokens": {
|
||||
"อัปเดตแล้ววันนี้": "0D",
|
||||
"อัปเดตแล้วเมื่อวาน": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"พัน": 3,
|
||||
"พันล้าน": 9,
|
||||
"ล้าน": 6,
|
||||
"หมื่น": 4,
|
||||
"หมื่นล้าน": 10,
|
||||
"แสน": 5
|
||||
}
|
||||
},
|
||||
"tr": {
|
||||
|
|
@ -2414,6 +2855,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"bugün": "0D",
|
||||
"dün": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"B": 3,
|
||||
"Mn": 6,
|
||||
"Mr": 9
|
||||
}
|
||||
},
|
||||
"uk": {
|
||||
|
|
@ -2459,6 +2906,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"вчора": "1D",
|
||||
"сьогодні": "0D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"млн": 6,
|
||||
"млрд": 9,
|
||||
"тис": 3
|
||||
}
|
||||
},
|
||||
"ur": {
|
||||
|
|
@ -2497,6 +2950,13 @@
|
|||
"timeago_nd_tokens": {
|
||||
"آج": "0D",
|
||||
"کل": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"ارب": 9,
|
||||
"لاکھ": 5,
|
||||
"کروڑ": 7,
|
||||
"ہزار": 3
|
||||
}
|
||||
},
|
||||
"uz": {
|
||||
|
|
@ -2529,6 +2989,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"bugun": "0D",
|
||||
"kecha": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"ming": 3,
|
||||
"mln": 6,
|
||||
"mlrd": 9
|
||||
}
|
||||
},
|
||||
"vi": {
|
||||
|
|
@ -2549,6 +3015,12 @@
|
|||
"timeago_nd_tokens": {
|
||||
"nay": "0D",
|
||||
"qua": "1D"
|
||||
},
|
||||
"comma_decimal": true,
|
||||
"number_tokens": {
|
||||
"N": 3,
|
||||
"T": 9,
|
||||
"Tr": 6
|
||||
}
|
||||
},
|
||||
"zh-CN": {
|
||||
|
|
@ -2568,6 +3040,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"今": "0D",
|
||||
"日": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"万": 4,
|
||||
"亿": 8
|
||||
}
|
||||
},
|
||||
"zh-HK": {
|
||||
|
|
@ -2588,6 +3065,10 @@
|
|||
"timeago_nd_tokens": {
|
||||
"今": "0D",
|
||||
"天": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"B": 9
|
||||
}
|
||||
},
|
||||
"zh-TW": {
|
||||
|
|
@ -2607,6 +3088,11 @@
|
|||
"timeago_nd_tokens": {
|
||||
"今": "0D",
|
||||
"天": "1D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"億": 8,
|
||||
"萬": 4
|
||||
}
|
||||
},
|
||||
"zu": {
|
||||
|
|
@ -2646,6 +3132,10 @@
|
|||
"timeago_nd_tokens": {
|
||||
"izolo": "1D",
|
||||
"namuhla": "0D"
|
||||
},
|
||||
"comma_decimal": false,
|
||||
"number_tokens": {
|
||||
"B": 9
|
||||
}
|
||||
}
|
||||
}
|
||||
3156
testfiles/dict/large_number_samples.json
Normal file
3156
testfiles/dict/large_number_samples.json
Normal file
File diff suppressed because it is too large
Load diff
Reference in a new issue