feat: add number_tokens for parsing large nums to dictionary

This commit is contained in:
ThetaDev 2022-09-23 15:04:22 +02:00
parent 67ae1eb21d
commit 5d19259a14
21 changed files with 5219 additions and 38 deletions

View file

@ -0,0 +1,358 @@
use std::collections::HashMap;
use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path};
use anyhow::{Context, Result};
use fancy_regex::Regex;
use futures::{stream, StreamExt};
use once_cell::sync::Lazy;
use reqwest::{header, Client};
use rustypipe::model::{locale::LANGUAGES, Language};
use serde::Deserialize;
use serde_with::serde_as;
use serde_with::VecSkipError;
use crate::util::{self, Text};
type CollectedNumbers = BTreeMap<Language, BTreeMap<u8, (String, u64)>>;
/// Collect video view count texts in every supported language
/// and write them to `testfiles/dict/large_number_samples.json`.
///
/// YouTube's API outputs the subscriber count of a channel only in a
/// approximated format (e.g *880K subscribers*), which varies
/// by language.
///
/// To parse these numbers correctly we need to collect textual numbers
/// of different orders of magnitude in every language. This script extracts
/// the view count texts from the most popular videos of different channels.
///
/// We extract these instead of subscriber counts because the YouTube API
/// outputs view counts both in approximated and exact format, so we can use
/// the exact counts to figure out the tokens.
pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
let mut json_path = project_root.to_path_buf();
json_path.push("testfiles/dict/large_number_samples.json");
let channels = [
"UCq-Fj5jknLsUf-MWSy4_brA", // 10e8 (225M)
"UCcdwLMPsaU2ezNSJU1nFoBQ", // 10e7 (60M)
"UC6mIxFTvXkWQVEHPsEdflzQ", // 10e6 (1.7M)
"UCD0y51PJfvkZNe3y3FR5riw", // 10e5 (125K)
"UCNcN0dW43zE0Om3278fjY8A", // 10e4 (27K)
"UC0QEucPrn0-Ddi3JBTcs5Kw", // 10e3 (5K)
"UCGiJh0NZ52wRhYKYnuZI08Q", // 10e1 (37)
];
let collected_numbers: CollectedNumbers = stream::iter(LANGUAGES)
.map(|lang| async move {
let mut entry = BTreeMap::new();
for (n, ch_id) in channels.iter().enumerate() {
let channel = get_channel(ch_id, lang)
.await
.context(format!("{}-{}", lang, n))
.unwrap();
channel.view_counts.iter().for_each(|(num, txt)| {
entry.insert(get_mag(*num), (txt.to_owned(), *num));
});
println!("collected {}-{}", lang, n);
}
(lang, entry)
})
.buffer_unordered(concurrency)
.collect()
.await;
let file = File::create(json_path).unwrap();
serde_json::to_writer_pretty(file, &collected_numbers).unwrap();
}
/// Attempt to parse the numbers collected by `collect-large-numbers`
/// and write the results to `dictionary.json`.
pub fn write_samples_to_dict(project_root: &Path) {
let mut json_path = project_root.to_path_buf();
json_path.push("testfiles/dict/large_number_samples.json");
let json_file = File::open(json_path).unwrap();
let collected_nums: CollectedNumbers =
serde_json::from_reader(BufReader::new(json_file)).unwrap();
let mut dict = util::read_dict(project_root);
let langs = dict.keys().map(|k| k.to_owned()).collect::<Vec<_>>();
static POINT_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"\d(\.|,)\d{1,3}(?:\D|$)").unwrap());
for lang in langs {
let dict_entry = dict.entry(lang).or_default();
let mut e_langs = dict_entry.equivalent.clone();
e_langs.push(lang);
let comma_decimal = collected_nums
.get(&lang)
.unwrap()
.iter()
.find_map(|(mag, (txt, _))| {
let point = POINT_REGEX
.captures(txt)
.unwrap()
.map(|c| c.get(1).unwrap().as_str());
if let Some(point) = point {
let num_all = util::parse_numeric::<u64>(txt).unwrap();
// If the number parsed from all digits has the same order of
// magnitude as the actual number, it must be a separator.
// Otherwise it is a decimal point
return Some((get_mag(num_all) == *mag) ^ (point == ","));
}
None
})
.unwrap();
let decimal_point = match comma_decimal {
true => ",",
false => ".",
};
// Search for tokens
// This map holds all the tokens we encounter while parsing the language
// If a new token is found, it is stored in this map with the derived order of
// magnitude.
// If the token is found again with a different derived order of magnitude,
// its value in the map is set to None.
let mut found_tokens: HashMap<String, Option<u8>> = HashMap::new();
let mut insert_token = |token: String, mag: u8| {
let found_token = found_tokens.entry(token).or_insert(match mag {
0 => None,
x => Some(x),
});
if let Some(f) = found_token {
if *f != mag {
*found_token = None;
}
}
};
for lang in e_langs {
let entry = collected_nums.get(&lang).unwrap();
entry.iter().for_each(|(mag, (txt, _))| {
let filtered = util::filter_largenumstr(txt);
let tokens: Vec<String> = match dict_entry.by_char {
true => filtered.chars().map(|c| c.to_string()).collect(),
false => filtered.split_whitespace().map(|c| c.to_string()).collect(),
};
let num_before_point =
util::parse_numeric::<u64>(txt.split(decimal_point).next().unwrap()).unwrap();
let mag_before_point = get_mag(num_before_point);
let mut mag_remaining = mag - mag_before_point;
tokens.iter().for_each(|t| {
// These tokens are correct in all languages
// and are used to parse combined prefixes like `1.1K crore` (en-IN)
let known_tmag: u8 = if t.len() == 1 {
match t.as_str() {
"K" | "k" => 3,
"M" => 6,
// 'm' means 10^3 in Catalan, 'B' means 10^3 in Turkish
_ => 0,
}
} else {
0
};
// K/M/B
if known_tmag > 0 {
mag_remaining = mag_remaining
.checked_sub(known_tmag)
.expect("known magnitude incorrect");
} else {
insert_token(t.to_owned(), mag_remaining);
}
});
});
}
// Insert collected data into dictionary
dict_entry.number_tokens = found_tokens
.into_iter()
.filter_map(|(k, v)| v.map(|v| (k, v)))
.collect();
dict_entry.comma_decimal = comma_decimal;
}
util::write_dict(project_root, &dict);
}
fn get_mag(n: u64) -> u8 {
(n as f64).log10().floor() as u8
}
/*
YouTube channel videos response
*/
#[derive(Clone, Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct Channel {
contents: Contents,
}
#[derive(Clone, Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct Contents {
two_column_browse_results_renderer: TabsRenderer,
}
#[serde_as]
#[derive(Clone, Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct TabsRenderer {
#[serde_as(as = "VecSkipError<_>")]
tabs: Vec<TabRendererWrap>,
}
#[derive(Clone, Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct TabRendererWrap {
tab_renderer: TabRenderer,
}
#[derive(Clone, Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct TabRenderer {
content: SectionListRendererWrap,
}
#[derive(Clone, Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct SectionListRendererWrap {
section_list_renderer: SectionListRenderer,
}
#[derive(Clone, Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct SectionListRenderer {
contents: Vec<ItemSectionRendererWrap>,
}
#[derive(Clone, Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct ItemSectionRendererWrap {
item_section_renderer: ItemSectionRenderer,
}
#[derive(Clone, Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct ItemSectionRenderer {
contents: Vec<GridRendererWrap>,
}
#[derive(Clone, Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct GridRendererWrap {
grid_renderer: GridRenderer,
}
#[serde_as]
#[derive(Clone, Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct GridRenderer {
#[serde_as(as = "VecSkipError<_>")]
items: Vec<VideoListItem>,
}
#[derive(Clone, Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct VideoListItem {
grid_video_renderer: GridVideoRenderer,
}
#[derive(Clone, Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct GridVideoRenderer {
/// `24,194 views`
view_count_text: Text,
/// `19K views`
short_view_count_text: Text,
}
#[derive(Clone, Debug)]
struct ChannelData {
view_counts: Vec<(u64, String)>,
}
async fn get_channel(channel_id: &str, lang: Language) -> Result<ChannelData> {
let client = Client::new();
let body = format!(
"{}{}{}{}{}",
r##"{"context":{"client":{"clientName":"WEB","clientVersion":"2.20220914.06.00","platform":"DESKTOP","originalUrl":"https://www.youtube.com/","hl":""##,
lang,
r##"","gl":"US"},"request":{"internalExperimentFlags":[],"useSsl":true},"user":{"lockedSafetyMode":false}},"params":"EgZ2aWRlb3MYASAAMAE%3D","browseId":""##,
channel_id,
"\"}"
);
let resp = client
.post("https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false")
.header(header::CONTENT_TYPE, "application/json")
.body(body)
.send().await?
.error_for_status()?;
let channel = resp.json::<Channel>().await?;
Ok(ChannelData {
view_counts: channel
.contents
.two_column_browse_results_renderer
.tabs
.get(0)
.map(|tab| {
tab.tab_renderer.content.section_list_renderer.contents[0]
.item_section_renderer
.contents[0]
.grid_renderer
.items
.iter()
.map(|itm| {
(
util::parse_numeric(
&itm.grid_video_renderer.view_count_text.simple_text,
)
.unwrap(),
itm.grid_video_renderer
.short_view_count_text
.simple_text
.to_owned(),
)
})
.collect()
})
.unwrap_or_default(),
})
}
#[tokio::test]
async fn test() {
let channel = get_channel("UCcdwLMPsaU2ezNSJU1nFoBQ", Language::Az)
.await
.unwrap();
dbg!(channel);
}
#[test]
fn test2() {
write_samples_to_dict(Path::new(
"/home/thetadev/Documents/Programmieren/Rust/rustypipe",
));
}

View file

@ -38,7 +38,7 @@ enum DateCase {
}
/// Collect 'Playlist updated' dates in every supported language
/// and write them to `testfiles/date/playlist_samples.json`.
/// and write them to `testfiles/dict/playlist_samples.json`.
///
/// YouTube's API outputs the update date of playlists only in a
/// textual format (e.g. *Last updated on Jan 3, 2020*), which varies
@ -55,13 +55,15 @@ enum DateCase {
/// - one playlist updated yesterday
/// - one playlist updated 2-7 days ago
/// - one playlist from every month. Note that there should not
/// be any dates which include the same number twice (e.g. 01.01.2020).
/// be any dates which include the same number twice (e.g. 01.01.2020).
///
/// **IMPORTANT:**
///
/// Because the relative dates change with time, the first three playlists
/// should be checked and eventually changed before running the program.
/// have to checked and eventually changed before running the program.
pub async fn collect_dates(project_root: &Path, concurrency: usize) {
let mut json_path = project_root.to_path_buf();
json_path.push("testfiles/date/playlist_samples.json");
json_path.push("testfiles/dict/playlist_samples.json");
// These are the sample playlists
let cases = [
@ -115,7 +117,7 @@ pub async fn collect_dates(project_root: &Path, concurrency: usize) {
/// parsed automatically and require manual work.
pub fn write_samples_to_dict(project_root: &Path) {
let mut json_path = project_root.to_path_buf();
json_path.push("testfiles/date/playlist_samples.json");
json_path.push("testfiles/dict/playlist_samples.json");
let json_file = File::open(json_path).unwrap();
let collected_dates: CollectedDates =

View file

@ -146,7 +146,7 @@ async fn video_details(testfiles: &Path) {
async fn comments_top(testfiles: &Path) {
let mut json_path = testfiles.to_path_buf();
json_path.push("video_details");
json_path.push(format!("comments_top.json"));
json_path.push("comments_top.json");
if json_path.exists() {
return;
}

View file

@ -34,17 +34,47 @@ pub fn generate_dictionary(project_root: &Path) {
let dict = util::read_dict(project_root);
let code_head = r#"// This file is automatically generated. DO NOT EDIT.
// See codegen/gen_dictionary.rs for the generation code.
use crate::{
model::Language,
timeago::{DateCmp, TaToken, TimeUnit},
};
/// The dictionary contains the information required to parse dates and numbers
/// in all supported languages.
pub struct Entry {
/// Should the language be parsed by character instead of by word?
/// (e.g. Chinese/Japanese)
pub by_char: bool,
/// Tokens for parsing timeago strings.
///
/// Format: Parsed token -> \[Quantity\] Identifier
///
/// Identifiers: `Y`(ear), `M`(month), `W`(eek), `D`(ay),
/// `h`(our), `m`(inute), `s`(econd)
pub timeago_tokens: phf::Map<&'static str, TaToken>,
/// Order in which to parse numeric date components. Formatted as
/// a string of date identifiers (Y, M, D).
///
/// Examples:
///
/// - 03.01.2020 => `"DMY"`
/// - Jan 3, 2020 => `"DY"`
pub date_order: &'static [DateCmp],
/// Tokens for parsing month names.
///
/// Format: Parsed token -> Month number (starting from 1)
pub months: phf::Map<&'static str, u8>,
/// Tokens for parsing date strings with no digits (e.g. Today, Tomorrow)
///
/// Format: Parsed token -> \[Quantity\] Identifier
pub timeago_nd_tokens: phf::Map<&'static str, TaToken>,
/// Are commas (instead of points) used as decimal separators?
pub comma_decimal: bool,
/// Tokens for parsing decimal prefixes (K, M, B, ...)
///
/// Format: Parsed token -> decimal power
pub number_tokens: phf::Map<&'static str, u8>,
}
"#;
@ -100,12 +130,19 @@ pub fn entry(lang: Language) -> Entry {
});
date_order = date_order.trim_end_matches([' ', ',']).to_owned() + "]";
// Number tokens
let mut number_tokens = phf_codegen::Map::<&str>::new();
entry.number_tokens.iter().for_each(|(txt, mag)| {
number_tokens.entry(txt, &mag.to_string());
});
let code_ta_tokens = &ta_tokens.build().to_string().replace('\n', "\n ");
let code_ta_nd_tokens = &ta_nd_tokens.build().to_string().replace('\n', "\n ");
let code_months = &months.build().to_string().replace('\n', "\n ");
let code_number_tokens = &number_tokens.build().to_string().replace('\n', "\n ");
let _ = write!(code_timeago_tokens, "{} => Entry {{\n by_char: {:?},\n timeago_tokens: {},\n date_order: {},\n months: {},\n timeago_nd_tokens: {},\n }},\n ",
selector, entry.by_char, code_ta_tokens, date_order, code_months, code_ta_nd_tokens);
let _ = write!(code_timeago_tokens, "{} => Entry {{\n by_char: {:?},\n timeago_tokens: {},\n date_order: {},\n months: {},\n timeago_nd_tokens: {},\n comma_decimal: {:?},\n number_tokens: {},\n }},\n ",
selector, entry.by_char, code_ta_tokens, date_order, code_months, code_ta_nd_tokens, entry.comma_decimal, code_number_tokens);
});
code_timeago_tokens = code_timeago_tokens.trim_end().to_owned() + "\n }\n}\n";

View file

@ -8,6 +8,8 @@ use serde::Deserialize;
use serde_with::serde_as;
use serde_with::VecSkipError;
use crate::util::Text;
#[serde_as]
#[derive(Clone, Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
@ -135,12 +137,6 @@ struct LanguageCountryCommand {
hl: String,
}
#[derive(Clone, Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct Text {
simple_text: String,
}
pub async fn generate_locales(project_root: &Path) {
let (languages, countries) = get_locales().await;
@ -284,7 +280,7 @@ pub enum Country {
async fn get_locales() -> (BTreeMap<String, String>, BTreeMap<String, String>) {
let client = Client::new();
let resp = client
.post("https://www.youtube.com/youtubei/v1/account/account_menu?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8")
.post("https://www.youtube.com/youtubei/v1/account/account_menu?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false")
.header(header::CONTENT_TYPE, "application/json")
.body(
r##"{"context":{"client":{"clientName":"WEB","clientVersion":"2.20220914.06.00","platform":"DESKTOP","originalUrl":"https://www.youtube.com/","hl":"en","gl":"US"},"request":{"internalExperimentFlags":[],"useSsl":true},"user":{"lockedSafetyMode":false}}}"##

View file

@ -1,3 +1,4 @@
mod collect_large_numbers;
mod collect_playlist_dates;
mod download_testfiles;
mod gen_dictionary;
@ -21,7 +22,9 @@ struct Cli {
#[derive(Subcommand)]
enum Commands {
CollectPlaylistDates,
WritePlaylistDates,
CollectLargeNumbers,
ParsePlaylistDates,
ParseLargeNumbers,
GenLocales,
GenDict,
DownloadTestfiles,
@ -36,8 +39,14 @@ async fn main() {
Commands::CollectPlaylistDates => {
collect_playlist_dates::collect_dates(&cli.project_root, cli.concurrency).await;
}
Commands::WritePlaylistDates => {
collect_playlist_dates::write_samples_to_dict(&cli.project_root);
Commands::CollectLargeNumbers => {
collect_large_numbers::collect_large_numbers(&cli.project_root, cli.concurrency).await;
}
Commands::ParsePlaylistDates => {
collect_playlist_dates::write_samples_to_dict(&cli.project_root)
}
Commands::ParseLargeNumbers => {
collect_large_numbers::write_samples_to_dict(&cli.project_root)
}
Commands::GenLocales => {
gen_locales::generate_locales(&cli.project_root).await;

View file

@ -3,19 +3,53 @@ use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path, str::FromS
use rustypipe::model::Language;
use serde::{Deserialize, Serialize};
const DICT_PATH: &str = "testfiles/date/dictionary.json";
const DICT_PATH: &str = "testfiles/dict/dictionary.json";
type Dictionary = BTreeMap<Language, DictEntry>;
#[derive(Debug, Default, Serialize, Deserialize)]
#[serde(default)]
pub struct DictEntry {
/// List of languages that should be treated equally (e.g. EnUs/EnGb/EnIn)
pub equivalent: Vec<Language>,
/// Should the language be parsed by character instead of by word?
/// (e.g. Chinese/Japanese)
pub by_char: bool,
/// Tokens for parsing timeago strings.
///
/// Format: Parsed token -> \[Quantity\] Identifier
///
/// Identifiers: `Y`(ear), `M`(month), `W`(eek), `D`(ay),
/// `h`(our), `m`(inute), `s`(econd)
pub timeago_tokens: BTreeMap<String, String>,
/// Order in which to parse numeric date components. Formatted as
/// a string of date identifiers (Y, M, D).
///
/// Examples:
///
/// - 03.01.2020 => `"DMY"`
/// - Jan 3, 2020 => `"DY"`
pub date_order: String,
/// Tokens for parsing month names.
///
/// Format: Parsed token -> Month number (starting from 1)
pub months: BTreeMap<String, u8>,
/// Tokens for parsing date strings with no digits (e.g. Today, Tomorrow)
///
/// Format: Parsed token -> \[Quantity\] Identifier
pub timeago_nd_tokens: BTreeMap<String, String>,
/// Are commas (instead of points) used as decimal separators?
pub comma_decimal: bool,
/// Tokens for parsing decimal prefixes (K, M, B, ...)
///
/// Format: Parsed token -> decimal power
pub number_tokens: BTreeMap<String, u8>,
}
#[derive(Clone, Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Text {
pub simple_text: String,
}
pub fn read_dict(project_root: &Path) -> Dictionary {
@ -48,6 +82,27 @@ pub fn filter_datestr(string: &str) -> String {
.collect()
}
pub fn filter_largenumstr(string: &str) -> String {
string
.chars()
.filter(|c| !matches!(c, '\u{200b}' | '.' | ',') && !c.is_ascii_digit())
.collect()
}
/// Parse a string after removing all non-numeric characters
pub fn parse_numeric<F>(string: &str) -> Result<F, F::Err>
where
F: FromStr,
{
let mut buf = String::new();
for c in string.chars() {
if c.is_ascii_digit() {
buf.push(c);
}
}
buf.parse()
}
/// Parse all numbers occurring in a string and reurn them as a vec
pub fn parse_numeric_vec<F>(string: &str) -> Vec<F>
where