feat: add number_tokens for parsing large nums to dictionary

2022-09-23 15:04:22 +02:00 · 2022-09-23 15:04:22 +02:00 · 5d19259a14
commit 5d19259a14
parent 67ae1eb21d
21 changed files with 5219 additions and 38 deletions
--- a/codegen/src/collect_large_numbers.rs
+++ b/codegen/src/collect_large_numbers.rs
@ -0,0 +1,358 @@
+use std::collections::HashMap;
+use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path};
+
+use anyhow::{Context, Result};
+use fancy_regex::Regex;
+use futures::{stream, StreamExt};
+use once_cell::sync::Lazy;
+use reqwest::{header, Client};
+use rustypipe::model::{locale::LANGUAGES, Language};
+use serde::Deserialize;
+use serde_with::serde_as;
+use serde_with::VecSkipError;
+
+use crate::util::{self, Text};
+
+type CollectedNumbers = BTreeMap<Language, BTreeMap<u8, (String, u64)>>;
+
+/// Collect video view count texts in every supported language
+/// and write them to `testfiles/dict/large_number_samples.json`.
+///
+/// YouTube's API outputs the subscriber count of a channel only in a
+/// approximated format (e.g *880K subscribers*), which varies
+/// by language.
+///
+/// To parse these numbers correctly we need to collect textual numbers
+/// of different orders of magnitude in every language. This script extracts
+/// the view count texts from the most popular videos of different channels.
+///
+/// We extract these instead of subscriber counts because the YouTube API
+/// outputs view counts both in approximated and exact format, so we can use
+/// the exact counts to figure out the tokens.
+pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
+    let mut json_path = project_root.to_path_buf();
+    json_path.push("testfiles/dict/large_number_samples.json");
+
+    let channels = [
+        "UCq-Fj5jknLsUf-MWSy4_brA", // 10e8 (225M)
+        "UCcdwLMPsaU2ezNSJU1nFoBQ", // 10e7 (60M)
+        "UC6mIxFTvXkWQVEHPsEdflzQ", // 10e6 (1.7M)
+        "UCD0y51PJfvkZNe3y3FR5riw", // 10e5 (125K)
+        "UCNcN0dW43zE0Om3278fjY8A", // 10e4 (27K)
+        "UC0QEucPrn0-Ddi3JBTcs5Kw", // 10e3 (5K)
+        "UCGiJh0NZ52wRhYKYnuZI08Q", // 10e1 (37)
+    ];
+
+    let collected_numbers: CollectedNumbers = stream::iter(LANGUAGES)
+        .map(|lang| async move {
+            let mut entry = BTreeMap::new();
+
+            for (n, ch_id) in channels.iter().enumerate() {
+                let channel = get_channel(ch_id, lang)
+                    .await
+                    .context(format!("{}-{}", lang, n))
+                    .unwrap();
+
+                channel.view_counts.iter().for_each(|(num, txt)| {
+                    entry.insert(get_mag(*num), (txt.to_owned(), *num));
+                });
+
+                println!("collected {}-{}", lang, n);
+            }
+
+            (lang, entry)
+        })
+        .buffer_unordered(concurrency)
+        .collect()
+        .await;
+
+    let file = File::create(json_path).unwrap();
+    serde_json::to_writer_pretty(file, &collected_numbers).unwrap();
+}
+
+/// Attempt to parse the numbers collected by `collect-large-numbers`
+/// and write the results to `dictionary.json`.
+pub fn write_samples_to_dict(project_root: &Path) {
+    let mut json_path = project_root.to_path_buf();
+    json_path.push("testfiles/dict/large_number_samples.json");
+
+    let json_file = File::open(json_path).unwrap();
+    let collected_nums: CollectedNumbers =
+        serde_json::from_reader(BufReader::new(json_file)).unwrap();
+    let mut dict = util::read_dict(project_root);
+    let langs = dict.keys().map(|k| k.to_owned()).collect::<Vec<_>>();
+
+    static POINT_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"\d(\.|,)\d{1,3}(?:\D|$)").unwrap());
+
+    for lang in langs {
+        let dict_entry = dict.entry(lang).or_default();
+
+        let mut e_langs = dict_entry.equivalent.clone();
+        e_langs.push(lang);
+
+        let comma_decimal = collected_nums
+            .get(&lang)
+            .unwrap()
+            .iter()
+            .find_map(|(mag, (txt, _))| {
+                let point = POINT_REGEX
+                    .captures(txt)
+                    .unwrap()
+                    .map(|c| c.get(1).unwrap().as_str());
+
+                if let Some(point) = point {
+                    let num_all = util::parse_numeric::<u64>(txt).unwrap();
+                    // If the number parsed from all digits has the same order of
+                    // magnitude as the actual number, it must be a separator.
+                    // Otherwise it is a decimal point
+                    return Some((get_mag(num_all) == *mag) ^ (point == ","));
+                }
+                None
+            })
+            .unwrap();
+
+        let decimal_point = match comma_decimal {
+            true => ",",
+            false => ".",
+        };
+
+        // Search for tokens
+
+        // This map holds all the tokens we encounter while parsing the language
+        // If a new token is found, it is stored in this map with the derived order of
+        // magnitude.
+        // If the token is found again with a different derived order of magnitude,
+        // its value in the map is set to None.
+        let mut found_tokens: HashMap<String, Option<u8>> = HashMap::new();
+
+        let mut insert_token = |token: String, mag: u8| {
+            let found_token = found_tokens.entry(token).or_insert(match mag {
+                0 => None,
+                x => Some(x),
+            });
+
+            if let Some(f) = found_token {
+                if *f != mag {
+                    *found_token = None;
+                }
+            }
+        };
+
+        for lang in e_langs {
+            let entry = collected_nums.get(&lang).unwrap();
+
+            entry.iter().for_each(|(mag, (txt, _))| {
+                let filtered = util::filter_largenumstr(txt);
+
+                let tokens: Vec<String> = match dict_entry.by_char {
+                    true => filtered.chars().map(|c| c.to_string()).collect(),
+                    false => filtered.split_whitespace().map(|c| c.to_string()).collect(),
+                };
+
+                let num_before_point =
+                    util::parse_numeric::<u64>(txt.split(decimal_point).next().unwrap()).unwrap();
+                let mag_before_point = get_mag(num_before_point);
+                let mut mag_remaining = mag - mag_before_point;
+
+                tokens.iter().for_each(|t| {
+                    // These tokens are correct in all languages
+                    // and are used to parse combined prefixes like `1.1K crore` (en-IN)
+                    let known_tmag: u8 = if t.len() == 1 {
+                        match t.as_str() {
+                            "K" | "k" => 3,
+                            "M" => 6,
+                            // 'm' means 10^3 in Catalan, 'B' means 10^3 in Turkish
+                            _ => 0,
+                        }
+                    } else {
+                        0
+                    };
+
+                    // K/M/B
+                    if known_tmag > 0 {
+                        mag_remaining = mag_remaining
+                            .checked_sub(known_tmag)
+                            .expect("known magnitude incorrect");
+                    } else {
+                        insert_token(t.to_owned(), mag_remaining);
+                    }
+                });
+            });
+        }
+
+        // Insert collected data into dictionary
+        dict_entry.number_tokens = found_tokens
+            .into_iter()
+            .filter_map(|(k, v)| v.map(|v| (k, v)))
+            .collect();
+        dict_entry.comma_decimal = comma_decimal;
+    }
+
+    util::write_dict(project_root, &dict);
+}
+
+fn get_mag(n: u64) -> u8 {
+    (n as f64).log10().floor() as u8
+}
+
+/*
+YouTube channel videos response
+*/
+
+#[derive(Clone, Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct Channel {
+    contents: Contents,
+}
+
+#[derive(Clone, Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct Contents {
+    two_column_browse_results_renderer: TabsRenderer,
+}
+
+#[serde_as]
+#[derive(Clone, Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct TabsRenderer {
+    #[serde_as(as = "VecSkipError<_>")]
+    tabs: Vec<TabRendererWrap>,
+}
+
+#[derive(Clone, Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct TabRendererWrap {
+    tab_renderer: TabRenderer,
+}
+
+#[derive(Clone, Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct TabRenderer {
+    content: SectionListRendererWrap,
+}
+
+#[derive(Clone, Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct SectionListRendererWrap {
+    section_list_renderer: SectionListRenderer,
+}
+
+#[derive(Clone, Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct SectionListRenderer {
+    contents: Vec<ItemSectionRendererWrap>,
+}
+
+#[derive(Clone, Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct ItemSectionRendererWrap {
+    item_section_renderer: ItemSectionRenderer,
+}
+
+#[derive(Clone, Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct ItemSectionRenderer {
+    contents: Vec<GridRendererWrap>,
+}
+
+#[derive(Clone, Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct GridRendererWrap {
+    grid_renderer: GridRenderer,
+}
+
+#[serde_as]
+#[derive(Clone, Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct GridRenderer {
+    #[serde_as(as = "VecSkipError<_>")]
+    items: Vec<VideoListItem>,
+}
+
+#[derive(Clone, Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct VideoListItem {
+    grid_video_renderer: GridVideoRenderer,
+}
+
+#[derive(Clone, Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct GridVideoRenderer {
+    /// `24,194 views`
+    view_count_text: Text,
+    /// `19K views`
+    short_view_count_text: Text,
+}
+
+#[derive(Clone, Debug)]
+struct ChannelData {
+    view_counts: Vec<(u64, String)>,
+}
+
+async fn get_channel(channel_id: &str, lang: Language) -> Result<ChannelData> {
+    let client = Client::new();
+
+    let body = format!(
+        "{}{}{}{}{}",
+        r##"{"context":{"client":{"clientName":"WEB","clientVersion":"2.20220914.06.00","platform":"DESKTOP","originalUrl":"https://www.youtube.com/","hl":""##,
+        lang,
+        r##"","gl":"US"},"request":{"internalExperimentFlags":[],"useSsl":true},"user":{"lockedSafetyMode":false}},"params":"EgZ2aWRlb3MYASAAMAE%3D","browseId":""##,
+        channel_id,
+        "\"}"
+    );
+
+    let resp = client
+        .post("https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false")
+        .header(header::CONTENT_TYPE, "application/json")
+        .body(body)
+        .send().await?
+        .error_for_status()?;
+
+    let channel = resp.json::<Channel>().await?;
+
+    Ok(ChannelData {
+        view_counts: channel
+            .contents
+            .two_column_browse_results_renderer
+            .tabs
+            .get(0)
+            .map(|tab| {
+                tab.tab_renderer.content.section_list_renderer.contents[0]
+                    .item_section_renderer
+                    .contents[0]
+                    .grid_renderer
+                    .items
+                    .iter()
+                    .map(|itm| {
+                        (
+                            util::parse_numeric(
+                                &itm.grid_video_renderer.view_count_text.simple_text,
+                            )
+                            .unwrap(),
+                            itm.grid_video_renderer
+                                .short_view_count_text
+                                .simple_text
+                                .to_owned(),
+                        )
+                    })
+                    .collect()
+            })
+            .unwrap_or_default(),
+    })
+}
+
+#[tokio::test]
+async fn test() {
+    let channel = get_channel("UCcdwLMPsaU2ezNSJU1nFoBQ", Language::Az)
+        .await
+        .unwrap();
+
+    dbg!(channel);
+}
+
+#[test]
+fn test2() {
+    write_samples_to_dict(Path::new(
+        "/home/thetadev/Documents/Programmieren/Rust/rustypipe",
+    ));
+}
--- a/codegen/src/collect_playlist_dates.rs
+++ b/codegen/src/collect_playlist_dates.rs
@ -38,7 +38,7 @@ enum DateCase {
 }

 /// Collect 'Playlist updated' dates in every supported language
-/// and write them to `testfiles/date/playlist_samples.json`.
+/// and write them to `testfiles/dict/playlist_samples.json`.
 ///
 /// YouTube's API outputs the update date of playlists only in a
 /// textual format (e.g. *Last updated on Jan 3, 2020*), which varies
@ -55,13 +55,15 @@ enum DateCase {
 /// - one playlist updated yesterday
 /// - one playlist updated 2-7 days ago
 /// - one playlist from every month. Note that there should not
-/// be any dates which include the same number twice (e.g. 01.01.2020).
+///   be any dates which include the same number twice (e.g. 01.01.2020).
+///
+/// **IMPORTANT:**
 ///
 /// Because the relative dates change with time, the first three playlists
-/// should be checked and eventually changed before running the program.
+/// have to checked and eventually changed before running the program.
 pub async fn collect_dates(project_root: &Path, concurrency: usize) {
    let mut json_path = project_root.to_path_buf();
-    json_path.push("testfiles/date/playlist_samples.json");
+    json_path.push("testfiles/dict/playlist_samples.json");

    // These are the sample playlists
    let cases = [
@ -115,7 +117,7 @@ pub async fn collect_dates(project_root: &Path, concurrency: usize) {
 /// parsed automatically and require manual work.
 pub fn write_samples_to_dict(project_root: &Path) {
    let mut json_path = project_root.to_path_buf();
-    json_path.push("testfiles/date/playlist_samples.json");
+    json_path.push("testfiles/dict/playlist_samples.json");

    let json_file = File::open(json_path).unwrap();
    let collected_dates: CollectedDates =
--- a/codegen/src/download_testfiles.rs
+++ b/codegen/src/download_testfiles.rs
@ -146,7 +146,7 @@ async fn video_details(testfiles: &Path) {
 async fn comments_top(testfiles: &Path) {
    let mut json_path = testfiles.to_path_buf();
    json_path.push("video_details");
-    json_path.push(format!("comments_top.json"));
+    json_path.push("comments_top.json");
    if json_path.exists() {
        return;
    }
--- a/codegen/src/gen_dictionary.rs
+++ b/codegen/src/gen_dictionary.rs
@ -34,17 +34,47 @@ pub fn generate_dictionary(project_root: &Path) {
    let dict = util::read_dict(project_root);

    let code_head = r#"// This file is automatically generated. DO NOT EDIT.
+// See codegen/gen_dictionary.rs for the generation code.
 use crate::{
    model::Language,
    timeago::{DateCmp, TaToken, TimeUnit},
 };

+/// The dictionary contains the information required to parse dates and numbers
+/// in all supported languages.
 pub struct Entry {
+    /// Should the language be parsed by character instead of by word?
+    /// (e.g. Chinese/Japanese)
    pub by_char: bool,
+    /// Tokens for parsing timeago strings.
+    ///
+    /// Format: Parsed token -> \[Quantity\] Identifier
+    ///
+    /// Identifiers: `Y`(ear), `M`(month), `W`(eek), `D`(ay),
+    /// `h`(our), `m`(inute), `s`(econd)
    pub timeago_tokens: phf::Map<&'static str, TaToken>,
+    /// Order in which to parse numeric date components. Formatted as
+    /// a string of date identifiers (Y, M, D).
+    ///
+    /// Examples:
+    ///
+    /// - 03.01.2020 => `"DMY"`
+    /// - Jan 3, 2020 => `"DY"`
    pub date_order: &'static [DateCmp],
+    /// Tokens for parsing month names.
+    ///
+    /// Format: Parsed token -> Month number (starting from 1)
    pub months: phf::Map<&'static str, u8>,
+    /// Tokens for parsing date strings with no digits (e.g. Today, Tomorrow)
+    ///
+    /// Format: Parsed token -> \[Quantity\] Identifier
    pub timeago_nd_tokens: phf::Map<&'static str, TaToken>,
+    /// Are commas (instead of points) used as decimal separators?
+    pub comma_decimal: bool,
+    /// Tokens for parsing decimal prefixes (K, M, B, ...)
+    ///
+    /// Format: Parsed token -> decimal power
+    pub number_tokens: phf::Map<&'static str, u8>,
 }
 "#;

@ -100,12 +130,19 @@ pub fn entry(lang: Language) -> Entry {
        });
        date_order = date_order.trim_end_matches([' ', ',']).to_owned() + "]";

+        // Number tokens
+        let mut number_tokens = phf_codegen::Map::<&str>::new();
+        entry.number_tokens.iter().for_each(|(txt, mag)| {
+            number_tokens.entry(txt, &mag.to_string());
+        });
+
        let code_ta_tokens = &ta_tokens.build().to_string().replace('\n', "\n            ");
        let code_ta_nd_tokens = &ta_nd_tokens.build().to_string().replace('\n', "\n            ");
        let code_months = &months.build().to_string().replace('\n', "\n            ");
+        let code_number_tokens = &number_tokens.build().to_string().replace('\n', "\n            ");

-        let _ = write!(code_timeago_tokens, "{} => Entry {{\n            by_char: {:?},\n            timeago_tokens: {},\n            date_order: {},\n            months: {},\n            timeago_nd_tokens: {},\n        }},\n        ",
-        selector, entry.by_char, code_ta_tokens, date_order, code_months, code_ta_nd_tokens);
+        let _ = write!(code_timeago_tokens, "{} => Entry {{\n            by_char: {:?},\n            timeago_tokens: {},\n            date_order: {},\n            months: {},\n            timeago_nd_tokens: {},\n            comma_decimal: {:?},\n            number_tokens: {},\n        }},\n        ",
+        selector, entry.by_char, code_ta_tokens, date_order, code_months, code_ta_nd_tokens, entry.comma_decimal, code_number_tokens);
    });

    code_timeago_tokens = code_timeago_tokens.trim_end().to_owned() + "\n    }\n}\n";
--- a/codegen/src/gen_locales.rs
+++ b/codegen/src/gen_locales.rs
@ -8,6 +8,8 @@ use serde::Deserialize;
 use serde_with::serde_as;
 use serde_with::VecSkipError;

+use crate::util::Text;
+
 #[serde_as]
 #[derive(Clone, Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
@ -135,12 +137,6 @@ struct LanguageCountryCommand {
    hl: String,
 }

-#[derive(Clone, Debug, Deserialize)]
-#[serde(rename_all = "camelCase")]
-struct Text {
-    simple_text: String,
-}
-
 pub async fn generate_locales(project_root: &Path) {
    let (languages, countries) = get_locales().await;

@ -284,7 +280,7 @@ pub enum Country {
 async fn get_locales() -> (BTreeMap<String, String>, BTreeMap<String, String>) {
    let client = Client::new();
    let resp = client
-        .post("https://www.youtube.com/youtubei/v1/account/account_menu?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8")
+        .post("https://www.youtube.com/youtubei/v1/account/account_menu?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false")
        .header(header::CONTENT_TYPE, "application/json")
        .body(
            r##"{"context":{"client":{"clientName":"WEB","clientVersion":"2.20220914.06.00","platform":"DESKTOP","originalUrl":"https://www.youtube.com/","hl":"en","gl":"US"},"request":{"internalExperimentFlags":[],"useSsl":true},"user":{"lockedSafetyMode":false}}}"##
--- a/codegen/src/main.rs
+++ b/codegen/src/main.rs
@ -1,3 +1,4 @@
+mod collect_large_numbers;
 mod collect_playlist_dates;
 mod download_testfiles;
 mod gen_dictionary;
@ -21,7 +22,9 @@ struct Cli {
 #[derive(Subcommand)]
 enum Commands {
    CollectPlaylistDates,
-    WritePlaylistDates,
+    CollectLargeNumbers,
+    ParsePlaylistDates,
+    ParseLargeNumbers,
    GenLocales,
    GenDict,
    DownloadTestfiles,
@ -36,8 +39,14 @@ async fn main() {
        Commands::CollectPlaylistDates => {
            collect_playlist_dates::collect_dates(&cli.project_root, cli.concurrency).await;
        }
-        Commands::WritePlaylistDates => {
-            collect_playlist_dates::write_samples_to_dict(&cli.project_root);
+        Commands::CollectLargeNumbers => {
+            collect_large_numbers::collect_large_numbers(&cli.project_root, cli.concurrency).await;
+        }
+        Commands::ParsePlaylistDates => {
+            collect_playlist_dates::write_samples_to_dict(&cli.project_root)
+        }
+        Commands::ParseLargeNumbers => {
+            collect_large_numbers::write_samples_to_dict(&cli.project_root)
        }
        Commands::GenLocales => {
            gen_locales::generate_locales(&cli.project_root).await;
--- a/codegen/src/util.rs
+++ b/codegen/src/util.rs
@ -3,19 +3,53 @@ use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path, str::FromS
 use rustypipe::model::Language;
 use serde::{Deserialize, Serialize};

-const DICT_PATH: &str = "testfiles/date/dictionary.json";
+const DICT_PATH: &str = "testfiles/dict/dictionary.json";

 type Dictionary = BTreeMap<Language, DictEntry>;

 #[derive(Debug, Default, Serialize, Deserialize)]
 #[serde(default)]
 pub struct DictEntry {
+    /// List of languages that should be treated equally (e.g. EnUs/EnGb/EnIn)
    pub equivalent: Vec<Language>,
+    /// Should the language be parsed by character instead of by word?
+    /// (e.g. Chinese/Japanese)
    pub by_char: bool,
+    /// Tokens for parsing timeago strings.
+    ///
+    /// Format: Parsed token -> \[Quantity\] Identifier
+    ///
+    /// Identifiers: `Y`(ear), `M`(month), `W`(eek), `D`(ay),
+    /// `h`(our), `m`(inute), `s`(econd)
    pub timeago_tokens: BTreeMap<String, String>,
+    /// Order in which to parse numeric date components. Formatted as
+    /// a string of date identifiers (Y, M, D).
+    ///
+    /// Examples:
+    ///
+    /// - 03.01.2020 => `"DMY"`
+    /// - Jan 3, 2020 => `"DY"`
    pub date_order: String,
+    /// Tokens for parsing month names.
+    ///
+    /// Format: Parsed token -> Month number (starting from 1)
    pub months: BTreeMap<String, u8>,
+    /// Tokens for parsing date strings with no digits (e.g. Today, Tomorrow)
+    ///
+    /// Format: Parsed token -> \[Quantity\] Identifier
    pub timeago_nd_tokens: BTreeMap<String, String>,
+    /// Are commas (instead of points) used as decimal separators?
+    pub comma_decimal: bool,
+    /// Tokens for parsing decimal prefixes (K, M, B, ...)
+    ///
+    /// Format: Parsed token -> decimal power
+    pub number_tokens: BTreeMap<String, u8>,
+}
+
+#[derive(Clone, Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct Text {
+    pub simple_text: String,
 }

 pub fn read_dict(project_root: &Path) -> Dictionary {
@ -48,6 +82,27 @@ pub fn filter_datestr(string: &str) -> String {
        .collect()
 }

+pub fn filter_largenumstr(string: &str) -> String {
+    string
+        .chars()
+        .filter(|c| !matches!(c, '\u{200b}' | '.' | ',') && !c.is_ascii_digit())
+        .collect()
+}
+
+/// Parse a string after removing all non-numeric characters
+pub fn parse_numeric<F>(string: &str) -> Result<F, F::Err>
+where
+    F: FromStr,
+{
+    let mut buf = String::new();
+    for c in string.chars() {
+        if c.is_ascii_digit() {
+            buf.push(c);
+        }
+    }
+    buf.parse()
+}
+
 /// Parse all numbers occurring in a string and reurn them as a vec
 pub fn parse_numeric_vec<F>(string: &str) -> Vec<F>
 where
--- a/notes/video_ids.txt
+++ b/notes/video_ids.txt
@ -59,6 +59,6 @@ Dec PL1J-6JOckZtHo91uApeb10Qlf2XhkfM-9 24.12.2021
 10e6: 1.7M UC6mIxFTvXkWQVEHPsEdflzQ
 10e5: 125K UCD0y51PJfvkZNe3y3FR5riw
 10e4: 27K UCNcN0dW43zE0Om3278fjY8A
-10e3: 5K UCNcN0dW43zE0Om3278fjY8A
+10e3: 5K UC0QEucPrn0-Ddi3JBTcs5Kw
 10e2: 388 UCllyEQfcoiPN68zHv6mGHDQ
-10e1: 37 UCNcN0dW43zE0Om3278fjY8A
+10e1: 37 UCGiJh0NZ52wRhYKYnuZI08Q
--- a/src/client/channel.rs
+++ b/src/client/channel.rs
@ -72,7 +72,6 @@ impl MapResponse<ChannelVideos> for response::Channel {
            c: ChannelVideos {
                id: header.channel_id,
                name: header.title,
-                subscriber_count_txt: header.subscriber_count_text,
            },
            warnings,
        })
--- a/src/client/response/channel.rs
+++ b/src/client/response/channel.rs
@ -82,12 +82,14 @@ pub struct HeaderRenderer {
    pub channel_id: String,
    /// Channel name
    pub title: String,
-    /// Approximate subscriber count (e.g. `880K subscribers`), depends on language
-    #[serde_as(as = "Text")]
-    pub subscriber_count_text: String,
+    /// Approximate subscriber count (e.g. `880K subscribers`), depends on language.
+    ///
+    /// `None` if the subscriber count is hidden.
+    #[serde_as(as = "Option<Text>")]
+    pub subscriber_count_text: Option<String>,
    pub avatar: Thumbnails,
-    #[serde_as(as = "VecSkipError<_>")]
-    pub badges: Vec<ChannelBadge>,
+    #[serde_as(as = "Option<VecSkipError<_>>")]
+    pub badges: Option<Vec<ChannelBadge>>,
    pub banner: Thumbnails,
    pub mobile_banner: Thumbnails,
    /// Fullscreen (16:9) channel banner
--- a/src/client/response/mod.rs
+++ b/src/client/response/mod.rs
@ -93,6 +93,7 @@ pub struct GridVideoRenderer {
    pub published_time_text: Option<String>,
    #[serde_as(as = "Option<Text>")]
    pub view_count_text: Option<String>,
+    /// Contains video length
    #[serde_as(as = "VecSkipError<_>")]
    pub thumbnail_overlays: Vec<TimeOverlay>,
 }
@ -397,6 +398,10 @@ pub trait IsLive {
    fn is_live(&self) -> bool;
 }

+pub trait IsShort {
+    fn is_short(&self) -> bool;
+}
+
 impl IsLive for Vec<VideoBadge> {
    fn is_live(&self) -> bool {
        self.iter().any(|badge| {
@ -404,3 +409,19 @@ impl IsLive for Vec<VideoBadge> {
        })
    }
 }
+
+impl IsLive for Vec<TimeOverlay> {
+    fn is_live(&self) -> bool {
+        self.iter().any(|overlay| {
+            overlay.thumbnail_overlay_time_status_renderer.style == TimeOverlayStyle::Live
+        })
+    }
+}
+
+impl IsShort for Vec<TimeOverlay> {
+    fn is_short(&self) -> bool {
+        self.iter().any(|overlay| {
+            overlay.thumbnail_overlay_time_status_renderer.style == TimeOverlayStyle::Shorts
+        })
+    }
+}
--- a/src/dictionary.rs
+++ b/src/dictionary.rs
--- a/src/model/mod.rs
+++ b/src/model/mod.rs
@ -310,7 +310,7 @@ pub struct RecommendedVideo {
    pub publish_date_txt: Option<String>,
    /// View count
    ///
-    /// Is `None` if it could not be parsed
+    /// `None` if it could not be extracted.
    pub view_count: Option<u64>,
    /// Is the video an active livestream?
    pub is_live: bool,
@ -400,6 +400,43 @@ pub struct ChannelVideos {
    pub id: String,
    /// Channel name
    pub name: String,
-    /// Textual subscriber count (e.g. `2.3M subscribers`), depends on language setting
-    pub subscriber_count_txt: String,
+    /*
+    /// Channel subscriber count
+    ///
+    /// `None` if the subscriber count was hidden by the owner
+    /// or could not be parsed.
+    pub subscriber_count: Option<u64>,
+    pub videos: Paginator<ChannelVideo>,
+    */
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[non_exhaustive]
+pub struct ChannelVideo {
+    /// Unique YouTube video ID
+    pub id: String,
+    /// Video title
+    pub title: String,
+    /// Video length in seconds.
+    ///
+    /// Is `None` for livestreams.
+    pub length: Option<u32>,
+    /// Video thumbnail
+    pub thumbnail: Vec<Thumbnail>,
+    /// Video publishing date.
+    ///
+    /// `None` if the date could not be parsed.
+    pub publish_date: Option<DateTime<Local>>,
+    /// Textual video publish date (e.g. `11 months ago`, depends on language)
+    ///
+    /// Is `None` for livestreams.
+    pub publish_date_txt: Option<String>,
+    /// View count
+    ///
+    /// `None` if it could not be extracted.
+    pub view_count: Option<u64>,
+    /// Is the video an active livestream?
+    pub is_live: bool,
+    /// Is the video a YouTube Short video (vertical and <60s)?
+    pub is_short: bool,
 }
--- a/src/model/richtext.rs
+++ b/src/model/richtext.rs
@ -53,7 +53,7 @@ pub trait ToHtml {
 }

 impl TextComponent {
-    pub fn get_text<'a>(&'a self) -> &'a str {
+    pub fn get_text(&self) -> &str {
        match self {
            TextComponent::Text(text) => text,
            TextComponent::Web { text, .. } => text,
--- a/src/timeago.rs
+++ b/src/timeago.rs
@ -247,7 +247,7 @@ mod tests {

    #[test]
    fn t_testfile() {
-        let json_path = Path::new("testfiles/date/timeago_samples.json");
+        let json_path = Path::new("testfiles/dict/timeago_samples.json");

        let expect = [
            TimeAgo {
@ -430,7 +430,7 @@ mod tests {
            cases: BTreeMap<String, u8>,
        }

-        let json_path = Path::new("testfiles/date/timeago_table.json");
+        let json_path = Path::new("testfiles/dict/timeago_table.json");
        let json_file = File::open(json_path).unwrap();
        let timeago_table: TimeagoTable =
            serde_json::from_reader(BufReader::new(json_file)).unwrap();
@ -477,7 +477,7 @@ mod tests {

    #[test]
    fn t_parse_date_samples() {
-        let json_path = Path::new("testfiles/date/playlist_samples.json");
+        let json_path = Path::new("testfiles/dict/playlist_samples.json");
        let json_file = File::open(json_path).unwrap();
        let date_samples: BTreeMap<Language, BTreeMap<String, String>> =
            serde_json::from_reader(BufReader::new(json_file)).unwrap();
--- a/testfiles/dict/cldr_pluralrules_cardinals.json
+++ b/testfiles/dict/cldr_pluralrules_cardinals.json
--- a/testfiles/dict/dictionary.json
+++ b/testfiles/dict/dictionary.json
@ -35,6 +35,11 @@
    "timeago_nd_tokens": {
      "gister": "1D",
      "vandag": "0D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "m": 6,
+      "mjd": 9
    }
  },
  "am": {
@ -74,6 +79,12 @@
    "timeago_nd_tokens": {
      "ትላንት": "1D",
      "ዛሬ": "0D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "ሚ": 6,
+      "ሺ": 3,
+      "ቢ": 9
    }
  },
  "ar": {
@ -110,6 +121,12 @@
    "timeago_nd_tokens": {
      "اليوم": "0D",
      "بالأمس": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "ألف": 3,
+      "مليار": 9,
+      "مليون": 6
    }
  },
  "as": {
@ -129,6 +146,15 @@
    "timeago_nd_tokens": {
      "আজি": "0D",
      "কালি": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "কোঃটা": 9,
+      "নিঃটা": 6,
+      "নিযুতটা": 6,
+      "লাখটা": 5,
+      "শঃ": 9,
+      "হাজাৰটা": 3
    }
  },
  "az": {
@ -161,6 +187,11 @@
    "timeago_nd_tokens": {
      "bugün": "0D",
      "dünən": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "mln": 6,
+      "mlrd": 9
    }
  },
  "be": {
@ -210,6 +241,12 @@
    "timeago_nd_tokens": {
      "сёння": "0D",
      "ўчора": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "млн": 6,
+      "млрд": 9,
+      "тыс": 3
    }
  },
  "bg": {
@ -236,6 +273,12 @@
    "timeago_nd_tokens": {
      "вчера": "1D",
      "днес": "0D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "млн": 6,
+      "млрд": 9,
+      "хил": 3
    }
  },
  "bn": {
@ -268,6 +311,12 @@
    "timeago_nd_tokens": {
      "আজ": "0D",
      "গতকাল": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "লাটি": 5,
+      "শত": 9,
+      "হাটি": 3
    }
  },
  "bs": {
@ -312,6 +361,12 @@
    "timeago_nd_tokens": {
      "danas": "0D",
      "jučer": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "hilj": 3,
+      "mil": 6,
+      "mlr": 9
    }
  },
  "ca": {
@ -351,6 +406,11 @@
    "timeago_nd_tokens": {
      "ahir": "1D",
      "avui": "0D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "m": 3,
+      "mM": 9
    }
  },
  "cs": {
@ -378,6 +438,12 @@
    "timeago_nd_tokens": {
      "dnes": "0D",
      "včera": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "mil": 6,
+      "mld": 9,
+      "tis": 3
    }
  },
  "da": {
@ -416,6 +482,11 @@
    "timeago_nd_tokens": {
      "dag": "0D",
      "går": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "mia": 9,
+      "mio": 6
    }
  },
  "de": {
@ -442,6 +513,11 @@
    "timeago_nd_tokens": {
      "gestern": "1D",
      "heute": "0D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "Mio": 6,
+      "Mrd": 9
    }
  },
  "el": {
@ -481,6 +557,12 @@
    "timeago_nd_tokens": {
      "σήμερα": "0D",
      "χτες": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "δισ": 9,
+      "εκ": 6,
+      "χιλ": 3
    }
  },
  "en": {
@ -524,11 +606,59 @@
    "timeago_nd_tokens": {
      "today": "0D",
      "yesterday": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "B": 9,
+      "crore": 7,
+      "lakh": 5
    }
  },
  "es": {
+    "equivalent": [],
+    "by_char": false,
+    "timeago_tokens": {
+      "año": "Y",
+      "años": "Y",
+      "día": "D",
+      "días": "D",
+      "hora": "h",
+      "horas": "h",
+      "mes": "M",
+      "meses": "M",
+      "minuto": "m",
+      "minutos": "m",
+      "segundo": "s",
+      "segundos": "s",
+      "semana": "W",
+      "semanas": "W"
+    },
+    "date_order": "DY",
+    "months": {
+      "abr": 4,
+      "ago": 8,
+      "dic": 12,
+      "ene": 1,
+      "feb": 2,
+      "jul": 7,
+      "jun": 6,
+      "mar": 3,
+      "may": 5,
+      "nov": 11,
+      "oct": 10,
+      "sept": 9
+    },
+    "timeago_nd_tokens": {
+      "ayer": "1D",
+      "hoy": "0D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "mil": 9
+    }
+  },
+  "es-US": {
    "equivalent": [
-      "es-US",
      "es-419"
    ],
    "by_char": false,
@ -566,6 +696,10 @@
    "timeago_nd_tokens": {
      "ayer": "1D",
      "hoy": "0D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "mil": 9
    }
  },
  "et": {
@ -607,6 +741,12 @@
    "timeago_nd_tokens": {
      "eile": "1D",
      "täna": "0D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "mld": 9,
+      "mln": 6,
+      "tuh": 3
    }
  },
  "eu": {
@ -642,7 +782,9 @@
    "timeago_nd_tokens": {
      "atzo": "1D",
      "gaur": "0D"
-    }
+    },
+    "comma_decimal": true,
+    "number_tokens": {}
  },
  "fa": {
    "equivalent": [],
@ -674,6 +816,12 @@
    "timeago_nd_tokens": {
      "امروز": "0D",
      "دیروز": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "میلیارد": 9,
+      "میلیون": 6,
+      "هزار": 3
    }
  },
  "fi": {
@ -700,6 +848,12 @@
    "timeago_nd_tokens": {
      "eilen": "1D",
      "tänään": "0D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "milj": 6,
+      "mrd": 9,
+      "t": 3
    }
  },
  "fil": {
@ -732,6 +886,10 @@
    "timeago_nd_tokens": {
      "kahapon": "1D",
      "ngayong": "0D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "B": 9
    }
  },
  "fr": {
@ -773,6 +931,11 @@
    "timeago_nd_tokens": {
      "aujourd'hui": "0D",
      "hier": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "G": 9,
+      "Md": 9
    }
  },
  "gl": {
@ -812,7 +975,9 @@
    "timeago_nd_tokens": {
      "hoxe": "0D",
      "onte": "1D"
-    }
+    },
+    "comma_decimal": true,
+    "number_tokens": {}
  },
  "gu": {
    "equivalent": [],
@ -844,6 +1009,13 @@
    "timeago_nd_tokens": {
      "આજે": "0D",
      "ગઈ": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "અબજ": 9,
+      "કરોડ": 7,
+      "લાખ": 5,
+      "હજાર": 3
    }
  },
  "hi": {
@ -876,6 +1048,13 @@
    "timeago_nd_tokens": {
      "आज": "0D",
      "कल": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "अ॰": 9,
+      "क॰": 7,
+      "लाख": 5,
+      "हज़ार": 3
    }
  },
  "hr": {
@ -920,6 +1099,12 @@
    "timeago_nd_tokens": {
      "danas": "0D",
      "jučer": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "mil": 6,
+      "mlr": 9,
+      "tis": 3
    }
  },
  "hu": {
@ -959,6 +1144,11 @@
    "timeago_nd_tokens": {
      "ma": "0D",
      "tegnap": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "E": 3,
+      "Mrd": 9
    }
  },
  "hy": {
@ -991,6 +1181,12 @@
    "timeago_nd_tokens": {
      "այսօր": "0D",
      "երեկ": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "հզր": 3,
+      "մլն": 6,
+      "մլրդ": 9
    }
  },
  "id": {
@ -1023,6 +1219,11 @@
    "timeago_nd_tokens": {
      "ini": "0D",
      "kemarin": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "jt": 6,
+      "rb": 3
    }
  },
  "is": {
@ -1062,6 +1263,12 @@
    "timeago_nd_tokens": {
      "dag": "0D",
      "gær": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "m": 6,
+      "ma": 9,
+      "þ": 3
    }
  },
  "it": {
@ -1101,6 +1308,11 @@
    "timeago_nd_tokens": {
      "ieri": "1D",
      "oggi": "0D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "Mln": 6,
+      "Mrd": 9
    }
  },
  "iw": {
@ -1146,6 +1358,12 @@
    "timeago_nd_tokens": {
      "אתמול": "1D",
      "היום": "0D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "‫B‏‬": 9,
+      "‫K‏‬": 3,
+      "‫M‏‬": 6
    }
  },
  "ja": {
@ -1165,6 +1383,11 @@
    "timeago_nd_tokens": {
      "日": "1D",
      "本": "0D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "万": 4,
+      "億": 8
    }
  },
  "ka": {
@ -1197,6 +1420,12 @@
    "timeago_nd_tokens": {
      "გუშინ": "1D",
      "დღეს": "0D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "ათ": 3,
+      "მლნ": 6,
+      "მლრდ": 9
    }
  },
  "kk": {
@ -1229,6 +1458,13 @@
    "timeago_nd_tokens": {
      "бүгін": "0D",
      "кеше": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "м": 3,
+      "млн": 6,
+      "млрд": 9,
+      "мың": 3
    }
  },
  "km": {
@ -1261,6 +1497,12 @@
    "timeago_nd_tokens": {
      "បានធ្វើបច្ចុប្បន្នភាពថ្ងៃនេះ": "0D",
      "បានធ្វើបច្ចុប្បន្នភាពម្សិលមិញ": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "ប៊ីលាន": 9,
+      "ពាន់": 3,
+      "លាន": 6
    }
  },
  "kn": {
@ -1300,6 +1542,11 @@
    "timeago_nd_tokens": {
      "ಇಂದು": "0D",
      "ನಿನ್ನೆ": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "ಕೋಟಿ": 7,
+      "ಲಕ್ಷ": 5
    }
  },
  "ko": {
@ -1318,6 +1565,12 @@
    "months": {},
    "timeago_nd_tokens": {
      "오늘": "0D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "만회": 4,
+      "억회": 8,
+      "천회": 3
    }
  },
  "ky": {
@ -1350,6 +1603,12 @@
    "timeago_nd_tokens": {
      "бүгүн": "0D",
      "кечээ": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "миң": 3,
+      "млд": 9,
+      "млн": 6
    }
  },
  "lo": {
@ -1382,6 +1641,13 @@
    "timeago_nd_tokens": {
      "ອັບເດດມື້ນີ້": "0D",
      "ອັບເດດມື້ວານນີ້": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "ກີບ": 3,
+      "ຕື້": 9,
+      "ພັນ": 3,
+      "ລ້ານ": 6
    }
  },
  "lt": {
@ -1415,6 +1681,12 @@
    "timeago_nd_tokens": {
      "vakar": "1D",
      "šiandien": "0D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "mln": 6,
+      "mlrd": 9,
+      "tūkst": 3
    }
  },
  "lv": {
@ -1454,6 +1726,12 @@
    "timeago_nd_tokens": {
      "vakar": "1D",
      "šodien": "0D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "milj": 6,
+      "mljrd": 9,
+      "tūkst": 3
    }
  },
  "mk": {
@ -1480,6 +1758,13 @@
    "timeago_nd_tokens": {
      "вчера": "1D",
      "денес": "0D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "М": 6,
+      "илј": 3,
+      "мил": 6,
+      "милј": 9
    }
  },
  "ml": {
@ -1512,6 +1797,11 @@
    "timeago_nd_tokens": {
      "ഇന്നലെ": "1D",
      "ഇന്ന്": "0D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "കോടി": 7,
+      "ലക്ഷം": 5
    }
  },
  "mn": {
@ -1531,6 +1821,12 @@
    "timeago_nd_tokens": {
      "өнөөдөр": "0D",
      "өчигдөр": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "мянга": 3,
+      "сая": 6,
+      "тэрбум": 9
    }
  },
  "mr": {
@ -1570,6 +1866,13 @@
    "timeago_nd_tokens": {
      "आज": "0D",
      "काल": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "अब्ज": 9,
+      "कोटी": 7,
+      "लाख": 5,
+      "ह": 3
    }
  },
  "ms": {
@ -1602,6 +1905,11 @@
    "timeago_nd_tokens": {
      "ini": "0D",
      "semalam": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "B": 9,
+      "J": 6
    }
  },
  "my": {
@ -1635,6 +1943,15 @@
    "timeago_nd_tokens": {
      "မနေ့က": "1D",
      "ယနေ့": "0D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "ကုဋေ": 7,
+      "ကုဋေထ": 10,
+      "ထောင်": 3,
+      "သန်း": 6,
+      "သိန်း": 5,
+      "သောင်း": 4
    }
  },
  "ne": {
@ -1667,6 +1984,13 @@
    "timeago_nd_tokens": {
      "आज": "0D",
      "हिजो": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "अरब": 9,
+      "करोड": 7,
+      "लाख": 5,
+      "हजार": 3
    }
  },
  "nl": {
@ -1704,6 +2028,11 @@
    "timeago_nd_tokens": {
      "gisteren": "1D",
      "vandaag": "0D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "mld": 9,
+      "mln": 6
    }
  },
  "no": {
@ -1743,6 +2072,11 @@
    "timeago_nd_tokens": {
      "dag": "0D",
      "går": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "mill": 6,
+      "mrd": 9
    }
  },
  "or": {
@ -1775,6 +2109,12 @@
    "timeago_nd_tokens": {
      "ଆଜି": "0D",
      "ଗତକାଲି": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "ନିଟି": 6,
+      "ବିଟି": 9,
+      "ହଟି": 3
    }
  },
  "pa": {
@ -1810,6 +2150,13 @@
    "timeago_nd_tokens": {
      "ਅੱਜ": "0D",
      "ਬੀੇਤੇ": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "ਅਰਬ": 9,
+      "ਕਰੋੜ": 7,
+      "ਲੱਖ": 5,
+      "ਹਜ਼ਾਰ": 3
    }
  },
  "pl": {
@ -1854,6 +2201,12 @@
    "timeago_nd_tokens": {
      "dzisiaj": "0D",
      "wczoraj": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "mld": 9,
+      "mln": 6,
+      "tys": 3
    }
  },
  "pt": {
@ -1893,6 +2246,12 @@
    "timeago_nd_tokens": {
      "hoje": "0D",
      "ontem": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "bi": 9,
+      "mi": 6,
+      "mil": 3
    }
  },
  "pt-PT": {
@ -1919,6 +2278,11 @@
    "timeago_nd_tokens": {
      "hoje": "0D",
      "ontem": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "mM": 9,
+      "mil": 3
    }
  },
  "ro": {
@ -1958,6 +2322,11 @@
    "timeago_nd_tokens": {
      "astăzi": "0D",
      "ieri": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "mil": 6,
+      "mld": 9
    }
  },
  "ru": {
@ -2003,6 +2372,12 @@
    "timeago_nd_tokens": {
      "вчера": "1D",
      "сегодня": "0D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "млн": 6,
+      "млрд": 9,
+      "тыс": 3
    }
  },
  "si": {
@ -2036,6 +2411,12 @@
      "අද": "0D",
      "ඊයෙ": "1D",
      "ඊයේ": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "ද": 3,
+      "බි": 9,
+      "මි": 6
    }
  },
  "sk": {
@ -2062,6 +2443,12 @@
    "timeago_nd_tokens": {
      "dnes": "0D",
      "včera": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "mil": 6,
+      "mld": 9,
+      "tis": 3
    }
  },
  "sl": {
@ -2109,6 +2496,12 @@
    "timeago_nd_tokens": {
      "danes": "0D",
      "včeraj": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "mio": 6,
+      "mrd": 9,
+      "tis": 3
    }
  },
  "sq": {
@ -2144,6 +2537,12 @@
    "timeago_nd_tokens": {
      "dje": "1D",
      "sot": "0D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "mijë": 3,
+      "mld": 9,
+      "mln": 6
    }
  },
  "sr": {
@ -2172,6 +2571,12 @@
    "timeago_nd_tokens": {
      "данас": "0D",
      "јуче": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "мил": 6,
+      "млрд": 9,
+      "хиљ": 3
    }
  },
  "sr-Latn": {
@ -2201,6 +2606,12 @@
    "timeago_nd_tokens": {
      "danas": "0D",
      "juče": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "hilj": 3,
+      "mil": 6,
+      "mlrd": 9
    }
  },
  "sv": {
@ -2239,6 +2650,11 @@
    "timeago_nd_tokens": {
      "idag": "0D",
      "igår": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "md": 9,
+      "mn": 6
    }
  },
  "sw": {
@ -2273,6 +2689,11 @@
    "timeago_nd_tokens": {
      "jana": "1D",
      "leo": "0D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "B": 9,
+      "elfu": 3
    }
  },
  "ta": {
@ -2311,6 +2732,11 @@
    "timeago_nd_tokens": {
      "இன்று": "0D",
      "நேற்று": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "கோடி": 7,
+      "லட்சம்": 5
    }
  },
  "te": {
@ -2350,6 +2776,12 @@
    "timeago_nd_tokens": {
      "ఈ": "0D",
      "నిన్న": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "కోట్లు": 7,
+      "లక్ష": 5,
+      "లక్షలు": 5
    }
  },
  "th": {
@ -2382,6 +2814,15 @@
    "timeago_nd_tokens": {
      "อัปเดตแล้ววันนี้": "0D",
      "อัปเดตแล้วเมื่อวาน": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "พัน": 3,
+      "พันล้าน": 9,
+      "ล้าน": 6,
+      "หมื่น": 4,
+      "หมื่นล้าน": 10,
+      "แสน": 5
    }
  },
  "tr": {
@ -2414,6 +2855,12 @@
    "timeago_nd_tokens": {
      "bugün": "0D",
      "dün": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "B": 3,
+      "Mn": 6,
+      "Mr": 9
    }
  },
  "uk": {
@ -2459,6 +2906,12 @@
    "timeago_nd_tokens": {
      "вчора": "1D",
      "сьогодні": "0D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "млн": 6,
+      "млрд": 9,
+      "тис": 3
    }
  },
  "ur": {
@ -2497,6 +2950,13 @@
    "timeago_nd_tokens": {
      "آج": "0D",
      "کل": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "ارب": 9,
+      "لاکھ": 5,
+      "کروڑ": 7,
+      "ہزار": 3
    }
  },
  "uz": {
@ -2529,6 +2989,12 @@
    "timeago_nd_tokens": {
      "bugun": "0D",
      "kecha": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "ming": 3,
+      "mln": 6,
+      "mlrd": 9
    }
  },
  "vi": {
@ -2549,6 +3015,12 @@
    "timeago_nd_tokens": {
      "nay": "0D",
      "qua": "1D"
+    },
+    "comma_decimal": true,
+    "number_tokens": {
+      "N": 3,
+      "T": 9,
+      "Tr": 6
    }
  },
  "zh-CN": {
@ -2568,6 +3040,11 @@
    "timeago_nd_tokens": {
      "今": "0D",
      "日": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "万": 4,
+      "亿": 8
    }
  },
  "zh-HK": {
@ -2588,6 +3065,10 @@
    "timeago_nd_tokens": {
      "今": "0D",
      "天": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "B": 9
    }
  },
  "zh-TW": {
@ -2607,6 +3088,11 @@
    "timeago_nd_tokens": {
      "今": "0D",
      "天": "1D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "億": 8,
+      "萬": 4
    }
  },
  "zu": {
@ -2646,6 +3132,10 @@
    "timeago_nd_tokens": {
      "izolo": "1D",
      "namuhla": "0D"
+    },
+    "comma_decimal": false,
+    "number_tokens": {
+      "B": 9
    }
  }
 }
--- a/testfiles/dict/large_number_samples.json
+++ b/testfiles/dict/large_number_samples.json
--- a/testfiles/dict/playlist_samples.json
+++ b/testfiles/dict/playlist_samples.json
--- a/testfiles/dict/timeago_samples.json
+++ b/testfiles/dict/timeago_samples.json
--- a/testfiles/dict/timeago_table.json
+++ b/testfiles/dict/timeago_table.json