fix: improve number parsing, add number_nd_tokens

add dictionary overrides
2023-05-06 17:27:51 +02:00 · 2023-05-06 17:27:51 +02:00 · 19781eab36
commit 19781eab36
parent 97492780c6
13 changed files with 33097 additions and 35712 deletions
--- a/codegen/src/collect_album_types.rs
+++ b/codegen/src/collect_album_types.rs
@ -72,7 +72,7 @@ pub fn write_samples_to_dict(project_root: &Path) {
        });
    }

-    util::write_dict(project_root, &dict);
+    util::write_dict(project_root, dict);
 }

 #[derive(Debug, Deserialize)]
--- a/codegen/src/collect_large_numbers.rs
+++ b/codegen/src/collect_large_numbers.rs
@ -1,6 +1,10 @@
-use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
-use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path};
+use std::{
+    collections::{BTreeMap, HashMap, HashSet},
+    fs::File,
+    io::BufReader,
+    path::Path,
+};

 use anyhow::{Context, Result};
 use futures::{stream, StreamExt};
@ -9,26 +13,12 @@ use path_macro::path;
 use regex::Regex;
 use rustypipe::client::{ClientType, RustyPipe, RustyPipeQuery};
 use rustypipe::param::{locale::LANGUAGES, Language};
-use serde::{Deserialize, Serialize};
+use serde::Deserialize;
 use serde_with::{serde_as, DefaultOnError, VecSkipError};

-use crate::util::{self, QBrowse, QCont, Text};
+use crate::util::{self, QBrowse, QCont, Text, TextRuns};

-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
-#[serde(untagged)]
-enum NumKey {
-    Mag(u8),
-    S(NumKeyS),
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
-#[serde(rename_all = "lowercase")]
-enum NumKeyS {
-    Zero,
-    One,
-}
-
-type CollectedNumbers = BTreeMap<Language, BTreeMap<NumKey, (String, u64)>>;
+type CollectedNumbers = BTreeMap<Language, BTreeMap<String, u64>>;

 /// Collect video view count texts in every supported language
 /// and write them to `testfiles/dict/large_number_samples.json`.
@ -45,9 +35,7 @@ type CollectedNumbers = BTreeMap<Language, BTreeMap<NumKey, (String, u64)>>;
 /// outputs view counts both in approximated and exact format, so we can use
 /// the exact counts to figure out the tokens.
 pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
-    let json_path = path!(project_root / "testfiles" / "dict" / "large_number_samples.json");
-    let json_path_all =
-        path!(project_root / "testfiles" / "dict" / "large_number_samples_all.json");
+    let json_path = path!(project_root / "testfiles" / "dict" / "large_number_samples_all.json");
    let rp = RustyPipe::new();

    let channels = [
@ -64,6 +52,16 @@ pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
        "UCQXYK94vDqOEkPbTCyL0OjA", // (1)
    ];

+    // YTM outputs the subscriber count in a shortened format in some languages
+    let music_channels = [
+        "UC_1N84buVNgR_-3gDZ9Jtxg", // 10e8 (158M)
+        "UCRw0x9_EfawqmgDI2IgQLLg", // 10e7 (29M)
+        "UChWu2clmvJ5wN_0Ic5dnqmw", // 10e6 (1.9M)
+        "UCOYiPDuimprrGHgFy4_Fw8Q", // 10e5 (149K)
+        "UC8nZf9WyVIxNMly_hy2PTyQ", // 10e4 (17K)
+        "UCaltNL5XvZ7dKvBsBPi-gqg", // 10e3 (8K)
+    ];
+
    // Build a lookup table for the channel's subscriber counts
    let subscriber_counts: Arc<BTreeMap<String, u64>> = stream::iter(channels)
        .map(|c| {
@ -80,10 +78,26 @@ pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
        .await
        .into();

-    let collected_numbers_all: BTreeMap<Language, BTreeMap<String, u64>> = stream::iter(LANGUAGES)
+    let music_subscriber_counts: Arc<BTreeMap<String, u64>> = stream::iter(music_channels)
+        .map(|c| {
+            let rp = rp.query();
+            async move {
+                let subscriber_count = music_channel_subscribers(&rp, c).await.unwrap();
+
+                let n = util::parse_largenum_en(&subscriber_count).unwrap();
+                (c.to_owned(), n)
+            }
+        })
+        .buffer_unordered(concurrency)
+        .collect::<BTreeMap<_, _>>()
+        .await
+        .into();
+
+    let collected_numbers: CollectedNumbers = stream::iter(LANGUAGES)
        .map(|lang| {
            let rp = rp.query().lang(lang);
            let subscriber_counts = subscriber_counts.clone();
+            let music_subscriber_counts = music_subscriber_counts.clone();
            async move {
                let mut entry = BTreeMap::new();

@ -101,6 +115,15 @@ pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
                    println!("collected {lang}-{n}");
                }

+                for (n, ch_id) in music_channels.iter().enumerate() {
+                    let subscriber_count = music_channel_subscribers(&rp, ch_id)
+                        .await
+                        .context(format!("{lang}-music-{n}"))
+                        .unwrap();
+                    entry.insert(subscriber_count, music_subscriber_counts[*ch_id]);
+                    println!("collected {lang}-music-{n}");
+                }
+
                (lang, entry)
            }
        })
@ -108,61 +131,13 @@ pub async fn collect_large_numbers(project_root: &Path, concurrency: usize) {
        .collect()
        .await;

-    let collected_numbers: CollectedNumbers = collected_numbers_all
-        .iter()
-        .map(|(lang, entry)| {
-            let mut e2 = BTreeMap::new();
-            entry.iter().for_each(|(txt, num)| {
-                let key = if num == &0 {
-                    NumKey::S(NumKeyS::Zero)
-                } else if num == &1 {
-                    NumKey::S(NumKeyS::One)
-                } else {
-                    NumKey::Mag(get_mag(*num))
-                };
-
-                e2.insert(key, (txt.to_owned(), *num));
-            });
-            (*lang, e2)
-        })
-        .collect();
-
    let file = File::create(json_path).unwrap();
    serde_json::to_writer_pretty(file, &collected_numbers).unwrap();
-
-    let file = File::create(json_path_all).unwrap();
-    serde_json::to_writer_pretty(file, &collected_numbers_all).unwrap();
 }

 /// Attempt to parse the numbers collected by `collect-large-numbers`
 /// and write the results to `dictionary.json`.
 pub fn write_samples_to_dict(project_root: &Path) {
-    /*
-    Manual corrections:
-    as
-    "কোঃটা": 9,
-    "নিঃটা": 6,
-    "নিযুতটা": 6,
-    "লাখটা": 5,
-    "হাজাৰটা": 3
-
-    ar
-    "ألف": 3,
-    "آلاف": 3,
-    "مليار": 9,
-    "مليون": 6
-
-    bn
-    "লাটি": 5,
-    "শত": 2,
-    "হাটি": 3,
-    "কোটি": 7
-
-    es/es-US
-    "mil": 3,
-    "M": 6
-    */
-
    let json_path = path!(project_root / "testfiles" / "dict" / "large_number_samples.json");

    let json_file = File::open(json_path).unwrap();
@ -179,27 +154,21 @@ pub fn write_samples_to_dict(project_root: &Path) {
        let mut e_langs = dict_entry.equivalent.clone();
        e_langs.push(lang);

-        let comma_decimal = collected_nums
-            .get(&lang)
-            .unwrap()
+        let comma_decimal = collected_nums[&lang]
            .iter()
-            .find_map(|(key, (txt, _))| {
-                match key {
-                    NumKey::Mag(mag) => {
-                        let point = POINT_REGEX
-                            .captures(txt)
-                            .map(|c| c.get(1).unwrap().as_str());
+            .find_map(|(txt, val)| {
+                let point = POINT_REGEX
+                    .captures(txt)
+                    .map(|c| c.get(1).unwrap().as_str());

-                        if let Some(point) = point {
-                            let num_all = util::parse_numeric::<u64>(txt).unwrap();
-                            // If the number parsed from all digits has the same order of
-                            // magnitude as the actual number, it must be a separator.
-                            // Otherwise it is a decimal point
-                            return Some((get_mag(num_all) == *mag) ^ (point == ","));
-                        }
-                    }
-                    NumKey::S(_) => {}
+                if let Some(point) = point {
+                    let num_all = util::parse_numeric::<u64>(txt).unwrap();
+                    // If the number parsed from all digits has the same order of
+                    // magnitude as the actual number, it must be a separator.
+                    // Otherwise it is a decimal point
+                    return Some((get_mag(num_all) == get_mag(*val)) ^ (point == ","));
                }
+
                None
            })
            .unwrap();
@ -217,6 +186,7 @@ pub fn write_samples_to_dict(project_root: &Path) {
        // If the token is found again with a different derived order of magnitude,
        // its value in the map is set to None.
        let mut found_tokens: HashMap<String, Option<u8>> = HashMap::new();
+        let mut found_nd_tokens: HashMap<String, Option<u8>> = HashMap::new();

        let mut insert_token = |token: String, mag: u8| {
            let found_token = found_tokens.entry(token).or_insert(match mag {
@ -231,22 +201,30 @@ pub fn write_samples_to_dict(project_root: &Path) {
            }
        };

+        let mut insert_nd_token = |token: String, n: Option<u8>| {
+            let found_token = found_nd_tokens.entry(token).or_insert(n);
+
+            if let Some(f) = found_token {
+                if Some(*f) != n {
+                    *found_token = None;
+                }
+            }
+        };
+
        for lang in e_langs {
            let entry = collected_nums.get(&lang).unwrap();

-            entry.iter().for_each(|(key, (txt, _))| {
-                match key {
-                    NumKey::Mag(mag) => {
-                        let filtered = util::filter_largenumstr(txt);
+            entry.iter().for_each(|(txt, val)| {
+                let filtered = util::filter_largenumstr(txt);
+                let mag = get_mag(*val);

-                        let tokens: Vec<String> = match dict_entry.by_char {
-                            true => filtered.chars().map(|c| c.to_string()).collect(),
-                            false => filtered.split_whitespace().map(|c| c.to_string()).collect(),
-                        };
+                let tokens: Vec<String> = match dict_entry.by_char || lang == Language::Ko {
+                    true => filtered.chars().map(|c| c.to_string()).collect(),
+                    false => filtered.split_whitespace().map(|c| c.to_string()).collect(),
+                };

-                        let num_before_point =
-                            util::parse_numeric::<u64>(txt.split(decimal_point).next().unwrap())
-                                .unwrap();
+                match util::parse_numeric::<u64>(txt.split(decimal_point).next().unwrap()) {
+                    Ok(num_before_point) => {
                        let mag_before_point = get_mag(num_before_point);
                        let mut mag_remaining = mag - mag_before_point;

@ -272,9 +250,22 @@ pub fn write_samples_to_dict(project_root: &Path) {
                            } else {
                                insert_token(t.to_owned(), mag_remaining);
                            }
+                            insert_nd_token(t.to_owned(), None);
                        });
                    }
-                    NumKey::S(_) => {}
+                    Err(e) => {
+                        if matches!(e.kind(), std::num::IntErrorKind::Empty) {
+                            // Text does not contain any digits, search for nd_tokens
+                            tokens.iter().for_each(|t| {
+                                insert_nd_token(
+                                    t.to_owned(),
+                                    Some((*val).try_into().expect("nd_token value too large")),
+                                );
+                            });
+                        } else {
+                            panic!("{e}, txt: {txt}")
+                        }
+                    }
                }
            });
        }
@ -284,6 +275,10 @@ pub fn write_samples_to_dict(project_root: &Path) {
            .into_iter()
            .filter_map(|(k, v)| v.map(|v| (k, v)))
            .collect();
+        dict_entry.number_nd_tokens = found_nd_tokens
+            .into_iter()
+            .filter_map(|(k, v)| v.map(|v| (k, v)))
+            .collect();
        dict_entry.comma_decimal = comma_decimal;

        // Check for duplicates
@ -291,9 +286,13 @@ pub fn write_samples_to_dict(project_root: &Path) {
        if !dict_entry.number_tokens.values().all(|x| uniq.insert(x)) {
            println!("Warning: collected duplicate tokens for {lang}");
        }
+        let mut uniq = HashSet::new();
+        if !dict_entry.number_nd_tokens.values().all(|x| uniq.insert(x)) {
+            println!("Warning: collected duplicate nd_tokens for {lang}");
+        }
    }

-    util::write_dict(project_root, &dict);
+    util::write_dict(project_root, dict);
 }

 fn get_mag(n: u64) -> u8 {
@ -304,59 +303,59 @@ fn get_mag(n: u64) -> u8 {
 YouTube channel videos response
 */

-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct Channel {
    contents: Contents,
    header: ChannelHeader,
 }

-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct ChannelHeader {
    c4_tabbed_header_renderer: HeaderRenderer,
 }

-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct HeaderRenderer {
    subscriber_count_text: Text,
 }

-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct Contents {
    two_column_browse_results_renderer: TabsRenderer,
 }

 #[serde_as]
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct TabsRenderer {
    #[serde_as(as = "VecSkipError<_>")]
    tabs: Vec<TabRendererWrap>,
 }

-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct TabRendererWrap {
    tab_renderer: TabRenderer,
 }

-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct TabRenderer {
    content: RichGridRendererWrap,
 }

-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct RichGridRendererWrap {
    rich_grid_renderer: RichGridRenderer,
 }

 #[serde_as]
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct RichGridRenderer {
    #[serde_as(as = "VecSkipError<_>")]
@ -366,25 +365,25 @@ struct RichGridRenderer {
    header: Option<RichGridHeader>,
 }

-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct RichItemRendererWrap {
    rich_item_renderer: RichItemRenderer,
 }

-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct RichItemRenderer {
    content: VideoRendererWrap,
 }

-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct VideoRendererWrap {
    video_renderer: VideoRenderer,
 }

-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct VideoRenderer {
    /// `24,194 views`
@ -393,65 +392,100 @@ struct VideoRenderer {
    short_view_count_text: Text,
 }

-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct RichGridHeader {
    feed_filter_chip_bar_renderer: ChipBar,
 }

-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct ChipBar {
    contents: Vec<Chip>,
 }

-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct Chip {
    chip_cloud_chip_renderer: ChipRenderer,
 }

-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct ChipRenderer {
    navigation_endpoint: NavigationEndpoint,
 }

-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct NavigationEndpoint {
    continuation_command: ContinuationCommand,
 }

-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct ContinuationCommand {
    token: String,
 }

 #[serde_as]
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct ContinuationResponse {
    // #[serde_as(as = "VecSkipError<_>")]
    on_response_received_actions: Vec<ContinuationAction>,
 }

-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct ContinuationAction {
    reload_continuation_items_command: ContinuationItemsWrap,
 }

 #[serde_as]
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct ContinuationItemsWrap {
    #[serde_as(as = "VecSkipError<_>")]
    continuation_items: Vec<RichItemRendererWrap>,
 }

-#[derive(Clone, Debug)]
+/*
+YouTube Music channel data
+*/
+
+#[derive(Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct MusicChannel {
+    header: MusicHeader,
+}
+
+#[derive(Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct MusicHeader {
+    #[serde(alias = "musicVisualHeaderRenderer")]
+    music_immersive_header_renderer: MusicHeaderRenderer,
+}
+
+#[derive(Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct MusicHeaderRenderer {
+    subscription_button: SubscriptionButton,
+}
+
+#[derive(Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct SubscriptionButton {
+    subscribe_button_renderer: SubscriptionButtonRenderer,
+}
+
+#[derive(Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+struct SubscriptionButtonRenderer {
+    subscriber_count_text: TextRuns,
+}
+
+#[derive(Debug)]
 struct ChannelData {
    view_counts: BTreeMap<u64, String>,
    subscriber_count: String,
@ -460,7 +494,7 @@ struct ChannelData {
 async fn get_channel(query: &RustyPipeQuery, channel_id: &str) -> Result<ChannelData> {
    let resp = query
        .raw(
-            ClientType::DesktopMusic,
+            ClientType::Desktop,
            "browse",
            &QBrowse {
                context: query.get_context(ClientType::Desktop, true, None).await,
@ -540,18 +574,31 @@ async fn get_channel(query: &RustyPipeQuery, channel_id: &str) -> Result<Channel
    })
 }

-#[cfg(test)]
-mod tests {
-    use rustypipe::client::RustyPipe;
+async fn music_channel_subscribers(query: &RustyPipeQuery, channel_id: &str) -> Result<String> {
+    let resp = query
+        .raw(
+            ClientType::DesktopMusic,
+            "browse",
+            &QBrowse {
+                context: query
+                    .get_context(ClientType::DesktopMusic, true, None)
+                    .await,
+                browse_id: channel_id,
+                params: None,
+            },
+        )
+        .await?;

-    use super::*;
-
-    #[tokio::test]
-    async fn t() {
-        let rp = RustyPipe::new();
-        let x = get_channel(&rp.query(), "UCQXYK94vDqOEkPbTCyL0OjA")
-            .await
-            .unwrap();
-        dbg!(&x);
-    }
+    let channel = serde_json::from_str::<MusicChannel>(&resp)?;
+    channel
+        .header
+        .music_immersive_header_renderer
+        .subscription_button
+        .subscribe_button_renderer
+        .subscriber_count_text
+        .runs
+        .into_iter()
+        .next()
+        .map(|t| t.text)
+        .ok_or_else(|| anyhow::anyhow!("no text"))
 }
--- a/codegen/src/collect_playlist_dates.rs
+++ b/codegen/src/collect_playlist_dates.rs
@ -291,5 +291,5 @@ pub fn write_samples_to_dict(project_root: &Path) {
        dict_entry.date_order = num_order;
    }

-    util::write_dict(project_root, &dict);
+    util::write_dict(project_root, dict);
 }
--- a/codegen/src/gen_dictionary.rs
+++ b/codegen/src/gen_dictionary.rs
@ -73,6 +73,10 @@ pub(crate) struct Entry {
    ///
    /// Format: Parsed token -> decimal power
    pub number_tokens: phf::Map<&'static str, u8>,
+    /// Tokens for parsing number strings with no digits (e.g. "No videos")
+    ///
+    /// Format: Parsed token -> value
+    pub number_nd_tokens: phf::Map<&'static str, u8>,
    /// Names of album types (Album, Single, ...)
    ///
    /// Format: Parsed text -> Album type
@ -138,6 +142,12 @@ pub(crate) fn entry(lang: Language) -> Entry {
            number_tokens.entry(txt, &mag.to_string());
        });

+        // Number nd tokens
+        let mut number_nd_tokens = phf_codegen::Map::<&str>::new();
+        entry.number_nd_tokens.iter().for_each(|(txt, mag)| {
+            number_nd_tokens.entry(txt, &mag.to_string());
+        });
+
        // Album types
        let mut album_types = phf_codegen::Map::<&str>::new();
        entry.album_types.iter().for_each(|(txt, album_type)| {
@ -148,10 +158,11 @@ pub(crate) fn entry(lang: Language) -> Entry {
        let code_ta_nd_tokens = &ta_nd_tokens.build().to_string().replace('\n', "\n            ");
        let code_months = &months.build().to_string().replace('\n', "\n            ");
        let code_number_tokens = &number_tokens.build().to_string().replace('\n', "\n            ");
+        let code_number_nd_tokens = &number_nd_tokens.build().to_string().replace('\n', "\n            ");
        let code_album_types = &album_types.build().to_string().replace('\n', "\n            ");

-        let _ = write!(code_timeago_tokens, "{} => Entry {{\n            timeago_tokens: {},\n            date_order: {},\n            months: {},\n            timeago_nd_tokens: {},\n            comma_decimal: {:?},\n            number_tokens: {},\n            album_types: {},\n        }},\n        ",
-        selector, code_ta_tokens, date_order, code_months, code_ta_nd_tokens, entry.comma_decimal, code_number_tokens, code_album_types);
+        write!(code_timeago_tokens, "{} => Entry {{\n            timeago_tokens: {},\n            date_order: {},\n            months: {},\n            timeago_nd_tokens: {},\n            comma_decimal: {:?},\n            number_tokens: {},\n            number_nd_tokens: {},\n            album_types: {},\n        }},\n        ",
+        selector, code_ta_tokens, date_order, code_months, code_ta_nd_tokens, entry.comma_decimal, code_number_tokens, code_number_nd_tokens, code_album_types).unwrap();
    });

    code_timeago_tokens = code_timeago_tokens.trim_end().to_owned() + "\n    }\n}\n";
--- a/codegen/src/util.rs
+++ b/codegen/src/util.rs
@ -12,8 +12,11 @@ use rustypipe::{client::YTContext, model::AlbumType, param::Language};
 use serde::{Deserialize, Serialize};

 static DICT_PATH: Lazy<PathBuf> = Lazy::new(|| path!("testfiles" / "dict" / "dictionary.json"));
+static DICT_OVERRIDE_PATH: Lazy<PathBuf> =
+    Lazy::new(|| path!("testfiles" / "dict" / "dictionary_override.json"));

 type Dictionary = BTreeMap<Language, DictEntry>;
+type DictionaryOverride = BTreeMap<Language, DictOverrideEntry>;

 #[derive(Debug, Default, Serialize, Deserialize)]
 #[serde(default)]
@ -62,6 +65,13 @@ pub struct DictEntry {
    pub album_types: BTreeMap<String, AlbumType>,
 }

+#[derive(Debug, Default, Serialize, Deserialize)]
+#[serde(default)]
+pub struct DictOverrideEntry {
+    pub number_tokens: BTreeMap<String, Option<u8>>,
+    pub number_nd_tokens: BTreeMap<String, Option<u8>>,
+}
+
 #[derive(Debug, Serialize)]
 #[serde(rename_all = "camelCase")]
 pub struct QBrowse<'a> {
@ -95,10 +105,41 @@ pub fn read_dict(project_root: &Path) -> Dictionary {
    serde_json::from_reader(BufReader::new(json_file)).unwrap()
 }

-pub fn write_dict(project_root: &Path, dict: &Dictionary) {
+pub fn read_dict_override(project_root: &Path) -> DictionaryOverride {
+    let json_path = path!(project_root / *DICT_OVERRIDE_PATH);
+    let json_file = File::open(json_path).unwrap();
+    serde_json::from_reader(BufReader::new(json_file)).unwrap()
+}
+
+pub fn write_dict(project_root: &Path, dict: Dictionary) {
+    let dict_override = read_dict_override(project_root);
+
    let json_path = path!(project_root / *DICT_PATH);
    let json_file = File::create(json_path).unwrap();
-    serde_json::to_writer_pretty(json_file, dict).unwrap();
+
+    fn apply_map<K: Clone + Ord, V: Clone>(map: &mut BTreeMap<K, V>, or: &BTreeMap<K, Option<V>>) {
+        or.iter().for_each(|(key, val)| match val {
+            Some(val) => {
+                map.insert(key.clone(), val.clone());
+            }
+            None => {
+                map.remove(key);
+            }
+        });
+    }
+
+    let dict: Dictionary = dict
+        .into_iter()
+        .map(|(lang, mut entry)| {
+            if let Some(or) = dict_override.get(&lang) {
+                apply_map(&mut entry.number_tokens, &or.number_tokens);
+                apply_map(&mut entry.number_nd_tokens, &or.number_nd_tokens);
+            }
+            (lang, entry)
+        })
+        .collect();
+
+    serde_json::to_writer_pretty(json_file, &dict).unwrap();
 }

 pub fn filter_datestr(string: &str) -> String {
@ -133,6 +174,7 @@ pub fn filter_largenumstr(string: &str) -> String {
                    | ','
            ) && !c.is_ascii_digit()
        })
+        .flat_map(char::to_lowercase)
        .collect()
 }

--- a/notes/dictionary.md
+++ b/notes/dictionary.md
@ -30,3 +30,5 @@ build a dictionary.
 - Examples: "1.4M views"
 - There is an exception for the value 0 ("no views") and in some languages for the value
  1 (pt: "Um vídeo")
+- Special case: Language "gu", "જોવાયાની સંખ્યા" = "no views", contains no unique tokens
+  to parse
--- a/src/client/response/video_item.rs
+++ b/src/client/response/video_item.rs
@ -515,7 +515,7 @@ impl<T> YouTubeListMapper<T> {
            publish_date_txt: pub_date_txt,
            view_count: video
                .view_count_text
-                .map(|txt| util::parse_large_numstr(&txt, lang).unwrap_or_default()),
+                .and_then(|txt| util::parse_large_numstr_or_warn(&txt, lang, &mut self.warnings)),
            is_live: false,
            is_short: true,
            is_upcoming: false,
--- a/src/util/dictionary.rs
+++ b/src/util/dictionary.rs
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@ -290,87 +290,83 @@ pub fn parse_large_numstr<F>(string: &str, lang: Language) -> Option<F>
 where
    F: TryFrom<u64>,
 {
+    // Special case for Gujarati: the "no views" text does not contain
+    // any parseable tokens: the 2 words occur in any view count text.
+    // This may be a translation error.
+    if lang == Language::Gu && string == "જોવાયાની સંખ્યા" {
+        return 0.try_into().ok();
+    }
+
    let dict_entry = dictionary::entry(lang);
+    let by_char = lang_by_char(lang) || lang == Language::Ko;
    let decimal_point = match dict_entry.comma_decimal {
        true => ',',
        false => '.',
    };

-    let (num, mut exp, filtered) = {
-        let mut buf = String::new();
-        let mut filtered = String::new();
-        let mut exp = 0;
-        let mut after_point = false;
-        for c in string.chars() {
-            if c.is_ascii_digit() {
-                buf.push(c);
+    let mut digits = String::new();
+    let mut filtered = String::new();
+    let mut exp = 0;
+    let mut after_point = false;

-                if after_point {
-                    exp -= 1;
-                }
-            } else if c == decimal_point {
-                after_point = true;
-            } else if !matches!(
-                c,
-                '\u{200b}'
-                    | '\u{202b}'
-                    | '\u{202c}'
-                    | '\u{202e}'
-                    | '\u{200e}'
-                    | '\u{200f}'
-                    | '.'
-                    | ','
-            ) {
-                filtered.push(c);
+    for c in string.chars() {
+        if c.is_ascii_digit() {
+            digits.push(c);
+
+            if after_point {
+                exp -= 1;
            }
+        } else if c == decimal_point {
+            after_point = true;
+        } else if !matches!(
+            c,
+            '\u{200b}' | '\u{202b}' | '\u{202c}' | '\u{202e}' | '\u{200e}' | '\u{200f}' | '.' | ','
+        ) {
+            c.to_lowercase().for_each(|c| filtered.push(c));
        }
-        if buf.is_empty() {
-            // TODO: integrate into dictionary
-            if lang == Language::Ar && string.contains("واحد")
-                || lang == Language::Iw && string.contains("אחד")
-                || lang == Language::As && string.contains('১') // ১টা
-                || lang == Language::Bn && string.contains('১')
-                || lang == Language::Fa && string.contains('۱')
-                || lang == Language::Is && (string.contains("Eitt ") || string.contains("Einn "))
-                || lang == Language::My && string.contains('၁')
-                || lang == Language::No && string.contains("Én ")
-                || lang == Language::Pt && string.contains("Um ")
-                || lang == Language::Ro && string.contains("Un ")
-            {
-                return 1.try_into().ok();
-            }
-
-            return None;
-        } else {
-            (buf.parse::<u64>().ok()?, exp, filtered)
-        }
-    };
-
-    let lookup_token = |token: &str| match token {
-        "K" | "k" => Some(3),
-        _ => dict_entry.number_tokens.get(token).map(|t| *t as i32),
-    };
-
-    if lang_by_char(lang) || lang == Language::Ko {
-        exp += filtered
-            .chars()
-            .filter_map(|token| lookup_token(&token.to_string()))
-            .sum::<i32>();
-    } else {
-        exp += filtered
-            .split_whitespace()
-            .filter_map(lookup_token)
-            .sum::<i32>();
    }

-    F::try_from(some_or_bail!(
-        num.checked_mul(some_or_bail!(
-            (10_u64).checked_pow(ok_or_bail!(exp.try_into(), None)),
+    if digits.is_empty() {
+        if by_char {
+            filtered
+                .chars()
+                .find_map(|c| dict_entry.number_nd_tokens.get(&c.to_string()))
+                .and_then(|n| (*n as u64).try_into().ok())
+        } else {
+            filtered
+                .split_whitespace()
+                .find_map(|token| dict_entry.number_nd_tokens.get(token))
+                .and_then(|n| (*n as u64).try_into().ok())
+        }
+    } else {
+        let num = digits.parse::<u64>().ok()?;
+
+        let lookup_token = |token: &str| match token {
+            "k" => Some(3),
+            _ => dict_entry.number_tokens.get(token).map(|t| *t as i32),
+        };
+
+        if by_char {
+            exp += filtered
+                .chars()
+                .filter_map(|token| lookup_token(&token.to_string()))
+                .sum::<i32>();
+        } else {
+            exp += filtered
+                .split_whitespace()
+                .filter_map(lookup_token)
+                .sum::<i32>();
+        }
+
+        F::try_from(some_or_bail!(
+            num.checked_mul(some_or_bail!(
+                (10_u64).checked_pow(ok_or_bail!(exp.try_into(), None)),
+                None
+            )),
            None
-        )),
-        None
-    ))
-    .ok()
+        ))
+        .ok()
+    }
 }

 pub fn parse_large_numstr_or_warn<F>(
@ -516,9 +512,10 @@ pub(crate) mod tests {
    #[case(
        Language::Iw,
        "\u{200f}\u{202b}3.36M\u{200f}\u{202c}\u{200f} \u{200f}מנויים\u{200f}",
-        3360000
+        3_360_000
    )]
-    fn t_parse_large_numstr_1(#[case] lang: Language, #[case] string: &str, #[case] expect: u64) {
+    #[case(Language::As, "১ জন গ্ৰাহক", 1)]
+    fn t_parse_large_numstr(#[case] lang: Language, #[case] string: &str, #[case] expect: u64) {
        let res = parse_large_numstr::<u64>(string, lang).unwrap();
        assert_eq!(res, expect);
    }
@ -527,20 +524,6 @@ pub(crate) mod tests {
    fn t_parse_large_numstr_samples() {
        let json_path = path!(*TESTFILES / "dict" / "large_number_samples.json");
        let json_file = File::open(json_path).unwrap();
-        let number_samples: BTreeMap<Language, BTreeMap<String, (String, u64)>> =
-            serde_json::from_reader(BufReader::new(json_file)).unwrap();
-
-        number_samples.iter().for_each(|(lang, entry)| {
-            entry.iter().for_each(|(_, (txt, expect))| {
-                testcase_parse_large_numstr(txt, *lang, *expect);
-            });
-        });
-    }
-
-    #[test]
-    fn t_parse_large_numstr_samples2() {
-        let json_path = path!(*TESTFILES / "dict" / "large_number_samples_all.json");
-        let json_file = File::open(json_path).unwrap();
        let number_samples: BTreeMap<Language, BTreeMap<String, u64>> =
            serde_json::from_reader(BufReader::new(json_file)).unwrap();

@ -565,8 +548,9 @@ pub(crate) mod tests {
            }
        };

-        // TODO: add support for zero values
-        let res = parse_large_numstr::<u64>(string, lang).unwrap_or_default();
-        assert_eq!(res, rounded, "{string} (lang: {lang}, exact: {expect})");
+        let emsg = format!("{string} (lang: {lang}, exact: {expect})");
+
+        let res = parse_large_numstr::<u64>(string, lang).expect(&emsg);
+        assert_eq!(res, rounded, "{emsg}");
    }
 }
--- a/testfiles/dict/dictionary.json
+++ b/testfiles/dict/dictionary.json
@ -41,6 +41,9 @@
      "m": 6,
      "mjd": 9
    },
+    "number_nd_tokens": {
+      "nie": 0
+    },
    "album_types": {
      "album": "Album",
      "drama": "Show",
@ -93,6 +96,9 @@
      "ሺ": 3,
      "ቢ": 9
    },
+    "number_nd_tokens": {
+      "የለዉም": 0
+    },
    "album_types": {
      "ትዕይንት": "Show",
      "ነጠላ": "Single",
@ -143,6 +149,10 @@
      "مليار": 9,
      "مليون": 6
    },
+    "number_nd_tokens": {
+      "لا": 0,
+      "واحد": 1
+    },
    "album_types": {
      "أغنية منفردة": "Single",
      "ألبوم": "Album",
@ -172,6 +182,7 @@
    "comma_decimal": false,
    "number_tokens": {
      "কোঃটা": 9,
+      "নিঃ": 6,
      "নিঃটা": 6,
      "নিযুত": 6,
      "নিযুতটা": 6,
@ -180,8 +191,11 @@
      "লাখটা": 5,
      "হা": 3,
      "হাজাৰ": 3,
-      "হাজাৰটা": 3,
-      "নিঃ": 6
+      "হাজাৰটা": 3
+    },
+    "number_nd_tokens": {
+      "নাই": 0,
+      "১": 1
    },
    "album_types": {
      "ep": "Ep",
@ -229,6 +243,9 @@
      "mln": 6,
      "mlrd": 9
    },
+    "number_nd_tokens": {
+      "yoxdur": 0
+    },
    "album_types": {
      "albom": "Album",
      "audio kitab": "Audiobook",
@ -291,6 +308,9 @@
      "млрд": 9,
      "тыс": 3
    },
+    "number_nd_tokens": {
+      "няма": 0
+    },
    "album_types": {
      "альбом": "Album",
      "аўдыякніга": "Audiobook",
@ -330,6 +350,9 @@
      "млрд": 9,
      "хил": 3
    },
+    "number_nd_tokens": {
+      "няма": 0
+    },
    "album_types": {
      "албум": "Album",
      "аудиокнига": "Audiobook",
@ -379,6 +402,10 @@
      "হা": 3,
      "হাটি": 3
    },
+    "number_nd_tokens": {
+      "০": 0,
+      "১": 1
+    },
    "album_types": {
      "অডিওবুক": "Audiobook",
      "অ্যালবাম": "Album",
@ -437,6 +464,9 @@
      "mil": 6,
      "mlr": 9
    },
+    "number_nd_tokens": {
+      "nema": 0
+    },
    "album_types": {
      "album": "Album",
      "audio knjiga": "Audiobook",
@ -485,9 +515,11 @@
    },
    "comma_decimal": true,
    "number_tokens": {
-      "M": 6,
-      "m": 3,
-      "kM": 9
+      "km": 9,
+      "m": 6
+    },
+    "number_nd_tokens": {
+      "sense": 0
    },
    "album_types": {
      "audiollibre": "Audiobook",
@ -532,6 +564,7 @@
      "mld": 9,
      "tis": 3
    },
+    "number_nd_tokens": {},
    "album_types": {
      "album": "Album",
      "audiokniha": "Audiobook",
@ -582,6 +615,9 @@
      "mia": 9,
      "mio": 6
    },
+    "number_nd_tokens": {
+      "ingen": 0
+    },
    "album_types": {
      "album": "Album",
      "ep": "Ep",
@ -617,8 +653,11 @@
    },
    "comma_decimal": true,
    "number_tokens": {
-      "Mio": 6,
-      "Mrd": 9
+      "mio": 6,
+      "mrd": 9
+    },
+    "number_nd_tokens": {
+      "keine": 0
    },
    "album_types": {
      "album": "Album",
@ -672,6 +711,9 @@
      "εκ": 6,
      "χιλ": 3
    },
+    "number_nd_tokens": {
+      "καμία": 0
+    },
    "album_types": {
      "ep": "Ep",
      "single": "Single",
@ -681,7 +723,10 @@
    }
  },
  "en": {
-    "equivalent": ["en-GB", "en-IN"],
+    "equivalent": [
+      "en-GB",
+      "en-IN"
+    ],
    "by_char": false,
    "timeago_tokens": {
      "day": "D",
@ -721,10 +766,13 @@
    },
    "comma_decimal": false,
    "number_tokens": {
-      "B": 9,
-      "M": 6,
+      "b": 9,
      "crore": 7,
-      "lakh": 5
+      "lakh": 5,
+      "m": 6
+    },
+    "number_nd_tokens": {
+      "no": 0
    },
    "album_types": {
      "album": "Album",
@ -774,9 +822,10 @@
    },
    "comma_decimal": true,
    "number_tokens": {
-      "M": 6,
+      "m": 6,
      "mil": 3
    },
+    "number_nd_tokens": {},
    "album_types": {
      "audiodrama": "Show",
      "audiolibro": "Audiobook",
@ -786,7 +835,9 @@
    }
  },
  "es-US": {
-    "equivalent": ["es-419"],
+    "equivalent": [
+      "es-419"
+    ],
    "by_char": false,
    "timeago_tokens": {
      "año": "Y",
@ -825,9 +876,12 @@
    },
    "comma_decimal": false,
    "number_tokens": {
-      "M": 6,
+      "m": 6,
      "mil": 3
    },
+    "number_nd_tokens": {
+      "sin": 0
+    },
    "album_types": {
      "audiolibro": "Audiobook",
      "ep": "Ep",
@ -882,6 +936,9 @@
      "mln": 6,
      "tuh": 3
    },
+    "number_nd_tokens": {
+      "pole": 0
+    },
    "album_types": {
      "album": "Album",
      "audioraamat": "Audiobook",
@ -926,7 +983,10 @@
    },
    "comma_decimal": true,
    "number_tokens": {
-      "M": 6
+      "m": 6
+    },
+    "number_nd_tokens": {
+      "ez": 0
    },
    "album_types": {
      "albuma": "Album",
@ -973,6 +1033,10 @@
      "میلیون": 6,
      "هزار": 3
    },
+    "number_nd_tokens": {
+      "بدون": 0,
+      "۱": 1
+    },
    "album_types": {
      "آلبوم": "Album",
      "تک آهنگ": "Single",
@ -1012,6 +1076,10 @@
      "mrd": 9,
      "t": 3
    },
+    "number_nd_tokens": {
+      "ei": 0,
+      "katselukertoja": 0
+    },
    "album_types": {
      "albumi": "Album",
      "ep": "Ep",
@ -1053,8 +1121,11 @@
    },
    "comma_decimal": false,
    "number_tokens": {
-      "B": 9,
-      "M": 6
+      "b": 9,
+      "m": 6
+    },
+    "number_nd_tokens": {
+      "walang": 0
    },
    "album_types": {
      "album": "Album",
@ -1065,7 +1136,9 @@
    }
  },
  "fr": {
-    "equivalent": ["fr-CA"],
+    "equivalent": [
+      "fr-CA"
+    ],
    "by_char": false,
    "timeago_tokens": {
      "an": "Y",
@ -1104,9 +1177,13 @@
    },
    "comma_decimal": true,
    "number_tokens": {
-      "G": 9,
-      "M": 6,
-      "Md": 9
+      "g": 9,
+      "m": 6,
+      "md": 9
+    },
+    "number_nd_tokens": {
+      "aucun": 0,
+      "aucune": 0
    },
    "album_types": {
      "album": "Album",
@ -1158,7 +1235,10 @@
    },
    "comma_decimal": true,
    "number_tokens": {
-      "M": 6
+      "m": 6
+    },
+    "number_nd_tokens": {
+      "ningunha": 0
    },
    "album_types": {
      "audiolibro": "Audiobook",
@ -1206,6 +1286,7 @@
      "લાખ": 5,
      "હજાર": 3
    },
+    "number_nd_tokens": {},
    "album_types": {
      "ep": "Ep",
      "આલ્બમ": "Album",
@ -1252,6 +1333,9 @@
      "लाख": 5,
      "हज़ार": 3
    },
+    "number_nd_tokens": {
+      "नहीं": 0
+    },
    "album_types": {
      "ईपी": "Ep",
      "एल्‍बम": "Album",
@ -1310,6 +1394,9 @@
      "mlr": 9,
      "tis": 3
    },
+    "number_nd_tokens": {
+      "nema": 0
+    },
    "album_types": {
      "album": "Album",
      "audioknjiga": "Audiobook",
@ -1360,9 +1447,12 @@
    },
    "comma_decimal": true,
    "number_tokens": {
-      "E": 3,
-      "M": 6,
-      "Mrd": 9
+      "e": 3,
+      "m": 6,
+      "mrd": 9
+    },
+    "number_nd_tokens": {
+      "nincs": 0
    },
    "album_types": {
      "album": "Album",
@ -1409,6 +1499,10 @@
      "մլն": 6,
      "մլրդ": 9
    },
+    "number_nd_tokens": {
+      "դիտումներ": 0,
+      "չկան": 0
+    },
    "album_types": {
      "ep": "Ep",
      "ալբոմ": "Album",
@ -1450,10 +1544,13 @@
    },
    "comma_decimal": true,
    "number_tokens": {
-      "M": 9,
      "jt": 6,
+      "m": 9,
      "rb": 3
    },
+    "number_nd_tokens": {
+      "belum": 0
+    },
    "album_types": {
      "acara": "Show",
      "album": "Album",
@ -1509,6 +1606,10 @@
      "ma": 9,
      "þ": 3
    },
+    "number_nd_tokens": {
+      "einn": 1,
+      "ekkert": 0
+    },
    "album_types": {
      "ep": "Ep",
      "hljóðbók": "Audiobook",
@ -1557,8 +1658,11 @@
    },
    "comma_decimal": true,
    "number_tokens": {
-      "Mln": 6,
-      "Mrd": 9
+      "mln": 6,
+      "mrd": 9
+    },
+    "number_nd_tokens": {
+      "nessuna": 0
    },
    "album_types": {
      "album": "Album",
@ -1615,9 +1719,12 @@
    },
    "comma_decimal": false,
    "number_tokens": {
-      "B": 9,
-      "K": 3,
-      "M": 6
+      "b": 9,
+      "m": 6
+    },
+    "number_nd_tokens": {
+      "אחד": 1,
+      "אין": 0
    },
    "album_types": {
      "אלבום": "Album",
@ -1650,6 +1757,7 @@
      "万": 4,
      "億": 8
    },
+    "number_nd_tokens": {},
    "album_types": {
      "ep": "Ep",
      "アルバム": "Album",
@ -1697,6 +1805,9 @@
      "მლნ": 6,
      "მლრდ": 9
    },
+    "number_nd_tokens": {
+      "არ": 0
+    },
    "album_types": {
      "ალბომი": "Album",
      "აუდიოწიგნი": "Audiobook",
@ -1743,6 +1854,9 @@
      "млрд": 9,
      "мың": 3
    },
+    "number_nd_tokens": {
+      "ешкім": 0
+    },
    "album_types": {
      "ep": "Ep",
      "альбом": "Album",
@ -1790,6 +1904,7 @@
      "ពាន់": 3,
      "លាន": 6
    },
+    "number_nd_tokens": {},
    "album_types": {
      "ep": "Ep",
      "កម្មវិធីទូរទស្សន៍": "Show",
@ -1843,6 +1958,9 @@
      "ಕೋಟಿ": 7,
      "ಲಕ್ಷ": 5
    },
+    "number_nd_tokens": {
+      "ವೀಕ್ಷಣೆಗಳಿಲ್ಲ": 0
+    },
    "album_types": {
      "ep": "Ep",
      "ಆಡಿಯೋಬುಕ್": "Audiobook",
@ -1866,8 +1984,8 @@
    "date_order": "YMD",
    "months": {},
    "timeago_nd_tokens": {
-      "오늘": "0D",
-      "어제": "1D"
+      "어제": "1D",
+      "오늘": "0D"
    },
    "comma_decimal": false,
    "number_tokens": {
@ -1875,6 +1993,9 @@
      "억": 8,
      "천": 3
    },
+    "number_nd_tokens": {
+      "없": 0
+    },
    "album_types": {
      "ep": "Ep",
      "싱글": "Single",
@ -1920,6 +2041,9 @@
      "млд": 9,
      "млн": 6
    },
+    "number_nd_tokens": {
+      "эч": 0
+    },
    "album_types": {
      "альбом": "Album",
      "аудиокитеп": "Audiobook",
@ -1968,6 +2092,9 @@
      "ພັນ": 3,
      "ລ້ານ": 6
    },
+    "number_nd_tokens": {
+      "ຍັງບໍ່ມີຄົນເບິ່ງເທື່ອ": 0
+    },
    "album_types": {
      "ep": "Ep",
      "ຊິງເກິນ": "Single",
@ -2017,6 +2144,9 @@
      "mlrd": 9,
      "tūkst": 3
    },
+    "number_nd_tokens": {
+      "nėra": 0
+    },
    "album_types": {
      "albumas": "Album",
      "garsinė knyga": "Audiobook",
@ -2069,6 +2199,9 @@
      "mljrd": 9,
      "tūkst": 3
    },
+    "number_nd_tokens": {
+      "nav": 0
+    },
    "album_types": {
      "albums": "Album",
      "audiogrāmata": "Audiobook",
@ -2104,11 +2237,14 @@
    },
    "comma_decimal": true,
    "number_tokens": {
-      "М": 6,
      "илј": 3,
+      "м": 6,
      "мил": 6,
      "милј": 9
    },
+    "number_nd_tokens": {
+      "нема": 0
+    },
    "album_types": {
      "ep": "Ep",
      "албум": "Album",
@ -2153,6 +2289,9 @@
      "കോടി": 7,
      "ലക്ഷം": 5
    },
+    "number_nd_tokens": {
+      "ഇല്ല": 0
+    },
    "album_types": {
      "ep": "Ep",
      "ആല്‍‌ബം": "Album",
@ -2187,6 +2326,9 @@
      "сая": 6,
      "тэрбум": 9
    },
+    "number_nd_tokens": {
+      "үзэлтгүй": 0
+    },
    "album_types": {
      "ep": "Ep",
      "аудио ном": "Audiobook",
@ -2243,6 +2385,9 @@
      "लाख": 5,
      "ह": 3
    },
+    "number_nd_tokens": {
+      "नाहीत": 0
+    },
    "album_types": {
      "अल्बम": "Album",
      "ऑडिओबुक": "Audiobook",
@ -2284,8 +2429,11 @@
    },
    "comma_decimal": false,
    "number_tokens": {
-      "B": 9,
-      "J": 6
+      "b": 9,
+      "j": 6
+    },
+    "number_nd_tokens": {
+      "tiada": 0
    },
    "album_types": {
      "album": "Album",
@ -2330,11 +2478,15 @@
    "comma_decimal": false,
    "number_tokens": {
      "ကုဋေ": 7,
+      "ထ": 3,
      "ထောင်": 3,
      "သန်း": 6,
      "သိန်း": 5,
-      "သောင်း": 4,
-      "ထ": 3
+      "သောင်း": 4
+    },
+    "number_nd_tokens": {
+      "မရှိ": 0,
+      "၁": 1
    },
    "album_types": {
      "ep": "Ep",
@ -2350,12 +2502,12 @@
    "timeago_tokens": {
      "घण्टा": "h",
      "दिन": "D",
+      "दिनअघि": "D",
      "महिना": "M",
      "मिनेट": "m",
      "वर्ष": "Y",
      "सेकेन्ड": "s",
-      "हप्ता": "W",
-      "दिनअघि": "D"
+      "हप्ता": "W"
    },
    "date_order": "YD",
    "months": {
@ -2383,6 +2535,9 @@
      "लाख": 5,
      "हजार": 3
    },
+    "number_nd_tokens": {
+      "छैन": 0
+    },
    "album_types": {
      "ep": "Ep",
      "अडियोबुक": "Audiobook",
@ -2432,6 +2587,9 @@
      "mld": 9,
      "mln": 6
    },
+    "number_nd_tokens": {
+      "geen": 0
+    },
    "album_types": {
      "aflevering": "Show",
      "album": "Album",
@ -2483,6 +2641,9 @@
      "mill": 6,
      "mrd": 9
    },
+    "number_nd_tokens": {
+      "ingen": 0
+    },
    "album_types": {
      "album": "Album",
      "ep": "Ep",
@ -2525,15 +2686,18 @@
    },
    "comma_decimal": false,
    "number_tokens": {
+      "ନି": 6,
      "ନିଜଣ": 6,
      "ନିଟି": 6,
+      "ବି": 9,
      "ବିଜଣ": 9,
      "ବିଟି": 9,
-      "ହଜଣ": 3,
-      "ହଟି": 3,
      "ହ": 3,
-      "ନି": 6,
-      "ବି": 9
+      "ହଜଣ": 3,
+      "ହଟି": 3
+    },
+    "number_nd_tokens": {
+      "ନାହିଁ": 0
    },
    "album_types": {
      "ep": "Ep",
@ -2584,6 +2748,9 @@
      "ਲੱਖ": 5,
      "ਹਜ਼ਾਰ": 3
    },
+    "number_nd_tokens": {
+      "ਨਹੀਂ": 0
+    },
    "album_types": {
      "ep": "Ep",
      "ਆਡੀਓ-ਕਿਤਾਬ": "Audiobook",
@ -2643,6 +2810,9 @@
      "mln": 6,
      "tys": 3
    },
+    "number_nd_tokens": {
+      "brak": 0
+    },
    "album_types": {
      "album": "Album",
      "audiobook": "Audiobook",
@ -2695,6 +2865,7 @@
      "mi": 6,
      "mil": 3
    },
+    "number_nd_tokens": {},
    "album_types": {
      "audiolivro": "Audiobook",
      "ep": "Ep",
@ -2730,10 +2901,11 @@
    },
    "comma_decimal": true,
    "number_tokens": {
-      "M": 6,
-      "mM": 9,
-      "mil": 3
+      "m": 6,
+      "mil": 3,
+      "mm": 9
    },
+    "number_nd_tokens": {},
    "album_types": {
      "ep": "Ep",
      "livro áudio": "Audiobook",
@ -2785,6 +2957,10 @@
      "mil": 6,
      "mld": 9
    },
+    "number_nd_tokens": {
+      "nicio": 0,
+      "un": 1
+    },
    "album_types": {
      "album": "Album",
      "carte audio": "Audiobook",
@ -2843,6 +3019,7 @@
      "млрд": 9,
      "тыс": 3
    },
+    "number_nd_tokens": {},
    "album_types": {
      "ep": "Ep",
      "альбом": "Album",
@ -2888,6 +3065,9 @@
      "බි": 9,
      "මි": 6
    },
+    "number_nd_tokens": {
+      "නැත": 0
+    },
    "album_types": {
      "ඇල්බමය": "Album",
      "තනි": "Single",
@ -2930,6 +3110,9 @@
      "mld": 9,
      "tis": 3
    },
+    "number_nd_tokens": {
+      "žiadne": 0
+    },
    "album_types": {
      "album": "Album",
      "audiokniha": "Audiobook",
@ -2993,6 +3176,9 @@
      "mrd": 9,
      "tis": 3
    },
+    "number_nd_tokens": {
+      "brez": 0
+    },
    "album_types": {
      "album": "Album",
      "ep": "Ep",
@ -3041,6 +3227,9 @@
      "mld": 9,
      "mln": 6
    },
+    "number_nd_tokens": {
+      "nuk": 0
+    },
    "album_types": {
      "album": "Album",
      "ep": "Ep",
@ -3084,6 +3273,9 @@
      "млрд": 9,
      "хиљ": 3
    },
+    "number_nd_tokens": {
+      "нема": 0
+    },
    "album_types": {
      "ep": "Ep",
      "албум": "Album",
@ -3128,6 +3320,9 @@
      "mil": 6,
      "mlrd": 9
    },
+    "number_nd_tokens": {
+      "nema": 0
+    },
    "album_types": {
      "album": "Album",
      "audio-knjiga": "Audiobook",
@ -3178,6 +3373,9 @@
      "md": 9,
      "mn": 6
    },
+    "number_nd_tokens": {
+      "inga": 0
+    },
    "album_types": {
      "album": "Album",
      "ep": "Ep",
@ -3221,9 +3419,12 @@
    },
    "comma_decimal": false,
    "number_tokens": {
-      "B": 9,
-      "M": 6,
-      "elfu": 3
+      "b": 9,
+      "elfu": 3,
+      "m": 6
+    },
+    "number_nd_tokens": {
+      "haijatazamwa": 0
    },
    "album_types": {
      "albamu": "Album",
@ -3278,6 +3479,9 @@
      "கோடி": 7,
      "லட்சம்": 5
    },
+    "number_nd_tokens": {
+      "இல்லை": 0
+    },
    "album_types": {
      "ep": "Ep",
      "ஆடியோ புத்தகம்": "Audiobook",
@ -3331,6 +3535,9 @@
      "లక్ష": 5,
      "లక్షలు": 5
    },
+    "number_nd_tokens": {
+      "లేవు": 0
+    },
    "album_types": {
      "ep": "Ep",
      "ఆడియోబుక్": "Audiobook",
@ -3348,11 +3555,11 @@
      "นาทีที่ผ่านมา": "m",
      "ปีที่แล้ว": "Y",
      "วันที่ผ่านมา": "D",
+      "วันที่แล้ว": "D",
      "วินาที": "s",
      "วินาทีที่ผ่านมา": "s",
      "สัปดาห์ที่ผ่านมา": "W",
-      "เดือนที่ผ่านมา": "M",
-      "วันที่แล้ว": "D"
+      "เดือนที่ผ่านมา": "M"
    },
    "date_order": "DY",
    "months": {
@ -3382,6 +3589,9 @@
      "หมื่นล้าน": 10,
      "แสน": 5
    },
+    "number_nd_tokens": {
+      "ไม่มีการดู": 0
+    },
    "album_types": {
      "ep": "Ep",
      "ซิงเกิล": "Single",
@ -3423,9 +3633,12 @@
    },
    "comma_decimal": true,
    "number_tokens": {
-      "B": 3,
-      "Mn": 6,
-      "Mr": 9
+      "b": 3,
+      "mn": 6,
+      "mr": 9
+    },
+    "number_nd_tokens": {
+      "yok": 0
    },
    "album_types": {
      "albüm": "Album",
@ -3485,6 +3698,9 @@
      "млрд": 9,
      "тис": 3
    },
+    "number_nd_tokens": {
+      "жодного": 0
+    },
    "album_types": {
      "альбом": "Album",
      "аудіодрама": "Show",
@ -3537,6 +3753,9 @@
      "کروڑ": 7,
      "ہزار": 3
    },
+    "number_nd_tokens": {
+      "نہیں": 0
+    },
    "album_types": {
      "ep": "Ep",
      "آڈیو بک": "Audiobook",
@ -3582,6 +3801,7 @@
      "mln": 6,
      "mlrd": 9
    },
+    "number_nd_tokens": {},
    "album_types": {
      "albom": "Album",
      "audiokitob": "Audiobook",
@ -3611,10 +3831,11 @@
    },
    "comma_decimal": true,
    "number_tokens": {
-      "N": 3,
-      "T": 9,
-      "Tr": 6
+      "n": 3,
+      "t": 9,
+      "tr": 6
    },
+    "number_nd_tokens": {},
    "album_types": {
      "chương trình": "Show",
      "sách nói": "Audiobook",
@ -3646,6 +3867,9 @@
      "万": 4,
      "亿": 8
    },
+    "number_nd_tokens": {
+      "无": 0
+    },
    "album_types": {
      "专辑": "Album",
      "单曲": "Single",
@ -3675,9 +3899,10 @@
    },
    "comma_decimal": false,
    "number_tokens": {
-      "B": 9,
-      "M": 6
+      "b": 9,
+      "m": 6
    },
+    "number_nd_tokens": {},
    "album_types": {
      "ep": "Ep",
      "單曲": "Single",
@ -3709,6 +3934,7 @@
      "億": 8,
      "萬": 4
    },
+    "number_nd_tokens": {},
    "album_types": {
      "ep": "Ep",
      "單曲": "Single",
@ -3757,8 +3983,11 @@
    },
    "comma_decimal": false,
    "number_tokens": {
-      "B": 9,
-      "M": 6
+      "b": 9,
+      "m": 6
+    },
+    "number_nd_tokens": {
+      "akukho": 0
    },
    "album_types": {
      "bonisa": "Show",
--- a/testfiles/dict/dictionary_override.json
+++ b/testfiles/dict/dictionary_override.json
@ -0,0 +1,163 @@
+{
+  "af": {
+    "number_nd_tokens": {
+      "geen": null
+    }
+  },
+  "am": {
+    "number_nd_tokens": {
+      "ምንም": null
+    }
+  },
+  "as": {
+    "number_tokens": {
+      "লা": 5,
+      "হা": 3,
+      "শঃ": null
+    },
+    "number_nd_tokens": {
+      "কোনো": null
+    }
+  },
+  "bn": {
+    "number_tokens": {
+      "কোটি": 7,
+      "শত": 2
+    }
+  },
+  "es": {
+    "number_tokens": {
+      "m": 6,
+      "mil": 3
+    }
+  },
+  "es-US": {
+    "number_tokens": {
+      "m": 6,
+      "mil": 3
+    }
+  },
+  "et": {
+    "number_nd_tokens": {
+      "vaatamisi": null
+    }
+  },
+  "eu": {
+    "number_nd_tokens": {
+      "dago": null,
+      "ikustaldirik": null
+    }
+  },
+  "fr": {
+    "number_tokens": {
+      "d’abonnés": null
+    }
+  },
+  "hy": {
+    "number_nd_tokens": {
+      "Դիտումներ": null
+    }
+  },
+  "is": {
+    "number_nd_tokens": {
+      "áskrifandi": null,
+      "enn": null
+    }
+  },
+  "iw": {
+    "number_nd_tokens": {
+      "מנוי": null
+    }
+  },
+  "ka": {
+    "number_nd_tokens": {
+      "არის": null,
+      "ნახვები": null
+    }
+  },
+  "kk": {
+    "number_nd_tokens": {
+      "көрмеген": null
+    }
+  },
+  "kn": {
+    "number_nd_tokens": {
+      "ಯಾವುದೇ": null
+    }
+  },
+  "ko": {
+    "number_nd_tokens": {
+      "음": null
+    }
+  },
+  "ky": {
+    "number_nd_tokens": {
+      "ким": null,
+      "көрө": null,
+      "элек": null
+    }
+  },
+  "my": {
+    "number_tokens": {
+      "ကုဋေ": 7,
+      "သောင်း": 4,
+      "ထ": 3
+    }
+  },
+  "ne": {
+    "number_nd_tokens": {
+      "कुनै": null
+    }
+  },
+  "no": {
+    "number_nd_tokens": {
+      "avspillinger": null
+    }
+  },
+  "or": {
+    "number_tokens": {
+      "ବିଜଣ": 9,
+      "ବି": 9
+    },
+    "number_nd_tokens": {
+      "କୌଣସି": null
+    }
+  },
+  "pa": {
+    "number_nd_tokens": {
+      "ਕਿਸੇ": null,
+      "ਨੇ": null
+    }
+  },
+  "ro": {
+    "number_nd_tokens": {
+      "abonat": null,
+      "vizionare": null
+    }
+  },
+  "sq": {
+    "number_nd_tokens": {
+      "ka": null
+    }
+  },
+  "uk": {
+    "number_nd_tokens": {
+      "перегляду": null
+    }
+  },
+  "ur": {
+    "number_nd_tokens": {
+      "کوئی": null
+    }
+  },
+  "zh-CN": {
+    "number_nd_tokens": {
+      "人": null
+    }
+  },
+  "zu": {
+    "number_nd_tokens": {
+      "kubukwa": null
+    }
+  }
+}
--- a/testfiles/dict/large_number_samples.json
+++ b/testfiles/dict/large_number_samples.json
--- a/testfiles/dict/large_number_samples_all.json
+++ b/testfiles/dict/large_number_samples_all.json