feat: add number_tokens for parsing large nums to dictionary

2022-09-23 15:04:22 +02:00 · 2022-09-23 15:04:22 +02:00 · 5d19259a14
commit 5d19259a14
parent 67ae1eb21d
21 changed files with 5219 additions and 38 deletions
--- a/codegen/src/util.rs
+++ b/codegen/src/util.rs
@ -3,19 +3,53 @@ use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path, str::FromS
 use rustypipe::model::Language;
 use serde::{Deserialize, Serialize};

-const DICT_PATH: &str = "testfiles/date/dictionary.json";
+const DICT_PATH: &str = "testfiles/dict/dictionary.json";

 type Dictionary = BTreeMap<Language, DictEntry>;

 #[derive(Debug, Default, Serialize, Deserialize)]
 #[serde(default)]
 pub struct DictEntry {
+    /// List of languages that should be treated equally (e.g. EnUs/EnGb/EnIn)
    pub equivalent: Vec<Language>,
+    /// Should the language be parsed by character instead of by word?
+    /// (e.g. Chinese/Japanese)
    pub by_char: bool,
+    /// Tokens for parsing timeago strings.
+    ///
+    /// Format: Parsed token -> \[Quantity\] Identifier
+    ///
+    /// Identifiers: `Y`(ear), `M`(month), `W`(eek), `D`(ay),
+    /// `h`(our), `m`(inute), `s`(econd)
    pub timeago_tokens: BTreeMap<String, String>,
+    /// Order in which to parse numeric date components. Formatted as
+    /// a string of date identifiers (Y, M, D).
+    ///
+    /// Examples:
+    ///
+    /// - 03.01.2020 => `"DMY"`
+    /// - Jan 3, 2020 => `"DY"`
    pub date_order: String,
+    /// Tokens for parsing month names.
+    ///
+    /// Format: Parsed token -> Month number (starting from 1)
    pub months: BTreeMap<String, u8>,
+    /// Tokens for parsing date strings with no digits (e.g. Today, Tomorrow)
+    ///
+    /// Format: Parsed token -> \[Quantity\] Identifier
    pub timeago_nd_tokens: BTreeMap<String, String>,
+    /// Are commas (instead of points) used as decimal separators?
+    pub comma_decimal: bool,
+    /// Tokens for parsing decimal prefixes (K, M, B, ...)
+    ///
+    /// Format: Parsed token -> decimal power
+    pub number_tokens: BTreeMap<String, u8>,
+}
+
+#[derive(Clone, Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct Text {
+    pub simple_text: String,
 }

 pub fn read_dict(project_root: &Path) -> Dictionary {
@ -48,6 +82,27 @@ pub fn filter_datestr(string: &str) -> String {
        .collect()
 }

+pub fn filter_largenumstr(string: &str) -> String {
+    string
+        .chars()
+        .filter(|c| !matches!(c, '\u{200b}' | '.' | ',') && !c.is_ascii_digit())
+        .collect()
+}
+
+/// Parse a string after removing all non-numeric characters
+pub fn parse_numeric<F>(string: &str) -> Result<F, F::Err>
+where
+    F: FromStr,
+{
+    let mut buf = String::new();
+    for c in string.chars() {
+        if c.is_ascii_digit() {
+            buf.push(c);
+        }
+    }
+    buf.parse()
+}
+
 /// Parse all numbers occurring in a string and reurn them as a vec
 pub fn parse_numeric_vec<F>(string: &str) -> Vec<F>
 where