feat: add number_tokens for parsing large nums to dictionary
This commit is contained in:
parent
67ae1eb21d
commit
5d19259a14
21 changed files with 5219 additions and 38 deletions
|
|
@ -3,19 +3,53 @@ use std::{collections::BTreeMap, fs::File, io::BufReader, path::Path, str::FromS
|
|||
use rustypipe::model::Language;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
const DICT_PATH: &str = "testfiles/date/dictionary.json";
|
||||
const DICT_PATH: &str = "testfiles/dict/dictionary.json";
|
||||
|
||||
type Dictionary = BTreeMap<Language, DictEntry>;
|
||||
|
||||
#[derive(Debug, Default, Serialize, Deserialize)]
|
||||
#[serde(default)]
|
||||
pub struct DictEntry {
|
||||
/// List of languages that should be treated equally (e.g. EnUs/EnGb/EnIn)
|
||||
pub equivalent: Vec<Language>,
|
||||
/// Should the language be parsed by character instead of by word?
|
||||
/// (e.g. Chinese/Japanese)
|
||||
pub by_char: bool,
|
||||
/// Tokens for parsing timeago strings.
|
||||
///
|
||||
/// Format: Parsed token -> \[Quantity\] Identifier
|
||||
///
|
||||
/// Identifiers: `Y`(ear), `M`(month), `W`(eek), `D`(ay),
|
||||
/// `h`(our), `m`(inute), `s`(econd)
|
||||
pub timeago_tokens: BTreeMap<String, String>,
|
||||
/// Order in which to parse numeric date components. Formatted as
|
||||
/// a string of date identifiers (Y, M, D).
|
||||
///
|
||||
/// Examples:
|
||||
///
|
||||
/// - 03.01.2020 => `"DMY"`
|
||||
/// - Jan 3, 2020 => `"DY"`
|
||||
pub date_order: String,
|
||||
/// Tokens for parsing month names.
|
||||
///
|
||||
/// Format: Parsed token -> Month number (starting from 1)
|
||||
pub months: BTreeMap<String, u8>,
|
||||
/// Tokens for parsing date strings with no digits (e.g. Today, Tomorrow)
|
||||
///
|
||||
/// Format: Parsed token -> \[Quantity\] Identifier
|
||||
pub timeago_nd_tokens: BTreeMap<String, String>,
|
||||
/// Are commas (instead of points) used as decimal separators?
|
||||
pub comma_decimal: bool,
|
||||
/// Tokens for parsing decimal prefixes (K, M, B, ...)
|
||||
///
|
||||
/// Format: Parsed token -> decimal power
|
||||
pub number_tokens: BTreeMap<String, u8>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct Text {
|
||||
pub simple_text: String,
|
||||
}
|
||||
|
||||
pub fn read_dict(project_root: &Path) -> Dictionary {
|
||||
|
|
@ -48,6 +82,27 @@ pub fn filter_datestr(string: &str) -> String {
|
|||
.collect()
|
||||
}
|
||||
|
||||
pub fn filter_largenumstr(string: &str) -> String {
|
||||
string
|
||||
.chars()
|
||||
.filter(|c| !matches!(c, '\u{200b}' | '.' | ',') && !c.is_ascii_digit())
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Parse a string after removing all non-numeric characters
|
||||
pub fn parse_numeric<F>(string: &str) -> Result<F, F::Err>
|
||||
where
|
||||
F: FromStr,
|
||||
{
|
||||
let mut buf = String::new();
|
||||
for c in string.chars() {
|
||||
if c.is_ascii_digit() {
|
||||
buf.push(c);
|
||||
}
|
||||
}
|
||||
buf.parse()
|
||||
}
|
||||
|
||||
/// Parse all numbers occurring in a string and reurn them as a vec
|
||||
pub fn parse_numeric_vec<F>(string: &str) -> Vec<F>
|
||||
where
|
||||
|
|
|
|||
Reference in a new issue