refactor: add iterators for parsing tokens

2023-05-07 18:00:49 +02:00 · 2023-05-07 18:00:49 +02:00 · 0008e305c2
commit 0008e305c2
parent b3331b36a7
3 changed files with 153 additions and 124 deletions
--- a/README.md
+++ b/README.md
@ -1,5 +1,7 @@
 # RustyPipe
 [![CI status](https://ci.thetadev.de/api/badges/ThetaDev/rustypipe/status.svg)](https://ci.thetadev.de/ThetaDev/rustypipe)
 Client for the public YouTube / YouTube Music API (Innertube),
 inspired by [NewPipe](https://github.com/TeamNewPipe/NewPipeExtractor).
@ -7,25 +9,25 @@ inspired by [NewPipe](https://github.com/TeamNewPipe/NewPipeExtractor).
 ### YouTube
- [X] **Player** (video/audio streams, subtitles)
+- **Player** (video/audio streams, subtitles)
- [X] **Playlist**
+- **Playlist**
- [X] **VideoDetails** (metadata, comments, recommended videos)
+- **VideoDetails** (metadata, comments, recommended videos)
- [X] **Channel** (videos, shorts, livestreams, playlists, info, search)
+- **Channel** (videos, shorts, livestreams, playlists, info, search)
- [X] **ChannelRSS**
+- **ChannelRSS**
- [X] **Search** (with filters)
+- **Search** (with filters)
- [X] **Search suggestions**
+- **Search suggestions**
- [X] **Trending**
+- **Trending**
- [X] **URL resolver**
+- **URL resolver**
 ### YouTube Music
- [X] **Playlist**
+- **Playlist**
- [X] **Album**
+- **Album**
- [X] **Artist**
+- **Artist**
- [X] **Search**
+- **Search**
- [X] **Search suggestions**
+- **Search suggestions**
- [X] **Radio**
+- **Radio**
- [X] **Track details** (lyrics, recommendations)
+- **Track details** (lyrics, recommendations)
- [X] **Moods/Genres**
+- **Moods/Genres**
- [X] **Charts**
+- **Charts**
- [X] **New**
+- **New** (albums, music videos)
--- a/src/util/mod.rs
+++ b/src/util/mod.rs
@ -10,7 +10,7 @@ pub use protobuf::{string_from_pb, ProtoBuilder};
 use std::{
    borrow::{Borrow, Cow},
    collections::BTreeMap,
-    str::FromStr,
+    str::{FromStr, SplitWhitespace},
 };
 use base64::Engine;
@ -331,36 +331,18 @@ where
    }
    if digits.is_empty() {
-        if by_char {
+        SplitTokens::new(&filtered, by_char)
-            filtered
+            .find_map(|token| dict_entry.number_nd_tokens.get(token))
-                .chars()
+            .and_then(|n| (*n as u64).try_into().ok())
                .find_map(|c| dict_entry.number_nd_tokens.get(&c.to_string()))
                .and_then(|n| (*n as u64).try_into().ok())
        } else {
            filtered
                .split_whitespace()
                .find_map(|token| dict_entry.number_nd_tokens.get(token))
                .and_then(|n| (*n as u64).try_into().ok())
        }
    } else {
        let num = digits.parse::<u64>().ok()?;
-        let lookup_token = |token: &str| match token {
+        exp += SplitTokens::new(&filtered, by_char)
-            "k" => Some(3),
+            .filter_map(|token| match token {
-            _ => dict_entry.number_tokens.get(token).map(|t| *t as i32),
+                "k" => Some(3),
-        };
+                _ => dict_entry.number_tokens.get(token).map(|t| *t as i32),
-
+            })
-        if by_char {
+            .sum::<i32>();
            exp += filtered
                .chars()
                .filter_map(|token| lookup_token(&token.to_string()))
                .sum::<i32>();
        } else {
            exp += filtered
                .split_whitespace()
                .filter_map(lookup_token)
                .sum::<i32>();
        }
        F::try_from(num.checked_mul((10_u64).checked_pow(exp.try_into().ok()?)?)?).ok()
    }
@ -415,6 +397,62 @@ pub fn b64_decode<T: AsRef<[u8]>>(input: T) -> Result<Vec<u8>, base64::DecodeErr
    base64::engine::general_purpose::STANDARD.decode(input)
 }
 /// An iterator over the chars in a string (in str format)
 pub struct SplitChar<'a> {
    txt: &'a str,
    index: usize,
 }
 impl<'a> From<&'a str> for SplitChar<'a> {
    fn from(value: &'a str) -> Self {
        Self {
            txt: value,
            index: 0,
        }
    }
 }
 impl<'a> Iterator for SplitChar<'a> {
    type Item = &'a str;
    fn next(&mut self) -> Option<Self::Item> {
        self.txt
            .get(self.index..)
            .and_then(|txt| txt.chars().next())
            .map(|c| {
                let start = self.index;
                self.index += c.len_utf8();
                &self.txt[start..self.index]
            })
    }
 }
 /// An iterator for parsing strings. It can either iterate over words or characters.
 pub enum SplitTokens<'a> {
    Word(SplitWhitespace<'a>),
    Char(SplitChar<'a>),
 }
 impl<'a> SplitTokens<'a> {
    pub fn new(s: &'a str, by_char: bool) -> Self {
        match by_char {
            true => Self::Char(SplitChar::from(s)),
            false => Self::Word(s.split_whitespace()),
        }
    }
 }
 impl<'a> Iterator for SplitTokens<'a> {
    type Item = &'a str;
    fn next(&mut self) -> Option<Self::Item> {
        match self {
            SplitTokens::Word(iter) => iter.next(),
            SplitTokens::Char(iter) => iter.next(),
        }
    }
 }
 #[cfg(test)]
 pub(crate) mod tests {
    use std::{fs::File, io::BufReader, path::PathBuf};
@ -550,4 +588,22 @@ pub(crate) mod tests {
        let res = parse_large_numstr::<u64>(string, lang).expect(&emsg);
        assert_eq!(res, rounded, "{emsg}");
    }
    #[test]
    fn split_char() {
        let teststr = "abc今天更新def";
        let res = SplitTokens::new(teststr, true).collect::<Vec<_>>();
        assert_eq!(res.len(), 10);
        let res_str = res.into_iter().collect::<String>();
        assert_eq!(res_str, teststr)
    }
    #[test]
    fn split_words() {
        let teststr = "abc 今天更新 ghi";
        let res = SplitTokens::new(teststr, false).collect::<Vec<_>>();
        assert_eq!(res.len(), 3);
        let res_str = res.join(" ");
        assert_eq!(res_str, teststr)
    }
 }
--- a/src/util/timeago.rs
+++ b/src/util/timeago.rs
@ -17,7 +17,7 @@ use time::{Date, Duration, Month, OffsetDateTime};
 use crate::{
    param::Language,
-    util::{self, dictionary},
+    util::{self, dictionary, SplitTokens},
 };
 /// Parsed TimeAgo string, contains amount and time unit.
@ -149,79 +149,39 @@ fn filter_str(string: &str) -> String {
        .collect()
 }
-fn parse_ta_token(
+struct TaTokenParser<'a> {
-    entry: &dictionary::Entry,
+    iter: SplitTokens<'a>,
-    by_char: bool,
+    tokens: &'a phf::Map<&'static str, TaToken>,
-    nd: bool,
+}
    filtered_str: &str,
 ) -> Option<TimeAgo> {
    let tokens = match nd {
        true => &entry.timeago_nd_tokens,
        false => &entry.timeago_tokens,
    };
    let mut qu = 1;
-    if by_char {
+impl<'a> TaTokenParser<'a> {
-        filtered_str.chars().find_map(|word| {
+    fn new(entry: &'a dictionary::Entry, by_char: bool, nd: bool, filtered_str: &'a str) -> Self {
-            tokens.get(&word.to_string()).and_then(|t| match t.unit {
+        let tokens = match nd {
-                Some(unit) => Some(TimeAgo { n: t.n * qu, unit }),
+            true => &entry.timeago_nd_tokens,
-                None => {
+            false => &entry.timeago_tokens,
-                    qu = t.n;
+        };
-                    None
+        Self {
-                }
+            iter: SplitTokens::new(filtered_str, by_char),
-            })
+            tokens,
-        })
+        }
    } else {
        filtered_str.split_whitespace().find_map(|word| {
            tokens.get(word).and_then(|t| match t.unit {
                Some(unit) => Some(TimeAgo { n: t.n * qu, unit }),
                None => {
                    qu = t.n;
                    None
                }
            })
        })
    }
 }
-fn parse_ta_tokens(
+impl<'a> Iterator for TaTokenParser<'a> {
-    entry: &dictionary::Entry,
+    type Item = TimeAgo;
    by_char: bool,
    nd: bool,
    filtered_str: &str,
 ) -> Vec<TimeAgo> {
    let tokens = match nd {
        true => &entry.timeago_nd_tokens,
        false => &entry.timeago_tokens,
    };
    let mut qu = 1;
-    if by_char {
+    fn next(&mut self) -> Option<Self::Item> {
-        filtered_str
+        // Quantity for parsing separate quantity + unit tokens
-            .chars()
+        let mut qu = 1;
-            .filter_map(|word| {
+        self.iter.find_map(|word| {
-                tokens.get(&word.to_string()).and_then(|t| match t.unit {
+            self.tokens.get(word).and_then(|t| match t.unit {
-                    Some(unit) => Some(TimeAgo { n: t.n * qu, unit }),
+                Some(unit) => Some(TimeAgo { n: t.n * qu, unit }),
-                    None => {
+                None => {
-                        qu = t.n;
+                    qu = t.n;
-                        None
+                    None
-                    }
+                }
                })
            })
-            .collect()
+        })
    } else {
        filtered_str
            .split_whitespace()
            .filter_map(|word| {
                tokens.get(word).and_then(|t| match t.unit {
                    Some(unit) => Some(TimeAgo { n: t.n * qu, unit }),
                    None => {
                        qu = t.n;
                        None
                    }
                })
            })
            .collect()
    }
 }
@ -240,7 +200,9 @@ pub fn parse_timeago(lang: Language, textual_date: &str) -> Option<TimeAgo> {
    let qu: u8 = util::parse_numeric(textual_date).unwrap_or(1);
-    parse_ta_token(&entry, util::lang_by_char(lang), false, &filtered_str).map(|ta| ta * qu)
+    TaTokenParser::new(&entry, util::lang_by_char(lang), false, &filtered_str)
        .next()
        .map(|ta| ta * qu)
 }
 /// Parse a TimeAgo string (e.g. "29 minutes ago") into a Chrono DateTime object.
@ -273,11 +235,14 @@ pub fn parse_textual_date(lang: Language, textual_date: &str) -> Option<ParsedDa
    let nums = util::parse_numeric_vec::<u16>(textual_date);
    match nums.len() {
-        0 => match parse_ta_token(&entry, by_char, true, &filtered_str) {
+        0 => match TaTokenParser::new(&entry, by_char, true, &filtered_str).next() {
            Some(timeago) => Some(ParsedDate::Relative(timeago)),
-            None => parse_ta_token(&entry, by_char, false, &filtered_str).map(ParsedDate::Relative),
+            None => TaTokenParser::new(&entry, by_char, false, &filtered_str)
                .next()
                .map(ParsedDate::Relative),
        },
-        1 => parse_ta_token(&entry, by_char, false, &filtered_str)
+        1 => TaTokenParser::new(&entry, by_char, false, &filtered_str)
            .next()
            .map(|timeago| ParsedDate::Relative(timeago * nums[0] as u8)),
        2..=3 => {
            if nums.len() == entry.date_order.len() {
@ -348,12 +313,10 @@ pub fn parse_video_duration(lang: Language, video_duration: &str) -> Option<u32>
        } else {
            part.digits.parse::<u32>().ok()?
        };
-        let tokens = parse_ta_tokens(&entry, by_char, false, &part.word);
+        let mut tokens = TaTokenParser::new(&entry, by_char, false, &part.word).peekable();
-        if tokens.is_empty() {
+        tokens.peek()?;
            return None;
        }
-        tokens.iter().for_each(|ta| {
+        tokens.for_each(|ta| {
            secs += n * ta.secs() as u32;
            n = 1;
        });
@ -805,4 +768,12 @@ mod tests {
        let now = OffsetDateTime::now_utc();
        assert_eq!(date.year(), now.year() - 1);
    }
    #[test]
    fn tx() {
        let s = "Abcdef";
        let lc: (usize, char) = s.char_indices().last().unwrap();
        let t = &s[(lc.0 + lc.1.len_utf8())..];
        dbg!(&t);
    }
 }