From 0008e305c222f6f2a7dffd7aed403f6c46b69228 Mon Sep 17 00:00:00 2001 From: ThetaDev Date: Sun, 7 May 2023 18:00:49 +0200 Subject: [PATCH] refactor: add iterators for parsing tokens --- README.md | 40 +++++++------- src/util/mod.rs | 112 +++++++++++++++++++++++++++++---------- src/util/timeago.rs | 125 +++++++++++++++++--------------------------- 3 files changed, 153 insertions(+), 124 deletions(-) diff --git a/README.md b/README.md index 62df601..a768901 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # RustyPipe +[![CI status](https://ci.thetadev.de/api/badges/ThetaDev/rustypipe/status.svg)](https://ci.thetadev.de/ThetaDev/rustypipe) + Client for the public YouTube / YouTube Music API (Innertube), inspired by [NewPipe](https://github.com/TeamNewPipe/NewPipeExtractor). @@ -7,25 +9,25 @@ inspired by [NewPipe](https://github.com/TeamNewPipe/NewPipeExtractor). ### YouTube -- [X] **Player** (video/audio streams, subtitles) -- [X] **Playlist** -- [X] **VideoDetails** (metadata, comments, recommended videos) -- [X] **Channel** (videos, shorts, livestreams, playlists, info, search) -- [X] **ChannelRSS** -- [X] **Search** (with filters) -- [X] **Search suggestions** -- [X] **Trending** -- [X] **URL resolver** +- **Player** (video/audio streams, subtitles) +- **Playlist** +- **VideoDetails** (metadata, comments, recommended videos) +- **Channel** (videos, shorts, livestreams, playlists, info, search) +- **ChannelRSS** +- **Search** (with filters) +- **Search suggestions** +- **Trending** +- **URL resolver** ### YouTube Music -- [X] **Playlist** -- [X] **Album** -- [X] **Artist** -- [X] **Search** -- [X] **Search suggestions** -- [X] **Radio** -- [X] **Track details** (lyrics, recommendations) -- [X] **Moods/Genres** -- [X] **Charts** -- [X] **New** +- **Playlist** +- **Album** +- **Artist** +- **Search** +- **Search suggestions** +- **Radio** +- **Track details** (lyrics, recommendations) +- **Moods/Genres** +- **Charts** +- **New** (albums, music videos) diff --git a/src/util/mod.rs b/src/util/mod.rs index b005caf..a91375f 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -10,7 +10,7 @@ pub use protobuf::{string_from_pb, ProtoBuilder}; use std::{ borrow::{Borrow, Cow}, collections::BTreeMap, - str::FromStr, + str::{FromStr, SplitWhitespace}, }; use base64::Engine; @@ -331,36 +331,18 @@ where } if digits.is_empty() { - if by_char { - filtered - .chars() - .find_map(|c| dict_entry.number_nd_tokens.get(&c.to_string())) - .and_then(|n| (*n as u64).try_into().ok()) - } else { - filtered - .split_whitespace() - .find_map(|token| dict_entry.number_nd_tokens.get(token)) - .and_then(|n| (*n as u64).try_into().ok()) - } + SplitTokens::new(&filtered, by_char) + .find_map(|token| dict_entry.number_nd_tokens.get(token)) + .and_then(|n| (*n as u64).try_into().ok()) } else { let num = digits.parse::().ok()?; - let lookup_token = |token: &str| match token { - "k" => Some(3), - _ => dict_entry.number_tokens.get(token).map(|t| *t as i32), - }; - - if by_char { - exp += filtered - .chars() - .filter_map(|token| lookup_token(&token.to_string())) - .sum::(); - } else { - exp += filtered - .split_whitespace() - .filter_map(lookup_token) - .sum::(); - } + exp += SplitTokens::new(&filtered, by_char) + .filter_map(|token| match token { + "k" => Some(3), + _ => dict_entry.number_tokens.get(token).map(|t| *t as i32), + }) + .sum::(); F::try_from(num.checked_mul((10_u64).checked_pow(exp.try_into().ok()?)?)?).ok() } @@ -415,6 +397,62 @@ pub fn b64_decode>(input: T) -> Result, base64::DecodeErr base64::engine::general_purpose::STANDARD.decode(input) } +/// An iterator over the chars in a string (in str format) +pub struct SplitChar<'a> { + txt: &'a str, + index: usize, +} + +impl<'a> From<&'a str> for SplitChar<'a> { + fn from(value: &'a str) -> Self { + Self { + txt: value, + index: 0, + } + } +} + +impl<'a> Iterator for SplitChar<'a> { + type Item = &'a str; + + fn next(&mut self) -> Option { + self.txt + .get(self.index..) + .and_then(|txt| txt.chars().next()) + .map(|c| { + let start = self.index; + self.index += c.len_utf8(); + &self.txt[start..self.index] + }) + } +} + +/// An iterator for parsing strings. It can either iterate over words or characters. +pub enum SplitTokens<'a> { + Word(SplitWhitespace<'a>), + Char(SplitChar<'a>), +} + +impl<'a> SplitTokens<'a> { + pub fn new(s: &'a str, by_char: bool) -> Self { + match by_char { + true => Self::Char(SplitChar::from(s)), + false => Self::Word(s.split_whitespace()), + } + } +} + +impl<'a> Iterator for SplitTokens<'a> { + type Item = &'a str; + + fn next(&mut self) -> Option { + match self { + SplitTokens::Word(iter) => iter.next(), + SplitTokens::Char(iter) => iter.next(), + } + } +} + #[cfg(test)] pub(crate) mod tests { use std::{fs::File, io::BufReader, path::PathBuf}; @@ -550,4 +588,22 @@ pub(crate) mod tests { let res = parse_large_numstr::(string, lang).expect(&emsg); assert_eq!(res, rounded, "{emsg}"); } + + #[test] + fn split_char() { + let teststr = "abc今天更新def"; + let res = SplitTokens::new(teststr, true).collect::>(); + assert_eq!(res.len(), 10); + let res_str = res.into_iter().collect::(); + assert_eq!(res_str, teststr) + } + + #[test] + fn split_words() { + let teststr = "abc 今天更新 ghi"; + let res = SplitTokens::new(teststr, false).collect::>(); + assert_eq!(res.len(), 3); + let res_str = res.join(" "); + assert_eq!(res_str, teststr) + } } diff --git a/src/util/timeago.rs b/src/util/timeago.rs index a384d9a..f5c368e 100644 --- a/src/util/timeago.rs +++ b/src/util/timeago.rs @@ -17,7 +17,7 @@ use time::{Date, Duration, Month, OffsetDateTime}; use crate::{ param::Language, - util::{self, dictionary}, + util::{self, dictionary, SplitTokens}, }; /// Parsed TimeAgo string, contains amount and time unit. @@ -149,79 +149,39 @@ fn filter_str(string: &str) -> String { .collect() } -fn parse_ta_token( - entry: &dictionary::Entry, - by_char: bool, - nd: bool, - filtered_str: &str, -) -> Option { - let tokens = match nd { - true => &entry.timeago_nd_tokens, - false => &entry.timeago_tokens, - }; - let mut qu = 1; +struct TaTokenParser<'a> { + iter: SplitTokens<'a>, + tokens: &'a phf::Map<&'static str, TaToken>, +} - if by_char { - filtered_str.chars().find_map(|word| { - tokens.get(&word.to_string()).and_then(|t| match t.unit { - Some(unit) => Some(TimeAgo { n: t.n * qu, unit }), - None => { - qu = t.n; - None - } - }) - }) - } else { - filtered_str.split_whitespace().find_map(|word| { - tokens.get(word).and_then(|t| match t.unit { - Some(unit) => Some(TimeAgo { n: t.n * qu, unit }), - None => { - qu = t.n; - None - } - }) - }) +impl<'a> TaTokenParser<'a> { + fn new(entry: &'a dictionary::Entry, by_char: bool, nd: bool, filtered_str: &'a str) -> Self { + let tokens = match nd { + true => &entry.timeago_nd_tokens, + false => &entry.timeago_tokens, + }; + Self { + iter: SplitTokens::new(filtered_str, by_char), + tokens, + } } } -fn parse_ta_tokens( - entry: &dictionary::Entry, - by_char: bool, - nd: bool, - filtered_str: &str, -) -> Vec { - let tokens = match nd { - true => &entry.timeago_nd_tokens, - false => &entry.timeago_tokens, - }; - let mut qu = 1; +impl<'a> Iterator for TaTokenParser<'a> { + type Item = TimeAgo; - if by_char { - filtered_str - .chars() - .filter_map(|word| { - tokens.get(&word.to_string()).and_then(|t| match t.unit { - Some(unit) => Some(TimeAgo { n: t.n * qu, unit }), - None => { - qu = t.n; - None - } - }) + fn next(&mut self) -> Option { + // Quantity for parsing separate quantity + unit tokens + let mut qu = 1; + self.iter.find_map(|word| { + self.tokens.get(word).and_then(|t| match t.unit { + Some(unit) => Some(TimeAgo { n: t.n * qu, unit }), + None => { + qu = t.n; + None + } }) - .collect() - } else { - filtered_str - .split_whitespace() - .filter_map(|word| { - tokens.get(word).and_then(|t| match t.unit { - Some(unit) => Some(TimeAgo { n: t.n * qu, unit }), - None => { - qu = t.n; - None - } - }) - }) - .collect() + }) } } @@ -240,7 +200,9 @@ pub fn parse_timeago(lang: Language, textual_date: &str) -> Option { let qu: u8 = util::parse_numeric(textual_date).unwrap_or(1); - parse_ta_token(&entry, util::lang_by_char(lang), false, &filtered_str).map(|ta| ta * qu) + TaTokenParser::new(&entry, util::lang_by_char(lang), false, &filtered_str) + .next() + .map(|ta| ta * qu) } /// Parse a TimeAgo string (e.g. "29 minutes ago") into a Chrono DateTime object. @@ -273,11 +235,14 @@ pub fn parse_textual_date(lang: Language, textual_date: &str) -> Option(textual_date); match nums.len() { - 0 => match parse_ta_token(&entry, by_char, true, &filtered_str) { + 0 => match TaTokenParser::new(&entry, by_char, true, &filtered_str).next() { Some(timeago) => Some(ParsedDate::Relative(timeago)), - None => parse_ta_token(&entry, by_char, false, &filtered_str).map(ParsedDate::Relative), + None => TaTokenParser::new(&entry, by_char, false, &filtered_str) + .next() + .map(ParsedDate::Relative), }, - 1 => parse_ta_token(&entry, by_char, false, &filtered_str) + 1 => TaTokenParser::new(&entry, by_char, false, &filtered_str) + .next() .map(|timeago| ParsedDate::Relative(timeago * nums[0] as u8)), 2..=3 => { if nums.len() == entry.date_order.len() { @@ -348,12 +313,10 @@ pub fn parse_video_duration(lang: Language, video_duration: &str) -> Option } else { part.digits.parse::().ok()? }; - let tokens = parse_ta_tokens(&entry, by_char, false, &part.word); - if tokens.is_empty() { - return None; - } + let mut tokens = TaTokenParser::new(&entry, by_char, false, &part.word).peekable(); + tokens.peek()?; - tokens.iter().for_each(|ta| { + tokens.for_each(|ta| { secs += n * ta.secs() as u32; n = 1; }); @@ -805,4 +768,12 @@ mod tests { let now = OffsetDateTime::now_utc(); assert_eq!(date.year(), now.year() - 1); } + + #[test] + fn tx() { + let s = "Abcdef"; + let lc: (usize, char) = s.char_indices().last().unwrap(); + let t = &s[(lc.0 + lc.1.len_utf8())..]; + dbg!(&t); + } }