refactor: add iterators for parsing tokens

This commit is contained in:
ThetaDev 2023-05-07 18:00:49 +02:00
parent b3331b36a7
commit 0008e305c2
3 changed files with 153 additions and 124 deletions

View file

@ -1,5 +1,7 @@
# RustyPipe # RustyPipe
[![CI status](https://ci.thetadev.de/api/badges/ThetaDev/rustypipe/status.svg)](https://ci.thetadev.de/ThetaDev/rustypipe)
Client for the public YouTube / YouTube Music API (Innertube), Client for the public YouTube / YouTube Music API (Innertube),
inspired by [NewPipe](https://github.com/TeamNewPipe/NewPipeExtractor). inspired by [NewPipe](https://github.com/TeamNewPipe/NewPipeExtractor).
@ -7,25 +9,25 @@ inspired by [NewPipe](https://github.com/TeamNewPipe/NewPipeExtractor).
### YouTube ### YouTube
- [X] **Player** (video/audio streams, subtitles) - **Player** (video/audio streams, subtitles)
- [X] **Playlist** - **Playlist**
- [X] **VideoDetails** (metadata, comments, recommended videos) - **VideoDetails** (metadata, comments, recommended videos)
- [X] **Channel** (videos, shorts, livestreams, playlists, info, search) - **Channel** (videos, shorts, livestreams, playlists, info, search)
- [X] **ChannelRSS** - **ChannelRSS**
- [X] **Search** (with filters) - **Search** (with filters)
- [X] **Search suggestions** - **Search suggestions**
- [X] **Trending** - **Trending**
- [X] **URL resolver** - **URL resolver**
### YouTube Music ### YouTube Music
- [X] **Playlist** - **Playlist**
- [X] **Album** - **Album**
- [X] **Artist** - **Artist**
- [X] **Search** - **Search**
- [X] **Search suggestions** - **Search suggestions**
- [X] **Radio** - **Radio**
- [X] **Track details** (lyrics, recommendations) - **Track details** (lyrics, recommendations)
- [X] **Moods/Genres** - **Moods/Genres**
- [X] **Charts** - **Charts**
- [X] **New** - **New** (albums, music videos)

View file

@ -10,7 +10,7 @@ pub use protobuf::{string_from_pb, ProtoBuilder};
use std::{ use std::{
borrow::{Borrow, Cow}, borrow::{Borrow, Cow},
collections::BTreeMap, collections::BTreeMap,
str::FromStr, str::{FromStr, SplitWhitespace},
}; };
use base64::Engine; use base64::Engine;
@ -331,36 +331,18 @@ where
} }
if digits.is_empty() { if digits.is_empty() {
if by_char { SplitTokens::new(&filtered, by_char)
filtered .find_map(|token| dict_entry.number_nd_tokens.get(token))
.chars() .and_then(|n| (*n as u64).try_into().ok())
.find_map(|c| dict_entry.number_nd_tokens.get(&c.to_string()))
.and_then(|n| (*n as u64).try_into().ok())
} else {
filtered
.split_whitespace()
.find_map(|token| dict_entry.number_nd_tokens.get(token))
.and_then(|n| (*n as u64).try_into().ok())
}
} else { } else {
let num = digits.parse::<u64>().ok()?; let num = digits.parse::<u64>().ok()?;
let lookup_token = |token: &str| match token { exp += SplitTokens::new(&filtered, by_char)
"k" => Some(3), .filter_map(|token| match token {
_ => dict_entry.number_tokens.get(token).map(|t| *t as i32), "k" => Some(3),
}; _ => dict_entry.number_tokens.get(token).map(|t| *t as i32),
})
if by_char { .sum::<i32>();
exp += filtered
.chars()
.filter_map(|token| lookup_token(&token.to_string()))
.sum::<i32>();
} else {
exp += filtered
.split_whitespace()
.filter_map(lookup_token)
.sum::<i32>();
}
F::try_from(num.checked_mul((10_u64).checked_pow(exp.try_into().ok()?)?)?).ok() F::try_from(num.checked_mul((10_u64).checked_pow(exp.try_into().ok()?)?)?).ok()
} }
@ -415,6 +397,62 @@ pub fn b64_decode<T: AsRef<[u8]>>(input: T) -> Result<Vec<u8>, base64::DecodeErr
base64::engine::general_purpose::STANDARD.decode(input) base64::engine::general_purpose::STANDARD.decode(input)
} }
/// An iterator over the chars in a string (in str format)
pub struct SplitChar<'a> {
txt: &'a str,
index: usize,
}
impl<'a> From<&'a str> for SplitChar<'a> {
fn from(value: &'a str) -> Self {
Self {
txt: value,
index: 0,
}
}
}
impl<'a> Iterator for SplitChar<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<Self::Item> {
self.txt
.get(self.index..)
.and_then(|txt| txt.chars().next())
.map(|c| {
let start = self.index;
self.index += c.len_utf8();
&self.txt[start..self.index]
})
}
}
/// An iterator for parsing strings. It can either iterate over words or characters.
pub enum SplitTokens<'a> {
Word(SplitWhitespace<'a>),
Char(SplitChar<'a>),
}
impl<'a> SplitTokens<'a> {
pub fn new(s: &'a str, by_char: bool) -> Self {
match by_char {
true => Self::Char(SplitChar::from(s)),
false => Self::Word(s.split_whitespace()),
}
}
}
impl<'a> Iterator for SplitTokens<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<Self::Item> {
match self {
SplitTokens::Word(iter) => iter.next(),
SplitTokens::Char(iter) => iter.next(),
}
}
}
#[cfg(test)] #[cfg(test)]
pub(crate) mod tests { pub(crate) mod tests {
use std::{fs::File, io::BufReader, path::PathBuf}; use std::{fs::File, io::BufReader, path::PathBuf};
@ -550,4 +588,22 @@ pub(crate) mod tests {
let res = parse_large_numstr::<u64>(string, lang).expect(&emsg); let res = parse_large_numstr::<u64>(string, lang).expect(&emsg);
assert_eq!(res, rounded, "{emsg}"); assert_eq!(res, rounded, "{emsg}");
} }
#[test]
fn split_char() {
let teststr = "abc今天更新def";
let res = SplitTokens::new(teststr, true).collect::<Vec<_>>();
assert_eq!(res.len(), 10);
let res_str = res.into_iter().collect::<String>();
assert_eq!(res_str, teststr)
}
#[test]
fn split_words() {
let teststr = "abc 今天更新 ghi";
let res = SplitTokens::new(teststr, false).collect::<Vec<_>>();
assert_eq!(res.len(), 3);
let res_str = res.join(" ");
assert_eq!(res_str, teststr)
}
} }

View file

@ -17,7 +17,7 @@ use time::{Date, Duration, Month, OffsetDateTime};
use crate::{ use crate::{
param::Language, param::Language,
util::{self, dictionary}, util::{self, dictionary, SplitTokens},
}; };
/// Parsed TimeAgo string, contains amount and time unit. /// Parsed TimeAgo string, contains amount and time unit.
@ -149,79 +149,39 @@ fn filter_str(string: &str) -> String {
.collect() .collect()
} }
fn parse_ta_token( struct TaTokenParser<'a> {
entry: &dictionary::Entry, iter: SplitTokens<'a>,
by_char: bool, tokens: &'a phf::Map<&'static str, TaToken>,
nd: bool, }
filtered_str: &str,
) -> Option<TimeAgo> {
let tokens = match nd {
true => &entry.timeago_nd_tokens,
false => &entry.timeago_tokens,
};
let mut qu = 1;
if by_char { impl<'a> TaTokenParser<'a> {
filtered_str.chars().find_map(|word| { fn new(entry: &'a dictionary::Entry, by_char: bool, nd: bool, filtered_str: &'a str) -> Self {
tokens.get(&word.to_string()).and_then(|t| match t.unit { let tokens = match nd {
Some(unit) => Some(TimeAgo { n: t.n * qu, unit }), true => &entry.timeago_nd_tokens,
None => { false => &entry.timeago_tokens,
qu = t.n; };
None Self {
} iter: SplitTokens::new(filtered_str, by_char),
}) tokens,
}) }
} else {
filtered_str.split_whitespace().find_map(|word| {
tokens.get(word).and_then(|t| match t.unit {
Some(unit) => Some(TimeAgo { n: t.n * qu, unit }),
None => {
qu = t.n;
None
}
})
})
} }
} }
fn parse_ta_tokens( impl<'a> Iterator for TaTokenParser<'a> {
entry: &dictionary::Entry, type Item = TimeAgo;
by_char: bool,
nd: bool,
filtered_str: &str,
) -> Vec<TimeAgo> {
let tokens = match nd {
true => &entry.timeago_nd_tokens,
false => &entry.timeago_tokens,
};
let mut qu = 1;
if by_char { fn next(&mut self) -> Option<Self::Item> {
filtered_str // Quantity for parsing separate quantity + unit tokens
.chars() let mut qu = 1;
.filter_map(|word| { self.iter.find_map(|word| {
tokens.get(&word.to_string()).and_then(|t| match t.unit { self.tokens.get(word).and_then(|t| match t.unit {
Some(unit) => Some(TimeAgo { n: t.n * qu, unit }), Some(unit) => Some(TimeAgo { n: t.n * qu, unit }),
None => { None => {
qu = t.n; qu = t.n;
None None
} }
})
}) })
.collect() })
} else {
filtered_str
.split_whitespace()
.filter_map(|word| {
tokens.get(word).and_then(|t| match t.unit {
Some(unit) => Some(TimeAgo { n: t.n * qu, unit }),
None => {
qu = t.n;
None
}
})
})
.collect()
} }
} }
@ -240,7 +200,9 @@ pub fn parse_timeago(lang: Language, textual_date: &str) -> Option<TimeAgo> {
let qu: u8 = util::parse_numeric(textual_date).unwrap_or(1); let qu: u8 = util::parse_numeric(textual_date).unwrap_or(1);
parse_ta_token(&entry, util::lang_by_char(lang), false, &filtered_str).map(|ta| ta * qu) TaTokenParser::new(&entry, util::lang_by_char(lang), false, &filtered_str)
.next()
.map(|ta| ta * qu)
} }
/// Parse a TimeAgo string (e.g. "29 minutes ago") into a Chrono DateTime object. /// Parse a TimeAgo string (e.g. "29 minutes ago") into a Chrono DateTime object.
@ -273,11 +235,14 @@ pub fn parse_textual_date(lang: Language, textual_date: &str) -> Option<ParsedDa
let nums = util::parse_numeric_vec::<u16>(textual_date); let nums = util::parse_numeric_vec::<u16>(textual_date);
match nums.len() { match nums.len() {
0 => match parse_ta_token(&entry, by_char, true, &filtered_str) { 0 => match TaTokenParser::new(&entry, by_char, true, &filtered_str).next() {
Some(timeago) => Some(ParsedDate::Relative(timeago)), Some(timeago) => Some(ParsedDate::Relative(timeago)),
None => parse_ta_token(&entry, by_char, false, &filtered_str).map(ParsedDate::Relative), None => TaTokenParser::new(&entry, by_char, false, &filtered_str)
.next()
.map(ParsedDate::Relative),
}, },
1 => parse_ta_token(&entry, by_char, false, &filtered_str) 1 => TaTokenParser::new(&entry, by_char, false, &filtered_str)
.next()
.map(|timeago| ParsedDate::Relative(timeago * nums[0] as u8)), .map(|timeago| ParsedDate::Relative(timeago * nums[0] as u8)),
2..=3 => { 2..=3 => {
if nums.len() == entry.date_order.len() { if nums.len() == entry.date_order.len() {
@ -348,12 +313,10 @@ pub fn parse_video_duration(lang: Language, video_duration: &str) -> Option<u32>
} else { } else {
part.digits.parse::<u32>().ok()? part.digits.parse::<u32>().ok()?
}; };
let tokens = parse_ta_tokens(&entry, by_char, false, &part.word); let mut tokens = TaTokenParser::new(&entry, by_char, false, &part.word).peekable();
if tokens.is_empty() { tokens.peek()?;
return None;
}
tokens.iter().for_each(|ta| { tokens.for_each(|ta| {
secs += n * ta.secs() as u32; secs += n * ta.secs() as u32;
n = 1; n = 1;
}); });
@ -805,4 +768,12 @@ mod tests {
let now = OffsetDateTime::now_utc(); let now = OffsetDateTime::now_utc();
assert_eq!(date.year(), now.year() - 1); assert_eq!(date.year(), now.year() - 1);
} }
#[test]
fn tx() {
let s = "Abcdef";
let lc: (usize, char) = s.char_indices().last().unwrap();
let t = &s[(lc.0 + lc.1.len_utf8())..];
dbg!(&t);
}
} }