refactor: add iterators for parsing tokens
This commit is contained in:
parent
b3331b36a7
commit
0008e305c2
3 changed files with 153 additions and 124 deletions
40
README.md
40
README.md
|
|
@ -1,5 +1,7 @@
|
||||||
# RustyPipe
|
# RustyPipe
|
||||||
|
|
||||||
|
[](https://ci.thetadev.de/ThetaDev/rustypipe)
|
||||||
|
|
||||||
Client for the public YouTube / YouTube Music API (Innertube),
|
Client for the public YouTube / YouTube Music API (Innertube),
|
||||||
inspired by [NewPipe](https://github.com/TeamNewPipe/NewPipeExtractor).
|
inspired by [NewPipe](https://github.com/TeamNewPipe/NewPipeExtractor).
|
||||||
|
|
||||||
|
|
@ -7,25 +9,25 @@ inspired by [NewPipe](https://github.com/TeamNewPipe/NewPipeExtractor).
|
||||||
|
|
||||||
### YouTube
|
### YouTube
|
||||||
|
|
||||||
- [X] **Player** (video/audio streams, subtitles)
|
- **Player** (video/audio streams, subtitles)
|
||||||
- [X] **Playlist**
|
- **Playlist**
|
||||||
- [X] **VideoDetails** (metadata, comments, recommended videos)
|
- **VideoDetails** (metadata, comments, recommended videos)
|
||||||
- [X] **Channel** (videos, shorts, livestreams, playlists, info, search)
|
- **Channel** (videos, shorts, livestreams, playlists, info, search)
|
||||||
- [X] **ChannelRSS**
|
- **ChannelRSS**
|
||||||
- [X] **Search** (with filters)
|
- **Search** (with filters)
|
||||||
- [X] **Search suggestions**
|
- **Search suggestions**
|
||||||
- [X] **Trending**
|
- **Trending**
|
||||||
- [X] **URL resolver**
|
- **URL resolver**
|
||||||
|
|
||||||
### YouTube Music
|
### YouTube Music
|
||||||
|
|
||||||
- [X] **Playlist**
|
- **Playlist**
|
||||||
- [X] **Album**
|
- **Album**
|
||||||
- [X] **Artist**
|
- **Artist**
|
||||||
- [X] **Search**
|
- **Search**
|
||||||
- [X] **Search suggestions**
|
- **Search suggestions**
|
||||||
- [X] **Radio**
|
- **Radio**
|
||||||
- [X] **Track details** (lyrics, recommendations)
|
- **Track details** (lyrics, recommendations)
|
||||||
- [X] **Moods/Genres**
|
- **Moods/Genres**
|
||||||
- [X] **Charts**
|
- **Charts**
|
||||||
- [X] **New**
|
- **New** (albums, music videos)
|
||||||
|
|
|
||||||
112
src/util/mod.rs
112
src/util/mod.rs
|
|
@ -10,7 +10,7 @@ pub use protobuf::{string_from_pb, ProtoBuilder};
|
||||||
use std::{
|
use std::{
|
||||||
borrow::{Borrow, Cow},
|
borrow::{Borrow, Cow},
|
||||||
collections::BTreeMap,
|
collections::BTreeMap,
|
||||||
str::FromStr,
|
str::{FromStr, SplitWhitespace},
|
||||||
};
|
};
|
||||||
|
|
||||||
use base64::Engine;
|
use base64::Engine;
|
||||||
|
|
@ -331,36 +331,18 @@ where
|
||||||
}
|
}
|
||||||
|
|
||||||
if digits.is_empty() {
|
if digits.is_empty() {
|
||||||
if by_char {
|
SplitTokens::new(&filtered, by_char)
|
||||||
filtered
|
.find_map(|token| dict_entry.number_nd_tokens.get(token))
|
||||||
.chars()
|
.and_then(|n| (*n as u64).try_into().ok())
|
||||||
.find_map(|c| dict_entry.number_nd_tokens.get(&c.to_string()))
|
|
||||||
.and_then(|n| (*n as u64).try_into().ok())
|
|
||||||
} else {
|
|
||||||
filtered
|
|
||||||
.split_whitespace()
|
|
||||||
.find_map(|token| dict_entry.number_nd_tokens.get(token))
|
|
||||||
.and_then(|n| (*n as u64).try_into().ok())
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
let num = digits.parse::<u64>().ok()?;
|
let num = digits.parse::<u64>().ok()?;
|
||||||
|
|
||||||
let lookup_token = |token: &str| match token {
|
exp += SplitTokens::new(&filtered, by_char)
|
||||||
"k" => Some(3),
|
.filter_map(|token| match token {
|
||||||
_ => dict_entry.number_tokens.get(token).map(|t| *t as i32),
|
"k" => Some(3),
|
||||||
};
|
_ => dict_entry.number_tokens.get(token).map(|t| *t as i32),
|
||||||
|
})
|
||||||
if by_char {
|
.sum::<i32>();
|
||||||
exp += filtered
|
|
||||||
.chars()
|
|
||||||
.filter_map(|token| lookup_token(&token.to_string()))
|
|
||||||
.sum::<i32>();
|
|
||||||
} else {
|
|
||||||
exp += filtered
|
|
||||||
.split_whitespace()
|
|
||||||
.filter_map(lookup_token)
|
|
||||||
.sum::<i32>();
|
|
||||||
}
|
|
||||||
|
|
||||||
F::try_from(num.checked_mul((10_u64).checked_pow(exp.try_into().ok()?)?)?).ok()
|
F::try_from(num.checked_mul((10_u64).checked_pow(exp.try_into().ok()?)?)?).ok()
|
||||||
}
|
}
|
||||||
|
|
@ -415,6 +397,62 @@ pub fn b64_decode<T: AsRef<[u8]>>(input: T) -> Result<Vec<u8>, base64::DecodeErr
|
||||||
base64::engine::general_purpose::STANDARD.decode(input)
|
base64::engine::general_purpose::STANDARD.decode(input)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// An iterator over the chars in a string (in str format)
|
||||||
|
pub struct SplitChar<'a> {
|
||||||
|
txt: &'a str,
|
||||||
|
index: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> From<&'a str> for SplitChar<'a> {
|
||||||
|
fn from(value: &'a str) -> Self {
|
||||||
|
Self {
|
||||||
|
txt: value,
|
||||||
|
index: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterator for SplitChar<'a> {
|
||||||
|
type Item = &'a str;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
self.txt
|
||||||
|
.get(self.index..)
|
||||||
|
.and_then(|txt| txt.chars().next())
|
||||||
|
.map(|c| {
|
||||||
|
let start = self.index;
|
||||||
|
self.index += c.len_utf8();
|
||||||
|
&self.txt[start..self.index]
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An iterator for parsing strings. It can either iterate over words or characters.
|
||||||
|
pub enum SplitTokens<'a> {
|
||||||
|
Word(SplitWhitespace<'a>),
|
||||||
|
Char(SplitChar<'a>),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> SplitTokens<'a> {
|
||||||
|
pub fn new(s: &'a str, by_char: bool) -> Self {
|
||||||
|
match by_char {
|
||||||
|
true => Self::Char(SplitChar::from(s)),
|
||||||
|
false => Self::Word(s.split_whitespace()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterator for SplitTokens<'a> {
|
||||||
|
type Item = &'a str;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
match self {
|
||||||
|
SplitTokens::Word(iter) => iter.next(),
|
||||||
|
SplitTokens::Char(iter) => iter.next(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub(crate) mod tests {
|
pub(crate) mod tests {
|
||||||
use std::{fs::File, io::BufReader, path::PathBuf};
|
use std::{fs::File, io::BufReader, path::PathBuf};
|
||||||
|
|
@ -550,4 +588,22 @@ pub(crate) mod tests {
|
||||||
let res = parse_large_numstr::<u64>(string, lang).expect(&emsg);
|
let res = parse_large_numstr::<u64>(string, lang).expect(&emsg);
|
||||||
assert_eq!(res, rounded, "{emsg}");
|
assert_eq!(res, rounded, "{emsg}");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn split_char() {
|
||||||
|
let teststr = "abc今天更新def";
|
||||||
|
let res = SplitTokens::new(teststr, true).collect::<Vec<_>>();
|
||||||
|
assert_eq!(res.len(), 10);
|
||||||
|
let res_str = res.into_iter().collect::<String>();
|
||||||
|
assert_eq!(res_str, teststr)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn split_words() {
|
||||||
|
let teststr = "abc 今天更新 ghi";
|
||||||
|
let res = SplitTokens::new(teststr, false).collect::<Vec<_>>();
|
||||||
|
assert_eq!(res.len(), 3);
|
||||||
|
let res_str = res.join(" ");
|
||||||
|
assert_eq!(res_str, teststr)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,7 @@ use time::{Date, Duration, Month, OffsetDateTime};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
param::Language,
|
param::Language,
|
||||||
util::{self, dictionary},
|
util::{self, dictionary, SplitTokens},
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Parsed TimeAgo string, contains amount and time unit.
|
/// Parsed TimeAgo string, contains amount and time unit.
|
||||||
|
|
@ -149,79 +149,39 @@ fn filter_str(string: &str) -> String {
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_ta_token(
|
struct TaTokenParser<'a> {
|
||||||
entry: &dictionary::Entry,
|
iter: SplitTokens<'a>,
|
||||||
by_char: bool,
|
tokens: &'a phf::Map<&'static str, TaToken>,
|
||||||
nd: bool,
|
}
|
||||||
filtered_str: &str,
|
|
||||||
) -> Option<TimeAgo> {
|
|
||||||
let tokens = match nd {
|
|
||||||
true => &entry.timeago_nd_tokens,
|
|
||||||
false => &entry.timeago_tokens,
|
|
||||||
};
|
|
||||||
let mut qu = 1;
|
|
||||||
|
|
||||||
if by_char {
|
impl<'a> TaTokenParser<'a> {
|
||||||
filtered_str.chars().find_map(|word| {
|
fn new(entry: &'a dictionary::Entry, by_char: bool, nd: bool, filtered_str: &'a str) -> Self {
|
||||||
tokens.get(&word.to_string()).and_then(|t| match t.unit {
|
let tokens = match nd {
|
||||||
Some(unit) => Some(TimeAgo { n: t.n * qu, unit }),
|
true => &entry.timeago_nd_tokens,
|
||||||
None => {
|
false => &entry.timeago_tokens,
|
||||||
qu = t.n;
|
};
|
||||||
None
|
Self {
|
||||||
}
|
iter: SplitTokens::new(filtered_str, by_char),
|
||||||
})
|
tokens,
|
||||||
})
|
}
|
||||||
} else {
|
|
||||||
filtered_str.split_whitespace().find_map(|word| {
|
|
||||||
tokens.get(word).and_then(|t| match t.unit {
|
|
||||||
Some(unit) => Some(TimeAgo { n: t.n * qu, unit }),
|
|
||||||
None => {
|
|
||||||
qu = t.n;
|
|
||||||
None
|
|
||||||
}
|
|
||||||
})
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_ta_tokens(
|
impl<'a> Iterator for TaTokenParser<'a> {
|
||||||
entry: &dictionary::Entry,
|
type Item = TimeAgo;
|
||||||
by_char: bool,
|
|
||||||
nd: bool,
|
|
||||||
filtered_str: &str,
|
|
||||||
) -> Vec<TimeAgo> {
|
|
||||||
let tokens = match nd {
|
|
||||||
true => &entry.timeago_nd_tokens,
|
|
||||||
false => &entry.timeago_tokens,
|
|
||||||
};
|
|
||||||
let mut qu = 1;
|
|
||||||
|
|
||||||
if by_char {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
filtered_str
|
// Quantity for parsing separate quantity + unit tokens
|
||||||
.chars()
|
let mut qu = 1;
|
||||||
.filter_map(|word| {
|
self.iter.find_map(|word| {
|
||||||
tokens.get(&word.to_string()).and_then(|t| match t.unit {
|
self.tokens.get(word).and_then(|t| match t.unit {
|
||||||
Some(unit) => Some(TimeAgo { n: t.n * qu, unit }),
|
Some(unit) => Some(TimeAgo { n: t.n * qu, unit }),
|
||||||
None => {
|
None => {
|
||||||
qu = t.n;
|
qu = t.n;
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
})
|
|
||||||
})
|
})
|
||||||
.collect()
|
})
|
||||||
} else {
|
|
||||||
filtered_str
|
|
||||||
.split_whitespace()
|
|
||||||
.filter_map(|word| {
|
|
||||||
tokens.get(word).and_then(|t| match t.unit {
|
|
||||||
Some(unit) => Some(TimeAgo { n: t.n * qu, unit }),
|
|
||||||
None => {
|
|
||||||
qu = t.n;
|
|
||||||
None
|
|
||||||
}
|
|
||||||
})
|
|
||||||
})
|
|
||||||
.collect()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -240,7 +200,9 @@ pub fn parse_timeago(lang: Language, textual_date: &str) -> Option<TimeAgo> {
|
||||||
|
|
||||||
let qu: u8 = util::parse_numeric(textual_date).unwrap_or(1);
|
let qu: u8 = util::parse_numeric(textual_date).unwrap_or(1);
|
||||||
|
|
||||||
parse_ta_token(&entry, util::lang_by_char(lang), false, &filtered_str).map(|ta| ta * qu)
|
TaTokenParser::new(&entry, util::lang_by_char(lang), false, &filtered_str)
|
||||||
|
.next()
|
||||||
|
.map(|ta| ta * qu)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parse a TimeAgo string (e.g. "29 minutes ago") into a Chrono DateTime object.
|
/// Parse a TimeAgo string (e.g. "29 minutes ago") into a Chrono DateTime object.
|
||||||
|
|
@ -273,11 +235,14 @@ pub fn parse_textual_date(lang: Language, textual_date: &str) -> Option<ParsedDa
|
||||||
let nums = util::parse_numeric_vec::<u16>(textual_date);
|
let nums = util::parse_numeric_vec::<u16>(textual_date);
|
||||||
|
|
||||||
match nums.len() {
|
match nums.len() {
|
||||||
0 => match parse_ta_token(&entry, by_char, true, &filtered_str) {
|
0 => match TaTokenParser::new(&entry, by_char, true, &filtered_str).next() {
|
||||||
Some(timeago) => Some(ParsedDate::Relative(timeago)),
|
Some(timeago) => Some(ParsedDate::Relative(timeago)),
|
||||||
None => parse_ta_token(&entry, by_char, false, &filtered_str).map(ParsedDate::Relative),
|
None => TaTokenParser::new(&entry, by_char, false, &filtered_str)
|
||||||
|
.next()
|
||||||
|
.map(ParsedDate::Relative),
|
||||||
},
|
},
|
||||||
1 => parse_ta_token(&entry, by_char, false, &filtered_str)
|
1 => TaTokenParser::new(&entry, by_char, false, &filtered_str)
|
||||||
|
.next()
|
||||||
.map(|timeago| ParsedDate::Relative(timeago * nums[0] as u8)),
|
.map(|timeago| ParsedDate::Relative(timeago * nums[0] as u8)),
|
||||||
2..=3 => {
|
2..=3 => {
|
||||||
if nums.len() == entry.date_order.len() {
|
if nums.len() == entry.date_order.len() {
|
||||||
|
|
@ -348,12 +313,10 @@ pub fn parse_video_duration(lang: Language, video_duration: &str) -> Option<u32>
|
||||||
} else {
|
} else {
|
||||||
part.digits.parse::<u32>().ok()?
|
part.digits.parse::<u32>().ok()?
|
||||||
};
|
};
|
||||||
let tokens = parse_ta_tokens(&entry, by_char, false, &part.word);
|
let mut tokens = TaTokenParser::new(&entry, by_char, false, &part.word).peekable();
|
||||||
if tokens.is_empty() {
|
tokens.peek()?;
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
tokens.iter().for_each(|ta| {
|
tokens.for_each(|ta| {
|
||||||
secs += n * ta.secs() as u32;
|
secs += n * ta.secs() as u32;
|
||||||
n = 1;
|
n = 1;
|
||||||
});
|
});
|
||||||
|
|
@ -805,4 +768,12 @@ mod tests {
|
||||||
let now = OffsetDateTime::now_utc();
|
let now = OffsetDateTime::now_utc();
|
||||||
assert_eq!(date.year(), now.year() - 1);
|
assert_eq!(date.year(), now.year() - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tx() {
|
||||||
|
let s = "Abcdef";
|
||||||
|
let lc: (usize, char) = s.char_indices().last().unwrap();
|
||||||
|
let t = &s[(lc.0 + lc.1.len_utf8())..];
|
||||||
|
dbg!(&t);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Reference in a new issue