fix: shorts duration parsing, playlist dates (no), number_nd_tokens (is)

This commit is contained in:
ThetaDev 2023-05-11 17:40:53 +02:00
parent b862d2d1f9
commit ef1cdbc91a
19 changed files with 302 additions and 230 deletions

View file

@ -208,7 +208,9 @@ pub(crate) struct CoverMusicItem {
///
/// `"2022"` Artist singles
///
/// `"Playlist", " • ", <"ThetaDev"> " • ", "26 songs"`
/// `"Playlist", " • ", <"YouTube Music"> " • ", "53 songs"`
///
/// `"Playlist", " • ", <"Vevo Playlists"> " • ", "13M views"`
///
/// `"Playlist", " • ", "YouTube Music" Featured on
#[serde(default)]
@ -737,8 +739,9 @@ impl MusicListMapper {
let channel = channel_p.and_then(|p| {
p.0.into_iter().find_map(|c| ChannelId::try_from(c).ok())
});
let track_count =
tcount_p.and_then(|p| util::parse_numeric(p.first_str()).ok());
let track_count = tcount_p
.filter(|_| from_ytm)
.and_then(|p| util::parse_numeric(p.first_str()).ok());
self.items.push(MusicItem::Playlist(MusicPlaylistItem {
id,
@ -772,7 +775,6 @@ impl MusicListMapper {
let mut subtitle_parts = item.subtitle.split(util::DOT_SEPARATOR).into_iter();
let subtitle_p1 = subtitle_parts.next();
let subtitle_p2 = subtitle_parts.next();
let subtitle_p3 = subtitle_parts.next();
match item.navigation_endpoint.music_page() {
Some((page_type, id)) => match page_type {
@ -879,15 +881,13 @@ impl MusicListMapper {
let channel = subtitle_p2.and_then(|p| {
p.0.into_iter().find_map(|c| ChannelId::try_from(c).ok())
});
let track_count =
subtitle_p3.and_then(|p| util::parse_numeric(p.first_str()).ok());
self.items.push(MusicItem::Playlist(MusicPlaylistItem {
id,
name: item.title,
thumbnail: item.thumbnail_renderer.into(),
channel,
track_count,
track_count: None,
from_ytm,
}));
Ok(Some(MusicItemType::Playlist))

View file

@ -1,3 +1,5 @@
use once_cell::sync::Lazy;
use regex::Regex;
use serde::Deserialize;
use serde_with::{
json::JsonString, rust::deserialize_ignore_any, serde_as, DefaultOnError, VecSkipError,
@ -382,6 +384,10 @@ impl IsShort for Vec<TimeOverlay> {
}
}
static ACCESSIBILITY_SEP_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new("(?:[ \u{00a0}][-\u{2013}\u{2014}] )|\u{2013}|(?:\u{055d} )|(?:\", )").unwrap()
});
/// Result of mapping a list of different YouTube enities
/// (videos, channels, playlists)
#[derive(Debug)]
@ -496,14 +502,29 @@ impl<T> YouTubeListMapper<T> {
.timestamp_text
});
let length = video.accessibility.and_then(|acc| {
let parts = ACCESSIBILITY_SEP_REGEX.split(&acc).collect::<Vec<_>>();
if parts.len() > 2 {
let i = match lang {
Language::Ru => 1,
_ => 2,
};
timeago::parse_video_duration_or_warn(
self.lang,
parts[parts.len() - i],
&mut self.warnings,
)
} else {
self.warnings
.push(format!("could not split video duration `{acc}`"));
None
}
});
VideoItem {
id: video.video_id,
name: video.headline,
length: video.accessibility.and_then(|acc| {
acc.rsplit(" - ").nth(1).and_then(|s| {
timeago::parse_video_duration_or_warn(self.lang, s, &mut self.warnings)
})
}),
length,
thumbnail: video.thumbnail.into(),
channel: self.channel.clone(),
publish_date: pub_date_txt.as_ref().and_then(|txt| {
@ -704,3 +725,50 @@ impl YouTubeListMapper<PlaylistItem> {
res.c.into_iter().for_each(|item| self.map_item(item));
}
}
#[cfg(test)]
mod tests {
use super::ACCESSIBILITY_SEP_REGEX;
use rstest::rstest;
#[rstest]
#[case::af(
"BTS - Permission to Dance Cover #shorts #pinkfong 50 sekondes speel video",
"50 sekondes"
)]
#[case::de(
"Point of view: Me VS My mom #shorts  8 Sekunden  Video wiedergeben",
"8 Sekunden"
)]
#[case::be(
"Point of view: Me VS My mom #shorts8 секунд прайграць відэа",
"8 секунд"
)]
#[case::fil("do u wanna get swole? - 53 segundo - i-play ang video", "53 segundo")]
#[case::ar(
"«the holy trinity of korean street food»՝ 1 րոպե՝ նվագարկել տեսանյութը",
"1 րոպե"
)]
#[case::lv(
"what i ate in google japan — 1 minūte — atskaņot videoklipu",
"1 minūte"
)]
#[case::sq("When you impulse buy... - 1 minutë - luaj videon", "1 minutë")]
#[case::uk(
"\"Point of view: Me VS My mom #shorts\", 8 секунд відтворити відео",
"8 секунд"
)]
// INFO: sw is unparseable "coming soonsekunde 58 - cheza video"
fn split_duration_txt(#[case] s: &str, #[case] expect: &str) {
let parts = ACCESSIBILITY_SEP_REGEX.split(s).collect::<Vec<_>>();
assert_eq!(parts[parts.len() - 2], expect);
}
#[test]
fn split_duration_txt_ru() {
let s = "Воспроизвести видео – \"the holy trinity of korean street food\". Его продолжительность – 1 минута.";
let parts = ACCESSIBILITY_SEP_REGEX.split(s).collect::<Vec<_>>();
assert_eq!(parts[parts.len() - 1], "1 минута.");
}
}