feat(codegen): collected video duration samples
This commit is contained in:
parent
19781eab36
commit
800073df48
14 changed files with 7325 additions and 1449 deletions
|
|
@ -1,120 +1,50 @@
|
|||
use std::{
|
||||
collections::BTreeMap,
|
||||
fs::File,
|
||||
io::BufReader,
|
||||
path::{Path, PathBuf},
|
||||
str::FromStr,
|
||||
};
|
||||
use std::{collections::BTreeMap, fs::File, io::BufReader, path::PathBuf, str::FromStr};
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use path_macro::path;
|
||||
use rustypipe::{client::YTContext, model::AlbumType, param::Language};
|
||||
use regex::Regex;
|
||||
use rustypipe::param::Language;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
static DICT_PATH: Lazy<PathBuf> = Lazy::new(|| path!("testfiles" / "dict" / "dictionary.json"));
|
||||
static DICT_OVERRIDE_PATH: Lazy<PathBuf> =
|
||||
Lazy::new(|| path!("testfiles" / "dict" / "dictionary_override.json"));
|
||||
use crate::model::DictEntry;
|
||||
|
||||
/// Get the path of the `testfiles` directory
|
||||
pub static TESTFILES_DIR: Lazy<PathBuf> = Lazy::new(|| {
|
||||
path!(env!("CARGO_MANIFEST_DIR") / ".." / "testfiles")
|
||||
.canonicalize()
|
||||
.unwrap()
|
||||
});
|
||||
/// Get the path of the `dict` directory
|
||||
pub static DICT_DIR: Lazy<PathBuf> = Lazy::new(|| path!(*TESTFILES_DIR / "dict"));
|
||||
/// Get the path of the `src` directory
|
||||
pub static SRC_DIR: Lazy<PathBuf> = Lazy::new(|| path!(env!("CARGO_MANIFEST_DIR") / ".." / "src"));
|
||||
|
||||
type Dictionary = BTreeMap<Language, DictEntry>;
|
||||
type DictionaryOverride = BTreeMap<Language, DictOverrideEntry>;
|
||||
|
||||
#[derive(Debug, Default, Serialize, Deserialize)]
|
||||
#[serde(default)]
|
||||
pub struct DictEntry {
|
||||
/// List of languages that should be treated equally (e.g. EnUs/EnGb/EnIn)
|
||||
pub equivalent: Vec<Language>,
|
||||
/// Should the language be parsed by character instead of by word?
|
||||
/// (e.g. Chinese/Japanese)
|
||||
pub by_char: bool,
|
||||
/// Tokens for parsing timeago strings.
|
||||
///
|
||||
/// Format: Parsed token -> \[Quantity\] Identifier
|
||||
///
|
||||
/// Identifiers: `Y`(ear), `M`(month), `W`(eek), `D`(ay),
|
||||
/// `h`(our), `m`(inute), `s`(econd)
|
||||
pub timeago_tokens: BTreeMap<String, String>,
|
||||
/// Order in which to parse numeric date components. Formatted as
|
||||
/// a string of date identifiers (Y, M, D).
|
||||
///
|
||||
/// Examples:
|
||||
///
|
||||
/// - 03.01.2020 => `"DMY"`
|
||||
/// - Jan 3, 2020 => `"DY"`
|
||||
pub date_order: String,
|
||||
/// Tokens for parsing month names.
|
||||
///
|
||||
/// Format: Parsed token -> Month number (starting from 1)
|
||||
pub months: BTreeMap<String, u8>,
|
||||
/// Tokens for parsing date strings with no digits (e.g. Today, Tomorrow)
|
||||
///
|
||||
/// Format: Parsed token -> \[Quantity\] Identifier
|
||||
pub timeago_nd_tokens: BTreeMap<String, String>,
|
||||
/// Are commas (instead of points) used as decimal separators?
|
||||
pub comma_decimal: bool,
|
||||
/// Tokens for parsing decimal prefixes (K, M, B, ...)
|
||||
///
|
||||
/// Format: Parsed token -> decimal power
|
||||
pub number_tokens: BTreeMap<String, u8>,
|
||||
/// Tokens for parsing number strings with no digits (e.g. "No videos")
|
||||
///
|
||||
/// Format: Parsed token -> value
|
||||
pub number_nd_tokens: BTreeMap<String, u8>,
|
||||
/// Names of album types (Album, Single, ...)
|
||||
///
|
||||
/// Format: Parsed text -> Album type
|
||||
pub album_types: BTreeMap<String, AlbumType>,
|
||||
struct DictOverrideEntry {
|
||||
number_tokens: BTreeMap<String, Option<u8>>,
|
||||
number_nd_tokens: BTreeMap<String, Option<u8>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Serialize, Deserialize)]
|
||||
#[serde(default)]
|
||||
pub struct DictOverrideEntry {
|
||||
pub number_tokens: BTreeMap<String, Option<u8>>,
|
||||
pub number_nd_tokens: BTreeMap<String, Option<u8>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct QBrowse<'a> {
|
||||
pub context: YTContext<'a>,
|
||||
pub browse_id: &'a str,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub params: Option<&'a str>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct QCont<'a> {
|
||||
pub context: YTContext<'a>,
|
||||
pub continuation: &'a str,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
pub struct TextRuns {
|
||||
pub runs: Vec<Text>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
pub struct Text {
|
||||
#[serde(alias = "simpleText")]
|
||||
pub text: String,
|
||||
}
|
||||
|
||||
pub fn read_dict(project_root: &Path) -> Dictionary {
|
||||
let json_path = path!(project_root / *DICT_PATH);
|
||||
pub fn read_dict() -> Dictionary {
|
||||
let json_path = path!(*DICT_DIR / "dictionary.json");
|
||||
let json_file = File::open(json_path).unwrap();
|
||||
serde_json::from_reader(BufReader::new(json_file)).unwrap()
|
||||
}
|
||||
|
||||
pub fn read_dict_override(project_root: &Path) -> DictionaryOverride {
|
||||
let json_path = path!(project_root / *DICT_OVERRIDE_PATH);
|
||||
fn read_dict_override() -> DictionaryOverride {
|
||||
let json_path = path!(*DICT_DIR / "dictionary_override.json");
|
||||
let json_file = File::open(json_path).unwrap();
|
||||
serde_json::from_reader(BufReader::new(json_file)).unwrap()
|
||||
}
|
||||
|
||||
pub fn write_dict(project_root: &Path, dict: Dictionary) {
|
||||
let dict_override = read_dict_override(project_root);
|
||||
pub fn write_dict(dict: Dictionary) {
|
||||
let dict_override = read_dict_override();
|
||||
|
||||
let json_path = path!(project_root / *DICT_PATH);
|
||||
let json_path = path!(*DICT_DIR / "dictionary.json");
|
||||
let json_file = File::create(json_path).unwrap();
|
||||
|
||||
fn apply_map<K: Clone + Ord, V: Clone>(map: &mut BTreeMap<K, V>, or: &BTreeMap<K, Option<V>>) {
|
||||
|
|
@ -251,3 +181,26 @@ pub fn parse_largenum_en(string: &str) -> Option<u64> {
|
|||
|
||||
num.checked_mul((10_u64).checked_pow(exp.try_into().ok()?)?)
|
||||
}
|
||||
|
||||
/// Parse textual video length (e.g. `0:49`, `2:02` or `1:48:18`)
|
||||
/// and return the duration in seconds.
|
||||
pub fn parse_video_length(text: &str) -> Option<u32> {
|
||||
static VIDEO_LENGTH_REGEX: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"(?:(\d+)[:.])?(\d{1,2})[:.](\d{2})"#).unwrap());
|
||||
VIDEO_LENGTH_REGEX.captures(text).map(|cap| {
|
||||
let hrs = cap
|
||||
.get(1)
|
||||
.and_then(|x| x.as_str().parse::<u32>().ok())
|
||||
.unwrap_or_default();
|
||||
let min = cap
|
||||
.get(2)
|
||||
.and_then(|x| x.as_str().parse::<u32>().ok())
|
||||
.unwrap_or_default();
|
||||
let sec = cap
|
||||
.get(3)
|
||||
.and_then(|x| x.as_str().parse::<u32>().ok())
|
||||
.unwrap_or_default();
|
||||
|
||||
hrs * 3600 + min * 60 + sec
|
||||
})
|
||||
}
|
||||
|
|
|
|||
Reference in a new issue