feat(codegen): collected video duration samples

2023-05-06 21:12:49 +02:00 · 2023-05-06 21:12:49 +02:00 · 800073df48
commit 800073df48
parent 19781eab36
14 changed files with 7325 additions and 1449 deletions
--- a/codegen/src/util.rs
+++ b/codegen/src/util.rs
@ -1,120 +1,50 @@
-use std::{
-    collections::BTreeMap,
-    fs::File,
-    io::BufReader,
-    path::{Path, PathBuf},
-    str::FromStr,
-};
+use std::{collections::BTreeMap, fs::File, io::BufReader, path::PathBuf, str::FromStr};

 use once_cell::sync::Lazy;
 use path_macro::path;
-use rustypipe::{client::YTContext, model::AlbumType, param::Language};
+use regex::Regex;
+use rustypipe::param::Language;
 use serde::{Deserialize, Serialize};

-static DICT_PATH: Lazy<PathBuf> = Lazy::new(|| path!("testfiles" / "dict" / "dictionary.json"));
-static DICT_OVERRIDE_PATH: Lazy<PathBuf> =
-    Lazy::new(|| path!("testfiles" / "dict" / "dictionary_override.json"));
+use crate::model::DictEntry;
+
+/// Get the path of the `testfiles` directory
+pub static TESTFILES_DIR: Lazy<PathBuf> = Lazy::new(|| {
+    path!(env!("CARGO_MANIFEST_DIR") / ".." / "testfiles")
+        .canonicalize()
+        .unwrap()
+});
+/// Get the path of the `dict` directory
+pub static DICT_DIR: Lazy<PathBuf> = Lazy::new(|| path!(*TESTFILES_DIR / "dict"));
+/// Get the path of the `src` directory
+pub static SRC_DIR: Lazy<PathBuf> = Lazy::new(|| path!(env!("CARGO_MANIFEST_DIR") / ".." / "src"));

 type Dictionary = BTreeMap<Language, DictEntry>;
 type DictionaryOverride = BTreeMap<Language, DictOverrideEntry>;

 #[derive(Debug, Default, Serialize, Deserialize)]
 #[serde(default)]
-pub struct DictEntry {
-    /// List of languages that should be treated equally (e.g. EnUs/EnGb/EnIn)
-    pub equivalent: Vec<Language>,
-    /// Should the language be parsed by character instead of by word?
-    /// (e.g. Chinese/Japanese)
-    pub by_char: bool,
-    /// Tokens for parsing timeago strings.
-    ///
-    /// Format: Parsed token -> \[Quantity\] Identifier
-    ///
-    /// Identifiers: `Y`(ear), `M`(month), `W`(eek), `D`(ay),
-    /// `h`(our), `m`(inute), `s`(econd)
-    pub timeago_tokens: BTreeMap<String, String>,
-    /// Order in which to parse numeric date components. Formatted as
-    /// a string of date identifiers (Y, M, D).
-    ///
-    /// Examples:
-    ///
-    /// - 03.01.2020 => `"DMY"`
-    /// - Jan 3, 2020 => `"DY"`
-    pub date_order: String,
-    /// Tokens for parsing month names.
-    ///
-    /// Format: Parsed token -> Month number (starting from 1)
-    pub months: BTreeMap<String, u8>,
-    /// Tokens for parsing date strings with no digits (e.g. Today, Tomorrow)
-    ///
-    /// Format: Parsed token -> \[Quantity\] Identifier
-    pub timeago_nd_tokens: BTreeMap<String, String>,
-    /// Are commas (instead of points) used as decimal separators?
-    pub comma_decimal: bool,
-    /// Tokens for parsing decimal prefixes (K, M, B, ...)
-    ///
-    /// Format: Parsed token -> decimal power
-    pub number_tokens: BTreeMap<String, u8>,
-    /// Tokens for parsing number strings with no digits (e.g. "No videos")
-    ///
-    /// Format: Parsed token -> value
-    pub number_nd_tokens: BTreeMap<String, u8>,
-    /// Names of album types (Album, Single, ...)
-    ///
-    /// Format: Parsed text -> Album type
-    pub album_types: BTreeMap<String, AlbumType>,
+struct DictOverrideEntry {
+    number_tokens: BTreeMap<String, Option<u8>>,
+    number_nd_tokens: BTreeMap<String, Option<u8>>,
 }

-#[derive(Debug, Default, Serialize, Deserialize)]
-#[serde(default)]
-pub struct DictOverrideEntry {
-    pub number_tokens: BTreeMap<String, Option<u8>>,
-    pub number_nd_tokens: BTreeMap<String, Option<u8>>,
-}
-
-#[derive(Debug, Serialize)]
-#[serde(rename_all = "camelCase")]
-pub struct QBrowse<'a> {
-    pub context: YTContext<'a>,
-    pub browse_id: &'a str,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub params: Option<&'a str>,
-}
-
-#[derive(Serialize)]
-#[serde(rename_all = "camelCase")]
-pub struct QCont<'a> {
-    pub context: YTContext<'a>,
-    pub continuation: &'a str,
-}
-
-#[derive(Clone, Debug, Deserialize)]
-pub struct TextRuns {
-    pub runs: Vec<Text>,
-}
-
-#[derive(Clone, Debug, Deserialize)]
-pub struct Text {
-    #[serde(alias = "simpleText")]
-    pub text: String,
-}
-
-pub fn read_dict(project_root: &Path) -> Dictionary {
-    let json_path = path!(project_root / *DICT_PATH);
+pub fn read_dict() -> Dictionary {
+    let json_path = path!(*DICT_DIR / "dictionary.json");
    let json_file = File::open(json_path).unwrap();
    serde_json::from_reader(BufReader::new(json_file)).unwrap()
 }

-pub fn read_dict_override(project_root: &Path) -> DictionaryOverride {
-    let json_path = path!(project_root / *DICT_OVERRIDE_PATH);
+fn read_dict_override() -> DictionaryOverride {
+    let json_path = path!(*DICT_DIR / "dictionary_override.json");
    let json_file = File::open(json_path).unwrap();
    serde_json::from_reader(BufReader::new(json_file)).unwrap()
 }

-pub fn write_dict(project_root: &Path, dict: Dictionary) {
-    let dict_override = read_dict_override(project_root);
+pub fn write_dict(dict: Dictionary) {
+    let dict_override = read_dict_override();

-    let json_path = path!(project_root / *DICT_PATH);
+    let json_path = path!(*DICT_DIR / "dictionary.json");
    let json_file = File::create(json_path).unwrap();

    fn apply_map<K: Clone + Ord, V: Clone>(map: &mut BTreeMap<K, V>, or: &BTreeMap<K, Option<V>>) {
@ -251,3 +181,26 @@ pub fn parse_largenum_en(string: &str) -> Option<u64> {

    num.checked_mul((10_u64).checked_pow(exp.try_into().ok()?)?)
 }
+
+/// Parse textual video length (e.g. `0:49`, `2:02` or `1:48:18`)
+/// and return the duration in seconds.
+pub fn parse_video_length(text: &str) -> Option<u32> {
+    static VIDEO_LENGTH_REGEX: Lazy<Regex> =
+        Lazy::new(|| Regex::new(r#"(?:(\d+)[:.])?(\d{1,2})[:.](\d{2})"#).unwrap());
+    VIDEO_LENGTH_REGEX.captures(text).map(|cap| {
+        let hrs = cap
+            .get(1)
+            .and_then(|x| x.as_str().parse::<u32>().ok())
+            .unwrap_or_default();
+        let min = cap
+            .get(2)
+            .and_then(|x| x.as_str().parse::<u32>().ok())
+            .unwrap_or_default();
+        let sec = cap
+            .get(3)
+            .and_then(|x| x.as_str().parse::<u32>().ok())
+            .unwrap_or_default();
+
+        hrs * 3600 + min * 60 + sec
+    })
+}