From 75bc7dc6bf931075395b73154e93b4fae3cfb0d2 Mon Sep 17 00:00:00 2001 From: Kayos Date: Tue, 26 May 2026 22:52:27 -0700 Subject: [PATCH] Replace hand-rolled urlencoded_decode with url::form_urlencoded::parse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous decoder treated each %XX as an isolated code point via `out.push(v as char)`. For UTF-8 multi-byte sequences (e.g. %E2%9C%93 for ✓) that produced three garbage chars at U+00E2 / U+009C / U+0093 instead of the proper U+2713. YT cipher strings are typically ASCII- only so this was latent, but the function was named generically and nothing in the type system prevented a non-ASCII input from reaching it. `url::form_urlencoded::parse` is the canonical &-separated query-pair parser — handles %-decode as UTF-8, handles + → space, and the url crate is already a transitive dep. parse_cipher_string collapses to one line; the bespoke 20-line decoder goes. --- src/youtube/stream_extractor.rs | 44 ++++++++------------------------- 1 file changed, 10 insertions(+), 34 deletions(-) diff --git a/src/youtube/stream_extractor.rs b/src/youtube/stream_extractor.rs index 4840787..ced32a6 100644 --- a/src/youtube/stream_extractor.rs +++ b/src/youtube/stream_extractor.rs @@ -558,40 +558,16 @@ fn process_url( } fn parse_cipher_string(s: &str) -> std::collections::BTreeMap { - let mut out = std::collections::BTreeMap::new(); - for pair in s.split('&') { - if let Some((k, v)) = pair.split_once('=') { - out.insert( - urlencoded_decode(k), - urlencoded_decode(v), - ); - } - } - out -} - -fn urlencoded_decode(s: &str) -> String { - let mut out = String::with_capacity(s.len()); - let bytes = s.as_bytes(); - let mut i = 0; - while i < bytes.len() { - let b = bytes[i]; - if b == b'%' && i + 2 < bytes.len() { - let hex = std::str::from_utf8(&bytes[i + 1..i + 3]).unwrap_or(""); - if let Ok(v) = u8::from_str_radix(hex, 16) { - out.push(v as char); - i += 3; - continue; - } - } - if b == b'+' { - out.push(' '); - } else { - out.push(b as char); - } - i += 1; - } - out + // `url::form_urlencoded::parse` decodes percent-escapes as UTF-8 + // multi-byte sequences and handles `+` → space — both of which the + // prior hand-rolled `urlencoded_decode` got wrong (it treated each + // %XX as an isolated code point, so `%E2%9C%93` rendered as three + // garbage chars instead of ✓). YT cipher strings are typically + // ASCII-only, but pulling in the canonical parser closes the + // surface and removes 20 lines. + url::form_urlencoded::parse(s.as_bytes()) + .map(|(k, v)| (k.into_owned(), v.into_owned())) + .collect() } fn build_video_progressive(