diff --git a/skald/src/narrate.rs b/skald/src/narrate.rs index a18afa5..cd697eb 100644 --- a/skald/src/narrate.rs +++ b/skald/src/narrate.rs @@ -91,14 +91,22 @@ pub async fn run( // the Kokoro server only ever sees real voice ids. Only kicks // in for kokoro-routed renders; F5 voice-tag handling isn't // implemented and any tags pass through unchanged. + // Two pre-processing passes (kokoro only). Order matters: + // 1. Speaker voice substitution rewrites [voice:slug] → [voice:kokoro_id]. + // This must run BEFORE pronunciation overrides so we don't + // accidentally try to respell character slugs. + // 2. Pronunciation overrides word-substitute proper nouns + // (Pripyat, Dyatlov, etc.) with English-readable respellings + // so Kokoro's small phonemizer doesn't mangle them. let gen_text = if voice.source.starts_with("kokoro") { - substitute_speaker_voices( + let voiced = substitute_speaker_voices( &pool, chapter.story_id, &chapter.body_for_tts, ref_audio_path.as_str(), ) - .await? + .await?; + apply_pronunciation_overrides(&pool, chapter.story_id, &voiced).await? } else { chapter.body_for_tts.clone() }; @@ -280,6 +288,91 @@ async fn substitute_speaker_voices( Ok(out.into_owned()) } +/// Apply pronunciation_overrides where phoneme_format='respelling' +/// — literal English-readable approximations like "Prip-yat" for +/// "Pripyat". Word-boundary case-insensitive substitution, with +/// case preservation on the first letter so that a "Pripyat" at +/// sentence start keeps its capital. +/// +/// Story-scoped overrides take precedence over global ones (same +/// word). Other phoneme_format values (ipa, arpabet) are skipped +/// for now — kokoro's misaki tokenizer doesn't expose a stable +/// lexicon-injection API from the request boundary in v0.1. Future +/// work: pass IPA forms through a new server-side request field +/// and inject into the pipeline's g2p lexicon. +async fn apply_pronunciation_overrides( + pool: &PgPool, + story_id: Uuid, + text: &str, +) -> anyhow::Result { + let rows: Vec<(String, String, Option)> = sqlx::query_as( + "SELECT word, phonemes, story_id FROM pronunciation_overrides + WHERE phoneme_format = 'respelling' + AND (story_id = $1 OR story_id IS NULL) + ORDER BY (story_id IS NULL) ASC, length(word) DESC", + ) + .bind(story_id) + .fetch_all(pool) + .await?; + if rows.is_empty() { + return Ok(text.to_string()); + } + + // De-dup: story-scoped wins over global. We sorted with story + // overrides first, so the first occurrence of a word is the + // authoritative one. + let mut seen = std::collections::HashSet::::new(); + let mut active: Vec<(String, String)> = Vec::new(); + for (word, phonemes, _) in rows { + let key = word.to_lowercase(); + if seen.insert(key) { + active.push((word, phonemes)); + } + } + if active.is_empty() { + return Ok(text.to_string()); + } + + tracing::info!( + story_id = %story_id, + override_count = active.len(), + "applying pronunciation overrides", + ); + + let mut out = text.to_string(); + for (word, respelling) in &active { + // Word-boundary, case-insensitive. (?i) at the start of the + // pattern enables i flag; \b at both ends prevents matching + // substrings ("Pripyat" inside "Pripyatchanin" stays). + let pat = format!(r"(?i)\b{}\b", regex::escape(word)); + if let Ok(re) = regex::Regex::new(&pat) { + out = re + .replace_all(&out, |caps: ®ex::Captures<'_>| { + // Preserve case of the FIRST letter: if the + // matched text starts with an uppercase letter, + // capitalize the respelling's first letter too. + let matched = &caps[0]; + let first = matched.chars().next().unwrap_or('a'); + if first.is_uppercase() { + let mut chars = respelling.chars(); + match chars.next() { + Some(c) => format!( + "{}{}", + c.to_uppercase().collect::(), + chars.collect::() + ), + None => respelling.clone(), + } + } else { + respelling.clone() + } + }) + .into_owned(); + } + } + Ok(out) +} + /// Pick the engine base URL for a given voice.source. Voices whose /// source starts with "kokoro" route to KOKORO_URL; everything else /// routes to F5_TTS_URL. Each env var has a LAN-default for Lucy.