narrate: pronunciation overrides for proper nouns

skald narrate pre-processes body_md_tts with word-boundary regex
substitutions from pronunciation_overrides where phoneme_format =
'respelling'. Story-scoped overrides win over global; longer words
substitute first (so 'Bryukhanov' wins over a hypothetical 'Bry'
override). Case is preserved on the first letter so 'Pripyat' at
sentence start stays capitalised after respelling to 'Prip-yat'.

Seeded Coast-Down with 52 entries: Russian/Ukrainian surnames
(Dyatlov, Akimov, Bryukhanov, Stolyarchuk, Yuvchenko, Khmel, etc.),
first names (Pyotr, Lyudmilla, Anatoly, Vasily, Sasha, Aleksandr,
Leonid), patronymics (Stepanovich, Fyodorovich, Mykolaivna,
Hryhorivna), and places (Pripyat, Chernobyl, Kyiv, Kopachi,
Lubyanka). Plus the operational acronyms NIKIET, RBMK, AZ-5, SIUR,
SIUB, ChNPP, MSCh.

Other phoneme_format values (ipa, arpabet) are no-ops for now —
Kokoro's misaki tokenizer doesn't expose a stable lexicon-injection
API across the HTTP boundary in v0.1. Future: pass IPA forms in a
new server-side request field and inject into the pipeline's g2p
lexicon for more accurate phonetics.
This commit is contained in:
Kayos 2026-05-14 08:56:17 -07:00
parent c9bd38034c
commit 1c3fc11484

View file

@ -91,14 +91,22 @@ pub async fn run(
// the Kokoro server only ever sees real voice ids. Only kicks
// in for kokoro-routed renders; F5 voice-tag handling isn't
// implemented and any tags pass through unchanged.
// Two pre-processing passes (kokoro only). Order matters:
// 1. Speaker voice substitution rewrites [voice:slug] → [voice:kokoro_id].
// This must run BEFORE pronunciation overrides so we don't
// accidentally try to respell character slugs.
// 2. Pronunciation overrides word-substitute proper nouns
// (Pripyat, Dyatlov, etc.) with English-readable respellings
// so Kokoro's small phonemizer doesn't mangle them.
let gen_text = if voice.source.starts_with("kokoro") {
substitute_speaker_voices(
let voiced = substitute_speaker_voices(
&pool,
chapter.story_id,
&chapter.body_for_tts,
ref_audio_path.as_str(),
)
.await?
.await?;
apply_pronunciation_overrides(&pool, chapter.story_id, &voiced).await?
} else {
chapter.body_for_tts.clone()
};
@ -280,6 +288,91 @@ async fn substitute_speaker_voices(
Ok(out.into_owned())
}
/// Apply pronunciation_overrides where phoneme_format='respelling'
/// — literal English-readable approximations like "Prip-yat" for
/// "Pripyat". Word-boundary case-insensitive substitution, with
/// case preservation on the first letter so that a "Pripyat" at
/// sentence start keeps its capital.
///
/// Story-scoped overrides take precedence over global ones (same
/// word). Other phoneme_format values (ipa, arpabet) are skipped
/// for now — kokoro's misaki tokenizer doesn't expose a stable
/// lexicon-injection API from the request boundary in v0.1. Future
/// work: pass IPA forms through a new server-side request field
/// and inject into the pipeline's g2p lexicon.
async fn apply_pronunciation_overrides(
pool: &PgPool,
story_id: Uuid,
text: &str,
) -> anyhow::Result<String> {
let rows: Vec<(String, String, Option<Uuid>)> = sqlx::query_as(
"SELECT word, phonemes, story_id FROM pronunciation_overrides
WHERE phoneme_format = 'respelling'
AND (story_id = $1 OR story_id IS NULL)
ORDER BY (story_id IS NULL) ASC, length(word) DESC",
)
.bind(story_id)
.fetch_all(pool)
.await?;
if rows.is_empty() {
return Ok(text.to_string());
}
// De-dup: story-scoped wins over global. We sorted with story
// overrides first, so the first occurrence of a word is the
// authoritative one.
let mut seen = std::collections::HashSet::<String>::new();
let mut active: Vec<(String, String)> = Vec::new();
for (word, phonemes, _) in rows {
let key = word.to_lowercase();
if seen.insert(key) {
active.push((word, phonemes));
}
}
if active.is_empty() {
return Ok(text.to_string());
}
tracing::info!(
story_id = %story_id,
override_count = active.len(),
"applying pronunciation overrides",
);
let mut out = text.to_string();
for (word, respelling) in &active {
// Word-boundary, case-insensitive. (?i) at the start of the
// pattern enables i flag; \b at both ends prevents matching
// substrings ("Pripyat" inside "Pripyatchanin" stays).
let pat = format!(r"(?i)\b{}\b", regex::escape(word));
if let Ok(re) = regex::Regex::new(&pat) {
out = re
.replace_all(&out, |caps: &regex::Captures<'_>| {
// Preserve case of the FIRST letter: if the
// matched text starts with an uppercase letter,
// capitalize the respelling's first letter too.
let matched = &caps[0];
let first = matched.chars().next().unwrap_or('a');
if first.is_uppercase() {
let mut chars = respelling.chars();
match chars.next() {
Some(c) => format!(
"{}{}",
c.to_uppercase().collect::<String>(),
chars.collect::<String>()
),
None => respelling.clone(),
}
} else {
respelling.clone()
}
})
.into_owned();
}
}
Ok(out)
}
/// Pick the engine base URL for a given voice.source. Voices whose
/// source starts with "kokoro" route to KOKORO_URL; everything else
/// routes to F5_TTS_URL. Each env var has a LAN-default for Lucy.