narrate: pronunciation overrides for proper nouns
skald narrate pre-processes body_md_tts with word-boundary regex substitutions from pronunciation_overrides where phoneme_format = 'respelling'. Story-scoped overrides win over global; longer words substitute first (so 'Bryukhanov' wins over a hypothetical 'Bry' override). Case is preserved on the first letter so 'Pripyat' at sentence start stays capitalised after respelling to 'Prip-yat'. Seeded Coast-Down with 52 entries: Russian/Ukrainian surnames (Dyatlov, Akimov, Bryukhanov, Stolyarchuk, Yuvchenko, Khmel, etc.), first names (Pyotr, Lyudmilla, Anatoly, Vasily, Sasha, Aleksandr, Leonid), patronymics (Stepanovich, Fyodorovich, Mykolaivna, Hryhorivna), and places (Pripyat, Chernobyl, Kyiv, Kopachi, Lubyanka). Plus the operational acronyms NIKIET, RBMK, AZ-5, SIUR, SIUB, ChNPP, MSCh. Other phoneme_format values (ipa, arpabet) are no-ops for now — Kokoro's misaki tokenizer doesn't expose a stable lexicon-injection API across the HTTP boundary in v0.1. Future: pass IPA forms in a new server-side request field and inject into the pipeline's g2p lexicon for more accurate phonetics.
This commit is contained in:
parent
c9bd38034c
commit
1c3fc11484
1 changed files with 95 additions and 2 deletions
|
|
@ -91,14 +91,22 @@ pub async fn run(
|
|||
// the Kokoro server only ever sees real voice ids. Only kicks
|
||||
// in for kokoro-routed renders; F5 voice-tag handling isn't
|
||||
// implemented and any tags pass through unchanged.
|
||||
// Two pre-processing passes (kokoro only). Order matters:
|
||||
// 1. Speaker voice substitution rewrites [voice:slug] → [voice:kokoro_id].
|
||||
// This must run BEFORE pronunciation overrides so we don't
|
||||
// accidentally try to respell character slugs.
|
||||
// 2. Pronunciation overrides word-substitute proper nouns
|
||||
// (Pripyat, Dyatlov, etc.) with English-readable respellings
|
||||
// so Kokoro's small phonemizer doesn't mangle them.
|
||||
let gen_text = if voice.source.starts_with("kokoro") {
|
||||
substitute_speaker_voices(
|
||||
let voiced = substitute_speaker_voices(
|
||||
&pool,
|
||||
chapter.story_id,
|
||||
&chapter.body_for_tts,
|
||||
ref_audio_path.as_str(),
|
||||
)
|
||||
.await?
|
||||
.await?;
|
||||
apply_pronunciation_overrides(&pool, chapter.story_id, &voiced).await?
|
||||
} else {
|
||||
chapter.body_for_tts.clone()
|
||||
};
|
||||
|
|
@ -280,6 +288,91 @@ async fn substitute_speaker_voices(
|
|||
Ok(out.into_owned())
|
||||
}
|
||||
|
||||
/// Apply pronunciation_overrides where phoneme_format='respelling'
|
||||
/// — literal English-readable approximations like "Prip-yat" for
|
||||
/// "Pripyat". Word-boundary case-insensitive substitution, with
|
||||
/// case preservation on the first letter so that a "Pripyat" at
|
||||
/// sentence start keeps its capital.
|
||||
///
|
||||
/// Story-scoped overrides take precedence over global ones (same
|
||||
/// word). Other phoneme_format values (ipa, arpabet) are skipped
|
||||
/// for now — kokoro's misaki tokenizer doesn't expose a stable
|
||||
/// lexicon-injection API from the request boundary in v0.1. Future
|
||||
/// work: pass IPA forms through a new server-side request field
|
||||
/// and inject into the pipeline's g2p lexicon.
|
||||
async fn apply_pronunciation_overrides(
|
||||
pool: &PgPool,
|
||||
story_id: Uuid,
|
||||
text: &str,
|
||||
) -> anyhow::Result<String> {
|
||||
let rows: Vec<(String, String, Option<Uuid>)> = sqlx::query_as(
|
||||
"SELECT word, phonemes, story_id FROM pronunciation_overrides
|
||||
WHERE phoneme_format = 'respelling'
|
||||
AND (story_id = $1 OR story_id IS NULL)
|
||||
ORDER BY (story_id IS NULL) ASC, length(word) DESC",
|
||||
)
|
||||
.bind(story_id)
|
||||
.fetch_all(pool)
|
||||
.await?;
|
||||
if rows.is_empty() {
|
||||
return Ok(text.to_string());
|
||||
}
|
||||
|
||||
// De-dup: story-scoped wins over global. We sorted with story
|
||||
// overrides first, so the first occurrence of a word is the
|
||||
// authoritative one.
|
||||
let mut seen = std::collections::HashSet::<String>::new();
|
||||
let mut active: Vec<(String, String)> = Vec::new();
|
||||
for (word, phonemes, _) in rows {
|
||||
let key = word.to_lowercase();
|
||||
if seen.insert(key) {
|
||||
active.push((word, phonemes));
|
||||
}
|
||||
}
|
||||
if active.is_empty() {
|
||||
return Ok(text.to_string());
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
story_id = %story_id,
|
||||
override_count = active.len(),
|
||||
"applying pronunciation overrides",
|
||||
);
|
||||
|
||||
let mut out = text.to_string();
|
||||
for (word, respelling) in &active {
|
||||
// Word-boundary, case-insensitive. (?i) at the start of the
|
||||
// pattern enables i flag; \b at both ends prevents matching
|
||||
// substrings ("Pripyat" inside "Pripyatchanin" stays).
|
||||
let pat = format!(r"(?i)\b{}\b", regex::escape(word));
|
||||
if let Ok(re) = regex::Regex::new(&pat) {
|
||||
out = re
|
||||
.replace_all(&out, |caps: ®ex::Captures<'_>| {
|
||||
// Preserve case of the FIRST letter: if the
|
||||
// matched text starts with an uppercase letter,
|
||||
// capitalize the respelling's first letter too.
|
||||
let matched = &caps[0];
|
||||
let first = matched.chars().next().unwrap_or('a');
|
||||
if first.is_uppercase() {
|
||||
let mut chars = respelling.chars();
|
||||
match chars.next() {
|
||||
Some(c) => format!(
|
||||
"{}{}",
|
||||
c.to_uppercase().collect::<String>(),
|
||||
chars.collect::<String>()
|
||||
),
|
||||
None => respelling.clone(),
|
||||
}
|
||||
} else {
|
||||
respelling.clone()
|
||||
}
|
||||
})
|
||||
.into_owned();
|
||||
}
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
/// Pick the engine base URL for a given voice.source. Voices whose
|
||||
/// source starts with "kokoro" route to KOKORO_URL; everything else
|
||||
/// routes to F5_TTS_URL. Each env var has a LAN-default for Lucy.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue