narrate: route tortoise_* voices to TORTOISE_URL

Engine dispatch was a binary kokoro-vs-everything-else split that
sent tortoise voices to F5. Now three branches:
  kokoro_*    → KOKORO_URL    (Lucy default :7794)
  tortoise_*  → TORTOISE_URL  (Lucy default :7795)
  *           → F5_TTS_URL    (Lucy default :7792)

substitute_speaker_voices also runs for tortoise voices, so
multi-voice [voice:slug] tags survive in the prose. Tortoise voices
without a per-character mapping fall back to the narrator voice
server-side (tortoise_server.py logs the fallback).
This commit is contained in:
Kayos 2026-05-14 09:46:16 -07:00
parent d1631ddffe
commit 303b6c73f4

View file

@ -65,12 +65,16 @@ pub async fn run(
let run_id = Uuid::new_v4();
let output_filename = format!("{}-{}-{}.wav", chapter.story_id, chapter.n, run_id);
// Engine + version threaded from the voice row's source/license
// pair. lj_speech-style PD voices live behind f5-tts; kokoro_*
// voices live behind kokoro. Future: a dedicated voices.engine
// column to make this explicit.
// Engine + version threaded from the voice row's source. Three
// engines on Lucy currently:
// kokoro_* → kokoro 82M
// tortoise_* → tortoise-tts
// anything else (lj_speech etc.) → f5-tts
// Future: a dedicated voices.engine column to make this explicit.
let (engine, engine_version) = if voice.source.starts_with("kokoro") {
("kokoro-82m", "0.9")
} else if voice.source.starts_with("tortoise") {
("tortoise-tts", "3.0")
} else {
("f5-tts", "1.1.20")
};
@ -91,14 +95,20 @@ pub async fn run(
// the Kokoro server only ever sees real voice ids. Only kicks
// in for kokoro-routed renders; F5 voice-tag handling isn't
// implemented and any tags pass through unchanged.
// Two pre-processing passes (kokoro only). Order matters:
// 1. Speaker voice substitution rewrites [voice:slug] → [voice:kokoro_id].
// This must run BEFORE pronunciation overrides so we don't
// accidentally try to respell character slugs.
// Two pre-processing passes (kokoro + tortoise — engines that
// parse [voice:X] dialogue tags). Order matters:
// 1. Speaker voice substitution rewrites [voice:slug] → the
// engine's named voice id. Must run BEFORE pronunciation
// overrides so we don't try to respell character slugs.
// Tortoise: characters with no tortoise-voice mapping
// gracefully fall back to the narrator voice server-side.
// 2. Pronunciation overrides word-substitute proper nouns
// (Pripyat, Dyatlov, etc.) with English-readable respellings
// so Kokoro's small phonemizer doesn't mangle them.
let gen_text = if voice.source.starts_with("kokoro") {
// (Pripyat, Dyatlov, etc.) with English-readable
// respellings. The respellings are kokoro/misaki-tuned but
// pass through tortoise's g2p_en well enough to apply.
let routes_to_engine_with_voice_tags =
voice.source.starts_with("kokoro") || voice.source.starts_with("tortoise");
let gen_text = if routes_to_engine_with_voice_tags {
let voiced = substitute_speaker_voices(
&pool,
chapter.story_id,
@ -373,13 +383,18 @@ async fn apply_pronunciation_overrides(
Ok(out)
}
/// Pick the engine base URL for a given voice.source. Voices whose
/// source starts with "kokoro" route to KOKORO_URL; everything else
/// routes to F5_TTS_URL. Each env var has a LAN-default for Lucy.
/// Pick the engine base URL for a given voice.source.
/// kokoro_* → KOKORO_URL
/// tortoise_* → TORTOISE_URL
/// anything else (lj_speech etc.) → F5_TTS_URL
/// Each env var has a LAN-default for Lucy.
fn engine_url_for(source: &str) -> anyhow::Result<String> {
if source.starts_with("kokoro") {
Ok(std::env::var("KOKORO_URL")
.unwrap_or_else(|_| "http://192.168.0.5:7794".into()))
} else if source.starts_with("tortoise") {
Ok(std::env::var("TORTOISE_URL")
.unwrap_or_else(|_| "http://192.168.0.5:7795".into()))
} else {
Ok(std::env::var("F5_TTS_URL")
.unwrap_or_else(|_| "http://192.168.0.5:7792".into()))