diff --git a/skald/src/narrate.rs b/skald/src/narrate.rs index cd697eb..98418c1 100644 --- a/skald/src/narrate.rs +++ b/skald/src/narrate.rs @@ -65,12 +65,16 @@ pub async fn run( let run_id = Uuid::new_v4(); let output_filename = format!("{}-{}-{}.wav", chapter.story_id, chapter.n, run_id); - // Engine + version threaded from the voice row's source/license - // pair. lj_speech-style PD voices live behind f5-tts; kokoro_* - // voices live behind kokoro. Future: a dedicated voices.engine - // column to make this explicit. + // Engine + version threaded from the voice row's source. Three + // engines on Lucy currently: + // kokoro_* → kokoro 82M + // tortoise_* → tortoise-tts + // anything else (lj_speech etc.) → f5-tts + // Future: a dedicated voices.engine column to make this explicit. let (engine, engine_version) = if voice.source.starts_with("kokoro") { ("kokoro-82m", "0.9") + } else if voice.source.starts_with("tortoise") { + ("tortoise-tts", "3.0") } else { ("f5-tts", "1.1.20") }; @@ -91,14 +95,20 @@ pub async fn run( // the Kokoro server only ever sees real voice ids. Only kicks // in for kokoro-routed renders; F5 voice-tag handling isn't // implemented and any tags pass through unchanged. - // Two pre-processing passes (kokoro only). Order matters: - // 1. Speaker voice substitution rewrites [voice:slug] → [voice:kokoro_id]. - // This must run BEFORE pronunciation overrides so we don't - // accidentally try to respell character slugs. + // Two pre-processing passes (kokoro + tortoise — engines that + // parse [voice:X] dialogue tags). Order matters: + // 1. Speaker voice substitution rewrites [voice:slug] → the + // engine's named voice id. Must run BEFORE pronunciation + // overrides so we don't try to respell character slugs. + // Tortoise: characters with no tortoise-voice mapping + // gracefully fall back to the narrator voice server-side. // 2. Pronunciation overrides word-substitute proper nouns - // (Pripyat, Dyatlov, etc.) with English-readable respellings - // so Kokoro's small phonemizer doesn't mangle them. - let gen_text = if voice.source.starts_with("kokoro") { + // (Pripyat, Dyatlov, etc.) with English-readable + // respellings. The respellings are kokoro/misaki-tuned but + // pass through tortoise's g2p_en well enough to apply. + let routes_to_engine_with_voice_tags = + voice.source.starts_with("kokoro") || voice.source.starts_with("tortoise"); + let gen_text = if routes_to_engine_with_voice_tags { let voiced = substitute_speaker_voices( &pool, chapter.story_id, @@ -373,13 +383,18 @@ async fn apply_pronunciation_overrides( Ok(out) } -/// Pick the engine base URL for a given voice.source. Voices whose -/// source starts with "kokoro" route to KOKORO_URL; everything else -/// routes to F5_TTS_URL. Each env var has a LAN-default for Lucy. +/// Pick the engine base URL for a given voice.source. +/// kokoro_* → KOKORO_URL +/// tortoise_* → TORTOISE_URL +/// anything else (lj_speech etc.) → F5_TTS_URL +/// Each env var has a LAN-default for Lucy. fn engine_url_for(source: &str) -> anyhow::Result { if source.starts_with("kokoro") { Ok(std::env::var("KOKORO_URL") .unwrap_or_else(|_| "http://192.168.0.5:7794".into())) + } else if source.starts_with("tortoise") { + Ok(std::env::var("TORTOISE_URL") + .unwrap_or_else(|_| "http://192.168.0.5:7795".into())) } else { Ok(std::env::var("F5_TTS_URL") .unwrap_or_else(|_| "http://192.168.0.5:7792".into()))