From c9bd38034c4b1b33a23a33a385e2ad0761b1e0bb Mon Sep 17 00:00:00 2001 From: Kayos Date: Thu, 14 May 2026 08:35:33 -0700 Subject: [PATCH] multi-voice: per-character dialogue rendering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Schema: characters.voice_id + characters.slug (migration 0007). voice_id is FK to voices(id); slug is the stable lowercase token the narrate_prep pass uses inside [voice:slug]...[/voice]. Forge::narrate_prep takes &[CharacterSpeaker]. System prompt expanded to instruct the author to wrap dialogue lines in voice tags based on a roster supplied in the user prompt (slug + name + short hint from key_facts). Unattributed dialogue stays unwrapped and inherits the narrator voice. skald narrate substitutes [voice:] → [voice:] right before sending to Kokoro, using characters.voice_id JOIN voices.reference_path as the map. Slugs with no voice or no character row fall back to the narrator voice defensively (logged as warn). kokoro_server.py v0.4: splitter recognises [voice:X]...[/voice] blocks at the paragraph level. Each text node carries an optional voice attribution; renderer feeds it to Kokoro per-segment. Outside voice blocks the request's default voice is used. voices_used is reported back so callers can verify multi-voice actually ran. Only kokoro-routed renders pre-process voice tags; F5 paths leave the tags in place (F5 multi-voice not implemented). Defensive fallback: orphan/unclosed [/voice] markers are silently absorbed rather than failing the render. --- Cargo.lock | 1 + migrations/0007_characters_voice.sql | 16 ++++++ skald-core/src/forge.rs | 59 +++++++++++++++++--- skald/Cargo.toml | 1 + skald/src/narrate.rs | 82 +++++++++++++++++++++++++++- skald/src/narrate_prep.rs | 37 ++++++++++++- 6 files changed, 186 insertions(+), 10 deletions(-) create mode 100644 migrations/0007_characters_voice.sql diff --git a/Cargo.lock b/Cargo.lock index 9c04f18..a9cd7e0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1896,6 +1896,7 @@ dependencies = [ "chrono", "clap", "maud", + "regex", "serde", "serde_json", "skald-core", diff --git a/migrations/0007_characters_voice.sql b/migrations/0007_characters_voice.sql new file mode 100644 index 0000000..f4c8c6b --- /dev/null +++ b/migrations/0007_characters_voice.sql @@ -0,0 +1,16 @@ +-- Per-character voice assignment for multi-voice audiobook rendering. +-- A NULL voice_id means "use the story's default narrator voice for +-- this character's dialogue" (effectively no voice change vs the +-- narrator). A non-NULL voice_id pins the character's dialogue to +-- that specific voice. +-- +-- slug is the stable lowercase token the narrate_prep pass uses +-- inside [voice:]...[/voice] markers. Distinct from name so +-- "Anatoly Dyatlov" can carry slug "dyatlov" without re-deriving on +-- every render. +ALTER TABLE characters + ADD COLUMN voice_id uuid REFERENCES voices(id) ON DELETE SET NULL, + ADD COLUMN slug text; + +CREATE INDEX idx_characters_voice ON characters(voice_id) WHERE voice_id IS NOT NULL; +CREATE UNIQUE INDEX idx_characters_story_slug ON characters(story_id, slug) WHERE slug IS NOT NULL; diff --git a/skald-core/src/forge.rs b/skald-core/src/forge.rs index 8231c6d..14f0da6 100644 --- a/skald-core/src/forge.rs +++ b/skald-core/src/forge.rs @@ -192,14 +192,20 @@ impl Forge { /// Orson Black places beats differently than another author /// would. Replace-mode if author is set; Append otherwise. /// + /// `characters` is the story's character roster. When provided, + /// the system prompt instructs the model to wrap dialogue in + /// `[voice:]"..."[/voice]` for multi-voice rendering. The + /// slug is mapped to a Kokoro voice id by skald's narrate path. + /// /// Hard rule the system prompt enforces: do not change a word /// of prose. Tags are additive only. pub async fn narrate_prep( &self, prose: &str, author: Option<&AuthorWithRevision>, + characters: &[CharacterSpeaker], ) -> anyhow::Result { - let user_prompt = narrate_prep_user_prompt(prose); + let user_prompt = narrate_prep_user_prompt(prose, characters); let (system, mode) = match author { Some(a) => { let scaffold = a @@ -339,9 +345,9 @@ const HOUSE_CLEANUP_SYSTEM: &str = "You are a copy editor polishing a draft chap const SYSTEM_AUDIT: &str = "You are a canon auditor for long-form fiction. You compare a parent story and a new chapter against the bible. You flag continuity drift, character voice shift, retconned facts, dropped threads, timeline contradictions. You return STRUCTURED JSON ONLY — no commentary, no preamble. The exact shape: { \"findings\": [ { \"severity\": \"info\"|\"warn\"|\"crit\", \"area\": \"character\"|\"continuity\"|\"tone\"|\"fact\"|\"timeline\"|\"other\", \"body\": \"...\" } ] }. If no findings, return { \"findings\": [] }."; -const NARRATE_PREP_DIRECTIVE: &str = "This is a NARRATION-ANNOTATION pass. You receive your own prose and prepare it for an audiobook reading. Two kinds of inserts are allowed:\n\n1. BEAT MARKERS (additive, not prose): `[breath]` (~400ms), `[pause:1.2s]` (explicit silence in seconds, e.g. 0.5s, 1.2s, 2s), `[scene]` (~1500ms scene break). Place where the prose's rhythm asks for them — after a hard one-line beat, before a turn in dialogue, on a paragraph that lands with weight.\n\n2. NARRATOR STUMBLES (humanizing prose-level inserts): a real narrator occasionally stumbles on a hard word, catches themselves, repeats. You may add these *sparingly* where the prose's pacing makes them feel right. Patterns: em-dash repetition (`Prip— Pripyat`), self-correction (`she — no, the wife — had been told`), hesitation (`the dose, the dose was`). USE SPARINGLY. Maybe 1-3 per chapter. Pick proper nouns, technical terms, or moments where the narrator might genuinely catch herself. Avoid stumbling on emotional climaxes — those should land clean.\n\nApart from stumbles, do NOT change a word of the original prose. Return the prose with beat markers and stumbles inline. No preamble. No commentary about your choices."; +const NARRATE_PREP_DIRECTIVE: &str = "This is a NARRATION-ANNOTATION pass. You receive your own prose and prepare it for an audiobook reading. Three kinds of inserts are allowed:\n\n1. BEAT MARKERS (additive, not prose): `[breath]` (~400ms), `[pause:1.2s]` (explicit silence in seconds, e.g. 0.5s, 1.2s, 2s), `[scene]` (~1500ms scene break). Place where the prose's rhythm asks for them — after a hard one-line beat, before a turn in dialogue, on a paragraph that lands with weight.\n\n2. SPEAKER VOICE TAGS (multi-voice dialogue): wrap dialogue lines in `[voice:]\"...\"[/voice]` based on who is speaking. The roster of available speaker slugs is given in the user prompt. The dialogue itself stays verbatim — only the wrapper is added. If a line of dialogue is not clearly attributable to a roster speaker, leave it unwrapped (the narrator voice will read it). Quoted thoughts (italicized interior monologue) stay unwrapped — only spoken aloud dialogue gets a voice tag.\n\n3. NARRATOR STUMBLES (humanizing prose-level inserts): a real narrator occasionally stumbles on a hard word, catches themselves, repeats. You may add these *sparingly* where the prose's pacing makes them feel right. Patterns: em-dash repetition (`Prip— Pripyat`), self-correction (`she — no, the wife — had been told`), hesitation (`the dose, the dose was`). USE SPARINGLY. Maybe 1-3 per chapter. Pick proper nouns, technical terms, or moments where the narrator might genuinely catch herself. Avoid stumbling on emotional climaxes — those should land clean.\n\nApart from stumbles, do NOT change a word of the original prose. Return the prose with beat markers, voice tags, and stumbles inline. No preamble. No commentary about your choices."; -const HOUSE_NARRATE_PREP_SYSTEM: &str = "You are a senior audiobook director annotating prose for narration. You insert (a) beat markers — `[breath]`, `[pause:Xs]`, `[scene]` — where a skilled narrator would breathe or pause, and (b) occasional humanizing narrator stumbles using em-dash repetition or self-correction (sparingly — maybe 1-3 per chapter, on proper nouns or hard words). Apart from those stumbles you do NOT change a word of the prose. Return the prose verbatim plus beat markers and (rare) stumbles inline. No preamble, no commentary."; +const HOUSE_NARRATE_PREP_SYSTEM: &str = "You are a senior audiobook director annotating prose for narration. You insert (a) beat markers — `[breath]`, `[pause:Xs]`, `[scene]` — where a skilled narrator would breathe or pause, (b) speaker voice tags `[voice:]\"...\"[/voice]` wrapping dialogue based on who is speaking (roster supplied in user prompt; leave unattributed dialogue unwrapped), and (c) occasional humanizing narrator stumbles using em-dash repetition or self-correction (sparingly — maybe 1-3 per chapter, on proper nouns or hard words). Apart from those stumbles you do NOT change a word of the prose. Return the prose verbatim plus beat markers, voice tags, and (rare) stumbles inline. No preamble, no commentary."; // ─── User-prompt builders ─────────────────────────────────────── @@ -376,14 +382,53 @@ fn gen_user_prompt( out } -fn narrate_prep_user_prompt(prose: &str) -> String { +/// One row of the story's character roster, passed to narrate_prep +/// so the LLM knows what speaker slugs to use in `[voice:slug]` +/// tags. Built from skald's characters table. +#[derive(Debug, Clone)] +pub struct CharacterSpeaker { + pub slug: String, + pub name: String, + /// Short note (1 sentence) giving the LLM enough to disambiguate + /// who's speaking when prose says "she said". Pulled from + /// characters.key_facts but trimmed. + pub hint: Option, +} + +fn narrate_prep_user_prompt(prose: &str, characters: &[CharacterSpeaker]) -> String { let mut out = String::with_capacity(prose.len() + 512); + + if !characters.is_empty() { + out.push_str("# Speaker roster\n\n"); + out.push_str( + "Use these slugs in `[voice:]\"...\"[/voice]` wrappers on dialogue. \ + Leave dialogue without a clear roster speaker unwrapped (the narrator \ + voice will read it).\n\n", + ); + for c in characters { + out.push_str("- `"); + out.push_str(&c.slug); + out.push_str("` — "); + out.push_str(&c.name); + if let Some(h) = &c.hint { + if !h.trim().is_empty() { + out.push_str(" ("); + out.push_str(h.trim()); + out.push(')'); + } + } + out.push('\n'); + } + out.push('\n'); + } + out.push_str("# Prose to annotate\n\n"); out.push_str(prose); out.push_str( - "\n\n# Task\n\nReturn the prose above with `[breath]`, `[pause:Xs]`, and \ - `[scene]` markers inserted at natural narration beats. Do not change \ - any word. Do not skip any sentence. Return only the annotated prose.\n", + "\n\n# Task\n\nReturn the prose above with `[breath]`, `[pause:Xs]`, \ + `[scene]` markers and `[voice:]\"...\"[/voice]` dialogue wrappers \ + inserted appropriately. Do not change any word. Do not skip any \ + sentence. Return only the annotated prose.\n", ); out } diff --git a/skald/Cargo.toml b/skald/Cargo.toml index ae9e280..ecebf62 100644 --- a/skald/Cargo.toml +++ b/skald/Cargo.toml @@ -26,3 +26,4 @@ tracing-subscriber = { workspace = true } chrono = { workspace = true } uuid = { workspace = true } maud = { workspace = true } +regex = { workspace = true } diff --git a/skald/src/narrate.rs b/skald/src/narrate.rs index 62abcb0..a18afa5 100644 --- a/skald/src/narrate.rs +++ b/skald/src/narrate.rs @@ -86,9 +86,26 @@ pub async fn run( .fetch_one(&pool) .await?; + // Resolve any [voice:]...[/voice] tags in the + // annotated text into [voice:]...[/voice] so + // the Kokoro server only ever sees real voice ids. Only kicks + // in for kokoro-routed renders; F5 voice-tag handling isn't + // implemented and any tags pass through unchanged. + let gen_text = if voice.source.starts_with("kokoro") { + substitute_speaker_voices( + &pool, + chapter.story_id, + &chapter.body_for_tts, + ref_audio_path.as_str(), + ) + .await? + } else { + chapter.body_for_tts.clone() + }; + let started = Instant::now(); let req = SynthesizeRequest { - gen_text: chapter.body_for_tts.clone(), + gen_text, ref_audio_path, ref_text: voice.reference_text.clone(), output_filename, @@ -200,6 +217,69 @@ async fn resolve_voice( .ok_or_else(|| anyhow::anyhow!("no default voice set; create one or use --voice ")) } +/// Rewrite `[voice:]...[/voice]` tags in the +/// annotated prose to use the actual Kokoro voice name for that +/// character. If a character has no voice_id assigned, or if the +/// slug doesn't match any character in the story, the tag is +/// stripped (dialogue falls back to the narrator voice) rather +/// than failing — defensive default so a missed character row +/// doesn't break a render. +async fn substitute_speaker_voices( + pool: &PgPool, + story_id: Uuid, + text: &str, + narrator_voice_name: &str, +) -> anyhow::Result { + // Tag detection is cheap; only run the DB query if at least one + // [voice:...] appears. + if !text.contains("[voice:") { + return Ok(text.to_string()); + } + + let rows: Vec<(String, Option)> = sqlx::query_as( + "SELECT c.slug, v.reference_path + FROM characters c + LEFT JOIN voices v ON v.id = c.voice_id + WHERE c.story_id = $1 AND c.slug IS NOT NULL", + ) + .bind(story_id) + .fetch_all(pool) + .await?; + let mut slug_to_voice = std::collections::HashMap::::new(); + for (slug, ref_path) in rows { + if let Some(p) = ref_path { + // For kokoro voices, reference_path stores the voice + // name directly (e.g. "am_onyx"). F5-style paths start + // with '/', skip those. + if !p.starts_with('/') { + slug_to_voice.insert(slug, p); + } + } + } + tracing::info!( + story_id = %story_id, + mapped_speakers = slug_to_voice.len(), + narrator_voice = narrator_voice_name, + "substituting [voice:slug] markers", + ); + + let re = regex::Regex::new(r"\[voice:([a-z0-9_-]+)\]").unwrap(); + let out = re.replace_all(text, |caps: ®ex::Captures<'_>| { + let slug = &caps[1]; + match slug_to_voice.get(slug) { + Some(voice_name) => format!("[voice:{voice_name}]"), + None => { + tracing::warn!(slug = %slug, "no voice for speaker; dialogue falls back to narrator"); + // Replace with the narrator voice so the segment + // still gets a voice tag (so Kokoro's segment + // parser doesn't see an orphan [/voice]). + format!("[voice:{narrator_voice_name}]") + } + } + }); + Ok(out.into_owned()) +} + /// Pick the engine base URL for a given voice.source. Voices whose /// source starts with "kokoro" route to KOKORO_URL; everything else /// routes to F5_TTS_URL. Each env var has a LAN-default for Lucy. diff --git a/skald/src/narrate_prep.rs b/skald/src/narrate_prep.rs index 6d2fe69..bc92228 100644 --- a/skald/src/narrate_prep.rs +++ b/skald/src/narrate_prep.rs @@ -15,7 +15,7 @@ use chrono::Utc; use skald_core::authors::{self, AuthorWithRevision}; use skald_core::config::ForgeConfig; use skald_core::db; -use skald_core::forge::{Forge, PassKind, PassOutput}; +use skald_core::forge::{CharacterSpeaker, Forge, PassKind, PassOutput}; use sqlx::PgPool; use uuid::Uuid; @@ -60,8 +60,15 @@ pub async fn run( .fetch_one(&pool) .await?; + let characters = load_speakers(&pool, chapter.story_id).await?; + if !characters.is_empty() { + tracing::info!(speaker_count = characters.len(), "speaker roster loaded"); + } + let started = Instant::now(); - let out_res = forge.narrate_prep(&chapter.body_md, author.as_ref()).await; + let out_res = forge + .narrate_prep(&chapter.body_md, author.as_ref(), &characters) + .await; let elapsed = started.elapsed(); let out: PassOutput = match out_res { @@ -179,6 +186,32 @@ fn count_beats(s: &str) -> usize { n } +async fn load_speakers(pool: &PgPool, story_id: Uuid) -> anyhow::Result> { + // Only characters with a slug get into the speaker roster — a + // null slug means we haven't curated the character yet, and we + // don't want the LLM inventing dialogue-attribution slugs. + let rows: Vec<(String, String, Option)> = sqlx::query_as( + "SELECT slug, name, key_facts FROM characters + WHERE story_id = $1 AND slug IS NOT NULL + ORDER BY name", + ) + .bind(story_id) + .fetch_all(pool) + .await?; + Ok(rows + .into_iter() + .map(|(slug, name, key_facts)| { + // Keep the hint short; the LLM mostly needs the name + + // a tiny disambiguator. ~120 chars of key_facts is + // plenty. + let hint = key_facts + .filter(|kf| !kf.trim().is_empty()) + .map(|kf| kf.chars().take(120).collect::()); + CharacterSpeaker { slug, name, hint } + }) + .collect()) +} + fn load_forge_config() -> anyhow::Result { let base_url = std::env::var("CLAWDFORGE_URL") .context("CLAWDFORGE_URL not set")?;