multi-voice: per-character dialogue rendering

Schema: characters.voice_id + characters.slug (migration 0007). voice_id is FK to voices(id); slug is the stable lowercase token the narrate_prep pass uses inside [voice:slug]...[/voice]. Forge::narrate_prep takes &[CharacterSpeaker]. System prompt expanded to instruct the author to wrap dialogue lines in voice tags based on a roster supplied in the user prompt (slug + name + short hint from key_facts). Unattributed dialogue stays unwrapped and inherits the narrator voice. skald narrate substitutes [voice:<character-slug>] → [voice:<kokoro-voice-name>] right before sending to Kokoro, using characters.voice_id JOIN voices.reference_path as the map. Slugs with no voice or no character row fall back to the narrator voice defensively (logged as warn). kokoro_server.py v0.4: splitter recognises [voice:X]...[/voice] blocks at the paragraph level. Each text node carries an optional voice attribution; renderer feeds it to Kokoro per-segment. Outside voice blocks the request's default voice is used. voices_used is reported back so callers can verify multi-voice actually ran. Only kokoro-routed renders pre-process voice tags; F5 paths leave the tags in place (F5 multi-voice not implemented). Defensive fallback: orphan/unclosed [/voice] markers are silently absorbed rather than failing the render.
2026-05-14 08:35:33 -07:00 · 2026-05-14 08:35:33 -07:00 · c9bd38034c
commit c9bd38034c
parent 330bc8bde2
6 changed files with 186 additions and 10 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1896,6 +1896,7 @@ dependencies = [
 "chrono",
 "clap",
 "maud",
 "regex",
 "serde",
 "serde_json",
 "skald-core",
--- a/migrations/0007_characters_voice.sql
+++ b/migrations/0007_characters_voice.sql
@ -0,0 +1,16 @@
 -- Per-character voice assignment for multi-voice audiobook rendering.
 -- A NULL voice_id means "use the story's default narrator voice for
 -- this character's dialogue" (effectively no voice change vs the
 -- narrator). A non-NULL voice_id pins the character's dialogue to
 -- that specific voice.
 --
 -- slug is the stable lowercase token the narrate_prep pass uses
 -- inside [voice:<slug>]...[/voice] markers. Distinct from name so
 -- "Anatoly Dyatlov" can carry slug "dyatlov" without re-deriving on
 -- every render.
 ALTER TABLE characters
    ADD COLUMN voice_id uuid REFERENCES voices(id) ON DELETE SET NULL,
    ADD COLUMN slug text;
 CREATE INDEX idx_characters_voice ON characters(voice_id) WHERE voice_id IS NOT NULL;
 CREATE UNIQUE INDEX idx_characters_story_slug ON characters(story_id, slug) WHERE slug IS NOT NULL;
--- a/skald-core/src/forge.rs
+++ b/skald-core/src/forge.rs
@ -192,14 +192,20 @@ impl Forge {
    /// Orson Black places beats differently than another author
    /// would. Replace-mode if author is set; Append otherwise.
    ///
    /// `characters` is the story's character roster. When provided,
    /// the system prompt instructs the model to wrap dialogue in
    /// `[voice:<slug>]"..."[/voice]` for multi-voice rendering. The
    /// slug is mapped to a Kokoro voice id by skald's narrate path.
    ///
    /// Hard rule the system prompt enforces: do not change a word
    /// of prose. Tags are additive only.
    pub async fn narrate_prep(
        &self,
        prose: &str,
        author: Option<&AuthorWithRevision>,
        characters: &[CharacterSpeaker],
    ) -> anyhow::Result<PassOutput> {
-        let user_prompt = narrate_prep_user_prompt(prose);
+        let user_prompt = narrate_prep_user_prompt(prose, characters);
        let (system, mode) = match author {
            Some(a) => {
                let scaffold = a
@ -339,9 +345,9 @@ const HOUSE_CLEANUP_SYSTEM: &str = "You are a copy editor polishing a draft chap
 const SYSTEM_AUDIT: &str = "You are a canon auditor for long-form fiction. You compare a parent story and a new chapter against the bible. You flag continuity drift, character voice shift, retconned facts, dropped threads, timeline contradictions. You return STRUCTURED JSON ONLY — no commentary, no preamble. The exact shape: { \"findings\": [ { \"severity\": \"info\"|\"warn\"|\"crit\", \"area\": \"character\"|\"continuity\"|\"tone\"|\"fact\"|\"timeline\"|\"other\", \"body\": \"...\" } ] }. If no findings, return { \"findings\": [] }.";
-const NARRATE_PREP_DIRECTIVE: &str = "This is a NARRATION-ANNOTATION pass. You receive your own prose and prepare it for an audiobook reading. Two kinds of inserts are allowed:\n\n1. BEAT MARKERS (additive, not prose): `[breath]` (~400ms), `[pause:1.2s]` (explicit silence in seconds, e.g. 0.5s, 1.2s, 2s), `[scene]` (~1500ms scene break). Place where the prose's rhythm asks for them — after a hard one-line beat, before a turn in dialogue, on a paragraph that lands with weight.\n\n2. NARRATOR STUMBLES (humanizing prose-level inserts): a real narrator occasionally stumbles on a hard word, catches themselves, repeats. You may add these *sparingly* where the prose's pacing makes them feel right. Patterns: em-dash repetition (`Prip— Pripyat`), self-correction (`she — no, the wife — had been told`), hesitation (`the dose, the dose was`). USE SPARINGLY. Maybe 1-3 per chapter. Pick proper nouns, technical terms, or moments where the narrator might genuinely catch herself. Avoid stumbling on emotional climaxes — those should land clean.\n\nApart from stumbles, do NOT change a word of the original prose. Return the prose with beat markers and stumbles inline. No preamble. No commentary about your choices.";
+const NARRATE_PREP_DIRECTIVE: &str = "This is a NARRATION-ANNOTATION pass. You receive your own prose and prepare it for an audiobook reading. Three kinds of inserts are allowed:\n\n1. BEAT MARKERS (additive, not prose): `[breath]` (~400ms), `[pause:1.2s]` (explicit silence in seconds, e.g. 0.5s, 1.2s, 2s), `[scene]` (~1500ms scene break). Place where the prose's rhythm asks for them — after a hard one-line beat, before a turn in dialogue, on a paragraph that lands with weight.\n\n2. SPEAKER VOICE TAGS (multi-voice dialogue): wrap dialogue lines in `[voice:<slug>]\"...\"[/voice]` based on who is speaking. The roster of available speaker slugs is given in the user prompt. The dialogue itself stays verbatim — only the wrapper is added. If a line of dialogue is not clearly attributable to a roster speaker, leave it unwrapped (the narrator voice will read it). Quoted thoughts (italicized interior monologue) stay unwrapped — only spoken aloud dialogue gets a voice tag.\n\n3. NARRATOR STUMBLES (humanizing prose-level inserts): a real narrator occasionally stumbles on a hard word, catches themselves, repeats. You may add these *sparingly* where the prose's pacing makes them feel right. Patterns: em-dash repetition (`Prip— Pripyat`), self-correction (`she — no, the wife — had been told`), hesitation (`the dose, the dose was`). USE SPARINGLY. Maybe 1-3 per chapter. Pick proper nouns, technical terms, or moments where the narrator might genuinely catch herself. Avoid stumbling on emotional climaxes — those should land clean.\n\nApart from stumbles, do NOT change a word of the original prose. Return the prose with beat markers, voice tags, and stumbles inline. No preamble. No commentary about your choices.";
-const HOUSE_NARRATE_PREP_SYSTEM: &str = "You are a senior audiobook director annotating prose for narration. You insert (a) beat markers — `[breath]`, `[pause:Xs]`, `[scene]` — where a skilled narrator would breathe or pause, and (b) occasional humanizing narrator stumbles using em-dash repetition or self-correction (sparingly — maybe 1-3 per chapter, on proper nouns or hard words). Apart from those stumbles you do NOT change a word of the prose. Return the prose verbatim plus beat markers and (rare) stumbles inline. No preamble, no commentary.";
+const HOUSE_NARRATE_PREP_SYSTEM: &str = "You are a senior audiobook director annotating prose for narration. You insert (a) beat markers — `[breath]`, `[pause:Xs]`, `[scene]` — where a skilled narrator would breathe or pause, (b) speaker voice tags `[voice:<slug>]\"...\"[/voice]` wrapping dialogue based on who is speaking (roster supplied in user prompt; leave unattributed dialogue unwrapped), and (c) occasional humanizing narrator stumbles using em-dash repetition or self-correction (sparingly — maybe 1-3 per chapter, on proper nouns or hard words). Apart from those stumbles you do NOT change a word of the prose. Return the prose verbatim plus beat markers, voice tags, and (rare) stumbles inline. No preamble, no commentary.";
 // ─── User-prompt builders ───────────────────────────────────────
@ -376,14 +382,53 @@ fn gen_user_prompt(
    out
 }
-fn narrate_prep_user_prompt(prose: &str) -> String {
+/// One row of the story's character roster, passed to narrate_prep
 /// so the LLM knows what speaker slugs to use in `[voice:slug]`
 /// tags. Built from skald's characters table.
 #[derive(Debug, Clone)]
 pub struct CharacterSpeaker {
    pub slug: String,
    pub name: String,
    /// Short note (1 sentence) giving the LLM enough to disambiguate
    /// who's speaking when prose says "she said". Pulled from
    /// characters.key_facts but trimmed.
    pub hint: Option<String>,
 }
 fn narrate_prep_user_prompt(prose: &str, characters: &[CharacterSpeaker]) -> String {
    let mut out = String::with_capacity(prose.len() + 512);
    if !characters.is_empty() {
        out.push_str("# Speaker roster\n\n");
        out.push_str(
            "Use these slugs in `[voice:<slug>]\"...\"[/voice]` wrappers on dialogue. \
             Leave dialogue without a clear roster speaker unwrapped (the narrator \
             voice will read it).\n\n",
        );
        for c in characters {
            out.push_str("- `");
            out.push_str(&c.slug);
            out.push_str("` — ");
            out.push_str(&c.name);
            if let Some(h) = &c.hint {
                if !h.trim().is_empty() {
                    out.push_str(" (");
                    out.push_str(h.trim());
                    out.push(')');
                }
            }
            out.push('\n');
        }
        out.push('\n');
    }
    out.push_str("# Prose to annotate\n\n");
    out.push_str(prose);
    out.push_str(
-        "\n\n# Task\n\nReturn the prose above with `[breath]`, `[pause:Xs]`, and \
+        "\n\n# Task\n\nReturn the prose above with `[breath]`, `[pause:Xs]`, \
-         `[scene]` markers inserted at natural narration beats. Do not change \
+         `[scene]` markers and `[voice:<slug>]\"...\"[/voice]` dialogue wrappers \
-         any word. Do not skip any sentence. Return only the annotated prose.\n",
+         inserted appropriately. Do not change any word. Do not skip any \
         sentence. Return only the annotated prose.\n",
    );
    out
 }
--- a/skald/Cargo.toml
+++ b/skald/Cargo.toml
@ -26,3 +26,4 @@ tracing-subscriber = { workspace = true }
 chrono = { workspace = true }
 uuid = { workspace = true }
 maud = { workspace = true }
 regex = { workspace = true }
--- a/skald/src/narrate.rs
+++ b/skald/src/narrate.rs
@ -86,9 +86,26 @@ pub async fn run(
    .fetch_one(&pool)
    .await?;
    // Resolve any [voice:<character-slug>]...[/voice] tags in the
    // annotated text into [voice:<kokoro-voice-name>]...[/voice] so
    // the Kokoro server only ever sees real voice ids. Only kicks
    // in for kokoro-routed renders; F5 voice-tag handling isn't
    // implemented and any tags pass through unchanged.
    let gen_text = if voice.source.starts_with("kokoro") {
        substitute_speaker_voices(
            &pool,
            chapter.story_id,
            &chapter.body_for_tts,
            ref_audio_path.as_str(),
        )
        .await?
    } else {
        chapter.body_for_tts.clone()
    };
    let started = Instant::now();
    let req = SynthesizeRequest {
-        gen_text: chapter.body_for_tts.clone(),
+        gen_text,
        ref_audio_path,
        ref_text: voice.reference_text.clone(),
        output_filename,
@ -200,6 +217,69 @@ async fn resolve_voice(
        .ok_or_else(|| anyhow::anyhow!("no default voice set; create one or use --voice <slug>"))
 }
 /// Rewrite `[voice:<character-slug>]...[/voice]` tags in the
 /// annotated prose to use the actual Kokoro voice name for that
 /// character. If a character has no voice_id assigned, or if the
 /// slug doesn't match any character in the story, the tag is
 /// stripped (dialogue falls back to the narrator voice) rather
 /// than failing — defensive default so a missed character row
 /// doesn't break a render.
 async fn substitute_speaker_voices(
    pool: &PgPool,
    story_id: Uuid,
    text: &str,
    narrator_voice_name: &str,
 ) -> anyhow::Result<String> {
    // Tag detection is cheap; only run the DB query if at least one
    // [voice:...] appears.
    if !text.contains("[voice:") {
        return Ok(text.to_string());
    }
    let rows: Vec<(String, Option<String>)> = sqlx::query_as(
        "SELECT c.slug, v.reference_path
         FROM characters c
         LEFT JOIN voices v ON v.id = c.voice_id
         WHERE c.story_id = $1 AND c.slug IS NOT NULL",
    )
    .bind(story_id)
    .fetch_all(pool)
    .await?;
    let mut slug_to_voice = std::collections::HashMap::<String, String>::new();
    for (slug, ref_path) in rows {
        if let Some(p) = ref_path {
            // For kokoro voices, reference_path stores the voice
            // name directly (e.g. "am_onyx"). F5-style paths start
            // with '/', skip those.
            if !p.starts_with('/') {
                slug_to_voice.insert(slug, p);
            }
        }
    }
    tracing::info!(
        story_id = %story_id,
        mapped_speakers = slug_to_voice.len(),
        narrator_voice = narrator_voice_name,
        "substituting [voice:slug] markers",
    );
    let re = regex::Regex::new(r"\[voice:([a-z0-9_-]+)\]").unwrap();
    let out = re.replace_all(text, |caps: &regex::Captures<'_>| {
        let slug = &caps[1];
        match slug_to_voice.get(slug) {
            Some(voice_name) => format!("[voice:{voice_name}]"),
            None => {
                tracing::warn!(slug = %slug, "no voice for speaker; dialogue falls back to narrator");
                // Replace with the narrator voice so the segment
                // still gets a voice tag (so Kokoro's segment
                // parser doesn't see an orphan [/voice]).
                format!("[voice:{narrator_voice_name}]")
            }
        }
    });
    Ok(out.into_owned())
 }
 /// Pick the engine base URL for a given voice.source. Voices whose
 /// source starts with "kokoro" route to KOKORO_URL; everything else
 /// routes to F5_TTS_URL. Each env var has a LAN-default for Lucy.
--- a/skald/src/narrate_prep.rs
+++ b/skald/src/narrate_prep.rs
@ -15,7 +15,7 @@ use chrono::Utc;
 use skald_core::authors::{self, AuthorWithRevision};
 use skald_core::config::ForgeConfig;
 use skald_core::db;
-use skald_core::forge::{Forge, PassKind, PassOutput};
+use skald_core::forge::{CharacterSpeaker, Forge, PassKind, PassOutput};
 use sqlx::PgPool;
 use uuid::Uuid;
@ -60,8 +60,15 @@ pub async fn run(
    .fetch_one(&pool)
    .await?;
    let characters = load_speakers(&pool, chapter.story_id).await?;
    if !characters.is_empty() {
        tracing::info!(speaker_count = characters.len(), "speaker roster loaded");
    }
    let started = Instant::now();
-    let out_res = forge.narrate_prep(&chapter.body_md, author.as_ref()).await;
+    let out_res = forge
        .narrate_prep(&chapter.body_md, author.as_ref(), &characters)
        .await;
    let elapsed = started.elapsed();
    let out: PassOutput = match out_res {
@ -179,6 +186,32 @@ fn count_beats(s: &str) -> usize {
    n
 }
 async fn load_speakers(pool: &PgPool, story_id: Uuid) -> anyhow::Result<Vec<CharacterSpeaker>> {
    // Only characters with a slug get into the speaker roster — a
    // null slug means we haven't curated the character yet, and we
    // don't want the LLM inventing dialogue-attribution slugs.
    let rows: Vec<(String, String, Option<String>)> = sqlx::query_as(
        "SELECT slug, name, key_facts FROM characters
         WHERE story_id = $1 AND slug IS NOT NULL
         ORDER BY name",
    )
    .bind(story_id)
    .fetch_all(pool)
    .await?;
    Ok(rows
        .into_iter()
        .map(|(slug, name, key_facts)| {
            // Keep the hint short; the LLM mostly needs the name +
            // a tiny disambiguator. ~120 chars of key_facts is
            // plenty.
            let hint = key_facts
                .filter(|kf| !kf.trim().is_empty())
                .map(|kf| kf.chars().take(120).collect::<String>());
            CharacterSpeaker { slug, name, hint }
        })
        .collect())
 }
 fn load_forge_config() -> anyhow::Result<ForgeConfig> {
    let base_url = std::env::var("CLAWDFORGE_URL")
        .context("CLAWDFORGE_URL not set")?;