multi-voice: per-character dialogue rendering

Schema: characters.voice_id + characters.slug (migration 0007).
voice_id is FK to voices(id); slug is the stable lowercase token
the narrate_prep pass uses inside [voice:slug]...[/voice].

Forge::narrate_prep takes &[CharacterSpeaker]. System prompt
expanded to instruct the author to wrap dialogue lines in voice
tags based on a roster supplied in the user prompt (slug + name +
short hint from key_facts). Unattributed dialogue stays unwrapped
and inherits the narrator voice.

skald narrate substitutes [voice:<character-slug>] →
[voice:<kokoro-voice-name>] right before sending to Kokoro, using
characters.voice_id JOIN voices.reference_path as the map. Slugs
with no voice or no character row fall back to the narrator voice
defensively (logged as warn).

kokoro_server.py v0.4: splitter recognises [voice:X]...[/voice]
blocks at the paragraph level. Each text node carries an optional
voice attribution; renderer feeds it to Kokoro per-segment. Outside
voice blocks the request's default voice is used. voices_used is
reported back so callers can verify multi-voice actually ran.

Only kokoro-routed renders pre-process voice tags; F5 paths leave
the tags in place (F5 multi-voice not implemented). Defensive
fallback: orphan/unclosed [/voice] markers are silently absorbed
rather than failing the render.
This commit is contained in:
Kayos 2026-05-14 08:35:33 -07:00
parent 330bc8bde2
commit c9bd38034c
6 changed files with 186 additions and 10 deletions

1
Cargo.lock generated
View file

@ -1896,6 +1896,7 @@ dependencies = [
"chrono", "chrono",
"clap", "clap",
"maud", "maud",
"regex",
"serde", "serde",
"serde_json", "serde_json",
"skald-core", "skald-core",

View file

@ -0,0 +1,16 @@
-- Per-character voice assignment for multi-voice audiobook rendering.
-- A NULL voice_id means "use the story's default narrator voice for
-- this character's dialogue" (effectively no voice change vs the
-- narrator). A non-NULL voice_id pins the character's dialogue to
-- that specific voice.
--
-- slug is the stable lowercase token the narrate_prep pass uses
-- inside [voice:<slug>]...[/voice] markers. Distinct from name so
-- "Anatoly Dyatlov" can carry slug "dyatlov" without re-deriving on
-- every render.
ALTER TABLE characters
ADD COLUMN voice_id uuid REFERENCES voices(id) ON DELETE SET NULL,
ADD COLUMN slug text;
CREATE INDEX idx_characters_voice ON characters(voice_id) WHERE voice_id IS NOT NULL;
CREATE UNIQUE INDEX idx_characters_story_slug ON characters(story_id, slug) WHERE slug IS NOT NULL;

View file

@ -192,14 +192,20 @@ impl Forge {
/// Orson Black places beats differently than another author /// Orson Black places beats differently than another author
/// would. Replace-mode if author is set; Append otherwise. /// would. Replace-mode if author is set; Append otherwise.
/// ///
/// `characters` is the story's character roster. When provided,
/// the system prompt instructs the model to wrap dialogue in
/// `[voice:<slug>]"..."[/voice]` for multi-voice rendering. The
/// slug is mapped to a Kokoro voice id by skald's narrate path.
///
/// Hard rule the system prompt enforces: do not change a word /// Hard rule the system prompt enforces: do not change a word
/// of prose. Tags are additive only. /// of prose. Tags are additive only.
pub async fn narrate_prep( pub async fn narrate_prep(
&self, &self,
prose: &str, prose: &str,
author: Option<&AuthorWithRevision>, author: Option<&AuthorWithRevision>,
characters: &[CharacterSpeaker],
) -> anyhow::Result<PassOutput> { ) -> anyhow::Result<PassOutput> {
let user_prompt = narrate_prep_user_prompt(prose); let user_prompt = narrate_prep_user_prompt(prose, characters);
let (system, mode) = match author { let (system, mode) = match author {
Some(a) => { Some(a) => {
let scaffold = a let scaffold = a
@ -339,9 +345,9 @@ const HOUSE_CLEANUP_SYSTEM: &str = "You are a copy editor polishing a draft chap
const SYSTEM_AUDIT: &str = "You are a canon auditor for long-form fiction. You compare a parent story and a new chapter against the bible. You flag continuity drift, character voice shift, retconned facts, dropped threads, timeline contradictions. You return STRUCTURED JSON ONLY — no commentary, no preamble. The exact shape: { \"findings\": [ { \"severity\": \"info\"|\"warn\"|\"crit\", \"area\": \"character\"|\"continuity\"|\"tone\"|\"fact\"|\"timeline\"|\"other\", \"body\": \"...\" } ] }. If no findings, return { \"findings\": [] }."; const SYSTEM_AUDIT: &str = "You are a canon auditor for long-form fiction. You compare a parent story and a new chapter against the bible. You flag continuity drift, character voice shift, retconned facts, dropped threads, timeline contradictions. You return STRUCTURED JSON ONLY — no commentary, no preamble. The exact shape: { \"findings\": [ { \"severity\": \"info\"|\"warn\"|\"crit\", \"area\": \"character\"|\"continuity\"|\"tone\"|\"fact\"|\"timeline\"|\"other\", \"body\": \"...\" } ] }. If no findings, return { \"findings\": [] }.";
const NARRATE_PREP_DIRECTIVE: &str = "This is a NARRATION-ANNOTATION pass. You receive your own prose and prepare it for an audiobook reading. Two kinds of inserts are allowed:\n\n1. BEAT MARKERS (additive, not prose): `[breath]` (~400ms), `[pause:1.2s]` (explicit silence in seconds, e.g. 0.5s, 1.2s, 2s), `[scene]` (~1500ms scene break). Place where the prose's rhythm asks for them — after a hard one-line beat, before a turn in dialogue, on a paragraph that lands with weight.\n\n2. NARRATOR STUMBLES (humanizing prose-level inserts): a real narrator occasionally stumbles on a hard word, catches themselves, repeats. You may add these *sparingly* where the prose's pacing makes them feel right. Patterns: em-dash repetition (`Prip— Pripyat`), self-correction (`she — no, the wife — had been told`), hesitation (`the dose, the dose was`). USE SPARINGLY. Maybe 1-3 per chapter. Pick proper nouns, technical terms, or moments where the narrator might genuinely catch herself. Avoid stumbling on emotional climaxes — those should land clean.\n\nApart from stumbles, do NOT change a word of the original prose. Return the prose with beat markers and stumbles inline. No preamble. No commentary about your choices."; const NARRATE_PREP_DIRECTIVE: &str = "This is a NARRATION-ANNOTATION pass. You receive your own prose and prepare it for an audiobook reading. Three kinds of inserts are allowed:\n\n1. BEAT MARKERS (additive, not prose): `[breath]` (~400ms), `[pause:1.2s]` (explicit silence in seconds, e.g. 0.5s, 1.2s, 2s), `[scene]` (~1500ms scene break). Place where the prose's rhythm asks for them — after a hard one-line beat, before a turn in dialogue, on a paragraph that lands with weight.\n\n2. SPEAKER VOICE TAGS (multi-voice dialogue): wrap dialogue lines in `[voice:<slug>]\"...\"[/voice]` based on who is speaking. The roster of available speaker slugs is given in the user prompt. The dialogue itself stays verbatim — only the wrapper is added. If a line of dialogue is not clearly attributable to a roster speaker, leave it unwrapped (the narrator voice will read it). Quoted thoughts (italicized interior monologue) stay unwrapped — only spoken aloud dialogue gets a voice tag.\n\n3. NARRATOR STUMBLES (humanizing prose-level inserts): a real narrator occasionally stumbles on a hard word, catches themselves, repeats. You may add these *sparingly* where the prose's pacing makes them feel right. Patterns: em-dash repetition (`Prip— Pripyat`), self-correction (`she — no, the wife — had been told`), hesitation (`the dose, the dose was`). USE SPARINGLY. Maybe 1-3 per chapter. Pick proper nouns, technical terms, or moments where the narrator might genuinely catch herself. Avoid stumbling on emotional climaxes — those should land clean.\n\nApart from stumbles, do NOT change a word of the original prose. Return the prose with beat markers, voice tags, and stumbles inline. No preamble. No commentary about your choices.";
const HOUSE_NARRATE_PREP_SYSTEM: &str = "You are a senior audiobook director annotating prose for narration. You insert (a) beat markers — `[breath]`, `[pause:Xs]`, `[scene]` — where a skilled narrator would breathe or pause, and (b) occasional humanizing narrator stumbles using em-dash repetition or self-correction (sparingly — maybe 1-3 per chapter, on proper nouns or hard words). Apart from those stumbles you do NOT change a word of the prose. Return the prose verbatim plus beat markers and (rare) stumbles inline. No preamble, no commentary."; const HOUSE_NARRATE_PREP_SYSTEM: &str = "You are a senior audiobook director annotating prose for narration. You insert (a) beat markers — `[breath]`, `[pause:Xs]`, `[scene]` — where a skilled narrator would breathe or pause, (b) speaker voice tags `[voice:<slug>]\"...\"[/voice]` wrapping dialogue based on who is speaking (roster supplied in user prompt; leave unattributed dialogue unwrapped), and (c) occasional humanizing narrator stumbles using em-dash repetition or self-correction (sparingly — maybe 1-3 per chapter, on proper nouns or hard words). Apart from those stumbles you do NOT change a word of the prose. Return the prose verbatim plus beat markers, voice tags, and (rare) stumbles inline. No preamble, no commentary.";
// ─── User-prompt builders ─────────────────────────────────────── // ─── User-prompt builders ───────────────────────────────────────
@ -376,14 +382,53 @@ fn gen_user_prompt(
out out
} }
fn narrate_prep_user_prompt(prose: &str) -> String { /// One row of the story's character roster, passed to narrate_prep
/// so the LLM knows what speaker slugs to use in `[voice:slug]`
/// tags. Built from skald's characters table.
#[derive(Debug, Clone)]
pub struct CharacterSpeaker {
pub slug: String,
pub name: String,
/// Short note (1 sentence) giving the LLM enough to disambiguate
/// who's speaking when prose says "she said". Pulled from
/// characters.key_facts but trimmed.
pub hint: Option<String>,
}
fn narrate_prep_user_prompt(prose: &str, characters: &[CharacterSpeaker]) -> String {
let mut out = String::with_capacity(prose.len() + 512); let mut out = String::with_capacity(prose.len() + 512);
if !characters.is_empty() {
out.push_str("# Speaker roster\n\n");
out.push_str(
"Use these slugs in `[voice:<slug>]\"...\"[/voice]` wrappers on dialogue. \
Leave dialogue without a clear roster speaker unwrapped (the narrator \
voice will read it).\n\n",
);
for c in characters {
out.push_str("- `");
out.push_str(&c.slug);
out.push_str("` — ");
out.push_str(&c.name);
if let Some(h) = &c.hint {
if !h.trim().is_empty() {
out.push_str(" (");
out.push_str(h.trim());
out.push(')');
}
}
out.push('\n');
}
out.push('\n');
}
out.push_str("# Prose to annotate\n\n"); out.push_str("# Prose to annotate\n\n");
out.push_str(prose); out.push_str(prose);
out.push_str( out.push_str(
"\n\n# Task\n\nReturn the prose above with `[breath]`, `[pause:Xs]`, and \ "\n\n# Task\n\nReturn the prose above with `[breath]`, `[pause:Xs]`, \
`[scene]` markers inserted at natural narration beats. Do not change \ `[scene]` markers and `[voice:<slug>]\"...\"[/voice]` dialogue wrappers \
any word. Do not skip any sentence. Return only the annotated prose.\n", inserted appropriately. Do not change any word. Do not skip any \
sentence. Return only the annotated prose.\n",
); );
out out
} }

View file

@ -26,3 +26,4 @@ tracing-subscriber = { workspace = true }
chrono = { workspace = true } chrono = { workspace = true }
uuid = { workspace = true } uuid = { workspace = true }
maud = { workspace = true } maud = { workspace = true }
regex = { workspace = true }

View file

@ -86,9 +86,26 @@ pub async fn run(
.fetch_one(&pool) .fetch_one(&pool)
.await?; .await?;
// Resolve any [voice:<character-slug>]...[/voice] tags in the
// annotated text into [voice:<kokoro-voice-name>]...[/voice] so
// the Kokoro server only ever sees real voice ids. Only kicks
// in for kokoro-routed renders; F5 voice-tag handling isn't
// implemented and any tags pass through unchanged.
let gen_text = if voice.source.starts_with("kokoro") {
substitute_speaker_voices(
&pool,
chapter.story_id,
&chapter.body_for_tts,
ref_audio_path.as_str(),
)
.await?
} else {
chapter.body_for_tts.clone()
};
let started = Instant::now(); let started = Instant::now();
let req = SynthesizeRequest { let req = SynthesizeRequest {
gen_text: chapter.body_for_tts.clone(), gen_text,
ref_audio_path, ref_audio_path,
ref_text: voice.reference_text.clone(), ref_text: voice.reference_text.clone(),
output_filename, output_filename,
@ -200,6 +217,69 @@ async fn resolve_voice(
.ok_or_else(|| anyhow::anyhow!("no default voice set; create one or use --voice <slug>")) .ok_or_else(|| anyhow::anyhow!("no default voice set; create one or use --voice <slug>"))
} }
/// Rewrite `[voice:<character-slug>]...[/voice]` tags in the
/// annotated prose to use the actual Kokoro voice name for that
/// character. If a character has no voice_id assigned, or if the
/// slug doesn't match any character in the story, the tag is
/// stripped (dialogue falls back to the narrator voice) rather
/// than failing — defensive default so a missed character row
/// doesn't break a render.
async fn substitute_speaker_voices(
pool: &PgPool,
story_id: Uuid,
text: &str,
narrator_voice_name: &str,
) -> anyhow::Result<String> {
// Tag detection is cheap; only run the DB query if at least one
// [voice:...] appears.
if !text.contains("[voice:") {
return Ok(text.to_string());
}
let rows: Vec<(String, Option<String>)> = sqlx::query_as(
"SELECT c.slug, v.reference_path
FROM characters c
LEFT JOIN voices v ON v.id = c.voice_id
WHERE c.story_id = $1 AND c.slug IS NOT NULL",
)
.bind(story_id)
.fetch_all(pool)
.await?;
let mut slug_to_voice = std::collections::HashMap::<String, String>::new();
for (slug, ref_path) in rows {
if let Some(p) = ref_path {
// For kokoro voices, reference_path stores the voice
// name directly (e.g. "am_onyx"). F5-style paths start
// with '/', skip those.
if !p.starts_with('/') {
slug_to_voice.insert(slug, p);
}
}
}
tracing::info!(
story_id = %story_id,
mapped_speakers = slug_to_voice.len(),
narrator_voice = narrator_voice_name,
"substituting [voice:slug] markers",
);
let re = regex::Regex::new(r"\[voice:([a-z0-9_-]+)\]").unwrap();
let out = re.replace_all(text, |caps: &regex::Captures<'_>| {
let slug = &caps[1];
match slug_to_voice.get(slug) {
Some(voice_name) => format!("[voice:{voice_name}]"),
None => {
tracing::warn!(slug = %slug, "no voice for speaker; dialogue falls back to narrator");
// Replace with the narrator voice so the segment
// still gets a voice tag (so Kokoro's segment
// parser doesn't see an orphan [/voice]).
format!("[voice:{narrator_voice_name}]")
}
}
});
Ok(out.into_owned())
}
/// Pick the engine base URL for a given voice.source. Voices whose /// Pick the engine base URL for a given voice.source. Voices whose
/// source starts with "kokoro" route to KOKORO_URL; everything else /// source starts with "kokoro" route to KOKORO_URL; everything else
/// routes to F5_TTS_URL. Each env var has a LAN-default for Lucy. /// routes to F5_TTS_URL. Each env var has a LAN-default for Lucy.

View file

@ -15,7 +15,7 @@ use chrono::Utc;
use skald_core::authors::{self, AuthorWithRevision}; use skald_core::authors::{self, AuthorWithRevision};
use skald_core::config::ForgeConfig; use skald_core::config::ForgeConfig;
use skald_core::db; use skald_core::db;
use skald_core::forge::{Forge, PassKind, PassOutput}; use skald_core::forge::{CharacterSpeaker, Forge, PassKind, PassOutput};
use sqlx::PgPool; use sqlx::PgPool;
use uuid::Uuid; use uuid::Uuid;
@ -60,8 +60,15 @@ pub async fn run(
.fetch_one(&pool) .fetch_one(&pool)
.await?; .await?;
let characters = load_speakers(&pool, chapter.story_id).await?;
if !characters.is_empty() {
tracing::info!(speaker_count = characters.len(), "speaker roster loaded");
}
let started = Instant::now(); let started = Instant::now();
let out_res = forge.narrate_prep(&chapter.body_md, author.as_ref()).await; let out_res = forge
.narrate_prep(&chapter.body_md, author.as_ref(), &characters)
.await;
let elapsed = started.elapsed(); let elapsed = started.elapsed();
let out: PassOutput = match out_res { let out: PassOutput = match out_res {
@ -179,6 +186,32 @@ fn count_beats(s: &str) -> usize {
n n
} }
async fn load_speakers(pool: &PgPool, story_id: Uuid) -> anyhow::Result<Vec<CharacterSpeaker>> {
// Only characters with a slug get into the speaker roster — a
// null slug means we haven't curated the character yet, and we
// don't want the LLM inventing dialogue-attribution slugs.
let rows: Vec<(String, String, Option<String>)> = sqlx::query_as(
"SELECT slug, name, key_facts FROM characters
WHERE story_id = $1 AND slug IS NOT NULL
ORDER BY name",
)
.bind(story_id)
.fetch_all(pool)
.await?;
Ok(rows
.into_iter()
.map(|(slug, name, key_facts)| {
// Keep the hint short; the LLM mostly needs the name +
// a tiny disambiguator. ~120 chars of key_facts is
// plenty.
let hint = key_facts
.filter(|kf| !kf.trim().is_empty())
.map(|kf| kf.chars().take(120).collect::<String>());
CharacterSpeaker { slug, name, hint }
})
.collect())
}
fn load_forge_config() -> anyhow::Result<ForgeConfig> { fn load_forge_config() -> anyhow::Result<ForgeConfig> {
let base_url = std::env::var("CLAWDFORGE_URL") let base_url = std::env::var("CLAWDFORGE_URL")
.context("CLAWDFORGE_URL not set")?; .context("CLAWDFORGE_URL not set")?;