multi-voice: per-character dialogue rendering
Schema: characters.voice_id + characters.slug (migration 0007). voice_id is FK to voices(id); slug is the stable lowercase token the narrate_prep pass uses inside [voice:slug]...[/voice]. Forge::narrate_prep takes &[CharacterSpeaker]. System prompt expanded to instruct the author to wrap dialogue lines in voice tags based on a roster supplied in the user prompt (slug + name + short hint from key_facts). Unattributed dialogue stays unwrapped and inherits the narrator voice. skald narrate substitutes [voice:<character-slug>] → [voice:<kokoro-voice-name>] right before sending to Kokoro, using characters.voice_id JOIN voices.reference_path as the map. Slugs with no voice or no character row fall back to the narrator voice defensively (logged as warn). kokoro_server.py v0.4: splitter recognises [voice:X]...[/voice] blocks at the paragraph level. Each text node carries an optional voice attribution; renderer feeds it to Kokoro per-segment. Outside voice blocks the request's default voice is used. voices_used is reported back so callers can verify multi-voice actually ran. Only kokoro-routed renders pre-process voice tags; F5 paths leave the tags in place (F5 multi-voice not implemented). Defensive fallback: orphan/unclosed [/voice] markers are silently absorbed rather than failing the render.
This commit is contained in:
parent
330bc8bde2
commit
c9bd38034c
6 changed files with 186 additions and 10 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
|
@ -1896,6 +1896,7 @@ dependencies = [
|
||||||
"chrono",
|
"chrono",
|
||||||
"clap",
|
"clap",
|
||||||
"maud",
|
"maud",
|
||||||
|
"regex",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"skald-core",
|
"skald-core",
|
||||||
|
|
|
||||||
16
migrations/0007_characters_voice.sql
Normal file
16
migrations/0007_characters_voice.sql
Normal file
|
|
@ -0,0 +1,16 @@
|
||||||
|
-- Per-character voice assignment for multi-voice audiobook rendering.
|
||||||
|
-- A NULL voice_id means "use the story's default narrator voice for
|
||||||
|
-- this character's dialogue" (effectively no voice change vs the
|
||||||
|
-- narrator). A non-NULL voice_id pins the character's dialogue to
|
||||||
|
-- that specific voice.
|
||||||
|
--
|
||||||
|
-- slug is the stable lowercase token the narrate_prep pass uses
|
||||||
|
-- inside [voice:<slug>]...[/voice] markers. Distinct from name so
|
||||||
|
-- "Anatoly Dyatlov" can carry slug "dyatlov" without re-deriving on
|
||||||
|
-- every render.
|
||||||
|
ALTER TABLE characters
|
||||||
|
ADD COLUMN voice_id uuid REFERENCES voices(id) ON DELETE SET NULL,
|
||||||
|
ADD COLUMN slug text;
|
||||||
|
|
||||||
|
CREATE INDEX idx_characters_voice ON characters(voice_id) WHERE voice_id IS NOT NULL;
|
||||||
|
CREATE UNIQUE INDEX idx_characters_story_slug ON characters(story_id, slug) WHERE slug IS NOT NULL;
|
||||||
|
|
@ -192,14 +192,20 @@ impl Forge {
|
||||||
/// Orson Black places beats differently than another author
|
/// Orson Black places beats differently than another author
|
||||||
/// would. Replace-mode if author is set; Append otherwise.
|
/// would. Replace-mode if author is set; Append otherwise.
|
||||||
///
|
///
|
||||||
|
/// `characters` is the story's character roster. When provided,
|
||||||
|
/// the system prompt instructs the model to wrap dialogue in
|
||||||
|
/// `[voice:<slug>]"..."[/voice]` for multi-voice rendering. The
|
||||||
|
/// slug is mapped to a Kokoro voice id by skald's narrate path.
|
||||||
|
///
|
||||||
/// Hard rule the system prompt enforces: do not change a word
|
/// Hard rule the system prompt enforces: do not change a word
|
||||||
/// of prose. Tags are additive only.
|
/// of prose. Tags are additive only.
|
||||||
pub async fn narrate_prep(
|
pub async fn narrate_prep(
|
||||||
&self,
|
&self,
|
||||||
prose: &str,
|
prose: &str,
|
||||||
author: Option<&AuthorWithRevision>,
|
author: Option<&AuthorWithRevision>,
|
||||||
|
characters: &[CharacterSpeaker],
|
||||||
) -> anyhow::Result<PassOutput> {
|
) -> anyhow::Result<PassOutput> {
|
||||||
let user_prompt = narrate_prep_user_prompt(prose);
|
let user_prompt = narrate_prep_user_prompt(prose, characters);
|
||||||
let (system, mode) = match author {
|
let (system, mode) = match author {
|
||||||
Some(a) => {
|
Some(a) => {
|
||||||
let scaffold = a
|
let scaffold = a
|
||||||
|
|
@ -339,9 +345,9 @@ const HOUSE_CLEANUP_SYSTEM: &str = "You are a copy editor polishing a draft chap
|
||||||
|
|
||||||
const SYSTEM_AUDIT: &str = "You are a canon auditor for long-form fiction. You compare a parent story and a new chapter against the bible. You flag continuity drift, character voice shift, retconned facts, dropped threads, timeline contradictions. You return STRUCTURED JSON ONLY — no commentary, no preamble. The exact shape: { \"findings\": [ { \"severity\": \"info\"|\"warn\"|\"crit\", \"area\": \"character\"|\"continuity\"|\"tone\"|\"fact\"|\"timeline\"|\"other\", \"body\": \"...\" } ] }. If no findings, return { \"findings\": [] }.";
|
const SYSTEM_AUDIT: &str = "You are a canon auditor for long-form fiction. You compare a parent story and a new chapter against the bible. You flag continuity drift, character voice shift, retconned facts, dropped threads, timeline contradictions. You return STRUCTURED JSON ONLY — no commentary, no preamble. The exact shape: { \"findings\": [ { \"severity\": \"info\"|\"warn\"|\"crit\", \"area\": \"character\"|\"continuity\"|\"tone\"|\"fact\"|\"timeline\"|\"other\", \"body\": \"...\" } ] }. If no findings, return { \"findings\": [] }.";
|
||||||
|
|
||||||
const NARRATE_PREP_DIRECTIVE: &str = "This is a NARRATION-ANNOTATION pass. You receive your own prose and prepare it for an audiobook reading. Two kinds of inserts are allowed:\n\n1. BEAT MARKERS (additive, not prose): `[breath]` (~400ms), `[pause:1.2s]` (explicit silence in seconds, e.g. 0.5s, 1.2s, 2s), `[scene]` (~1500ms scene break). Place where the prose's rhythm asks for them — after a hard one-line beat, before a turn in dialogue, on a paragraph that lands with weight.\n\n2. NARRATOR STUMBLES (humanizing prose-level inserts): a real narrator occasionally stumbles on a hard word, catches themselves, repeats. You may add these *sparingly* where the prose's pacing makes them feel right. Patterns: em-dash repetition (`Prip— Pripyat`), self-correction (`she — no, the wife — had been told`), hesitation (`the dose, the dose was`). USE SPARINGLY. Maybe 1-3 per chapter. Pick proper nouns, technical terms, or moments where the narrator might genuinely catch herself. Avoid stumbling on emotional climaxes — those should land clean.\n\nApart from stumbles, do NOT change a word of the original prose. Return the prose with beat markers and stumbles inline. No preamble. No commentary about your choices.";
|
const NARRATE_PREP_DIRECTIVE: &str = "This is a NARRATION-ANNOTATION pass. You receive your own prose and prepare it for an audiobook reading. Three kinds of inserts are allowed:\n\n1. BEAT MARKERS (additive, not prose): `[breath]` (~400ms), `[pause:1.2s]` (explicit silence in seconds, e.g. 0.5s, 1.2s, 2s), `[scene]` (~1500ms scene break). Place where the prose's rhythm asks for them — after a hard one-line beat, before a turn in dialogue, on a paragraph that lands with weight.\n\n2. SPEAKER VOICE TAGS (multi-voice dialogue): wrap dialogue lines in `[voice:<slug>]\"...\"[/voice]` based on who is speaking. The roster of available speaker slugs is given in the user prompt. The dialogue itself stays verbatim — only the wrapper is added. If a line of dialogue is not clearly attributable to a roster speaker, leave it unwrapped (the narrator voice will read it). Quoted thoughts (italicized interior monologue) stay unwrapped — only spoken aloud dialogue gets a voice tag.\n\n3. NARRATOR STUMBLES (humanizing prose-level inserts): a real narrator occasionally stumbles on a hard word, catches themselves, repeats. You may add these *sparingly* where the prose's pacing makes them feel right. Patterns: em-dash repetition (`Prip— Pripyat`), self-correction (`she — no, the wife — had been told`), hesitation (`the dose, the dose was`). USE SPARINGLY. Maybe 1-3 per chapter. Pick proper nouns, technical terms, or moments where the narrator might genuinely catch herself. Avoid stumbling on emotional climaxes — those should land clean.\n\nApart from stumbles, do NOT change a word of the original prose. Return the prose with beat markers, voice tags, and stumbles inline. No preamble. No commentary about your choices.";
|
||||||
|
|
||||||
const HOUSE_NARRATE_PREP_SYSTEM: &str = "You are a senior audiobook director annotating prose for narration. You insert (a) beat markers — `[breath]`, `[pause:Xs]`, `[scene]` — where a skilled narrator would breathe or pause, and (b) occasional humanizing narrator stumbles using em-dash repetition or self-correction (sparingly — maybe 1-3 per chapter, on proper nouns or hard words). Apart from those stumbles you do NOT change a word of the prose. Return the prose verbatim plus beat markers and (rare) stumbles inline. No preamble, no commentary.";
|
const HOUSE_NARRATE_PREP_SYSTEM: &str = "You are a senior audiobook director annotating prose for narration. You insert (a) beat markers — `[breath]`, `[pause:Xs]`, `[scene]` — where a skilled narrator would breathe or pause, (b) speaker voice tags `[voice:<slug>]\"...\"[/voice]` wrapping dialogue based on who is speaking (roster supplied in user prompt; leave unattributed dialogue unwrapped), and (c) occasional humanizing narrator stumbles using em-dash repetition or self-correction (sparingly — maybe 1-3 per chapter, on proper nouns or hard words). Apart from those stumbles you do NOT change a word of the prose. Return the prose verbatim plus beat markers, voice tags, and (rare) stumbles inline. No preamble, no commentary.";
|
||||||
|
|
||||||
// ─── User-prompt builders ───────────────────────────────────────
|
// ─── User-prompt builders ───────────────────────────────────────
|
||||||
|
|
||||||
|
|
@ -376,14 +382,53 @@ fn gen_user_prompt(
|
||||||
out
|
out
|
||||||
}
|
}
|
||||||
|
|
||||||
fn narrate_prep_user_prompt(prose: &str) -> String {
|
/// One row of the story's character roster, passed to narrate_prep
|
||||||
|
/// so the LLM knows what speaker slugs to use in `[voice:slug]`
|
||||||
|
/// tags. Built from skald's characters table.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct CharacterSpeaker {
|
||||||
|
pub slug: String,
|
||||||
|
pub name: String,
|
||||||
|
/// Short note (1 sentence) giving the LLM enough to disambiguate
|
||||||
|
/// who's speaking when prose says "she said". Pulled from
|
||||||
|
/// characters.key_facts but trimmed.
|
||||||
|
pub hint: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn narrate_prep_user_prompt(prose: &str, characters: &[CharacterSpeaker]) -> String {
|
||||||
let mut out = String::with_capacity(prose.len() + 512);
|
let mut out = String::with_capacity(prose.len() + 512);
|
||||||
|
|
||||||
|
if !characters.is_empty() {
|
||||||
|
out.push_str("# Speaker roster\n\n");
|
||||||
|
out.push_str(
|
||||||
|
"Use these slugs in `[voice:<slug>]\"...\"[/voice]` wrappers on dialogue. \
|
||||||
|
Leave dialogue without a clear roster speaker unwrapped (the narrator \
|
||||||
|
voice will read it).\n\n",
|
||||||
|
);
|
||||||
|
for c in characters {
|
||||||
|
out.push_str("- `");
|
||||||
|
out.push_str(&c.slug);
|
||||||
|
out.push_str("` — ");
|
||||||
|
out.push_str(&c.name);
|
||||||
|
if let Some(h) = &c.hint {
|
||||||
|
if !h.trim().is_empty() {
|
||||||
|
out.push_str(" (");
|
||||||
|
out.push_str(h.trim());
|
||||||
|
out.push(')');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out.push('\n');
|
||||||
|
}
|
||||||
|
out.push('\n');
|
||||||
|
}
|
||||||
|
|
||||||
out.push_str("# Prose to annotate\n\n");
|
out.push_str("# Prose to annotate\n\n");
|
||||||
out.push_str(prose);
|
out.push_str(prose);
|
||||||
out.push_str(
|
out.push_str(
|
||||||
"\n\n# Task\n\nReturn the prose above with `[breath]`, `[pause:Xs]`, and \
|
"\n\n# Task\n\nReturn the prose above with `[breath]`, `[pause:Xs]`, \
|
||||||
`[scene]` markers inserted at natural narration beats. Do not change \
|
`[scene]` markers and `[voice:<slug>]\"...\"[/voice]` dialogue wrappers \
|
||||||
any word. Do not skip any sentence. Return only the annotated prose.\n",
|
inserted appropriately. Do not change any word. Do not skip any \
|
||||||
|
sentence. Return only the annotated prose.\n",
|
||||||
);
|
);
|
||||||
out
|
out
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -26,3 +26,4 @@ tracing-subscriber = { workspace = true }
|
||||||
chrono = { workspace = true }
|
chrono = { workspace = true }
|
||||||
uuid = { workspace = true }
|
uuid = { workspace = true }
|
||||||
maud = { workspace = true }
|
maud = { workspace = true }
|
||||||
|
regex = { workspace = true }
|
||||||
|
|
|
||||||
|
|
@ -86,9 +86,26 @@ pub async fn run(
|
||||||
.fetch_one(&pool)
|
.fetch_one(&pool)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
|
// Resolve any [voice:<character-slug>]...[/voice] tags in the
|
||||||
|
// annotated text into [voice:<kokoro-voice-name>]...[/voice] so
|
||||||
|
// the Kokoro server only ever sees real voice ids. Only kicks
|
||||||
|
// in for kokoro-routed renders; F5 voice-tag handling isn't
|
||||||
|
// implemented and any tags pass through unchanged.
|
||||||
|
let gen_text = if voice.source.starts_with("kokoro") {
|
||||||
|
substitute_speaker_voices(
|
||||||
|
&pool,
|
||||||
|
chapter.story_id,
|
||||||
|
&chapter.body_for_tts,
|
||||||
|
ref_audio_path.as_str(),
|
||||||
|
)
|
||||||
|
.await?
|
||||||
|
} else {
|
||||||
|
chapter.body_for_tts.clone()
|
||||||
|
};
|
||||||
|
|
||||||
let started = Instant::now();
|
let started = Instant::now();
|
||||||
let req = SynthesizeRequest {
|
let req = SynthesizeRequest {
|
||||||
gen_text: chapter.body_for_tts.clone(),
|
gen_text,
|
||||||
ref_audio_path,
|
ref_audio_path,
|
||||||
ref_text: voice.reference_text.clone(),
|
ref_text: voice.reference_text.clone(),
|
||||||
output_filename,
|
output_filename,
|
||||||
|
|
@ -200,6 +217,69 @@ async fn resolve_voice(
|
||||||
.ok_or_else(|| anyhow::anyhow!("no default voice set; create one or use --voice <slug>"))
|
.ok_or_else(|| anyhow::anyhow!("no default voice set; create one or use --voice <slug>"))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Rewrite `[voice:<character-slug>]...[/voice]` tags in the
|
||||||
|
/// annotated prose to use the actual Kokoro voice name for that
|
||||||
|
/// character. If a character has no voice_id assigned, or if the
|
||||||
|
/// slug doesn't match any character in the story, the tag is
|
||||||
|
/// stripped (dialogue falls back to the narrator voice) rather
|
||||||
|
/// than failing — defensive default so a missed character row
|
||||||
|
/// doesn't break a render.
|
||||||
|
async fn substitute_speaker_voices(
|
||||||
|
pool: &PgPool,
|
||||||
|
story_id: Uuid,
|
||||||
|
text: &str,
|
||||||
|
narrator_voice_name: &str,
|
||||||
|
) -> anyhow::Result<String> {
|
||||||
|
// Tag detection is cheap; only run the DB query if at least one
|
||||||
|
// [voice:...] appears.
|
||||||
|
if !text.contains("[voice:") {
|
||||||
|
return Ok(text.to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
let rows: Vec<(String, Option<String>)> = sqlx::query_as(
|
||||||
|
"SELECT c.slug, v.reference_path
|
||||||
|
FROM characters c
|
||||||
|
LEFT JOIN voices v ON v.id = c.voice_id
|
||||||
|
WHERE c.story_id = $1 AND c.slug IS NOT NULL",
|
||||||
|
)
|
||||||
|
.bind(story_id)
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await?;
|
||||||
|
let mut slug_to_voice = std::collections::HashMap::<String, String>::new();
|
||||||
|
for (slug, ref_path) in rows {
|
||||||
|
if let Some(p) = ref_path {
|
||||||
|
// For kokoro voices, reference_path stores the voice
|
||||||
|
// name directly (e.g. "am_onyx"). F5-style paths start
|
||||||
|
// with '/', skip those.
|
||||||
|
if !p.starts_with('/') {
|
||||||
|
slug_to_voice.insert(slug, p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tracing::info!(
|
||||||
|
story_id = %story_id,
|
||||||
|
mapped_speakers = slug_to_voice.len(),
|
||||||
|
narrator_voice = narrator_voice_name,
|
||||||
|
"substituting [voice:slug] markers",
|
||||||
|
);
|
||||||
|
|
||||||
|
let re = regex::Regex::new(r"\[voice:([a-z0-9_-]+)\]").unwrap();
|
||||||
|
let out = re.replace_all(text, |caps: ®ex::Captures<'_>| {
|
||||||
|
let slug = &caps[1];
|
||||||
|
match slug_to_voice.get(slug) {
|
||||||
|
Some(voice_name) => format!("[voice:{voice_name}]"),
|
||||||
|
None => {
|
||||||
|
tracing::warn!(slug = %slug, "no voice for speaker; dialogue falls back to narrator");
|
||||||
|
// Replace with the narrator voice so the segment
|
||||||
|
// still gets a voice tag (so Kokoro's segment
|
||||||
|
// parser doesn't see an orphan [/voice]).
|
||||||
|
format!("[voice:{narrator_voice_name}]")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
Ok(out.into_owned())
|
||||||
|
}
|
||||||
|
|
||||||
/// Pick the engine base URL for a given voice.source. Voices whose
|
/// Pick the engine base URL for a given voice.source. Voices whose
|
||||||
/// source starts with "kokoro" route to KOKORO_URL; everything else
|
/// source starts with "kokoro" route to KOKORO_URL; everything else
|
||||||
/// routes to F5_TTS_URL. Each env var has a LAN-default for Lucy.
|
/// routes to F5_TTS_URL. Each env var has a LAN-default for Lucy.
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ use chrono::Utc;
|
||||||
use skald_core::authors::{self, AuthorWithRevision};
|
use skald_core::authors::{self, AuthorWithRevision};
|
||||||
use skald_core::config::ForgeConfig;
|
use skald_core::config::ForgeConfig;
|
||||||
use skald_core::db;
|
use skald_core::db;
|
||||||
use skald_core::forge::{Forge, PassKind, PassOutput};
|
use skald_core::forge::{CharacterSpeaker, Forge, PassKind, PassOutput};
|
||||||
use sqlx::PgPool;
|
use sqlx::PgPool;
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
|
@ -60,8 +60,15 @@ pub async fn run(
|
||||||
.fetch_one(&pool)
|
.fetch_one(&pool)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
|
let characters = load_speakers(&pool, chapter.story_id).await?;
|
||||||
|
if !characters.is_empty() {
|
||||||
|
tracing::info!(speaker_count = characters.len(), "speaker roster loaded");
|
||||||
|
}
|
||||||
|
|
||||||
let started = Instant::now();
|
let started = Instant::now();
|
||||||
let out_res = forge.narrate_prep(&chapter.body_md, author.as_ref()).await;
|
let out_res = forge
|
||||||
|
.narrate_prep(&chapter.body_md, author.as_ref(), &characters)
|
||||||
|
.await;
|
||||||
let elapsed = started.elapsed();
|
let elapsed = started.elapsed();
|
||||||
|
|
||||||
let out: PassOutput = match out_res {
|
let out: PassOutput = match out_res {
|
||||||
|
|
@ -179,6 +186,32 @@ fn count_beats(s: &str) -> usize {
|
||||||
n
|
n
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn load_speakers(pool: &PgPool, story_id: Uuid) -> anyhow::Result<Vec<CharacterSpeaker>> {
|
||||||
|
// Only characters with a slug get into the speaker roster — a
|
||||||
|
// null slug means we haven't curated the character yet, and we
|
||||||
|
// don't want the LLM inventing dialogue-attribution slugs.
|
||||||
|
let rows: Vec<(String, String, Option<String>)> = sqlx::query_as(
|
||||||
|
"SELECT slug, name, key_facts FROM characters
|
||||||
|
WHERE story_id = $1 AND slug IS NOT NULL
|
||||||
|
ORDER BY name",
|
||||||
|
)
|
||||||
|
.bind(story_id)
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await?;
|
||||||
|
Ok(rows
|
||||||
|
.into_iter()
|
||||||
|
.map(|(slug, name, key_facts)| {
|
||||||
|
// Keep the hint short; the LLM mostly needs the name +
|
||||||
|
// a tiny disambiguator. ~120 chars of key_facts is
|
||||||
|
// plenty.
|
||||||
|
let hint = key_facts
|
||||||
|
.filter(|kf| !kf.trim().is_empty())
|
||||||
|
.map(|kf| kf.chars().take(120).collect::<String>());
|
||||||
|
CharacterSpeaker { slug, name, hint }
|
||||||
|
})
|
||||||
|
.collect())
|
||||||
|
}
|
||||||
|
|
||||||
fn load_forge_config() -> anyhow::Result<ForgeConfig> {
|
fn load_forge_config() -> anyhow::Result<ForgeConfig> {
|
||||||
let base_url = std::env::var("CLAWDFORGE_URL")
|
let base_url = std::env::var("CLAWDFORGE_URL")
|
||||||
.context("CLAWDFORGE_URL not set")?;
|
.context("CLAWDFORGE_URL not set")?;
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue