narrate: single-voice prep drops voice tags; GC superseded renders
Two fixes: - narrate_prep in single-voice mode (empty character roster) was still handed the multi-voice directive, so the model invented [voice:<slug>] tags from character names in the prose. The narrate path neutralised them by falling back to the narrator, but it was log spam and a leak of intent. Single-voice now gets directive + house-system variants that forbid voice tags outright, and the user-prompt task line matches. - Every narrate run wrote a fresh ~80MB WAV and never reclaimed the previous one, so re-renders piled up stale files. A successful render now deletes the WAVs of prior renders of the same chapter and nulls their output_path. Render history rows are kept; only the dead file pointer is cleared. Best-effort — cleanup failure never fails the render.
This commit is contained in:
parent
98233182fd
commit
c8c44a5d23
2 changed files with 123 additions and 10 deletions
|
|
@ -200,10 +200,12 @@ impl Forge {
|
||||||
/// Orson Black places beats differently than another author
|
/// Orson Black places beats differently than another author
|
||||||
/// would. Replace-mode if author is set; Append otherwise.
|
/// would. Replace-mode if author is set; Append otherwise.
|
||||||
///
|
///
|
||||||
/// `characters` is the story's character roster. When provided,
|
/// `characters` is the story's character roster. When non-empty,
|
||||||
/// the system prompt instructs the model to wrap dialogue in
|
/// the system prompt instructs the model to wrap dialogue in
|
||||||
/// `[voice:<slug>]"..."[/voice]` for multi-voice rendering. The
|
/// `[voice:<slug>]"..."[/voice]` for multi-voice rendering; the
|
||||||
/// slug is mapped to a Kokoro voice id by skald's narrate path.
|
/// slug is mapped to a Kokoro voice id by skald's narrate path.
|
||||||
|
/// An EMPTY roster selects single-voice mode — the prompt then
|
||||||
|
/// forbids `[voice:...]` tags entirely (one narrator, no cast).
|
||||||
///
|
///
|
||||||
/// Hard rule the system prompt enforces: do not change a word
|
/// Hard rule the system prompt enforces: do not change a word
|
||||||
/// of prose. Tags are additive only.
|
/// of prose. Tags are additive only.
|
||||||
|
|
@ -213,6 +215,12 @@ impl Forge {
|
||||||
author: Option<&AuthorWithRevision>,
|
author: Option<&AuthorWithRevision>,
|
||||||
characters: &[CharacterSpeaker],
|
characters: &[CharacterSpeaker],
|
||||||
) -> anyhow::Result<PassOutput> {
|
) -> anyhow::Result<PassOutput> {
|
||||||
|
// An empty character roster means single-voice narration —
|
||||||
|
// the whole chapter reads in one voice. In that mode the
|
||||||
|
// prompt must NOT invite `[voice:...]` tags, or the model
|
||||||
|
// invents speaker slugs from names in the prose that the
|
||||||
|
// narrate path then has to detect and neutralize.
|
||||||
|
let single_voice = characters.is_empty();
|
||||||
let user_prompt = narrate_prep_user_prompt(prose, characters);
|
let user_prompt = narrate_prep_user_prompt(prose, characters);
|
||||||
let (system, mode) = match author {
|
let (system, mode) = match author {
|
||||||
Some(a) => {
|
Some(a) => {
|
||||||
|
|
@ -221,13 +229,25 @@ impl Forge {
|
||||||
.system_template
|
.system_template
|
||||||
.as_deref()
|
.as_deref()
|
||||||
.unwrap_or(DEFAULT_AUTHOR_SCAFFOLD);
|
.unwrap_or(DEFAULT_AUTHOR_SCAFFOLD);
|
||||||
|
let directive = if single_voice {
|
||||||
|
NARRATE_PREP_DIRECTIVE_SINGLE
|
||||||
|
} else {
|
||||||
|
NARRATE_PREP_DIRECTIVE
|
||||||
|
};
|
||||||
let composed = scaffold
|
let composed = scaffold
|
||||||
.replace("{{display_name}}", &a.author.display_name)
|
.replace("{{display_name}}", &a.author.display_name)
|
||||||
.replace("{{pass_directive}}", NARRATE_PREP_DIRECTIVE)
|
.replace("{{pass_directive}}", directive)
|
||||||
.replace("{{soul}}", &a.revision.soul);
|
.replace("{{soul}}", &a.revision.soul);
|
||||||
(composed, SystemMode::Replace)
|
(composed, SystemMode::Replace)
|
||||||
}
|
}
|
||||||
None => (HOUSE_NARRATE_PREP_SYSTEM.to_string(), SystemMode::Append),
|
None => {
|
||||||
|
let house = if single_voice {
|
||||||
|
HOUSE_NARRATE_PREP_SYSTEM_SINGLE
|
||||||
|
} else {
|
||||||
|
HOUSE_NARRATE_PREP_SYSTEM
|
||||||
|
};
|
||||||
|
(house.to_string(), SystemMode::Append)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
let body = RunRequest {
|
let body = RunRequest {
|
||||||
prompt: user_prompt,
|
prompt: user_prompt,
|
||||||
|
|
@ -395,8 +415,20 @@ const SYSTEM_AUDIT: &str = "You are a canon auditor for long-form fiction. You c
|
||||||
|
|
||||||
const NARRATE_PREP_DIRECTIVE: &str = "This is a NARRATION-ANNOTATION pass. You receive your own prose and prepare it for an audiobook reading. Three kinds of inserts are allowed:\n\n1. BEAT MARKERS (additive, not prose): `[breath]` (~400ms), `[pause:1.2s]` (explicit silence in seconds, e.g. 0.5s, 1.2s, 2s), `[scene]` (~1500ms scene break). Place where the prose's rhythm asks for them — after a hard one-line beat, before a turn in dialogue, on a paragraph that lands with weight.\n\n2. SPEAKER VOICE TAGS (multi-voice dialogue): wrap dialogue lines in `[voice:<slug>]\"...\"[/voice]` based on who is speaking. The roster of available speaker slugs is given in the user prompt. The dialogue itself stays verbatim — only the wrapper is added. If a line of dialogue is not clearly attributable to a roster speaker, leave it unwrapped (the narrator voice will read it). Quoted thoughts (italicized interior monologue) stay unwrapped — only spoken aloud dialogue gets a voice tag.\n\n3. NARRATOR STUMBLES (humanizing prose-level inserts): a real narrator occasionally stumbles on a hard word, catches themselves, repeats. You may add these *sparingly* where the prose's pacing makes them feel right. Patterns: em-dash repetition (`Prip— Pripyat`), self-correction (`she — no, the wife — had been told`), hesitation (`the dose, the dose was`). USE SPARINGLY. Maybe 1-3 per chapter. Pick proper nouns, technical terms, or moments where the narrator might genuinely catch herself. Avoid stumbling on emotional climaxes — those should land clean.\n\nApart from stumbles, do NOT change a word of the original prose. Return the prose with beat markers, voice tags, and stumbles inline. No preamble. No commentary about your choices.";
|
const NARRATE_PREP_DIRECTIVE: &str = "This is a NARRATION-ANNOTATION pass. You receive your own prose and prepare it for an audiobook reading. Three kinds of inserts are allowed:\n\n1. BEAT MARKERS (additive, not prose): `[breath]` (~400ms), `[pause:1.2s]` (explicit silence in seconds, e.g. 0.5s, 1.2s, 2s), `[scene]` (~1500ms scene break). Place where the prose's rhythm asks for them — after a hard one-line beat, before a turn in dialogue, on a paragraph that lands with weight.\n\n2. SPEAKER VOICE TAGS (multi-voice dialogue): wrap dialogue lines in `[voice:<slug>]\"...\"[/voice]` based on who is speaking. The roster of available speaker slugs is given in the user prompt. The dialogue itself stays verbatim — only the wrapper is added. If a line of dialogue is not clearly attributable to a roster speaker, leave it unwrapped (the narrator voice will read it). Quoted thoughts (italicized interior monologue) stay unwrapped — only spoken aloud dialogue gets a voice tag.\n\n3. NARRATOR STUMBLES (humanizing prose-level inserts): a real narrator occasionally stumbles on a hard word, catches themselves, repeats. You may add these *sparingly* where the prose's pacing makes them feel right. Patterns: em-dash repetition (`Prip— Pripyat`), self-correction (`she — no, the wife — had been told`), hesitation (`the dose, the dose was`). USE SPARINGLY. Maybe 1-3 per chapter. Pick proper nouns, technical terms, or moments where the narrator might genuinely catch herself. Avoid stumbling on emotional climaxes — those should land clean.\n\nApart from stumbles, do NOT change a word of the original prose. Return the prose with beat markers, voice tags, and stumbles inline. No preamble. No commentary about your choices.";
|
||||||
|
|
||||||
|
/// Single-voice variant of [`NARRATE_PREP_DIRECTIVE`]. Used when the
|
||||||
|
/// chapter narrates in one voice (no speaker roster). The multi-voice
|
||||||
|
/// directive's section 2 is dropped entirely AND a hard prohibition
|
||||||
|
/// is added — without it the model invents `[voice:<slug>]` tags from
|
||||||
|
/// character names in the prose, which the narrate path then has to
|
||||||
|
/// detect and neutralize.
|
||||||
|
const NARRATE_PREP_DIRECTIVE_SINGLE: &str = "This is a NARRATION-ANNOTATION pass. You receive your own prose and prepare it for a SINGLE-narrator audiobook reading — the whole chapter, dialogue included, is read aloud in ONE voice. Two kinds of inserts are allowed:\n\n1. BEAT MARKERS (additive, not prose): `[breath]` (~400ms), `[pause:1.2s]` (explicit silence in seconds, e.g. 0.5s, 1.2s, 2s), `[scene]` (~1500ms scene break). Place where the prose's rhythm asks for them — after a hard one-line beat, before a turn in dialogue, on a paragraph that lands with weight.\n\n2. NARRATOR STUMBLES (humanizing prose-level inserts): a real narrator occasionally stumbles on a hard word, catches themselves, repeats. You may add these *sparingly* where the prose's pacing makes them feel right. Patterns: em-dash repetition (`Prip— Pripyat`), self-correction (`she — no, the wife — had been told`), hesitation (`the dose, the dose was`). USE SPARINGLY. Maybe 1-3 per chapter. Pick proper nouns, technical terms, or moments where the narrator might genuinely catch herself. Avoid stumbling on emotional climaxes — those should land clean.\n\nDo NOT add `[voice:...]` speaker tags of any kind — there is one narrator, not a cast. Apart from stumbles, do NOT change a word of the original prose. Return the prose with beat markers and stumbles inline. No preamble. No commentary about your choices.";
|
||||||
|
|
||||||
const HOUSE_NARRATE_PREP_SYSTEM: &str = "You are a senior audiobook director annotating prose for narration. You insert (a) beat markers — `[breath]`, `[pause:Xs]`, `[scene]` — where a skilled narrator would breathe or pause, (b) speaker voice tags `[voice:<slug>]\"...\"[/voice]` wrapping dialogue based on who is speaking (roster supplied in user prompt; leave unattributed dialogue unwrapped), and (c) occasional humanizing narrator stumbles using em-dash repetition or self-correction (sparingly — maybe 1-3 per chapter, on proper nouns or hard words). Apart from those stumbles you do NOT change a word of the prose. Return the prose verbatim plus beat markers, voice tags, and (rare) stumbles inline. No preamble, no commentary.";
|
const HOUSE_NARRATE_PREP_SYSTEM: &str = "You are a senior audiobook director annotating prose for narration. You insert (a) beat markers — `[breath]`, `[pause:Xs]`, `[scene]` — where a skilled narrator would breathe or pause, (b) speaker voice tags `[voice:<slug>]\"...\"[/voice]` wrapping dialogue based on who is speaking (roster supplied in user prompt; leave unattributed dialogue unwrapped), and (c) occasional humanizing narrator stumbles using em-dash repetition or self-correction (sparingly — maybe 1-3 per chapter, on proper nouns or hard words). Apart from those stumbles you do NOT change a word of the prose. Return the prose verbatim plus beat markers, voice tags, and (rare) stumbles inline. No preamble, no commentary.";
|
||||||
|
|
||||||
|
/// Single-voice variant of [`HOUSE_NARRATE_PREP_SYSTEM`] — no speaker
|
||||||
|
/// voice tags, one narrator throughout.
|
||||||
|
const HOUSE_NARRATE_PREP_SYSTEM_SINGLE: &str = "You are a senior audiobook director annotating prose for a SINGLE-narrator reading. You insert (a) beat markers — `[breath]`, `[pause:Xs]`, `[scene]` — where a skilled narrator would breathe or pause, and (b) occasional humanizing narrator stumbles using em-dash repetition or self-correction (sparingly — maybe 1-3 per chapter, on proper nouns or hard words). Do NOT add `[voice:...]` speaker tags — the whole chapter is one voice. Apart from those stumbles you do NOT change a word of the prose. Return the prose verbatim plus beat markers and (rare) stumbles inline. No preamble, no commentary.";
|
||||||
|
|
||||||
const REWRITE_DIRECTIVE: &str = "This is a REWRITE pass. The user prompt contains a chapter of prose written by another hand. Re-author it entirely in YOUR voice — every sentence reworked in your style: your sentence rhythm, your word choice, your paragraph shape, your way of landing a beat. This is not editing or polishing. It is re-authoring. The reader should not be able to tell another writer ever touched it.\n\nHARD CONSTRAINTS — canon is non-negotiable:\n- Every character name, every date, every place name stays exactly as written.\n- Every event, and the ORDER events happen in, stays exactly as written.\n- Every technical or historical fact stays exactly as written.\n- Do not add new scenes, characters, or events. Do not cut any scene or beat. Same story, same shape — your telling.\n\nReturn ONLY the rewritten chapter prose. Begin with the chapter heading line (`## Chapter N — title`) exactly as in the source. No preamble, no commentary about the rewrite.";
|
const REWRITE_DIRECTIVE: &str = "This is a REWRITE pass. The user prompt contains a chapter of prose written by another hand. Re-author it entirely in YOUR voice — every sentence reworked in your style: your sentence rhythm, your word choice, your paragraph shape, your way of landing a beat. This is not editing or polishing. It is re-authoring. The reader should not be able to tell another writer ever touched it.\n\nHARD CONSTRAINTS — canon is non-negotiable:\n- Every character name, every date, every place name stays exactly as written.\n- Every event, and the ORDER events happen in, stays exactly as written.\n- Every technical or historical fact stays exactly as written.\n- Do not add new scenes, characters, or events. Do not cut any scene or beat. Same story, same shape — your telling.\n\nReturn ONLY the rewritten chapter prose. Begin with the chapter heading line (`## Chapter N — title`) exactly as in the source. No preamble, no commentary about the rewrite.";
|
||||||
|
|
||||||
// ─── User-prompt builders ───────────────────────────────────────
|
// ─── User-prompt builders ───────────────────────────────────────
|
||||||
|
|
@ -487,12 +519,22 @@ fn narrate_prep_user_prompt(prose: &str, characters: &[CharacterSpeaker]) -> Str
|
||||||
|
|
||||||
out.push_str("# Prose to annotate\n\n");
|
out.push_str("# Prose to annotate\n\n");
|
||||||
out.push_str(prose);
|
out.push_str(prose);
|
||||||
|
if characters.is_empty() {
|
||||||
|
out.push_str(
|
||||||
|
"\n\n# Task\n\nReturn the prose above with `[breath]`, `[pause:Xs]`, \
|
||||||
|
`[scene]` beat markers inserted appropriately. Do NOT add any \
|
||||||
|
`[voice:...]` tags — this is a single-voice reading. Do not \
|
||||||
|
change any word. Do not skip any sentence. Return only the \
|
||||||
|
annotated prose.\n",
|
||||||
|
);
|
||||||
|
} else {
|
||||||
out.push_str(
|
out.push_str(
|
||||||
"\n\n# Task\n\nReturn the prose above with `[breath]`, `[pause:Xs]`, \
|
"\n\n# Task\n\nReturn the prose above with `[breath]`, `[pause:Xs]`, \
|
||||||
`[scene]` markers and `[voice:<slug>]\"...\"[/voice]` dialogue wrappers \
|
`[scene]` markers and `[voice:<slug>]\"...\"[/voice]` dialogue wrappers \
|
||||||
inserted appropriately. Do not change any word. Do not skip any \
|
inserted appropriately. Do not change any word. Do not skip any \
|
||||||
sentence. Return only the annotated prose.\n",
|
sentence. Return only the annotated prose.\n",
|
||||||
);
|
);
|
||||||
|
}
|
||||||
out
|
out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -161,6 +161,12 @@ pub async fn run(
|
||||||
.execute(&pool)
|
.execute(&pool)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
|
// This chapter now has a fresh canonical render. Prior render
|
||||||
|
// WAVs are dead weight — every re-render otherwise leaves its
|
||||||
|
// predecessor on disk forever. Reclaim it. Best-effort: a
|
||||||
|
// cleanup failure must never fail an otherwise-good render.
|
||||||
|
cleanup_superseded_renders(&pool, chapter_id, run_row_id).await;
|
||||||
|
|
||||||
println!(
|
println!(
|
||||||
"narrated chapter {} of story {}: {} ({:.2}s audio, {:.1}s wall clock)",
|
"narrated chapter {} of story {}: {} ({:.2}s audio, {:.1}s wall clock)",
|
||||||
chapter.n,
|
chapter.n,
|
||||||
|
|
@ -383,6 +389,71 @@ async fn apply_pronunciation_overrides(
|
||||||
Ok(out)
|
Ok(out)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Delete the WAV files of prior renders of this chapter and clear
|
||||||
|
/// their `output_path`. The newest succeeded render is the canonical
|
||||||
|
/// one; older renders are superseded the moment a new one lands, and
|
||||||
|
/// without this every re-render would leave a stale ~80MB file on
|
||||||
|
/// disk forever.
|
||||||
|
///
|
||||||
|
/// The `narration_runs` rows themselves are KEPT — engine, voice,
|
||||||
|
/// timing and status stay as render history. Only `output_path` is
|
||||||
|
/// nulled, so no row ever points at a file that no longer exists.
|
||||||
|
///
|
||||||
|
/// Best-effort throughout: this runs *after* the current render has
|
||||||
|
/// already been recorded as succeeded, so any failure here (a query
|
||||||
|
/// error, a permission problem on the audio dir) is logged and
|
||||||
|
/// swallowed — it must never turn a good render into a failed one.
|
||||||
|
async fn cleanup_superseded_renders(pool: &PgPool, chapter_id: Uuid, current_run: Uuid) {
|
||||||
|
// output_path is only ever set on the success UPDATE, so
|
||||||
|
// "output_path IS NOT NULL AND id != current" is exactly the set
|
||||||
|
// of prior completed renders.
|
||||||
|
let prior: Vec<(Uuid, String)> = match sqlx::query_as(
|
||||||
|
"SELECT id, output_path FROM narration_runs
|
||||||
|
WHERE chapter_id = $1 AND id <> $2 AND output_path IS NOT NULL",
|
||||||
|
)
|
||||||
|
.bind(chapter_id)
|
||||||
|
.bind(current_run)
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(rows) => rows,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(error = %e, "superseded-render cleanup: query failed, skipping");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
for (run_id, output_path) in prior {
|
||||||
|
// output_path is the HTTP-facing path "/audio/<file>"; the
|
||||||
|
// `/audio` bind mount means that is also the on-disk path
|
||||||
|
// inside this container.
|
||||||
|
match std::fs::remove_file(&output_path) {
|
||||||
|
Ok(()) => {
|
||||||
|
tracing::info!(run_id = %run_id, path = %output_path, "removed superseded render");
|
||||||
|
}
|
||||||
|
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||||
|
// File already gone — still clear the dangling row.
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
// Could not delete — leave output_path intact rather
|
||||||
|
// than pointing the row at nothing.
|
||||||
|
tracing::warn!(
|
||||||
|
run_id = %run_id, path = %output_path, error = %e,
|
||||||
|
"superseded-render cleanup: could not delete file, leaving row intact",
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Err(e) = sqlx::query("UPDATE narration_runs SET output_path = NULL WHERE id = $1")
|
||||||
|
.bind(run_id)
|
||||||
|
.execute(pool)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
tracing::warn!(run_id = %run_id, error = %e, "superseded-render cleanup: could not null output_path");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Pick the engine base URL for a given voice.source.
|
/// Pick the engine base URL for a given voice.source.
|
||||||
/// kokoro_* → KOKORO_URL
|
/// kokoro_* → KOKORO_URL
|
||||||
/// tortoise_* → TORTOISE_URL
|
/// tortoise_* → TORTOISE_URL
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue