From 36922706a2c33f5d2ff1d5d089d60610eaad752d Mon Sep 17 00:00:00 2001 From: Kayos Date: Thu, 14 May 2026 09:40:59 -0700 Subject: [PATCH 01/13] engine/kokoro: question doubling + kludges notes Re-applies the Kokoro-specific hacks that main intentionally omits: - _emphasize_questions doubles '?' to '??' so the 82M's flat interrogative prosody gets a rising-pitch cue - engines/kokoro/hacks.md documents this and the other Kokoro- tuned bits (gap durations, lowercase-only respellings) with the 'remove when we move to a bigger model' marker Deploy from this branch to /mnt/cache/appdata/kokoro/build/ when you want the tuned version. Main's vanilla Kokoro is for reference / future cleanup. --- engines/kokoro/hacks.md | 48 ++++++++++++++++++++++++++++++++++++++++ engines/kokoro/server.py | 13 ++++++++++- 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 engines/kokoro/hacks.md diff --git a/engines/kokoro/hacks.md b/engines/kokoro/hacks.md new file mode 100644 index 0000000..0917827 --- /dev/null +++ b/engines/kokoro/hacks.md @@ -0,0 +1,48 @@ +# Kokoro engine — kludges branched off main + +This branch carries the engine-specific tweaks that don't generalise +to F5 / Tortoise. Each one is a real workaround for a real Kokoro-82M +limitation, not a stylistic choice — when we move to a bigger model +these should disappear. + +## 1. Doubled `??` for question prosody + +**File:** `server.py` — `_emphasize_questions` + `_QUESTION_RE`. + +Kokoro-82M's prosody on single `?` is flat — interrogatives read like +declaratives. The 82M parameter cap shows up here. Doubling the mark +to `??` triggers a noticeably stronger rising-pitch contour. + +Tried + works: `??`. Tried + worse: `?!` (sounds shouty), trailing +spaces (no effect). + +Remove when: upgrading to a bigger model OR a Kokoro version with +better prosody control. + +## 2. Paragraph / scene / breath gap durations + +**File:** `server.py` — `PARAGRAPH_GAP_S=0.7`, `SCENE_GAP_S=1.5`, +`BREATH_GAP_S=0.4`. + +These were eyeballed against af_heart's natural pacing for long-form +prose. Other voices (e.g. am_michael's slower delivery) may want +shorter gaps; a per-voice override map would be more correct but +isn't worth the complexity yet. + +The 2026-05-14 feedback was "some pauses are a tad too long" — the +0.7/1.5/0.4 may want to drop to 0.5/1.2/0.3 if confirmed. + +## 3. Pronunciation respellings as ALL-LOWERCASE + +**File:** *(data, not code — pronunciation_overrides DB table)* + +Kokoro's misaki phonemizer treats consecutive uppercase letters as +initialisms ("PRIP-yat" → "P-R-I-P yat"). The seeded respellings in +`pronunciation_overrides WHERE phoneme_format='respelling'` must +therefore use lowercase syllabification: `prip-yat`, `dyat-loff`, +`bryu-hah-noff`. Stress marking is lost. + +For tortoise this constraint may not hold (different phonemizer); +the respelling format is currently kokoro-tuned. Future: per-engine +phoneme_format buckets, or have skald narrate pass the engine name +when selecting overrides. diff --git a/engines/kokoro/server.py b/engines/kokoro/server.py index 169cbbd..b0a6f9d 100644 --- a/engines/kokoro/server.py +++ b/engines/kokoro/server.py @@ -113,12 +113,23 @@ def _parse_tag(match: re.Match) -> float: return dur / 1000.0 if unit == "ms" else dur +# [HACK — engine/kokoro] Kokoro-82M has weak question prosody on a +# single `?`. Doubling the question mark to `??` reliably triggers a +# more interrogative rising-pitch contour without changing semantics. +# Skip if already doubled or part of an interrobang. See hacks.md. +_QUESTION_RE = re.compile(r"(? str: + return _QUESTION_RE.sub("??", text) + + def _expand_inline(text: str, voice: str | None) -> list[Node]: """Expand inline [breath]/[pause]/[scene] tags inside a chunk of text that already has a single voice attribution. Voice blocks themselves are handled one level up in split_to_nodes.""" out: list[Node] = [] - text = text.strip() + text = _emphasize_questions(text.strip()) if not text: return out cursor = 0 From 303b6c73f4b6af77b6f33a32cb3656ad828a2d09 Mon Sep 17 00:00:00 2001 From: Kayos Date: Thu, 14 May 2026 09:46:16 -0700 Subject: [PATCH 02/13] narrate: route tortoise_* voices to TORTOISE_URL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Engine dispatch was a binary kokoro-vs-everything-else split that sent tortoise voices to F5. Now three branches: kokoro_* → KOKORO_URL (Lucy default :7794) tortoise_* → TORTOISE_URL (Lucy default :7795) * → F5_TTS_URL (Lucy default :7792) substitute_speaker_voices also runs for tortoise voices, so multi-voice [voice:slug] tags survive in the prose. Tortoise voices without a per-character mapping fall back to the narrator voice server-side (tortoise_server.py logs the fallback). --- skald/src/narrate.rs | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/skald/src/narrate.rs b/skald/src/narrate.rs index cd697eb..98418c1 100644 --- a/skald/src/narrate.rs +++ b/skald/src/narrate.rs @@ -65,12 +65,16 @@ pub async fn run( let run_id = Uuid::new_v4(); let output_filename = format!("{}-{}-{}.wav", chapter.story_id, chapter.n, run_id); - // Engine + version threaded from the voice row's source/license - // pair. lj_speech-style PD voices live behind f5-tts; kokoro_* - // voices live behind kokoro. Future: a dedicated voices.engine - // column to make this explicit. + // Engine + version threaded from the voice row's source. Three + // engines on Lucy currently: + // kokoro_* → kokoro 82M + // tortoise_* → tortoise-tts + // anything else (lj_speech etc.) → f5-tts + // Future: a dedicated voices.engine column to make this explicit. let (engine, engine_version) = if voice.source.starts_with("kokoro") { ("kokoro-82m", "0.9") + } else if voice.source.starts_with("tortoise") { + ("tortoise-tts", "3.0") } else { ("f5-tts", "1.1.20") }; @@ -91,14 +95,20 @@ pub async fn run( // the Kokoro server only ever sees real voice ids. Only kicks // in for kokoro-routed renders; F5 voice-tag handling isn't // implemented and any tags pass through unchanged. - // Two pre-processing passes (kokoro only). Order matters: - // 1. Speaker voice substitution rewrites [voice:slug] → [voice:kokoro_id]. - // This must run BEFORE pronunciation overrides so we don't - // accidentally try to respell character slugs. + // Two pre-processing passes (kokoro + tortoise — engines that + // parse [voice:X] dialogue tags). Order matters: + // 1. Speaker voice substitution rewrites [voice:slug] → the + // engine's named voice id. Must run BEFORE pronunciation + // overrides so we don't try to respell character slugs. + // Tortoise: characters with no tortoise-voice mapping + // gracefully fall back to the narrator voice server-side. // 2. Pronunciation overrides word-substitute proper nouns - // (Pripyat, Dyatlov, etc.) with English-readable respellings - // so Kokoro's small phonemizer doesn't mangle them. - let gen_text = if voice.source.starts_with("kokoro") { + // (Pripyat, Dyatlov, etc.) with English-readable + // respellings. The respellings are kokoro/misaki-tuned but + // pass through tortoise's g2p_en well enough to apply. + let routes_to_engine_with_voice_tags = + voice.source.starts_with("kokoro") || voice.source.starts_with("tortoise"); + let gen_text = if routes_to_engine_with_voice_tags { let voiced = substitute_speaker_voices( &pool, chapter.story_id, @@ -373,13 +383,18 @@ async fn apply_pronunciation_overrides( Ok(out) } -/// Pick the engine base URL for a given voice.source. Voices whose -/// source starts with "kokoro" route to KOKORO_URL; everything else -/// routes to F5_TTS_URL. Each env var has a LAN-default for Lucy. +/// Pick the engine base URL for a given voice.source. +/// kokoro_* → KOKORO_URL +/// tortoise_* → TORTOISE_URL +/// anything else (lj_speech etc.) → F5_TTS_URL +/// Each env var has a LAN-default for Lucy. fn engine_url_for(source: &str) -> anyhow::Result { if source.starts_with("kokoro") { Ok(std::env::var("KOKORO_URL") .unwrap_or_else(|_| "http://192.168.0.5:7794".into())) + } else if source.starts_with("tortoise") { + Ok(std::env::var("TORTOISE_URL") + .unwrap_or_else(|_| "http://192.168.0.5:7795".into())) } else { Ok(std::env::var("F5_TTS_URL") .unwrap_or_else(|_| "http://192.168.0.5:7792".into())) From d2442f0a87c6854bf912a070e005413a5bd83cab Mon Sep 17 00:00:00 2001 From: Kayos Date: Thu, 14 May 2026 21:35:20 -0700 Subject: [PATCH 03/13] =?UTF-8?q?forge:=20rewrite=20pass=20=E2=80=94=20re-?= =?UTF-8?q?author=20prose=20in=20an=20author's=20voice?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New Forge::rewrite + PassKind::Rewrite. An author re-authors existing chapter prose entirely in their voice — sentence rhythm, word choice, paragraph shape all become theirs — while canon (names, dates, places, events, order, technical facts) is preserved exactly. Not editing; re-authoring. SystemMode::Replace, max effort. skald rewrite --chapter [--author slug] overwrites body_md with the rewritten version. The pre-rewrite prose is stashed in the new chapters.body_md_original column on first rewrite (migration 0008, idempotent) so the original is never lost. body_md_tts is cleared — it was annotated against the old prose and must be regenerated by a fresh prepare-narration. prepare-narration gains --single-voice: skips the character speaker roster so no [voice:X] dialogue tags are inserted, only beat markers. Right for one-voice narration. Migration 0008 also extends generation_runs.kind to allow 'rewrite'. --- migrations/0008_chapter_rewrite.sql | 15 ++ skald-core/src/forge.rs | 59 +++++++ skald/src/main.rs | 31 +++- skald/src/narrate_prep.rs | 19 ++- skald/src/rewrite.rs | 252 ++++++++++++++++++++++++++++ 5 files changed, 371 insertions(+), 5 deletions(-) create mode 100644 migrations/0008_chapter_rewrite.sql create mode 100644 skald/src/rewrite.rs diff --git a/migrations/0008_chapter_rewrite.sql b/migrations/0008_chapter_rewrite.sql new file mode 100644 index 0000000..c6ce9b5 --- /dev/null +++ b/migrations/0008_chapter_rewrite.sql @@ -0,0 +1,15 @@ +-- The rewrite pass: an author re-authors existing chapter prose in +-- their own voice (canon preserved, prose reworked). body_md gets +-- overwritten with the rewritten version; body_md_original keeps +-- the pre-rewrite prose so the original is never lost. Populated +-- only on the FIRST rewrite of a chapter (if NULL) — subsequent +-- rewrites leave the original alone. +ALTER TABLE chapters + ADD COLUMN IF NOT EXISTS body_md_original text; + +-- Allow 'rewrite' as a generation_runs.kind. +ALTER TABLE generation_runs + DROP CONSTRAINT generation_runs_kind_check; +ALTER TABLE generation_runs + ADD CONSTRAINT generation_runs_kind_check + CHECK (kind = ANY (ARRAY['gen', 'cleanup', 'audit', 'summary', 'embed', 'narrate_prep', 'rewrite'])); diff --git a/skald-core/src/forge.rs b/skald-core/src/forge.rs index 14f0da6..75c22b1 100644 --- a/skald-core/src/forge.rs +++ b/skald-core/src/forge.rs @@ -74,6 +74,10 @@ pub enum PassKind { /// prose; output should be byte-identical except for the /// tag insertions. NarratePrep, + /// Re-author existing chapter prose in an author's voice. Canon + /// (names, dates, events, places, facts) is preserved exactly; + /// the prose itself is rewritten. Not editing — re-authoring. + Rewrite, } impl PassKind { @@ -84,6 +88,7 @@ impl PassKind { Self::Audit => "audit", Self::Summary => "summary", Self::NarratePrep => "narrate_prep", + Self::Rewrite => "rewrite", } } } @@ -237,6 +242,45 @@ impl Forge { Ok(PassOutput { kind: PassKind::NarratePrep, result: r, duration_ms }) } + /// Re-author existing chapter prose in the author's voice. The + /// model receives prose written by another hand and rewrites it + /// entirely in its own style — sentence rhythm, word choice, + /// paragraph shape all become the author's. Canon is preserved + /// exactly: names, dates, events, places, technical facts, and + /// the sequence of what happens do not change. + /// + /// Author REQUIRED — a rewrite without an author has no target + /// voice. SystemMode::Replace; the model BECOMES the author. + /// Max effort: re-authoring is the heaviest prose-craft task. + pub async fn rewrite( + &self, + prose: &str, + author: &AuthorWithRevision, + ) -> anyhow::Result { + let scaffold = author + .revision + .system_template + .as_deref() + .unwrap_or(DEFAULT_AUTHOR_SCAFFOLD); + let system = scaffold + .replace("{{display_name}}", &author.author.display_name) + .replace("{{pass_directive}}", REWRITE_DIRECTIVE) + .replace("{{soul}}", &author.revision.soul); + let user_prompt = rewrite_user_prompt(prose); + let body = RunRequest { + prompt: user_prompt, + model: Some(self.model.clone()), + system: Some(system), + system_mode: Some(SystemMode::Replace), + effort: Some(Effort::Max), + timeout_secs: Some(1800), + ..Default::default() + }; + let r = self.client.run(body).await?; + let duration_ms = r.duration_ms; + Ok(PassOutput { kind: PassKind::Rewrite, result: r, duration_ms }) + } + /// Summarize one chapter to ~250 words. The summary feeds into /// the continuation context for older chapters so the token /// budget stays sane on long series (book 12 doesn't carry book 1 @@ -349,6 +393,8 @@ const NARRATE_PREP_DIRECTIVE: &str = "This is a NARRATION-ANNOTATION pass. You r const HOUSE_NARRATE_PREP_SYSTEM: &str = "You are a senior audiobook director annotating prose for narration. You insert (a) beat markers — `[breath]`, `[pause:Xs]`, `[scene]` — where a skilled narrator would breathe or pause, (b) speaker voice tags `[voice:]\"...\"[/voice]` wrapping dialogue based on who is speaking (roster supplied in user prompt; leave unattributed dialogue unwrapped), and (c) occasional humanizing narrator stumbles using em-dash repetition or self-correction (sparingly — maybe 1-3 per chapter, on proper nouns or hard words). Apart from those stumbles you do NOT change a word of the prose. Return the prose verbatim plus beat markers, voice tags, and (rare) stumbles inline. No preamble, no commentary."; +const REWRITE_DIRECTIVE: &str = "This is a REWRITE pass. The user prompt contains a chapter of prose written by another hand. Re-author it entirely in YOUR voice — every sentence reworked in your style: your sentence rhythm, your word choice, your paragraph shape, your way of landing a beat. This is not editing or polishing. It is re-authoring. The reader should not be able to tell another writer ever touched it.\n\nHARD CONSTRAINTS — canon is non-negotiable:\n- Every character name, every date, every place name stays exactly as written.\n- Every event, and the ORDER events happen in, stays exactly as written.\n- Every technical or historical fact stays exactly as written.\n- Do not add new scenes, characters, or events. Do not cut any scene or beat. Same story, same shape — your telling.\n\nReturn ONLY the rewritten chapter prose. Begin with the chapter heading line (`## Chapter N — title`) exactly as in the source. No preamble, no commentary about the rewrite."; + // ─── User-prompt builders ─────────────────────────────────────── fn gen_user_prompt( @@ -395,6 +441,19 @@ pub struct CharacterSpeaker { pub hint: Option, } +fn rewrite_user_prompt(prose: &str) -> String { + let mut out = String::with_capacity(prose.len() + 256); + out.push_str("# Chapter to re-author\n\n"); + out.push_str(prose); + out.push_str( + "\n\n# Task\n\nRe-author the chapter above entirely in your voice. \ + Preserve all canon — names, dates, places, events, the order they \ + happen, every technical fact. Change only the prose. Return only \ + the rewritten chapter, starting with its `## Chapter N` heading.\n", + ); + out +} + fn narrate_prep_user_prompt(prose: &str, characters: &[CharacterSpeaker]) -> String { let mut out = String::with_capacity(prose.len() + 512); diff --git a/skald/src/main.rs b/skald/src/main.rs index b6b6d7a..503a5f0 100644 --- a/skald/src/main.rs +++ b/skald/src/main.rs @@ -9,6 +9,7 @@ mod continue_story; mod import; mod narrate; mod narrate_prep; +mod rewrite; mod serve; mod show_context; mod summarize; @@ -155,6 +156,23 @@ enum Cmd { /// errors out to avoid clobbering a hand-tuned version. #[arg(long)] overwrite: bool, + /// Single-voice mode: skip the character speaker roster so + /// no [voice:X] dialogue tags are inserted. Use when the + /// whole chapter narrates in one voice. + #[arg(long)] + single_voice: bool, + }, + /// Re-author one chapter's prose in an author's voice. Canon + /// preserved, prose reworked. Overwrites body_md (stashing the + /// original in body_md_original) and clears body_md_tts. + Rewrite { + /// Chapter UUID to re-author. + #[arg(long)] + chapter: Uuid, + /// Author slug to rewrite as. Falls back to the story's + /// bound author if omitted. + #[arg(long)] + author: Option, }, } @@ -230,8 +248,19 @@ async fn run() -> anyhow::Result<()> { chapter, author, overwrite, + single_voice, } => { - narrate_prep::run(&cli.database_url, chapter, author.as_deref(), overwrite).await + narrate_prep::run( + &cli.database_url, + chapter, + author.as_deref(), + overwrite, + single_voice, + ) + .await + } + Cmd::Rewrite { chapter, author } => { + rewrite::run(&cli.database_url, chapter, author.as_deref()).await } } } diff --git a/skald/src/narrate_prep.rs b/skald/src/narrate_prep.rs index bc92228..8f4b806 100644 --- a/skald/src/narrate_prep.rs +++ b/skald/src/narrate_prep.rs @@ -24,6 +24,7 @@ pub async fn run( chapter_id: Uuid, author_slug: Option<&str>, overwrite: bool, + single_voice: bool, ) -> anyhow::Result<()> { let cfg = load_forge_config()?; tracing::info!(base_url = %cfg.base_url, model = %cfg.model, "forge configured"); @@ -60,10 +61,20 @@ pub async fn run( .fetch_one(&pool) .await?; - let characters = load_speakers(&pool, chapter.story_id).await?; - if !characters.is_empty() { - tracing::info!(speaker_count = characters.len(), "speaker roster loaded"); - } + // Single-voice mode skips the speaker roster entirely — the + // narrate_prep pass then inserts only [breath]/[pause]/[scene] + // beats, no [voice:X] dialogue tags. Right when the whole + // chapter narrates in one voice. + let characters = if single_voice { + tracing::info!("single-voice mode — skipping speaker roster"); + Vec::new() + } else { + let c = load_speakers(&pool, chapter.story_id).await?; + if !c.is_empty() { + tracing::info!(speaker_count = c.len(), "speaker roster loaded"); + } + c + }; let started = Instant::now(); let out_res = forge diff --git a/skald/src/rewrite.rs b/skald/src/rewrite.rs new file mode 100644 index 0000000..8692826 --- /dev/null +++ b/skald/src/rewrite.rs @@ -0,0 +1,252 @@ +//! `skald rewrite` — re-author one chapter's prose in an author's +//! voice. Canon preserved, prose reworked. Overwrites chapters.body_md +//! with the rewritten version; the pre-rewrite prose is stashed in +//! chapters.body_md_original on the first rewrite (if NULL) so the +//! original is never lost. +//! +//! Author resolution: --author flag wins, else the chapter's +//! story.author_id. A rewrite with no author errors — there's no +//! target voice. + +use std::time::Instant; + +use anyhow::{Context, bail}; +use chrono::Utc; +use skald_core::authors::{self, AuthorWithRevision}; +use skald_core::config::ForgeConfig; +use skald_core::db; +use skald_core::forge::{Forge, PassKind, PassOutput}; +use sqlx::PgPool; +use uuid::Uuid; + +pub async fn run( + database_url: &str, + chapter_id: Uuid, + author_slug: Option<&str>, +) -> anyhow::Result<()> { + let cfg = load_forge_config()?; + tracing::info!(base_url = %cfg.base_url, model = %cfg.model, "forge configured"); + + let pool = db::connect_and_migrate(database_url).await?; + let forge = Forge::new(&cfg)?; + + let chapter = load_chapter(&pool, chapter_id).await?; + let author = resolve_author(&pool, &chapter, author_slug) + .await? + .ok_or_else(|| { + anyhow::anyhow!( + "rewrite needs an author — pass --author or bind one to the story" + ) + })?; + tracing::info!( + slug = %author.author.slug, + revision_n = author.revision.n, + chapter_n = chapter.n, + word_count_in = word_count(&chapter.body_md), + "re-authoring chapter", + ); + + let run_id: Uuid = sqlx::query_scalar( + "INSERT INTO generation_runs (story_id, kind, status) VALUES ($1, $2, 'running') RETURNING id", + ) + .bind(chapter.story_id) + .bind(PassKind::Rewrite.as_str()) + .fetch_one(&pool) + .await?; + + let started = Instant::now(); + let out_res = forge.rewrite(&chapter.body_md, &author).await; + let elapsed = started.elapsed(); + + let out: PassOutput = match out_res { + Ok(o) => o, + Err(e) => { + sqlx::query( + "UPDATE generation_runs SET status='failed', error=$1, ended_at=$2 WHERE id=$3", + ) + .bind(format!("{e:#}")) + .bind(Utc::now()) + .bind(run_id) + .execute(&pool) + .await?; + return Err(e); + } + }; + + let rewritten = pass_text(&out)?; + let (_n, title, body) = parse_chapter(&rewritten); + + // Stash the original on first rewrite, then overwrite body_md. + // body_md_tts is cleared — it was annotated against the OLD + // prose and must be regenerated by a fresh prepare-narration. + sqlx::query( + "UPDATE chapters + SET body_md_original = COALESCE(body_md_original, body_md), + body_md = $1, + title = COALESCE($2, title), + body_md_tts = NULL, + word_count = $3, + generated_at = now() + WHERE id = $4", + ) + .bind(&body) + .bind(title.as_deref()) + .bind(word_count(&body)) + .bind(chapter_id) + .execute(&pool) + .await?; + + // Replace passages with the rewritten paragraphs. + sqlx::query("DELETE FROM passages WHERE chapter_id = $1") + .bind(chapter_id) + .execute(&pool) + .await?; + for (i, para) in body.split("\n\n").enumerate() { + let p = para.trim(); + if p.is_empty() || p == "---" { + continue; + } + sqlx::query("INSERT INTO passages (chapter_id, paragraph_n, body) VALUES ($1, $2, $3)") + .bind(chapter_id) + .bind(i as i32 + 1) + .bind(p) + .execute(&pool) + .await?; + } + sqlx::query( + "UPDATE stories SET word_count_actual = (SELECT COALESCE(SUM(word_count), 0) FROM chapters WHERE story_id = $1) WHERE id = $1", + ) + .bind(chapter.story_id) + .execute(&pool) + .await?; + + sqlx::query("UPDATE generation_runs SET status='succeeded', ended_at=$1 WHERE id=$2") + .bind(Utc::now()) + .bind(run_id) + .execute(&pool) + .await?; + + println!( + "rewrote chapter {} of story {} as {} ({} → {} words) in {:.1}s", + chapter.n, + chapter.story_id, + author.author.slug, + word_count(&chapter.body_md), + word_count(&body), + elapsed.as_secs_f32(), + ); + Ok(()) +} + +#[derive(Debug, Clone)] +struct ChapterRow { + story_id: Uuid, + n: i32, + body_md: String, + story_author_id: Option, +} + +async fn load_chapter(pool: &PgPool, id: Uuid) -> anyhow::Result { + let row: Option<(Uuid, i32, String, Option)> = sqlx::query_as( + "SELECT c.story_id, c.n, c.body_md, s.author_id + FROM chapters c JOIN stories s ON s.id = c.story_id + WHERE c.id = $1", + ) + .bind(id) + .fetch_optional(pool) + .await?; + let (story_id, n, body_md, story_author_id) = + row.with_context(|| format!("chapter {id} not found"))?; + Ok(ChapterRow { + story_id, + n, + body_md, + story_author_id, + }) +} + +async fn resolve_author( + pool: &PgPool, + chapter: &ChapterRow, + flag_slug: Option<&str>, +) -> anyhow::Result> { + if let Some(slug) = flag_slug { + return authors::get_with_current_revision(pool, slug) + .await? + .map(Some) + .with_context(|| format!("author '{slug}' not found")); + } + if let Some(aid) = chapter.story_author_id { + let row: Option<(String,)> = sqlx::query_as("SELECT slug FROM authors WHERE id = $1") + .bind(aid) + .fetch_optional(pool) + .await?; + if let Some((slug,)) = row { + return Ok(authors::get_with_current_revision(pool, &slug).await?); + } + } + Ok(None) +} + +fn pass_text(out: &PassOutput) -> anyhow::Result { + let text = out + .result + .as_text() + .map(|s| s.to_string()) + .or_else(|| out.result.result.as_str().map(|s| s.to_string())) + .unwrap_or_else(|| out.result.result.to_string()); + if text.trim().is_empty() { + bail!("rewrite pass returned empty"); + } + Ok(text) +} + +/// Parse (n, title, body) out of the rewritten chapter. Tolerant of +/// a missing heading — if the first line isn't a heading we keep the +/// whole text as body and return n=0 (caller keeps the existing n). +fn parse_chapter(text: &str) -> (i32, Option, String) { + let trimmed = text.trim_start(); + let first = trimmed.lines().next().unwrap_or("").trim(); + if let Some(heading) = first.strip_prefix('#') { + let heading = heading.trim_start_matches('#').trim(); + let n = heading + .to_lowercase() + .find("chapter") + .and_then(|idx| { + heading[idx + 7..] + .trim_start() + .split([' ', '—', '-', ':', ',']) + .next() + .and_then(|w| w.parse::().ok()) + }) + .unwrap_or(0); + let title = heading + .split_once(" — ") + .or_else(|| heading.split_once(" - ")) + .map(|(_, t)| t.trim().to_string()) + .filter(|t| !t.is_empty()); + let body = trimmed + .lines() + .skip(1) + .collect::>() + .join("\n") + .trim_start() + .to_string(); + let body = if body.is_empty() { text.trim().to_string() } else { body }; + return (n, title, body); + } + (0, None, text.trim().to_string()) +} + +fn word_count(s: &str) -> i32 { + s.split_whitespace().count() as i32 +} + +fn load_forge_config() -> anyhow::Result { + let base_url = std::env::var("CLAWDFORGE_URL") + .context("CLAWDFORGE_URL not set")?; + let app_token = std::env::var("CLAWDFORGE_TOKEN") + .context("CLAWDFORGE_TOKEN not set")?; + let model = std::env::var("SKALD_MODEL").unwrap_or_else(|_| "opus".into()); + Ok(ForgeConfig { base_url, app_token, model }) +} From 98233182fd4b54f19edcf211b4eddc1352dd5987 Mon Sep 17 00:00:00 2001 From: Kayos Date: Thu, 14 May 2026 22:32:52 -0700 Subject: [PATCH 04/13] forge: high effort for prose-craft passes, max only for audit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gen, cleanup, narrate_prep and rewrite drop from max to high effort. Audit keeps max — it is the one pass doing real reasoning (canon drift, timeline gaps, retcons) rather than prose-craft, so it is worth the frontier spend. Prose-craft is "good enough" at high. This also keeps the all-Opus skald pattern under the $200/month claude -p cap landing next month. --- skald-core/src/forge.rs | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/skald-core/src/forge.rs b/skald-core/src/forge.rs index 75c22b1..9812459 100644 --- a/skald-core/src/forge.rs +++ b/skald-core/src/forge.rs @@ -7,7 +7,7 @@ //! //! 1. **gen** — produces a new chapter draft from an assembled //! context blob (parent prose + bible + characters + similarity- -//! matched passages, all from the database). Opus, max effort. +//! matched passages, all from the database). Opus, high effort. //! //! 2. **cleanup** — polishes the draft for prose quality, voice //! consistency, dialogue rhythm, pacing dead spots. Same Opus, @@ -43,8 +43,11 @@ use crate::config::ForgeConfig; pub struct Forge { client: Client, /// The model alias we pass to clawdforge. Skald is opinionated: - /// always opus max effort. (See `project_story_writer_container.md`.) - /// `clawdforge` resolves the alias to the actual claude CLI flag. + /// always opus. Story-writing passes (gen/cleanup/narrate_prep/ + /// rewrite) run at HIGH effort; only the audit pass runs at MAX — + /// audit genuinely needs the frontier reasoning, prose-craft does + /// not, and the $200/mo `claude -p` cap makes max-everywhere + /// unaffordable. `clawdforge` resolves the alias to the CLI flag. model: String, } @@ -136,7 +139,7 @@ impl Forge { model: Some(self.model.clone()), system: Some(system), system_mode: Some(mode), - effort: Some(Effort::Max), + effort: Some(Effort::High), timeout_secs: Some(1800), ..Default::default() }; @@ -161,7 +164,7 @@ impl Forge { model: Some(self.model.clone()), system: Some(system), system_mode: Some(mode), - effort: Some(Effort::Max), + effort: Some(Effort::High), timeout_secs: Some(1800), ..Default::default() }; @@ -231,9 +234,9 @@ impl Forge { model: Some(self.model.clone()), system: Some(system), system_mode: Some(mode), - // Tag placement IS a craft choice; max effort buys - // better beat sense. Same posture as gen/cleanup. - effort: Some(Effort::Max), + // Tag placement IS a craft choice; high effort is + // plenty for beat sense. Same posture as gen/cleanup. + effort: Some(Effort::High), timeout_secs: Some(1800), ..Default::default() }; @@ -251,7 +254,8 @@ impl Forge { /// /// Author REQUIRED — a rewrite without an author has no target /// voice. SystemMode::Replace; the model BECOMES the author. - /// Max effort: re-authoring is the heaviest prose-craft task. + /// High effort: re-authoring is heavy prose-craft, but it's + /// still craft, not reasoning — max is reserved for audit. pub async fn rewrite( &self, prose: &str, @@ -272,7 +276,7 @@ impl Forge { model: Some(self.model.clone()), system: Some(system), system_mode: Some(SystemMode::Replace), - effort: Some(Effort::Max), + effort: Some(Effort::High), timeout_secs: Some(1800), ..Default::default() }; @@ -516,6 +520,10 @@ fn build_audit_request(model: &str, parent: &str, sequel: &str, bible: &str) -> prompt, model: Some(model.to_string()), system: Some(SYSTEM_AUDIT.to_string()), + // Audit is the one pass that keeps MAX effort — catching + // canon drift, timeline gaps and retcons is reasoning work + // worth the frontier spend; prose-craft passes run at high. + effort: Some(Effort::Max), timeout_secs: Some(600), ..Default::default() } From c8c44a5d23f83e85cf64580ce2590ed2d2b8be1b Mon Sep 17 00:00:00 2001 From: Kayos Date: Fri, 15 May 2026 07:02:10 -0700 Subject: [PATCH 05/13] narrate: single-voice prep drops voice tags; GC superseded renders MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes: - narrate_prep in single-voice mode (empty character roster) was still handed the multi-voice directive, so the model invented [voice:] tags from character names in the prose. The narrate path neutralised them by falling back to the narrator, but it was log spam and a leak of intent. Single-voice now gets directive + house-system variants that forbid voice tags outright, and the user-prompt task line matches. - Every narrate run wrote a fresh ~80MB WAV and never reclaimed the previous one, so re-renders piled up stale files. A successful render now deletes the WAVs of prior renders of the same chapter and nulls their output_path. Render history rows are kept; only the dead file pointer is cleared. Best-effort — cleanup failure never fails the render. --- skald-core/src/forge.rs | 62 +++++++++++++++++++++++++++++------ skald/src/narrate.rs | 71 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+), 10 deletions(-) diff --git a/skald-core/src/forge.rs b/skald-core/src/forge.rs index 9812459..916f416 100644 --- a/skald-core/src/forge.rs +++ b/skald-core/src/forge.rs @@ -200,10 +200,12 @@ impl Forge { /// Orson Black places beats differently than another author /// would. Replace-mode if author is set; Append otherwise. /// - /// `characters` is the story's character roster. When provided, + /// `characters` is the story's character roster. When non-empty, /// the system prompt instructs the model to wrap dialogue in - /// `[voice:]"..."[/voice]` for multi-voice rendering. The + /// `[voice:]"..."[/voice]` for multi-voice rendering; the /// slug is mapped to a Kokoro voice id by skald's narrate path. + /// An EMPTY roster selects single-voice mode — the prompt then + /// forbids `[voice:...]` tags entirely (one narrator, no cast). /// /// Hard rule the system prompt enforces: do not change a word /// of prose. Tags are additive only. @@ -213,6 +215,12 @@ impl Forge { author: Option<&AuthorWithRevision>, characters: &[CharacterSpeaker], ) -> anyhow::Result { + // An empty character roster means single-voice narration — + // the whole chapter reads in one voice. In that mode the + // prompt must NOT invite `[voice:...]` tags, or the model + // invents speaker slugs from names in the prose that the + // narrate path then has to detect and neutralize. + let single_voice = characters.is_empty(); let user_prompt = narrate_prep_user_prompt(prose, characters); let (system, mode) = match author { Some(a) => { @@ -221,13 +229,25 @@ impl Forge { .system_template .as_deref() .unwrap_or(DEFAULT_AUTHOR_SCAFFOLD); + let directive = if single_voice { + NARRATE_PREP_DIRECTIVE_SINGLE + } else { + NARRATE_PREP_DIRECTIVE + }; let composed = scaffold .replace("{{display_name}}", &a.author.display_name) - .replace("{{pass_directive}}", NARRATE_PREP_DIRECTIVE) + .replace("{{pass_directive}}", directive) .replace("{{soul}}", &a.revision.soul); (composed, SystemMode::Replace) } - None => (HOUSE_NARRATE_PREP_SYSTEM.to_string(), SystemMode::Append), + None => { + let house = if single_voice { + HOUSE_NARRATE_PREP_SYSTEM_SINGLE + } else { + HOUSE_NARRATE_PREP_SYSTEM + }; + (house.to_string(), SystemMode::Append) + } }; let body = RunRequest { prompt: user_prompt, @@ -395,8 +415,20 @@ const SYSTEM_AUDIT: &str = "You are a canon auditor for long-form fiction. You c const NARRATE_PREP_DIRECTIVE: &str = "This is a NARRATION-ANNOTATION pass. You receive your own prose and prepare it for an audiobook reading. Three kinds of inserts are allowed:\n\n1. BEAT MARKERS (additive, not prose): `[breath]` (~400ms), `[pause:1.2s]` (explicit silence in seconds, e.g. 0.5s, 1.2s, 2s), `[scene]` (~1500ms scene break). Place where the prose's rhythm asks for them — after a hard one-line beat, before a turn in dialogue, on a paragraph that lands with weight.\n\n2. SPEAKER VOICE TAGS (multi-voice dialogue): wrap dialogue lines in `[voice:]\"...\"[/voice]` based on who is speaking. The roster of available speaker slugs is given in the user prompt. The dialogue itself stays verbatim — only the wrapper is added. If a line of dialogue is not clearly attributable to a roster speaker, leave it unwrapped (the narrator voice will read it). Quoted thoughts (italicized interior monologue) stay unwrapped — only spoken aloud dialogue gets a voice tag.\n\n3. NARRATOR STUMBLES (humanizing prose-level inserts): a real narrator occasionally stumbles on a hard word, catches themselves, repeats. You may add these *sparingly* where the prose's pacing makes them feel right. Patterns: em-dash repetition (`Prip— Pripyat`), self-correction (`she — no, the wife — had been told`), hesitation (`the dose, the dose was`). USE SPARINGLY. Maybe 1-3 per chapter. Pick proper nouns, technical terms, or moments where the narrator might genuinely catch herself. Avoid stumbling on emotional climaxes — those should land clean.\n\nApart from stumbles, do NOT change a word of the original prose. Return the prose with beat markers, voice tags, and stumbles inline. No preamble. No commentary about your choices."; +/// Single-voice variant of [`NARRATE_PREP_DIRECTIVE`]. Used when the +/// chapter narrates in one voice (no speaker roster). The multi-voice +/// directive's section 2 is dropped entirely AND a hard prohibition +/// is added — without it the model invents `[voice:]` tags from +/// character names in the prose, which the narrate path then has to +/// detect and neutralize. +const NARRATE_PREP_DIRECTIVE_SINGLE: &str = "This is a NARRATION-ANNOTATION pass. You receive your own prose and prepare it for a SINGLE-narrator audiobook reading — the whole chapter, dialogue included, is read aloud in ONE voice. Two kinds of inserts are allowed:\n\n1. BEAT MARKERS (additive, not prose): `[breath]` (~400ms), `[pause:1.2s]` (explicit silence in seconds, e.g. 0.5s, 1.2s, 2s), `[scene]` (~1500ms scene break). Place where the prose's rhythm asks for them — after a hard one-line beat, before a turn in dialogue, on a paragraph that lands with weight.\n\n2. NARRATOR STUMBLES (humanizing prose-level inserts): a real narrator occasionally stumbles on a hard word, catches themselves, repeats. You may add these *sparingly* where the prose's pacing makes them feel right. Patterns: em-dash repetition (`Prip— Pripyat`), self-correction (`she — no, the wife — had been told`), hesitation (`the dose, the dose was`). USE SPARINGLY. Maybe 1-3 per chapter. Pick proper nouns, technical terms, or moments where the narrator might genuinely catch herself. Avoid stumbling on emotional climaxes — those should land clean.\n\nDo NOT add `[voice:...]` speaker tags of any kind — there is one narrator, not a cast. Apart from stumbles, do NOT change a word of the original prose. Return the prose with beat markers and stumbles inline. No preamble. No commentary about your choices."; + const HOUSE_NARRATE_PREP_SYSTEM: &str = "You are a senior audiobook director annotating prose for narration. You insert (a) beat markers — `[breath]`, `[pause:Xs]`, `[scene]` — where a skilled narrator would breathe or pause, (b) speaker voice tags `[voice:]\"...\"[/voice]` wrapping dialogue based on who is speaking (roster supplied in user prompt; leave unattributed dialogue unwrapped), and (c) occasional humanizing narrator stumbles using em-dash repetition or self-correction (sparingly — maybe 1-3 per chapter, on proper nouns or hard words). Apart from those stumbles you do NOT change a word of the prose. Return the prose verbatim plus beat markers, voice tags, and (rare) stumbles inline. No preamble, no commentary."; +/// Single-voice variant of [`HOUSE_NARRATE_PREP_SYSTEM`] — no speaker +/// voice tags, one narrator throughout. +const HOUSE_NARRATE_PREP_SYSTEM_SINGLE: &str = "You are a senior audiobook director annotating prose for a SINGLE-narrator reading. You insert (a) beat markers — `[breath]`, `[pause:Xs]`, `[scene]` — where a skilled narrator would breathe or pause, and (b) occasional humanizing narrator stumbles using em-dash repetition or self-correction (sparingly — maybe 1-3 per chapter, on proper nouns or hard words). Do NOT add `[voice:...]` speaker tags — the whole chapter is one voice. Apart from those stumbles you do NOT change a word of the prose. Return the prose verbatim plus beat markers and (rare) stumbles inline. No preamble, no commentary."; + const REWRITE_DIRECTIVE: &str = "This is a REWRITE pass. The user prompt contains a chapter of prose written by another hand. Re-author it entirely in YOUR voice — every sentence reworked in your style: your sentence rhythm, your word choice, your paragraph shape, your way of landing a beat. This is not editing or polishing. It is re-authoring. The reader should not be able to tell another writer ever touched it.\n\nHARD CONSTRAINTS — canon is non-negotiable:\n- Every character name, every date, every place name stays exactly as written.\n- Every event, and the ORDER events happen in, stays exactly as written.\n- Every technical or historical fact stays exactly as written.\n- Do not add new scenes, characters, or events. Do not cut any scene or beat. Same story, same shape — your telling.\n\nReturn ONLY the rewritten chapter prose. Begin with the chapter heading line (`## Chapter N — title`) exactly as in the source. No preamble, no commentary about the rewrite."; // ─── User-prompt builders ─────────────────────────────────────── @@ -487,12 +519,22 @@ fn narrate_prep_user_prompt(prose: &str, characters: &[CharacterSpeaker]) -> Str out.push_str("# Prose to annotate\n\n"); out.push_str(prose); - out.push_str( - "\n\n# Task\n\nReturn the prose above with `[breath]`, `[pause:Xs]`, \ - `[scene]` markers and `[voice:]\"...\"[/voice]` dialogue wrappers \ - inserted appropriately. Do not change any word. Do not skip any \ - sentence. Return only the annotated prose.\n", - ); + if characters.is_empty() { + out.push_str( + "\n\n# Task\n\nReturn the prose above with `[breath]`, `[pause:Xs]`, \ + `[scene]` beat markers inserted appropriately. Do NOT add any \ + `[voice:...]` tags — this is a single-voice reading. Do not \ + change any word. Do not skip any sentence. Return only the \ + annotated prose.\n", + ); + } else { + out.push_str( + "\n\n# Task\n\nReturn the prose above with `[breath]`, `[pause:Xs]`, \ + `[scene]` markers and `[voice:]\"...\"[/voice]` dialogue wrappers \ + inserted appropriately. Do not change any word. Do not skip any \ + sentence. Return only the annotated prose.\n", + ); + } out } diff --git a/skald/src/narrate.rs b/skald/src/narrate.rs index 98418c1..73e102e 100644 --- a/skald/src/narrate.rs +++ b/skald/src/narrate.rs @@ -161,6 +161,12 @@ pub async fn run( .execute(&pool) .await?; + // This chapter now has a fresh canonical render. Prior render + // WAVs are dead weight — every re-render otherwise leaves its + // predecessor on disk forever. Reclaim it. Best-effort: a + // cleanup failure must never fail an otherwise-good render. + cleanup_superseded_renders(&pool, chapter_id, run_row_id).await; + println!( "narrated chapter {} of story {}: {} ({:.2}s audio, {:.1}s wall clock)", chapter.n, @@ -383,6 +389,71 @@ async fn apply_pronunciation_overrides( Ok(out) } +/// Delete the WAV files of prior renders of this chapter and clear +/// their `output_path`. The newest succeeded render is the canonical +/// one; older renders are superseded the moment a new one lands, and +/// without this every re-render would leave a stale ~80MB file on +/// disk forever. +/// +/// The `narration_runs` rows themselves are KEPT — engine, voice, +/// timing and status stay as render history. Only `output_path` is +/// nulled, so no row ever points at a file that no longer exists. +/// +/// Best-effort throughout: this runs *after* the current render has +/// already been recorded as succeeded, so any failure here (a query +/// error, a permission problem on the audio dir) is logged and +/// swallowed — it must never turn a good render into a failed one. +async fn cleanup_superseded_renders(pool: &PgPool, chapter_id: Uuid, current_run: Uuid) { + // output_path is only ever set on the success UPDATE, so + // "output_path IS NOT NULL AND id != current" is exactly the set + // of prior completed renders. + let prior: Vec<(Uuid, String)> = match sqlx::query_as( + "SELECT id, output_path FROM narration_runs + WHERE chapter_id = $1 AND id <> $2 AND output_path IS NOT NULL", + ) + .bind(chapter_id) + .bind(current_run) + .fetch_all(pool) + .await + { + Ok(rows) => rows, + Err(e) => { + tracing::warn!(error = %e, "superseded-render cleanup: query failed, skipping"); + return; + } + }; + + for (run_id, output_path) in prior { + // output_path is the HTTP-facing path "/audio/"; the + // `/audio` bind mount means that is also the on-disk path + // inside this container. + match std::fs::remove_file(&output_path) { + Ok(()) => { + tracing::info!(run_id = %run_id, path = %output_path, "removed superseded render"); + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + // File already gone — still clear the dangling row. + } + Err(e) => { + // Could not delete — leave output_path intact rather + // than pointing the row at nothing. + tracing::warn!( + run_id = %run_id, path = %output_path, error = %e, + "superseded-render cleanup: could not delete file, leaving row intact", + ); + continue; + } + } + if let Err(e) = sqlx::query("UPDATE narration_runs SET output_path = NULL WHERE id = $1") + .bind(run_id) + .execute(pool) + .await + { + tracing::warn!(run_id = %run_id, error = %e, "superseded-render cleanup: could not null output_path"); + } + } +} + /// Pick the engine base URL for a given voice.source. /// kokoro_* → KOKORO_URL /// tortoise_* → TORTOISE_URL From 575749b7746ea34228d48969c734ff3006aa2a3c Mon Sep 17 00:00:00 2001 From: Kayos Date: Fri, 15 May 2026 07:30:56 -0700 Subject: [PATCH 06/13] =?UTF-8?q?web:=20audiobook=20player=20=E2=80=94=20s?= =?UTF-8?q?titched-file=20playback=20with=20chapter=20seek?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds GET /stories/{id}/listen: one