From 7a96031aa67bc2c67d753e83f383a7ace9a9c8c6 Mon Sep 17 00:00:00 2001 From: Kayos Date: Thu, 14 May 2026 09:42:09 -0700 Subject: [PATCH 01/14] engine/tortoise: GPU exclusivity wrapper + kludges notes Adds the Tortoise-specific tooling that main intentionally omits: - engines/tortoise/exclusive-gpu.sh wraps any command, stops F5 + Kokoro on the GPU, restarts Tortoise to clear stale CUDA contexts, waits for healthz, runs the command, restarts the engines on EXIT trap. Solves the 8GB OOM that took down the first smoke. - engines/tortoise/hacks.md captures the speed reality (~74x real- time slowdown on the 2070 Super at standard preset) and the pronunciation-overrides cross-engine compatibility note. Deploy from this branch when you want Tortoise's tuning. Main's vanilla Tortoise is for the cross-engine reference + future 'we have more VRAM now' cleanup. --- engines/tortoise/exclusive-gpu.sh | 47 ++++++++++++++++++++ engines/tortoise/hacks.md | 71 +++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100755 engines/tortoise/exclusive-gpu.sh create mode 100644 engines/tortoise/hacks.md diff --git a/engines/tortoise/exclusive-gpu.sh b/engines/tortoise/exclusive-gpu.sh new file mode 100755 index 0000000..fb2b0a4 --- /dev/null +++ b/engines/tortoise/exclusive-gpu.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Tortoise GPU exclusivity wrapper. The 2070 Super (8GB) can't host +# F5 (~4.5GB) + Kokoro (~2.7GB) + Tortoise (~5GB peak) simultaneously, +# so we stop the other two engines for the duration of a Tortoise run +# and restart them after. +# +# Usage: +# exclusive-gpu.sh +# +# Example: +# exclusive-gpu.sh docker exec skald skald narrate --chapter +# +# Exits with the wrapped command's status. Restarts the engines +# regardless of success/failure (trap on EXIT). +set -euo pipefail + +STOP_ENGINES=(f5-tts kokoro) + +cleanup() { + local rc=$? + echo "[exclusive-gpu] restarting engines" + for engine in "${STOP_ENGINES[@]}"; do + docker start "$engine" >/dev/null 2>&1 || \ + echo "[exclusive-gpu] failed to restart $engine — investigate" + done + return "$rc" +} +trap cleanup EXIT + +echo "[exclusive-gpu] stopping engines: ${STOP_ENGINES[*]}" +for engine in "${STOP_ENGINES[@]}"; do + docker stop "$engine" >/dev/null 2>&1 || true +done + +# Restart Tortoise to clean up any cached GPU allocations from the +# now-stopped engines (their CUDA contexts can linger briefly). +docker restart tortoise >/dev/null +echo "[exclusive-gpu] waiting for tortoise healthz..." +for i in {1..30}; do + if curl -sf http://192.168.0.5:7795/healthz | grep -q '"loaded":true'; then + break + fi + sleep 2 +done + +echo "[exclusive-gpu] running: $*" +"$@" diff --git a/engines/tortoise/hacks.md b/engines/tortoise/hacks.md new file mode 100644 index 0000000..ee335b4 --- /dev/null +++ b/engines/tortoise/hacks.md @@ -0,0 +1,71 @@ +# Tortoise engine — kludges branched off main + +This branch carries the engine-specific tweaks that don't generalise +to F5 / Kokoro. Tortoise is the audiobook-quality engine but the +trade-offs are real and need explicit handling — speed and GPU. + +## 1. GPU exclusivity + +**File:** `exclusive-gpu.sh`. + +The 2070 Super has 8GB. F5 (~4.5GB) + Kokoro (~2.7GB) + Tortoise +(~5GB peak) sums to ~12GB — over budget. First Tortoise smoke +caught it: `torch.OutOfMemoryError: ... 9.31 MiB is free`. + +Solution: stop the other two engines for the duration of a Tortoise +run. The script wraps any command, stops `f5-tts` + `kokoro`, +restarts `tortoise` to clean its CUDA context, waits for healthz, +runs the wrapped command, then restarts the engines on EXIT trap +(success or failure). + +```bash +./exclusive-gpu.sh docker exec skald skald narrate --chapter +``` + +Remove when: GPU upgrade (P40 24GB / 3090 24GB / etc) lets all three +engines co-reside. + +## 2. Speed — slow, batch-only + +Tortoise at `standard` preset is **~74x slower than real-time** on +the 2070 Super (smoke: 6.5s of audio took 478s wall clock). A 33-min +Chapter 2 render would take ~8 hours. Tortoise is acceptable for +overnight batched runs but NOT interactive rendering. + +Quality presets and their approx wall-clock for a 3000-word chapter: +- `ultra_fast` — ~1h, noticeable quality drop +- `fast` — ~2h +- `standard` — ~6-8h, the recommended bar +- `high_quality` — ~24h, marginally better than standard + +For most use, `standard` is right. Reserve `high_quality` for +short prologues or named samples. + +## 3. Voice mapping format + +Tortoise's voice roster (`lj`, `freeman`, `daniel`, etc.) lives +behind `source='tortoise_tts'` in the `voices` table. Character +slug → Tortoise voice mapping is independent of the Kokoro mapping +— a story can have BOTH a Kokoro and Tortoise mapping live in +parallel, picked at render time via story.preferred_voice_id or +the --voice flag. + +Tortoise voices may sometimes warble or stutter at chunk boundaries +— the `tortoise.api.TextToSpeech.tts_with_preset` call is per-chunk +and re-conditions the voice each time. Acceptable for v0.1; future +work could feed `conditioning_latents` directly for tighter cohesion. + +## 4. No respelling overrides for Tortoise (yet) + +The `pronunciation_overrides` rows in the DB are seeded with +lowercase-syllable respellings tuned for Kokoro's misaki tokenizer. +Tortoise uses a different phonemizer (`g2p_en`) which handles many +of those proper nouns better natively — but some still mangle. + +For now, narrate's substitution applies the same overrides regardless +of engine, which means Tortoise sees `prip-yat` for "Pripyat" — same +input, different phonemizer interprets differently. Usually OK but +audit after each batch. + +Future: per-engine override sets, OR an `engine` column on +pronunciation_overrides. From 303b6c73f4b6af77b6f33a32cb3656ad828a2d09 Mon Sep 17 00:00:00 2001 From: Kayos Date: Thu, 14 May 2026 09:46:16 -0700 Subject: [PATCH 02/14] narrate: route tortoise_* voices to TORTOISE_URL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Engine dispatch was a binary kokoro-vs-everything-else split that sent tortoise voices to F5. Now three branches: kokoro_* → KOKORO_URL (Lucy default :7794) tortoise_* → TORTOISE_URL (Lucy default :7795) * → F5_TTS_URL (Lucy default :7792) substitute_speaker_voices also runs for tortoise voices, so multi-voice [voice:slug] tags survive in the prose. Tortoise voices without a per-character mapping fall back to the narrator voice server-side (tortoise_server.py logs the fallback). --- skald/src/narrate.rs | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/skald/src/narrate.rs b/skald/src/narrate.rs index cd697eb..98418c1 100644 --- a/skald/src/narrate.rs +++ b/skald/src/narrate.rs @@ -65,12 +65,16 @@ pub async fn run( let run_id = Uuid::new_v4(); let output_filename = format!("{}-{}-{}.wav", chapter.story_id, chapter.n, run_id); - // Engine + version threaded from the voice row's source/license - // pair. lj_speech-style PD voices live behind f5-tts; kokoro_* - // voices live behind kokoro. Future: a dedicated voices.engine - // column to make this explicit. + // Engine + version threaded from the voice row's source. Three + // engines on Lucy currently: + // kokoro_* → kokoro 82M + // tortoise_* → tortoise-tts + // anything else (lj_speech etc.) → f5-tts + // Future: a dedicated voices.engine column to make this explicit. let (engine, engine_version) = if voice.source.starts_with("kokoro") { ("kokoro-82m", "0.9") + } else if voice.source.starts_with("tortoise") { + ("tortoise-tts", "3.0") } else { ("f5-tts", "1.1.20") }; @@ -91,14 +95,20 @@ pub async fn run( // the Kokoro server only ever sees real voice ids. Only kicks // in for kokoro-routed renders; F5 voice-tag handling isn't // implemented and any tags pass through unchanged. - // Two pre-processing passes (kokoro only). Order matters: - // 1. Speaker voice substitution rewrites [voice:slug] → [voice:kokoro_id]. - // This must run BEFORE pronunciation overrides so we don't - // accidentally try to respell character slugs. + // Two pre-processing passes (kokoro + tortoise — engines that + // parse [voice:X] dialogue tags). Order matters: + // 1. Speaker voice substitution rewrites [voice:slug] → the + // engine's named voice id. Must run BEFORE pronunciation + // overrides so we don't try to respell character slugs. + // Tortoise: characters with no tortoise-voice mapping + // gracefully fall back to the narrator voice server-side. // 2. Pronunciation overrides word-substitute proper nouns - // (Pripyat, Dyatlov, etc.) with English-readable respellings - // so Kokoro's small phonemizer doesn't mangle them. - let gen_text = if voice.source.starts_with("kokoro") { + // (Pripyat, Dyatlov, etc.) with English-readable + // respellings. The respellings are kokoro/misaki-tuned but + // pass through tortoise's g2p_en well enough to apply. + let routes_to_engine_with_voice_tags = + voice.source.starts_with("kokoro") || voice.source.starts_with("tortoise"); + let gen_text = if routes_to_engine_with_voice_tags { let voiced = substitute_speaker_voices( &pool, chapter.story_id, @@ -373,13 +383,18 @@ async fn apply_pronunciation_overrides( Ok(out) } -/// Pick the engine base URL for a given voice.source. Voices whose -/// source starts with "kokoro" route to KOKORO_URL; everything else -/// routes to F5_TTS_URL. Each env var has a LAN-default for Lucy. +/// Pick the engine base URL for a given voice.source. +/// kokoro_* → KOKORO_URL +/// tortoise_* → TORTOISE_URL +/// anything else (lj_speech etc.) → F5_TTS_URL +/// Each env var has a LAN-default for Lucy. fn engine_url_for(source: &str) -> anyhow::Result { if source.starts_with("kokoro") { Ok(std::env::var("KOKORO_URL") .unwrap_or_else(|_| "http://192.168.0.5:7794".into())) + } else if source.starts_with("tortoise") { + Ok(std::env::var("TORTOISE_URL") + .unwrap_or_else(|_| "http://192.168.0.5:7795".into())) } else { Ok(std::env::var("F5_TTS_URL") .unwrap_or_else(|_| "http://192.168.0.5:7792".into())) From 9df378f799f48db7e83635e48dbc3f6bd7cb16ef Mon Sep 17 00:00:00 2001 From: Kayos Date: Thu, 14 May 2026 19:08:43 -0700 Subject: [PATCH 03/14] engine/tortoise: sentence chunking + device fix + pitch/rate modulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Catches up engines/tortoise/server.py with what's been deployed on Lucy through tonight's smoke iterations: 0.2 — _chunk_for_tortoise splits text nodes at sentence boundaries (max 220 chars) before each tts_with_preset call. Fixes the end-of-prompt gibberish past tortoise's ~20s reliable horizon. 0.3 — _get_voice now .to(DEVICE) cached samples + latents. Without this, non-lj voices crash with 'Expected all tensors to be on the same device, but found cpu and cuda:0'. 0.4 — [voice:NAME pitch=N rate=R][/voice] tag syntax. librosa pitch_shift + time_stretch applied per-chunk for single-voice multi-character renders. The strategy survived the design table — but the librosa phase-vocoder artifacts at ±5 semitones ate the quality on the 2070 Super. Parked here for the GPU rebuild; modulation works architecturally, just needs better stretching algorithm (rubberband) + more headroom. Production stayed Kokoro. Coast-Down preferred_voice_id reverted to kokoro_af_heart in the live DB after this experiment. --- engines/tortoise/server.py | 172 ++++++++++++++++++++++++++++++++----- 1 file changed, 150 insertions(+), 22 deletions(-) diff --git a/engines/tortoise/server.py b/engines/tortoise/server.py index c39eafe..ef602e5 100644 --- a/engines/tortoise/server.py +++ b/engines/tortoise/server.py @@ -23,6 +23,7 @@ import time import uuid from pathlib import Path +import librosa import numpy as np import soundfile as sf import torch @@ -62,12 +63,31 @@ def _get_tts() -> TextToSpeech: return _tts +def _move_to_device(obj): + """Recursively .to(DEVICE) tensors inside the structure tortoise + returns from load_voice. voice_samples is a list of tensors; + conditioning_latents is a tuple of tensors. Anything else + passes through unchanged (e.g. None, ints).""" + if obj is None: + return obj + if isinstance(obj, torch.Tensor): + return obj.to(DEVICE) + if isinstance(obj, list): + return [_move_to_device(x) for x in obj] + if isinstance(obj, tuple): + return tuple(_move_to_device(x) for x in obj) + return obj + + def _get_voice(name: str) -> tuple: """Cache voice latents to avoid re-loading reference clips on every synthesis call. Tortoise's load_voice returns - (voice_samples, conditioning_latents).""" + (voice_samples, conditioning_latents) — but they're created on + CPU; we move them to DEVICE so the autoregressive model (on + CUDA) doesn't fail with cpu/cuda tensor-device mismatch.""" if name not in _voice_cache: - _voice_cache[name] = load_voice(name) + samples, latents = load_voice(name) + _voice_cache[name] = (_move_to_device(samples), _move_to_device(latents)) return _voice_cache[name] @@ -75,15 +95,38 @@ def _get_voice(name: str) -> tuple: class Node: - __slots__ = ("kind", "value", "voice") + __slots__ = ("kind", "value", "voice", "pitch", "rate") - def __init__(self, kind: str, value, voice: str | None = None): + def __init__( + self, + kind: str, + value, + voice: str | None = None, + pitch: float = 0.0, + rate: float = 1.0, + ): + # kind ∈ {"text", "silence"}; value is str for text, float + # seconds for silence. voice/pitch/rate are character-voicing + # modifiers from [voice:NAME pitch=N rate=R] tags. Default: + # request voice, 0 semitones, 1x rate. self.kind = kind self.value = value self.voice = voice + self.pitch = pitch + self.rate = rate -_VOICE_OPEN_RE = re.compile(r"\[voice:([A-Za-z0-9_-]+)\]") +# Voice open tag — name + optional pitch (semitones) + optional rate: +# [voice:dyatlov] → voice swap only +# [voice:lj pitch=-3] → same voice, 3 semitones lower +# [voice:lj pitch=2 rate=1.1] → higher + slightly faster (fairy) +# [voice:lj pitch=-4 rate=0.9] → lower + slower (troll) +_VOICE_OPEN_RE = re.compile( + r"\[voice:([A-Za-z0-9_-]+)" + r"(?:\s+pitch=(-?[0-9]+(?:\.[0-9]+)?))?" + r"(?:\s+rate=([0-9]+(?:\.[0-9]+)?))?" + r"\]" +) _VOICE_CLOSE = "[/voice]" _TAG_RE = re.compile( r"\[(pause:(?P[0-9]+(?:\.[0-9]+)?)(?Ps|ms)?|breath|scene)\]", @@ -102,7 +145,70 @@ def _parse_tag(match: re.Match) -> float: return dur / 1000.0 if unit == "ms" else dur -def _expand_inline(text: str, voice: str | None) -> list[Node]: +# Tortoise's autoregressive head loses coherence past ~20s of generated +# audio per inference call. lj's pace is roughly 14 chars/s, so anything +# past ~280 chars per call risks gibberish at the end. We split inside +# _expand_inline at sentence boundaries to keep each tts_with_preset +# call inside the model's reliable horizon. +TORTOISE_MAX_CHUNK_CHARS = 220 + +# Sentence boundary regex — splits on `.`/`?`/`!` followed by whitespace +# and a capital letter (keeps "Mr. Smith" / "U.S." together) OR at any +# newline. +_SENTENCE_BOUNDARY = re.compile(r"(?<=[\.!?])\s+(?=[A-Z\"\(])|(?<=\n)\s*") + + +def _chunk_for_tortoise(text: str, max_chars: int = TORTOISE_MAX_CHUNK_CHARS) -> list[str]: + """Split text into chunks <= max_chars at sentence boundaries. + If a single sentence exceeds max_chars (rare for prose), fall + back to splitting that sentence at commas or just hard-cutting. + """ + sentences = [s.strip() for s in _SENTENCE_BOUNDARY.split(text) if s and s.strip()] + chunks: list[str] = [] + current = "" + for sent in sentences: + # Long sentence: emit alone, but try sub-splitting at commas. + if len(sent) > max_chars: + if current: + chunks.append(current.strip()) + current = "" + # Split on commas + parts = [p.strip() for p in sent.split(",") if p.strip()] + sub = "" + for p in parts: + add = (sub + ", " if sub else "") + p + if len(add) <= max_chars: + sub = add + else: + if sub: + chunks.append(sub) + # If even the part alone exceeds, hard-cut at max_chars + while len(p) > max_chars: + chunks.append(p[:max_chars]) + p = p[max_chars:] + sub = p + if sub: + chunks.append(sub) + continue + # Sentence fits — accumulate. + candidate = (current + " " if current else "") + sent + if len(candidate) <= max_chars: + current = candidate + else: + if current: + chunks.append(current.strip()) + current = sent + if current: + chunks.append(current.strip()) + return chunks + + +def _expand_inline( + text: str, + voice: str | None, + pitch: float = 0.0, + rate: float = 1.0, +) -> list[Node]: out: list[Node] = [] text = text.strip() if not text: @@ -111,12 +217,12 @@ def _expand_inline(text: str, voice: str | None) -> list[Node]: for m in _TAG_RE.finditer(text): pre = text[cursor : m.start()].strip() if pre: - out.append(Node("text", pre, voice)) + out.append(Node("text", pre, voice, pitch, rate)) out.append(Node("silence", _parse_tag(m))) cursor = m.end() tail = text[cursor:].strip() if tail: - out.append(Node("text", tail, voice)) + out.append(Node("text", tail, voice, pitch, rate)) return out @@ -130,12 +236,14 @@ def _split_paragraph_voices(para: str) -> list[Node]: break out.extend(_expand_inline(para[cursor : m.start()], None)) voice = m.group(1) + pitch = float(m.group(2)) if m.group(2) else 0.0 + rate = float(m.group(3)) if m.group(3) else 1.0 body_start = m.end() close_idx = para.find(_VOICE_CLOSE, body_start) if close_idx < 0: - out.extend(_expand_inline(para[body_start:], voice)) + out.extend(_expand_inline(para[body_start:], voice, pitch, rate)) break - out.extend(_expand_inline(para[body_start:close_idx], voice)) + out.extend(_expand_inline(para[body_start:close_idx], voice, pitch, rate)) cursor = close_idx + len(_VOICE_CLOSE) return out @@ -253,6 +361,7 @@ def synthesize(req: SynthesizeRequest) -> SynthesizeResponse: started = time.monotonic() pieces: list[np.ndarray] = [] voices_used: set[str] = set() + tortoise_chunks_rendered = 0 for node in nodes: if node.kind == "silence": pieces.append(_silence_samples(node.value)) @@ -264,18 +373,37 @@ def synthesize(req: SynthesizeRequest) -> SynthesizeResponse: except Exception as e: log.warning("voice %s failed to load (%s); falling back to default", seg_voice, e) samples, latents = _get_voice(voice) - # Tortoise's tts_with_preset returns a torch.Tensor on the - # configured device. - audio_tensor = tts.tts_with_preset( - text=node.value, - voice_samples=samples, - conditioning_latents=latents, - preset=preset, - ) - if isinstance(audio_tensor, list): - audio_tensor = audio_tensor[0] - arr = audio_tensor.squeeze().cpu().numpy().astype(np.float32) - pieces.append(arr) + # Each text node may exceed Tortoise's reliable ~20s horizon — + # split at sentence boundaries before feeding the model. + sub_chunks = _chunk_for_tortoise(node.value) + for sub_idx, sub in enumerate(sub_chunks): + audio_tensor = tts.tts_with_preset( + text=sub, + voice_samples=samples, + conditioning_latents=latents, + preset=preset, + ) + if isinstance(audio_tensor, list): + audio_tensor = audio_tensor[0] + arr = audio_tensor.squeeze().cpu().numpy().astype(np.float32) + # Per-character voice modulation via librosa. Apply + # pitch first (preserves duration), then rate (preserves + # pitch). Default pitch=0, rate=1.0 = no-op fast path. + if abs(node.pitch) > 1e-3: + arr = librosa.effects.pitch_shift( + arr, sr=SAMPLE_RATE, n_steps=node.pitch + ) + if abs(node.rate - 1.0) > 1e-3: + arr = librosa.effects.time_stretch(arr, rate=node.rate) + arr = arr.astype(np.float32) + pieces.append(arr) + tortoise_chunks_rendered += 1 + log.info( + "chunk %d/%d done (%d chars, pitch=%+.1f rate=%.2f, %.1fs audio so far)", + sub_idx + 1, len(sub_chunks), len(sub), + node.pitch, node.rate, + sum(len(p) for p in pieces) / SAMPLE_RATE, + ) elapsed_ms = int((time.monotonic() - started) * 1000) if not pieces: From d2442f0a87c6854bf912a070e005413a5bd83cab Mon Sep 17 00:00:00 2001 From: Kayos Date: Thu, 14 May 2026 21:35:20 -0700 Subject: [PATCH 04/14] =?UTF-8?q?forge:=20rewrite=20pass=20=E2=80=94=20re-?= =?UTF-8?q?author=20prose=20in=20an=20author's=20voice?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New Forge::rewrite + PassKind::Rewrite. An author re-authors existing chapter prose entirely in their voice — sentence rhythm, word choice, paragraph shape all become theirs — while canon (names, dates, places, events, order, technical facts) is preserved exactly. Not editing; re-authoring. SystemMode::Replace, max effort. skald rewrite --chapter [--author slug] overwrites body_md with the rewritten version. The pre-rewrite prose is stashed in the new chapters.body_md_original column on first rewrite (migration 0008, idempotent) so the original is never lost. body_md_tts is cleared — it was annotated against the old prose and must be regenerated by a fresh prepare-narration. prepare-narration gains --single-voice: skips the character speaker roster so no [voice:X] dialogue tags are inserted, only beat markers. Right for one-voice narration. Migration 0008 also extends generation_runs.kind to allow 'rewrite'. --- migrations/0008_chapter_rewrite.sql | 15 ++ skald-core/src/forge.rs | 59 +++++++ skald/src/main.rs | 31 +++- skald/src/narrate_prep.rs | 19 ++- skald/src/rewrite.rs | 252 ++++++++++++++++++++++++++++ 5 files changed, 371 insertions(+), 5 deletions(-) create mode 100644 migrations/0008_chapter_rewrite.sql create mode 100644 skald/src/rewrite.rs diff --git a/migrations/0008_chapter_rewrite.sql b/migrations/0008_chapter_rewrite.sql new file mode 100644 index 0000000..c6ce9b5 --- /dev/null +++ b/migrations/0008_chapter_rewrite.sql @@ -0,0 +1,15 @@ +-- The rewrite pass: an author re-authors existing chapter prose in +-- their own voice (canon preserved, prose reworked). body_md gets +-- overwritten with the rewritten version; body_md_original keeps +-- the pre-rewrite prose so the original is never lost. Populated +-- only on the FIRST rewrite of a chapter (if NULL) — subsequent +-- rewrites leave the original alone. +ALTER TABLE chapters + ADD COLUMN IF NOT EXISTS body_md_original text; + +-- Allow 'rewrite' as a generation_runs.kind. +ALTER TABLE generation_runs + DROP CONSTRAINT generation_runs_kind_check; +ALTER TABLE generation_runs + ADD CONSTRAINT generation_runs_kind_check + CHECK (kind = ANY (ARRAY['gen', 'cleanup', 'audit', 'summary', 'embed', 'narrate_prep', 'rewrite'])); diff --git a/skald-core/src/forge.rs b/skald-core/src/forge.rs index 14f0da6..75c22b1 100644 --- a/skald-core/src/forge.rs +++ b/skald-core/src/forge.rs @@ -74,6 +74,10 @@ pub enum PassKind { /// prose; output should be byte-identical except for the /// tag insertions. NarratePrep, + /// Re-author existing chapter prose in an author's voice. Canon + /// (names, dates, events, places, facts) is preserved exactly; + /// the prose itself is rewritten. Not editing — re-authoring. + Rewrite, } impl PassKind { @@ -84,6 +88,7 @@ impl PassKind { Self::Audit => "audit", Self::Summary => "summary", Self::NarratePrep => "narrate_prep", + Self::Rewrite => "rewrite", } } } @@ -237,6 +242,45 @@ impl Forge { Ok(PassOutput { kind: PassKind::NarratePrep, result: r, duration_ms }) } + /// Re-author existing chapter prose in the author's voice. The + /// model receives prose written by another hand and rewrites it + /// entirely in its own style — sentence rhythm, word choice, + /// paragraph shape all become the author's. Canon is preserved + /// exactly: names, dates, events, places, technical facts, and + /// the sequence of what happens do not change. + /// + /// Author REQUIRED — a rewrite without an author has no target + /// voice. SystemMode::Replace; the model BECOMES the author. + /// Max effort: re-authoring is the heaviest prose-craft task. + pub async fn rewrite( + &self, + prose: &str, + author: &AuthorWithRevision, + ) -> anyhow::Result { + let scaffold = author + .revision + .system_template + .as_deref() + .unwrap_or(DEFAULT_AUTHOR_SCAFFOLD); + let system = scaffold + .replace("{{display_name}}", &author.author.display_name) + .replace("{{pass_directive}}", REWRITE_DIRECTIVE) + .replace("{{soul}}", &author.revision.soul); + let user_prompt = rewrite_user_prompt(prose); + let body = RunRequest { + prompt: user_prompt, + model: Some(self.model.clone()), + system: Some(system), + system_mode: Some(SystemMode::Replace), + effort: Some(Effort::Max), + timeout_secs: Some(1800), + ..Default::default() + }; + let r = self.client.run(body).await?; + let duration_ms = r.duration_ms; + Ok(PassOutput { kind: PassKind::Rewrite, result: r, duration_ms }) + } + /// Summarize one chapter to ~250 words. The summary feeds into /// the continuation context for older chapters so the token /// budget stays sane on long series (book 12 doesn't carry book 1 @@ -349,6 +393,8 @@ const NARRATE_PREP_DIRECTIVE: &str = "This is a NARRATION-ANNOTATION pass. You r const HOUSE_NARRATE_PREP_SYSTEM: &str = "You are a senior audiobook director annotating prose for narration. You insert (a) beat markers — `[breath]`, `[pause:Xs]`, `[scene]` — where a skilled narrator would breathe or pause, (b) speaker voice tags `[voice:]\"...\"[/voice]` wrapping dialogue based on who is speaking (roster supplied in user prompt; leave unattributed dialogue unwrapped), and (c) occasional humanizing narrator stumbles using em-dash repetition or self-correction (sparingly — maybe 1-3 per chapter, on proper nouns or hard words). Apart from those stumbles you do NOT change a word of the prose. Return the prose verbatim plus beat markers, voice tags, and (rare) stumbles inline. No preamble, no commentary."; +const REWRITE_DIRECTIVE: &str = "This is a REWRITE pass. The user prompt contains a chapter of prose written by another hand. Re-author it entirely in YOUR voice — every sentence reworked in your style: your sentence rhythm, your word choice, your paragraph shape, your way of landing a beat. This is not editing or polishing. It is re-authoring. The reader should not be able to tell another writer ever touched it.\n\nHARD CONSTRAINTS — canon is non-negotiable:\n- Every character name, every date, every place name stays exactly as written.\n- Every event, and the ORDER events happen in, stays exactly as written.\n- Every technical or historical fact stays exactly as written.\n- Do not add new scenes, characters, or events. Do not cut any scene or beat. Same story, same shape — your telling.\n\nReturn ONLY the rewritten chapter prose. Begin with the chapter heading line (`## Chapter N — title`) exactly as in the source. No preamble, no commentary about the rewrite."; + // ─── User-prompt builders ─────────────────────────────────────── fn gen_user_prompt( @@ -395,6 +441,19 @@ pub struct CharacterSpeaker { pub hint: Option, } +fn rewrite_user_prompt(prose: &str) -> String { + let mut out = String::with_capacity(prose.len() + 256); + out.push_str("# Chapter to re-author\n\n"); + out.push_str(prose); + out.push_str( + "\n\n# Task\n\nRe-author the chapter above entirely in your voice. \ + Preserve all canon — names, dates, places, events, the order they \ + happen, every technical fact. Change only the prose. Return only \ + the rewritten chapter, starting with its `## Chapter N` heading.\n", + ); + out +} + fn narrate_prep_user_prompt(prose: &str, characters: &[CharacterSpeaker]) -> String { let mut out = String::with_capacity(prose.len() + 512); diff --git a/skald/src/main.rs b/skald/src/main.rs index b6b6d7a..503a5f0 100644 --- a/skald/src/main.rs +++ b/skald/src/main.rs @@ -9,6 +9,7 @@ mod continue_story; mod import; mod narrate; mod narrate_prep; +mod rewrite; mod serve; mod show_context; mod summarize; @@ -155,6 +156,23 @@ enum Cmd { /// errors out to avoid clobbering a hand-tuned version. #[arg(long)] overwrite: bool, + /// Single-voice mode: skip the character speaker roster so + /// no [voice:X] dialogue tags are inserted. Use when the + /// whole chapter narrates in one voice. + #[arg(long)] + single_voice: bool, + }, + /// Re-author one chapter's prose in an author's voice. Canon + /// preserved, prose reworked. Overwrites body_md (stashing the + /// original in body_md_original) and clears body_md_tts. + Rewrite { + /// Chapter UUID to re-author. + #[arg(long)] + chapter: Uuid, + /// Author slug to rewrite as. Falls back to the story's + /// bound author if omitted. + #[arg(long)] + author: Option, }, } @@ -230,8 +248,19 @@ async fn run() -> anyhow::Result<()> { chapter, author, overwrite, + single_voice, } => { - narrate_prep::run(&cli.database_url, chapter, author.as_deref(), overwrite).await + narrate_prep::run( + &cli.database_url, + chapter, + author.as_deref(), + overwrite, + single_voice, + ) + .await + } + Cmd::Rewrite { chapter, author } => { + rewrite::run(&cli.database_url, chapter, author.as_deref()).await } } } diff --git a/skald/src/narrate_prep.rs b/skald/src/narrate_prep.rs index bc92228..8f4b806 100644 --- a/skald/src/narrate_prep.rs +++ b/skald/src/narrate_prep.rs @@ -24,6 +24,7 @@ pub async fn run( chapter_id: Uuid, author_slug: Option<&str>, overwrite: bool, + single_voice: bool, ) -> anyhow::Result<()> { let cfg = load_forge_config()?; tracing::info!(base_url = %cfg.base_url, model = %cfg.model, "forge configured"); @@ -60,10 +61,20 @@ pub async fn run( .fetch_one(&pool) .await?; - let characters = load_speakers(&pool, chapter.story_id).await?; - if !characters.is_empty() { - tracing::info!(speaker_count = characters.len(), "speaker roster loaded"); - } + // Single-voice mode skips the speaker roster entirely — the + // narrate_prep pass then inserts only [breath]/[pause]/[scene] + // beats, no [voice:X] dialogue tags. Right when the whole + // chapter narrates in one voice. + let characters = if single_voice { + tracing::info!("single-voice mode — skipping speaker roster"); + Vec::new() + } else { + let c = load_speakers(&pool, chapter.story_id).await?; + if !c.is_empty() { + tracing::info!(speaker_count = c.len(), "speaker roster loaded"); + } + c + }; let started = Instant::now(); let out_res = forge diff --git a/skald/src/rewrite.rs b/skald/src/rewrite.rs new file mode 100644 index 0000000..8692826 --- /dev/null +++ b/skald/src/rewrite.rs @@ -0,0 +1,252 @@ +//! `skald rewrite` — re-author one chapter's prose in an author's +//! voice. Canon preserved, prose reworked. Overwrites chapters.body_md +//! with the rewritten version; the pre-rewrite prose is stashed in +//! chapters.body_md_original on the first rewrite (if NULL) so the +//! original is never lost. +//! +//! Author resolution: --author flag wins, else the chapter's +//! story.author_id. A rewrite with no author errors — there's no +//! target voice. + +use std::time::Instant; + +use anyhow::{Context, bail}; +use chrono::Utc; +use skald_core::authors::{self, AuthorWithRevision}; +use skald_core::config::ForgeConfig; +use skald_core::db; +use skald_core::forge::{Forge, PassKind, PassOutput}; +use sqlx::PgPool; +use uuid::Uuid; + +pub async fn run( + database_url: &str, + chapter_id: Uuid, + author_slug: Option<&str>, +) -> anyhow::Result<()> { + let cfg = load_forge_config()?; + tracing::info!(base_url = %cfg.base_url, model = %cfg.model, "forge configured"); + + let pool = db::connect_and_migrate(database_url).await?; + let forge = Forge::new(&cfg)?; + + let chapter = load_chapter(&pool, chapter_id).await?; + let author = resolve_author(&pool, &chapter, author_slug) + .await? + .ok_or_else(|| { + anyhow::anyhow!( + "rewrite needs an author — pass --author or bind one to the story" + ) + })?; + tracing::info!( + slug = %author.author.slug, + revision_n = author.revision.n, + chapter_n = chapter.n, + word_count_in = word_count(&chapter.body_md), + "re-authoring chapter", + ); + + let run_id: Uuid = sqlx::query_scalar( + "INSERT INTO generation_runs (story_id, kind, status) VALUES ($1, $2, 'running') RETURNING id", + ) + .bind(chapter.story_id) + .bind(PassKind::Rewrite.as_str()) + .fetch_one(&pool) + .await?; + + let started = Instant::now(); + let out_res = forge.rewrite(&chapter.body_md, &author).await; + let elapsed = started.elapsed(); + + let out: PassOutput = match out_res { + Ok(o) => o, + Err(e) => { + sqlx::query( + "UPDATE generation_runs SET status='failed', error=$1, ended_at=$2 WHERE id=$3", + ) + .bind(format!("{e:#}")) + .bind(Utc::now()) + .bind(run_id) + .execute(&pool) + .await?; + return Err(e); + } + }; + + let rewritten = pass_text(&out)?; + let (_n, title, body) = parse_chapter(&rewritten); + + // Stash the original on first rewrite, then overwrite body_md. + // body_md_tts is cleared — it was annotated against the OLD + // prose and must be regenerated by a fresh prepare-narration. + sqlx::query( + "UPDATE chapters + SET body_md_original = COALESCE(body_md_original, body_md), + body_md = $1, + title = COALESCE($2, title), + body_md_tts = NULL, + word_count = $3, + generated_at = now() + WHERE id = $4", + ) + .bind(&body) + .bind(title.as_deref()) + .bind(word_count(&body)) + .bind(chapter_id) + .execute(&pool) + .await?; + + // Replace passages with the rewritten paragraphs. + sqlx::query("DELETE FROM passages WHERE chapter_id = $1") + .bind(chapter_id) + .execute(&pool) + .await?; + for (i, para) in body.split("\n\n").enumerate() { + let p = para.trim(); + if p.is_empty() || p == "---" { + continue; + } + sqlx::query("INSERT INTO passages (chapter_id, paragraph_n, body) VALUES ($1, $2, $3)") + .bind(chapter_id) + .bind(i as i32 + 1) + .bind(p) + .execute(&pool) + .await?; + } + sqlx::query( + "UPDATE stories SET word_count_actual = (SELECT COALESCE(SUM(word_count), 0) FROM chapters WHERE story_id = $1) WHERE id = $1", + ) + .bind(chapter.story_id) + .execute(&pool) + .await?; + + sqlx::query("UPDATE generation_runs SET status='succeeded', ended_at=$1 WHERE id=$2") + .bind(Utc::now()) + .bind(run_id) + .execute(&pool) + .await?; + + println!( + "rewrote chapter {} of story {} as {} ({} → {} words) in {:.1}s", + chapter.n, + chapter.story_id, + author.author.slug, + word_count(&chapter.body_md), + word_count(&body), + elapsed.as_secs_f32(), + ); + Ok(()) +} + +#[derive(Debug, Clone)] +struct ChapterRow { + story_id: Uuid, + n: i32, + body_md: String, + story_author_id: Option, +} + +async fn load_chapter(pool: &PgPool, id: Uuid) -> anyhow::Result { + let row: Option<(Uuid, i32, String, Option)> = sqlx::query_as( + "SELECT c.story_id, c.n, c.body_md, s.author_id + FROM chapters c JOIN stories s ON s.id = c.story_id + WHERE c.id = $1", + ) + .bind(id) + .fetch_optional(pool) + .await?; + let (story_id, n, body_md, story_author_id) = + row.with_context(|| format!("chapter {id} not found"))?; + Ok(ChapterRow { + story_id, + n, + body_md, + story_author_id, + }) +} + +async fn resolve_author( + pool: &PgPool, + chapter: &ChapterRow, + flag_slug: Option<&str>, +) -> anyhow::Result> { + if let Some(slug) = flag_slug { + return authors::get_with_current_revision(pool, slug) + .await? + .map(Some) + .with_context(|| format!("author '{slug}' not found")); + } + if let Some(aid) = chapter.story_author_id { + let row: Option<(String,)> = sqlx::query_as("SELECT slug FROM authors WHERE id = $1") + .bind(aid) + .fetch_optional(pool) + .await?; + if let Some((slug,)) = row { + return Ok(authors::get_with_current_revision(pool, &slug).await?); + } + } + Ok(None) +} + +fn pass_text(out: &PassOutput) -> anyhow::Result { + let text = out + .result + .as_text() + .map(|s| s.to_string()) + .or_else(|| out.result.result.as_str().map(|s| s.to_string())) + .unwrap_or_else(|| out.result.result.to_string()); + if text.trim().is_empty() { + bail!("rewrite pass returned empty"); + } + Ok(text) +} + +/// Parse (n, title, body) out of the rewritten chapter. Tolerant of +/// a missing heading — if the first line isn't a heading we keep the +/// whole text as body and return n=0 (caller keeps the existing n). +fn parse_chapter(text: &str) -> (i32, Option, String) { + let trimmed = text.trim_start(); + let first = trimmed.lines().next().unwrap_or("").trim(); + if let Some(heading) = first.strip_prefix('#') { + let heading = heading.trim_start_matches('#').trim(); + let n = heading + .to_lowercase() + .find("chapter") + .and_then(|idx| { + heading[idx + 7..] + .trim_start() + .split([' ', '—', '-', ':', ',']) + .next() + .and_then(|w| w.parse::().ok()) + }) + .unwrap_or(0); + let title = heading + .split_once(" — ") + .or_else(|| heading.split_once(" - ")) + .map(|(_, t)| t.trim().to_string()) + .filter(|t| !t.is_empty()); + let body = trimmed + .lines() + .skip(1) + .collect::>() + .join("\n") + .trim_start() + .to_string(); + let body = if body.is_empty() { text.trim().to_string() } else { body }; + return (n, title, body); + } + (0, None, text.trim().to_string()) +} + +fn word_count(s: &str) -> i32 { + s.split_whitespace().count() as i32 +} + +fn load_forge_config() -> anyhow::Result { + let base_url = std::env::var("CLAWDFORGE_URL") + .context("CLAWDFORGE_URL not set")?; + let app_token = std::env::var("CLAWDFORGE_TOKEN") + .context("CLAWDFORGE_TOKEN not set")?; + let model = std::env::var("SKALD_MODEL").unwrap_or_else(|_| "opus".into()); + Ok(ForgeConfig { base_url, app_token, model }) +} From 98233182fd4b54f19edcf211b4eddc1352dd5987 Mon Sep 17 00:00:00 2001 From: Kayos Date: Thu, 14 May 2026 22:32:52 -0700 Subject: [PATCH 05/14] forge: high effort for prose-craft passes, max only for audit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gen, cleanup, narrate_prep and rewrite drop from max to high effort. Audit keeps max — it is the one pass doing real reasoning (canon drift, timeline gaps, retcons) rather than prose-craft, so it is worth the frontier spend. Prose-craft is "good enough" at high. This also keeps the all-Opus skald pattern under the $200/month claude -p cap landing next month. --- skald-core/src/forge.rs | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/skald-core/src/forge.rs b/skald-core/src/forge.rs index 75c22b1..9812459 100644 --- a/skald-core/src/forge.rs +++ b/skald-core/src/forge.rs @@ -7,7 +7,7 @@ //! //! 1. **gen** — produces a new chapter draft from an assembled //! context blob (parent prose + bible + characters + similarity- -//! matched passages, all from the database). Opus, max effort. +//! matched passages, all from the database). Opus, high effort. //! //! 2. **cleanup** — polishes the draft for prose quality, voice //! consistency, dialogue rhythm, pacing dead spots. Same Opus, @@ -43,8 +43,11 @@ use crate::config::ForgeConfig; pub struct Forge { client: Client, /// The model alias we pass to clawdforge. Skald is opinionated: - /// always opus max effort. (See `project_story_writer_container.md`.) - /// `clawdforge` resolves the alias to the actual claude CLI flag. + /// always opus. Story-writing passes (gen/cleanup/narrate_prep/ + /// rewrite) run at HIGH effort; only the audit pass runs at MAX — + /// audit genuinely needs the frontier reasoning, prose-craft does + /// not, and the $200/mo `claude -p` cap makes max-everywhere + /// unaffordable. `clawdforge` resolves the alias to the CLI flag. model: String, } @@ -136,7 +139,7 @@ impl Forge { model: Some(self.model.clone()), system: Some(system), system_mode: Some(mode), - effort: Some(Effort::Max), + effort: Some(Effort::High), timeout_secs: Some(1800), ..Default::default() }; @@ -161,7 +164,7 @@ impl Forge { model: Some(self.model.clone()), system: Some(system), system_mode: Some(mode), - effort: Some(Effort::Max), + effort: Some(Effort::High), timeout_secs: Some(1800), ..Default::default() }; @@ -231,9 +234,9 @@ impl Forge { model: Some(self.model.clone()), system: Some(system), system_mode: Some(mode), - // Tag placement IS a craft choice; max effort buys - // better beat sense. Same posture as gen/cleanup. - effort: Some(Effort::Max), + // Tag placement IS a craft choice; high effort is + // plenty for beat sense. Same posture as gen/cleanup. + effort: Some(Effort::High), timeout_secs: Some(1800), ..Default::default() }; @@ -251,7 +254,8 @@ impl Forge { /// /// Author REQUIRED — a rewrite without an author has no target /// voice. SystemMode::Replace; the model BECOMES the author. - /// Max effort: re-authoring is the heaviest prose-craft task. + /// High effort: re-authoring is heavy prose-craft, but it's + /// still craft, not reasoning — max is reserved for audit. pub async fn rewrite( &self, prose: &str, @@ -272,7 +276,7 @@ impl Forge { model: Some(self.model.clone()), system: Some(system), system_mode: Some(SystemMode::Replace), - effort: Some(Effort::Max), + effort: Some(Effort::High), timeout_secs: Some(1800), ..Default::default() }; @@ -516,6 +520,10 @@ fn build_audit_request(model: &str, parent: &str, sequel: &str, bible: &str) -> prompt, model: Some(model.to_string()), system: Some(SYSTEM_AUDIT.to_string()), + // Audit is the one pass that keeps MAX effort — catching + // canon drift, timeline gaps and retcons is reasoning work + // worth the frontier spend; prose-craft passes run at high. + effort: Some(Effort::Max), timeout_secs: Some(600), ..Default::default() } From c8c44a5d23f83e85cf64580ce2590ed2d2b8be1b Mon Sep 17 00:00:00 2001 From: Kayos Date: Fri, 15 May 2026 07:02:10 -0700 Subject: [PATCH 06/14] narrate: single-voice prep drops voice tags; GC superseded renders MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes: - narrate_prep in single-voice mode (empty character roster) was still handed the multi-voice directive, so the model invented [voice:] tags from character names in the prose. The narrate path neutralised them by falling back to the narrator, but it was log spam and a leak of intent. Single-voice now gets directive + house-system variants that forbid voice tags outright, and the user-prompt task line matches. - Every narrate run wrote a fresh ~80MB WAV and never reclaimed the previous one, so re-renders piled up stale files. A successful render now deletes the WAVs of prior renders of the same chapter and nulls their output_path. Render history rows are kept; only the dead file pointer is cleared. Best-effort — cleanup failure never fails the render. --- skald-core/src/forge.rs | 62 +++++++++++++++++++++++++++++------ skald/src/narrate.rs | 71 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+), 10 deletions(-) diff --git a/skald-core/src/forge.rs b/skald-core/src/forge.rs index 9812459..916f416 100644 --- a/skald-core/src/forge.rs +++ b/skald-core/src/forge.rs @@ -200,10 +200,12 @@ impl Forge { /// Orson Black places beats differently than another author /// would. Replace-mode if author is set; Append otherwise. /// - /// `characters` is the story's character roster. When provided, + /// `characters` is the story's character roster. When non-empty, /// the system prompt instructs the model to wrap dialogue in - /// `[voice:]"..."[/voice]` for multi-voice rendering. The + /// `[voice:]"..."[/voice]` for multi-voice rendering; the /// slug is mapped to a Kokoro voice id by skald's narrate path. + /// An EMPTY roster selects single-voice mode — the prompt then + /// forbids `[voice:...]` tags entirely (one narrator, no cast). /// /// Hard rule the system prompt enforces: do not change a word /// of prose. Tags are additive only. @@ -213,6 +215,12 @@ impl Forge { author: Option<&AuthorWithRevision>, characters: &[CharacterSpeaker], ) -> anyhow::Result { + // An empty character roster means single-voice narration — + // the whole chapter reads in one voice. In that mode the + // prompt must NOT invite `[voice:...]` tags, or the model + // invents speaker slugs from names in the prose that the + // narrate path then has to detect and neutralize. + let single_voice = characters.is_empty(); let user_prompt = narrate_prep_user_prompt(prose, characters); let (system, mode) = match author { Some(a) => { @@ -221,13 +229,25 @@ impl Forge { .system_template .as_deref() .unwrap_or(DEFAULT_AUTHOR_SCAFFOLD); + let directive = if single_voice { + NARRATE_PREP_DIRECTIVE_SINGLE + } else { + NARRATE_PREP_DIRECTIVE + }; let composed = scaffold .replace("{{display_name}}", &a.author.display_name) - .replace("{{pass_directive}}", NARRATE_PREP_DIRECTIVE) + .replace("{{pass_directive}}", directive) .replace("{{soul}}", &a.revision.soul); (composed, SystemMode::Replace) } - None => (HOUSE_NARRATE_PREP_SYSTEM.to_string(), SystemMode::Append), + None => { + let house = if single_voice { + HOUSE_NARRATE_PREP_SYSTEM_SINGLE + } else { + HOUSE_NARRATE_PREP_SYSTEM + }; + (house.to_string(), SystemMode::Append) + } }; let body = RunRequest { prompt: user_prompt, @@ -395,8 +415,20 @@ const SYSTEM_AUDIT: &str = "You are a canon auditor for long-form fiction. You c const NARRATE_PREP_DIRECTIVE: &str = "This is a NARRATION-ANNOTATION pass. You receive your own prose and prepare it for an audiobook reading. Three kinds of inserts are allowed:\n\n1. BEAT MARKERS (additive, not prose): `[breath]` (~400ms), `[pause:1.2s]` (explicit silence in seconds, e.g. 0.5s, 1.2s, 2s), `[scene]` (~1500ms scene break). Place where the prose's rhythm asks for them — after a hard one-line beat, before a turn in dialogue, on a paragraph that lands with weight.\n\n2. SPEAKER VOICE TAGS (multi-voice dialogue): wrap dialogue lines in `[voice:]\"...\"[/voice]` based on who is speaking. The roster of available speaker slugs is given in the user prompt. The dialogue itself stays verbatim — only the wrapper is added. If a line of dialogue is not clearly attributable to a roster speaker, leave it unwrapped (the narrator voice will read it). Quoted thoughts (italicized interior monologue) stay unwrapped — only spoken aloud dialogue gets a voice tag.\n\n3. NARRATOR STUMBLES (humanizing prose-level inserts): a real narrator occasionally stumbles on a hard word, catches themselves, repeats. You may add these *sparingly* where the prose's pacing makes them feel right. Patterns: em-dash repetition (`Prip— Pripyat`), self-correction (`she — no, the wife — had been told`), hesitation (`the dose, the dose was`). USE SPARINGLY. Maybe 1-3 per chapter. Pick proper nouns, technical terms, or moments where the narrator might genuinely catch herself. Avoid stumbling on emotional climaxes — those should land clean.\n\nApart from stumbles, do NOT change a word of the original prose. Return the prose with beat markers, voice tags, and stumbles inline. No preamble. No commentary about your choices."; +/// Single-voice variant of [`NARRATE_PREP_DIRECTIVE`]. Used when the +/// chapter narrates in one voice (no speaker roster). The multi-voice +/// directive's section 2 is dropped entirely AND a hard prohibition +/// is added — without it the model invents `[voice:]` tags from +/// character names in the prose, which the narrate path then has to +/// detect and neutralize. +const NARRATE_PREP_DIRECTIVE_SINGLE: &str = "This is a NARRATION-ANNOTATION pass. You receive your own prose and prepare it for a SINGLE-narrator audiobook reading — the whole chapter, dialogue included, is read aloud in ONE voice. Two kinds of inserts are allowed:\n\n1. BEAT MARKERS (additive, not prose): `[breath]` (~400ms), `[pause:1.2s]` (explicit silence in seconds, e.g. 0.5s, 1.2s, 2s), `[scene]` (~1500ms scene break). Place where the prose's rhythm asks for them — after a hard one-line beat, before a turn in dialogue, on a paragraph that lands with weight.\n\n2. NARRATOR STUMBLES (humanizing prose-level inserts): a real narrator occasionally stumbles on a hard word, catches themselves, repeats. You may add these *sparingly* where the prose's pacing makes them feel right. Patterns: em-dash repetition (`Prip— Pripyat`), self-correction (`she — no, the wife — had been told`), hesitation (`the dose, the dose was`). USE SPARINGLY. Maybe 1-3 per chapter. Pick proper nouns, technical terms, or moments where the narrator might genuinely catch herself. Avoid stumbling on emotional climaxes — those should land clean.\n\nDo NOT add `[voice:...]` speaker tags of any kind — there is one narrator, not a cast. Apart from stumbles, do NOT change a word of the original prose. Return the prose with beat markers and stumbles inline. No preamble. No commentary about your choices."; + const HOUSE_NARRATE_PREP_SYSTEM: &str = "You are a senior audiobook director annotating prose for narration. You insert (a) beat markers — `[breath]`, `[pause:Xs]`, `[scene]` — where a skilled narrator would breathe or pause, (b) speaker voice tags `[voice:]\"...\"[/voice]` wrapping dialogue based on who is speaking (roster supplied in user prompt; leave unattributed dialogue unwrapped), and (c) occasional humanizing narrator stumbles using em-dash repetition or self-correction (sparingly — maybe 1-3 per chapter, on proper nouns or hard words). Apart from those stumbles you do NOT change a word of the prose. Return the prose verbatim plus beat markers, voice tags, and (rare) stumbles inline. No preamble, no commentary."; +/// Single-voice variant of [`HOUSE_NARRATE_PREP_SYSTEM`] — no speaker +/// voice tags, one narrator throughout. +const HOUSE_NARRATE_PREP_SYSTEM_SINGLE: &str = "You are a senior audiobook director annotating prose for a SINGLE-narrator reading. You insert (a) beat markers — `[breath]`, `[pause:Xs]`, `[scene]` — where a skilled narrator would breathe or pause, and (b) occasional humanizing narrator stumbles using em-dash repetition or self-correction (sparingly — maybe 1-3 per chapter, on proper nouns or hard words). Do NOT add `[voice:...]` speaker tags — the whole chapter is one voice. Apart from those stumbles you do NOT change a word of the prose. Return the prose verbatim plus beat markers and (rare) stumbles inline. No preamble, no commentary."; + const REWRITE_DIRECTIVE: &str = "This is a REWRITE pass. The user prompt contains a chapter of prose written by another hand. Re-author it entirely in YOUR voice — every sentence reworked in your style: your sentence rhythm, your word choice, your paragraph shape, your way of landing a beat. This is not editing or polishing. It is re-authoring. The reader should not be able to tell another writer ever touched it.\n\nHARD CONSTRAINTS — canon is non-negotiable:\n- Every character name, every date, every place name stays exactly as written.\n- Every event, and the ORDER events happen in, stays exactly as written.\n- Every technical or historical fact stays exactly as written.\n- Do not add new scenes, characters, or events. Do not cut any scene or beat. Same story, same shape — your telling.\n\nReturn ONLY the rewritten chapter prose. Begin with the chapter heading line (`## Chapter N — title`) exactly as in the source. No preamble, no commentary about the rewrite."; // ─── User-prompt builders ─────────────────────────────────────── @@ -487,12 +519,22 @@ fn narrate_prep_user_prompt(prose: &str, characters: &[CharacterSpeaker]) -> Str out.push_str("# Prose to annotate\n\n"); out.push_str(prose); - out.push_str( - "\n\n# Task\n\nReturn the prose above with `[breath]`, `[pause:Xs]`, \ - `[scene]` markers and `[voice:]\"...\"[/voice]` dialogue wrappers \ - inserted appropriately. Do not change any word. Do not skip any \ - sentence. Return only the annotated prose.\n", - ); + if characters.is_empty() { + out.push_str( + "\n\n# Task\n\nReturn the prose above with `[breath]`, `[pause:Xs]`, \ + `[scene]` beat markers inserted appropriately. Do NOT add any \ + `[voice:...]` tags — this is a single-voice reading. Do not \ + change any word. Do not skip any sentence. Return only the \ + annotated prose.\n", + ); + } else { + out.push_str( + "\n\n# Task\n\nReturn the prose above with `[breath]`, `[pause:Xs]`, \ + `[scene]` markers and `[voice:]\"...\"[/voice]` dialogue wrappers \ + inserted appropriately. Do not change any word. Do not skip any \ + sentence. Return only the annotated prose.\n", + ); + } out } diff --git a/skald/src/narrate.rs b/skald/src/narrate.rs index 98418c1..73e102e 100644 --- a/skald/src/narrate.rs +++ b/skald/src/narrate.rs @@ -161,6 +161,12 @@ pub async fn run( .execute(&pool) .await?; + // This chapter now has a fresh canonical render. Prior render + // WAVs are dead weight — every re-render otherwise leaves its + // predecessor on disk forever. Reclaim it. Best-effort: a + // cleanup failure must never fail an otherwise-good render. + cleanup_superseded_renders(&pool, chapter_id, run_row_id).await; + println!( "narrated chapter {} of story {}: {} ({:.2}s audio, {:.1}s wall clock)", chapter.n, @@ -383,6 +389,71 @@ async fn apply_pronunciation_overrides( Ok(out) } +/// Delete the WAV files of prior renders of this chapter and clear +/// their `output_path`. The newest succeeded render is the canonical +/// one; older renders are superseded the moment a new one lands, and +/// without this every re-render would leave a stale ~80MB file on +/// disk forever. +/// +/// The `narration_runs` rows themselves are KEPT — engine, voice, +/// timing and status stay as render history. Only `output_path` is +/// nulled, so no row ever points at a file that no longer exists. +/// +/// Best-effort throughout: this runs *after* the current render has +/// already been recorded as succeeded, so any failure here (a query +/// error, a permission problem on the audio dir) is logged and +/// swallowed — it must never turn a good render into a failed one. +async fn cleanup_superseded_renders(pool: &PgPool, chapter_id: Uuid, current_run: Uuid) { + // output_path is only ever set on the success UPDATE, so + // "output_path IS NOT NULL AND id != current" is exactly the set + // of prior completed renders. + let prior: Vec<(Uuid, String)> = match sqlx::query_as( + "SELECT id, output_path FROM narration_runs + WHERE chapter_id = $1 AND id <> $2 AND output_path IS NOT NULL", + ) + .bind(chapter_id) + .bind(current_run) + .fetch_all(pool) + .await + { + Ok(rows) => rows, + Err(e) => { + tracing::warn!(error = %e, "superseded-render cleanup: query failed, skipping"); + return; + } + }; + + for (run_id, output_path) in prior { + // output_path is the HTTP-facing path "/audio/"; the + // `/audio` bind mount means that is also the on-disk path + // inside this container. + match std::fs::remove_file(&output_path) { + Ok(()) => { + tracing::info!(run_id = %run_id, path = %output_path, "removed superseded render"); + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + // File already gone — still clear the dangling row. + } + Err(e) => { + // Could not delete — leave output_path intact rather + // than pointing the row at nothing. + tracing::warn!( + run_id = %run_id, path = %output_path, error = %e, + "superseded-render cleanup: could not delete file, leaving row intact", + ); + continue; + } + } + if let Err(e) = sqlx::query("UPDATE narration_runs SET output_path = NULL WHERE id = $1") + .bind(run_id) + .execute(pool) + .await + { + tracing::warn!(run_id = %run_id, error = %e, "superseded-render cleanup: could not null output_path"); + } + } +} + /// Pick the engine base URL for a given voice.source. /// kokoro_* → KOKORO_URL /// tortoise_* → TORTOISE_URL From 575749b7746ea34228d48969c734ff3006aa2a3c Mon Sep 17 00:00:00 2001 From: Kayos Date: Fri, 15 May 2026 07:30:56 -0700 Subject: [PATCH 07/14] =?UTF-8?q?web:=20audiobook=20player=20=E2=80=94=20s?= =?UTF-8?q?titched-file=20playback=20with=20chapter=20seek?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds GET /stories/{id}/listen: one