forge: dedup pass — the fix half of the audit loop

Adds `skald dedup --story <id>`: reads a story's most recent
prose-audit findings and walks every chapter, handing the author
the chapter prose + the findings with instructions to rephrase
ONLY the flagged repetitions (each recurrence made distinct) and
fix flagged continuity errors — everything else stays verbatim.
A surgical dedup, not a rewrite. Overwrites body_md, clears
body_md_tts so the chapter is re-prepped before narration. High
effort (prose-craft). Migration 0011 adds the 'dedup' run kind.

Completes the QC loop: audit (find) -> dedup (fix) -> re-audit.
This commit is contained in:
Kayos 2026-05-15 14:49:08 -07:00
parent 4de484cd35
commit 2820d173e8
4 changed files with 317 additions and 0 deletions

View file

@ -0,0 +1,9 @@
-- The dedup pass: the fix half of the audit loop. Takes a chapter
-- plus the story's prose-audit findings and rephrases only the
-- flagged repetitions, leaving everything else verbatim.
ALTER TABLE generation_runs DROP CONSTRAINT generation_runs_kind_check;
ALTER TABLE generation_runs ADD CONSTRAINT generation_runs_kind_check
CHECK (kind = ANY (ARRAY[
'gen', 'cleanup', 'audit', 'summary', 'embed',
'narrate_prep', 'rewrite', 'prose_audit', 'dedup'
]));

View file

@ -85,6 +85,10 @@ pub enum PassKind {
/// to end and flags repetition, template tics, self-restatement
/// and continuity drift. The QC gate before narration.
ProseAudit,
/// Surgical dedup — takes a chapter plus the story's audit
/// findings and rephrases only the flagged repetitions, leaving
/// everything else verbatim. The fix half of the audit loop.
Dedup,
}
impl PassKind {
@ -97,6 +101,7 @@ impl PassKind {
Self::NarratePrep => "narrate_prep",
Self::Rewrite => "rewrite",
Self::ProseAudit => "prose_audit",
Self::Dedup => "dedup",
}
}
}
@ -232,6 +237,43 @@ impl Forge {
Ok(PassOutput { kind: PassKind::ProseAudit, result: r, duration_ms })
}
/// Surgical dedup of one chapter — the fix half of the audit
/// loop. Receives the chapter's prose plus the whole story's
/// audit findings, and rephrases ONLY the flagged repetitions
/// that occur in this chapter; everything the findings do not
/// flag stays verbatim. Author REQUIRED — the fresh phrasing
/// lands in the author's voice (SystemMode::Replace). High
/// effort: it is prose-craft, same posture as rewrite.
pub async fn dedup(
&self,
prose: &str,
findings: &str,
author: &AuthorWithRevision,
) -> anyhow::Result<PassOutput> {
let scaffold = author
.revision
.system_template
.as_deref()
.unwrap_or(DEFAULT_AUTHOR_SCAFFOLD);
let system = scaffold
.replace("{{display_name}}", &author.author.display_name)
.replace("{{pass_directive}}", DEDUP_DIRECTIVE)
.replace("{{soul}}", &author.revision.soul);
let user_prompt = dedup_user_prompt(prose, findings);
let body = RunRequest {
prompt: user_prompt,
model: Some(self.model.clone()),
system: Some(system),
system_mode: Some(SystemMode::Replace),
effort: Some(Effort::High),
timeout_secs: Some(1800),
..Default::default()
};
let r = self.client.run(body).await?;
let duration_ms = r.duration_ms;
Ok(PassOutput { kind: PassKind::Dedup, result: r, duration_ms })
}
/// Annotate prose with narration control tags. The model
/// receives the full chapter prose and returns the SAME prose
/// with `[pause:Xs]`, `[breath]`, `[scene]` markers inserted
@ -470,6 +512,8 @@ const HOUSE_NARRATE_PREP_SYSTEM: &str = "You are a senior audiobook director ann
/// voice tags, one narrator throughout.
const HOUSE_NARRATE_PREP_SYSTEM_SINGLE: &str = "You are a senior audiobook director annotating prose for a SINGLE-narrator reading. You insert (a) beat markers — `[breath]`, `[pause:Xs]`, `[scene]` — where a skilled narrator would breathe or pause, and (b) occasional humanizing narrator stumbles using em-dash repetition or self-correction (sparingly — maybe 1-3 per chapter, on proper nouns or hard words). Do NOT add `[voice:...]` speaker tags — the whole chapter is one voice. Apart from those stumbles you do NOT change a word of the prose. Return the prose verbatim plus beat markers and (rare) stumbles inline. No preamble, no commentary.";
const DEDUP_DIRECTIVE: &str = "This is a DEDUP pass. The user prompt contains ONE chapter of a story you wrote, plus a list of audit findings — repeated phrases, motifs, similes, sentence templates and continuity errors found across the whole book. Your job: return this chapter with every flagged repetition that occurs IN IT rephrased fresh, and everything else byte-identical.\n\nHARD RULES:\n- For any motif, simile, phrase, image or structural tic the findings flag as recurring: if it appears in THIS chapter, render this chapter's occurrence in fresh, distinctive wording. Never reuse the flagged original phrasing. The other chapters' occurrences are being revised separately — do NOT try to coordinate with them; just make yours distinct from the flagged original.\n- Fix any continuity error the findings flag that touches this chapter (a wrong age, number, name, date) — use the correct value the findings identify.\n- Change NOTHING the findings do not flag. Every sentence not implicated by a finding stays EXACTLY as written, word for word. This is not a rewrite, not a polish, not an edit for taste — it is a surgical dedup. When in doubt, leave it.\n- Canon is absolute: names, dates, events, the order they happen, every fact — unchanged. The chapter stays the same length and shape.\n- Return ONLY the chapter prose. No heading unless the source had one. No preamble, no commentary, no list of what you changed.\n\n";
const REWRITE_DIRECTIVE: &str = "This is a REWRITE pass. The user prompt contains a chapter of prose written by another hand. Re-author it entirely in YOUR voice — every sentence reworked in your style: your sentence rhythm, your word choice, your paragraph shape, your way of landing a beat. This is not editing or polishing. It is re-authoring. The reader should not be able to tell another writer ever touched it.\n\nHARD CONSTRAINTS — canon is non-negotiable:\n- Every character name, every date, every place name stays exactly as written.\n- Every event, and the ORDER events happen in, stays exactly as written.\n- Every technical or historical fact stays exactly as written.\n- Do not add new scenes, characters, or events. Do not cut any scene or beat. Same story, same shape — your telling.\n\nReturn ONLY the rewritten chapter prose. Begin with the chapter heading line (`## Chapter N — title`) exactly as in the source. No preamble, no commentary about the rewrite.";
// ─── User-prompt builders ───────────────────────────────────────
@ -518,6 +562,25 @@ pub struct CharacterSpeaker {
pub hint: Option<String>,
}
fn dedup_user_prompt(prose: &str, findings: &str) -> String {
let mut out = String::with_capacity(prose.len() + findings.len() + 512);
out.push_str("# Audit findings for this story\n\n");
out.push_str(
"These repetitions and errors were found across the whole book. \
Fix only the ones that occur in the chapter below.\n\n",
);
out.push_str(findings);
out.push_str("\n\n# Chapter to dedup\n\n");
out.push_str(prose);
out.push_str(
"\n\n# Task\n\nReturn the chapter above with every flagged repetition \
that appears in it rephrased fresh, and any flagged continuity error \
touching it corrected. Leave every unflagged sentence verbatim. \
Return only the chapter prose.\n",
);
out
}
fn rewrite_user_prompt(prose: &str) -> String {
let mut out = String::with_capacity(prose.len() + 256);
out.push_str("# Chapter to re-author\n\n");

227
skald/src/dedup.rs Normal file
View file

@ -0,0 +1,227 @@
//! `skald dedup` — the fix half of the audit loop. Reads a story's
//! most recent prose-audit findings and walks the chapters, handing
//! each chapter + the findings to the author with instructions to
//! rephrase ONLY the flagged repetitions and leave everything else
//! verbatim. Overwrites body_md and clears body_md_tts so the
//! chapter gets re-prepped before narration.
//!
//! Run `skald audit` first — dedup needs findings to act on.
use std::time::Instant;
use anyhow::{Context, bail};
use chrono::Utc;
use skald_core::authors::{self, AuthorWithRevision};
use skald_core::config::ForgeConfig;
use skald_core::db;
use skald_core::forge::{Forge, PassKind, PassOutput};
use sqlx::PgPool;
use uuid::Uuid;
pub async fn run(
database_url: &str,
story_id: Uuid,
author_slug: Option<&str>,
) -> anyhow::Result<()> {
let cfg = load_forge_config()?;
tracing::info!(base_url = %cfg.base_url, model = %cfg.model, "forge configured");
let pool = db::connect_and_migrate(database_url).await?;
let forge = Forge::new(&cfg)?;
let story: Option<(String, Option<Uuid>)> =
sqlx::query_as("SELECT title, author_id FROM stories WHERE id = $1")
.bind(story_id)
.fetch_optional(&pool)
.await?;
let (title, story_author_id) =
story.with_context(|| format!("story {story_id} not found"))?;
let author = resolve_author(&pool, story_author_id, author_slug)
.await?
.ok_or_else(|| {
anyhow::anyhow!(
"dedup needs an author for the rephrasing — pass --author <slug> \
or bind one to the story"
)
})?;
// Findings from the most recent succeeded prose-audit run.
let findings = load_latest_findings(&pool, story_id).await?;
if findings.is_empty() {
bail!(
"no prose-audit findings for story {story_id} — run `skald audit \
--story {story_id}` first"
);
}
let findings_block = render_findings(&findings);
tracing::info!(
story = %title,
author = %author.author.slug,
finding_count = findings.len(),
"dedup starting",
);
let chapters: Vec<(Uuid, i32)> = sqlx::query_as(
"SELECT id, n FROM chapters WHERE story_id = $1 ORDER BY n",
)
.bind(story_id)
.fetch_all(&pool)
.await?;
if chapters.is_empty() {
bail!("story {story_id} has no chapters");
}
for (chapter_id, n) in &chapters {
let body_md: String =
sqlx::query_scalar("SELECT body_md FROM chapters WHERE id = $1")
.bind(chapter_id)
.fetch_one(&pool)
.await?;
let run_id: Uuid = sqlx::query_scalar(
"INSERT INTO generation_runs (story_id, kind, status) VALUES ($1, $2, 'running') RETURNING id",
)
.bind(story_id)
.bind(PassKind::Dedup.as_str())
.fetch_one(&pool)
.await?;
let started = Instant::now();
let out_res = forge.dedup(&body_md, &findings_block, &author).await;
let elapsed = started.elapsed();
let out: PassOutput = match out_res {
Ok(o) => o,
Err(e) => {
sqlx::query(
"UPDATE generation_runs SET status='failed', error=$1, ended_at=$2 WHERE id=$3",
)
.bind(format!("{e:#}"))
.bind(Utc::now())
.bind(run_id)
.execute(&pool)
.await?;
return Err(e).with_context(|| format!("dedup failed on chapter {n}"));
}
};
let deduped = pass_text(&out)?;
// Overwrite body_md and clear body_md_tts — the chapter must be
// re-prepped before it is narrated again. body_md_original is
// left untouched (it belongs to the rewrite pass).
sqlx::query("UPDATE chapters SET body_md = $1, body_md_tts = NULL WHERE id = $2")
.bind(&deduped)
.bind(chapter_id)
.execute(&pool)
.await?;
sqlx::query("UPDATE generation_runs SET status='succeeded', ended_at=$1 WHERE id=$2")
.bind(Utc::now())
.bind(run_id)
.execute(&pool)
.await?;
let before = body_md.len();
let after = deduped.len();
println!(
"deduped chapter {n} ({before}c -> {after}c) in {:.1}s",
elapsed.as_secs_f32(),
);
}
println!(
"dedup complete: \"{title}\" — {} chapter(s), {} finding(s) applied",
chapters.len(),
findings.len(),
);
Ok(())
}
#[derive(Debug, Clone)]
struct Finding {
severity: String,
area: String,
body: String,
}
async fn load_latest_findings(pool: &PgPool, story_id: Uuid) -> anyhow::Result<Vec<Finding>> {
let rows: Vec<(String, String, String)> = sqlx::query_as(
"SELECT severity, area, body FROM audit_findings
WHERE story_id = $1
AND run_id = (
SELECT id FROM generation_runs
WHERE story_id = $1 AND kind = 'prose_audit' AND status = 'succeeded'
ORDER BY started_at DESC LIMIT 1
)
ORDER BY
CASE severity WHEN 'crit' THEN 0 WHEN 'warn' THEN 1 ELSE 2 END,
area",
)
.bind(story_id)
.fetch_all(pool)
.await?;
Ok(rows
.into_iter()
.map(|(severity, area, body)| Finding { severity, area, body })
.collect())
}
fn render_findings(findings: &[Finding]) -> String {
let mut out = String::new();
for f in findings {
out.push_str(&format!(
"[{} · {}]\n{}\n\n",
f.severity.to_uppercase(),
f.area,
f.body,
));
}
out
}
async fn resolve_author(
pool: &PgPool,
story_author_id: Option<Uuid>,
flag_slug: Option<&str>,
) -> anyhow::Result<Option<AuthorWithRevision>> {
if let Some(slug) = flag_slug {
return authors::get_with_current_revision(pool, slug)
.await?
.map(Some)
.with_context(|| format!("author '{slug}' not found"));
}
if let Some(aid) = story_author_id {
let row: Option<(String,)> = sqlx::query_as("SELECT slug FROM authors WHERE id = $1")
.bind(aid)
.fetch_optional(pool)
.await?;
if let Some((slug,)) = row {
return Ok(authors::get_with_current_revision(pool, &slug).await?);
}
}
Ok(None)
}
fn pass_text(out: &PassOutput) -> anyhow::Result<String> {
let text = out
.result
.as_text()
.map(|s| s.to_string())
.or_else(|| out.result.result.as_str().map(|s| s.to_string()))
.unwrap_or_else(|| out.result.result.to_string());
if text.trim().is_empty() {
bail!("dedup pass returned empty");
}
Ok(text)
}
fn load_forge_config() -> anyhow::Result<ForgeConfig> {
let base_url = std::env::var("CLAWDFORGE_URL").context("CLAWDFORGE_URL not set")?;
let app_token = std::env::var("CLAWDFORGE_TOKEN").context("CLAWDFORGE_TOKEN not set")?;
let model = std::env::var("SKALD_MODEL").unwrap_or_else(|_| "opus".into());
Ok(ForgeConfig {
base_url,
app_token,
model,
})
}

View file

@ -7,6 +7,7 @@
mod audit;
mod authors_seed;
mod continue_story;
mod dedup;
mod import;
mod narrate;
mod narrate_prep;
@ -185,6 +186,20 @@ enum Cmd {
#[arg(long)]
story: Uuid,
},
/// Dedup a story against its most recent prose-audit findings.
/// Walks every chapter, rephrasing only the flagged repetitions
/// and fixing flagged continuity errors — everything else stays
/// verbatim. Overwrites body_md and clears body_md_tts. Run
/// `skald audit` first.
Dedup {
/// Story to dedup.
#[arg(long)]
story: Uuid,
/// Author slug for the rephrasing. Falls back to the story's
/// bound author if omitted.
#[arg(long)]
author: Option<String>,
},
}
#[tokio::main]
@ -274,6 +289,9 @@ async fn run() -> anyhow::Result<()> {
rewrite::run(&cli.database_url, chapter, author.as_deref()).await
}
Cmd::Audit { story } => audit::run(&cli.database_url, story).await,
Cmd::Dedup { story, author } => {
dedup::run(&cli.database_url, story, author.as_deref()).await
}
}
}