diff --git a/migrations/0011_dedup_pass.sql b/migrations/0011_dedup_pass.sql new file mode 100644 index 0000000..fb96504 --- /dev/null +++ b/migrations/0011_dedup_pass.sql @@ -0,0 +1,9 @@ +-- The dedup pass: the fix half of the audit loop. Takes a chapter +-- plus the story's prose-audit findings and rephrases only the +-- flagged repetitions, leaving everything else verbatim. +ALTER TABLE generation_runs DROP CONSTRAINT generation_runs_kind_check; +ALTER TABLE generation_runs ADD CONSTRAINT generation_runs_kind_check + CHECK (kind = ANY (ARRAY[ + 'gen', 'cleanup', 'audit', 'summary', 'embed', + 'narrate_prep', 'rewrite', 'prose_audit', 'dedup' + ])); diff --git a/skald-core/src/forge.rs b/skald-core/src/forge.rs index c5c206b..a00a400 100644 --- a/skald-core/src/forge.rs +++ b/skald-core/src/forge.rs @@ -85,6 +85,10 @@ pub enum PassKind { /// to end and flags repetition, template tics, self-restatement /// and continuity drift. The QC gate before narration. ProseAudit, + /// Surgical dedup — takes a chapter plus the story's audit + /// findings and rephrases only the flagged repetitions, leaving + /// everything else verbatim. The fix half of the audit loop. + Dedup, } impl PassKind { @@ -97,6 +101,7 @@ impl PassKind { Self::NarratePrep => "narrate_prep", Self::Rewrite => "rewrite", Self::ProseAudit => "prose_audit", + Self::Dedup => "dedup", } } } @@ -232,6 +237,43 @@ impl Forge { Ok(PassOutput { kind: PassKind::ProseAudit, result: r, duration_ms }) } + /// Surgical dedup of one chapter — the fix half of the audit + /// loop. Receives the chapter's prose plus the whole story's + /// audit findings, and rephrases ONLY the flagged repetitions + /// that occur in this chapter; everything the findings do not + /// flag stays verbatim. Author REQUIRED — the fresh phrasing + /// lands in the author's voice (SystemMode::Replace). High + /// effort: it is prose-craft, same posture as rewrite. + pub async fn dedup( + &self, + prose: &str, + findings: &str, + author: &AuthorWithRevision, + ) -> anyhow::Result { + let scaffold = author + .revision + .system_template + .as_deref() + .unwrap_or(DEFAULT_AUTHOR_SCAFFOLD); + let system = scaffold + .replace("{{display_name}}", &author.author.display_name) + .replace("{{pass_directive}}", DEDUP_DIRECTIVE) + .replace("{{soul}}", &author.revision.soul); + let user_prompt = dedup_user_prompt(prose, findings); + let body = RunRequest { + prompt: user_prompt, + model: Some(self.model.clone()), + system: Some(system), + system_mode: Some(SystemMode::Replace), + effort: Some(Effort::High), + timeout_secs: Some(1800), + ..Default::default() + }; + let r = self.client.run(body).await?; + let duration_ms = r.duration_ms; + Ok(PassOutput { kind: PassKind::Dedup, result: r, duration_ms }) + } + /// Annotate prose with narration control tags. The model /// receives the full chapter prose and returns the SAME prose /// with `[pause:Xs]`, `[breath]`, `[scene]` markers inserted @@ -470,6 +512,8 @@ const HOUSE_NARRATE_PREP_SYSTEM: &str = "You are a senior audiobook director ann /// voice tags, one narrator throughout. const HOUSE_NARRATE_PREP_SYSTEM_SINGLE: &str = "You are a senior audiobook director annotating prose for a SINGLE-narrator reading. You insert (a) beat markers — `[breath]`, `[pause:Xs]`, `[scene]` — where a skilled narrator would breathe or pause, and (b) occasional humanizing narrator stumbles using em-dash repetition or self-correction (sparingly — maybe 1-3 per chapter, on proper nouns or hard words). Do NOT add `[voice:...]` speaker tags — the whole chapter is one voice. Apart from those stumbles you do NOT change a word of the prose. Return the prose verbatim plus beat markers and (rare) stumbles inline. No preamble, no commentary."; +const DEDUP_DIRECTIVE: &str = "This is a DEDUP pass. The user prompt contains ONE chapter of a story you wrote, plus a list of audit findings — repeated phrases, motifs, similes, sentence templates and continuity errors found across the whole book. Your job: return this chapter with every flagged repetition that occurs IN IT rephrased fresh, and everything else byte-identical.\n\nHARD RULES:\n- For any motif, simile, phrase, image or structural tic the findings flag as recurring: if it appears in THIS chapter, render this chapter's occurrence in fresh, distinctive wording. Never reuse the flagged original phrasing. The other chapters' occurrences are being revised separately — do NOT try to coordinate with them; just make yours distinct from the flagged original.\n- Fix any continuity error the findings flag that touches this chapter (a wrong age, number, name, date) — use the correct value the findings identify.\n- Change NOTHING the findings do not flag. Every sentence not implicated by a finding stays EXACTLY as written, word for word. This is not a rewrite, not a polish, not an edit for taste — it is a surgical dedup. When in doubt, leave it.\n- Canon is absolute: names, dates, events, the order they happen, every fact — unchanged. The chapter stays the same length and shape.\n- Return ONLY the chapter prose. No heading unless the source had one. No preamble, no commentary, no list of what you changed.\n\n"; + const REWRITE_DIRECTIVE: &str = "This is a REWRITE pass. The user prompt contains a chapter of prose written by another hand. Re-author it entirely in YOUR voice — every sentence reworked in your style: your sentence rhythm, your word choice, your paragraph shape, your way of landing a beat. This is not editing or polishing. It is re-authoring. The reader should not be able to tell another writer ever touched it.\n\nHARD CONSTRAINTS — canon is non-negotiable:\n- Every character name, every date, every place name stays exactly as written.\n- Every event, and the ORDER events happen in, stays exactly as written.\n- Every technical or historical fact stays exactly as written.\n- Do not add new scenes, characters, or events. Do not cut any scene or beat. Same story, same shape — your telling.\n\nReturn ONLY the rewritten chapter prose. Begin with the chapter heading line (`## Chapter N — title`) exactly as in the source. No preamble, no commentary about the rewrite."; // ─── User-prompt builders ─────────────────────────────────────── @@ -518,6 +562,25 @@ pub struct CharacterSpeaker { pub hint: Option, } +fn dedup_user_prompt(prose: &str, findings: &str) -> String { + let mut out = String::with_capacity(prose.len() + findings.len() + 512); + out.push_str("# Audit findings for this story\n\n"); + out.push_str( + "These repetitions and errors were found across the whole book. \ + Fix only the ones that occur in the chapter below.\n\n", + ); + out.push_str(findings); + out.push_str("\n\n# Chapter to dedup\n\n"); + out.push_str(prose); + out.push_str( + "\n\n# Task\n\nReturn the chapter above with every flagged repetition \ + that appears in it rephrased fresh, and any flagged continuity error \ + touching it corrected. Leave every unflagged sentence verbatim. \ + Return only the chapter prose.\n", + ); + out +} + fn rewrite_user_prompt(prose: &str) -> String { let mut out = String::with_capacity(prose.len() + 256); out.push_str("# Chapter to re-author\n\n"); diff --git a/skald/src/dedup.rs b/skald/src/dedup.rs new file mode 100644 index 0000000..9573b8c --- /dev/null +++ b/skald/src/dedup.rs @@ -0,0 +1,227 @@ +//! `skald dedup` — the fix half of the audit loop. Reads a story's +//! most recent prose-audit findings and walks the chapters, handing +//! each chapter + the findings to the author with instructions to +//! rephrase ONLY the flagged repetitions and leave everything else +//! verbatim. Overwrites body_md and clears body_md_tts so the +//! chapter gets re-prepped before narration. +//! +//! Run `skald audit` first — dedup needs findings to act on. + +use std::time::Instant; + +use anyhow::{Context, bail}; +use chrono::Utc; +use skald_core::authors::{self, AuthorWithRevision}; +use skald_core::config::ForgeConfig; +use skald_core::db; +use skald_core::forge::{Forge, PassKind, PassOutput}; +use sqlx::PgPool; +use uuid::Uuid; + +pub async fn run( + database_url: &str, + story_id: Uuid, + author_slug: Option<&str>, +) -> anyhow::Result<()> { + let cfg = load_forge_config()?; + tracing::info!(base_url = %cfg.base_url, model = %cfg.model, "forge configured"); + + let pool = db::connect_and_migrate(database_url).await?; + let forge = Forge::new(&cfg)?; + + let story: Option<(String, Option)> = + sqlx::query_as("SELECT title, author_id FROM stories WHERE id = $1") + .bind(story_id) + .fetch_optional(&pool) + .await?; + let (title, story_author_id) = + story.with_context(|| format!("story {story_id} not found"))?; + + let author = resolve_author(&pool, story_author_id, author_slug) + .await? + .ok_or_else(|| { + anyhow::anyhow!( + "dedup needs an author for the rephrasing — pass --author \ + or bind one to the story" + ) + })?; + + // Findings from the most recent succeeded prose-audit run. + let findings = load_latest_findings(&pool, story_id).await?; + if findings.is_empty() { + bail!( + "no prose-audit findings for story {story_id} — run `skald audit \ + --story {story_id}` first" + ); + } + let findings_block = render_findings(&findings); + tracing::info!( + story = %title, + author = %author.author.slug, + finding_count = findings.len(), + "dedup starting", + ); + + let chapters: Vec<(Uuid, i32)> = sqlx::query_as( + "SELECT id, n FROM chapters WHERE story_id = $1 ORDER BY n", + ) + .bind(story_id) + .fetch_all(&pool) + .await?; + if chapters.is_empty() { + bail!("story {story_id} has no chapters"); + } + + for (chapter_id, n) in &chapters { + let body_md: String = + sqlx::query_scalar("SELECT body_md FROM chapters WHERE id = $1") + .bind(chapter_id) + .fetch_one(&pool) + .await?; + + let run_id: Uuid = sqlx::query_scalar( + "INSERT INTO generation_runs (story_id, kind, status) VALUES ($1, $2, 'running') RETURNING id", + ) + .bind(story_id) + .bind(PassKind::Dedup.as_str()) + .fetch_one(&pool) + .await?; + + let started = Instant::now(); + let out_res = forge.dedup(&body_md, &findings_block, &author).await; + let elapsed = started.elapsed(); + + let out: PassOutput = match out_res { + Ok(o) => o, + Err(e) => { + sqlx::query( + "UPDATE generation_runs SET status='failed', error=$1, ended_at=$2 WHERE id=$3", + ) + .bind(format!("{e:#}")) + .bind(Utc::now()) + .bind(run_id) + .execute(&pool) + .await?; + return Err(e).with_context(|| format!("dedup failed on chapter {n}")); + } + }; + + let deduped = pass_text(&out)?; + // Overwrite body_md and clear body_md_tts — the chapter must be + // re-prepped before it is narrated again. body_md_original is + // left untouched (it belongs to the rewrite pass). + sqlx::query("UPDATE chapters SET body_md = $1, body_md_tts = NULL WHERE id = $2") + .bind(&deduped) + .bind(chapter_id) + .execute(&pool) + .await?; + sqlx::query("UPDATE generation_runs SET status='succeeded', ended_at=$1 WHERE id=$2") + .bind(Utc::now()) + .bind(run_id) + .execute(&pool) + .await?; + + let before = body_md.len(); + let after = deduped.len(); + println!( + "deduped chapter {n} ({before}c -> {after}c) in {:.1}s", + elapsed.as_secs_f32(), + ); + } + + println!( + "dedup complete: \"{title}\" — {} chapter(s), {} finding(s) applied", + chapters.len(), + findings.len(), + ); + Ok(()) +} + +#[derive(Debug, Clone)] +struct Finding { + severity: String, + area: String, + body: String, +} + +async fn load_latest_findings(pool: &PgPool, story_id: Uuid) -> anyhow::Result> { + let rows: Vec<(String, String, String)> = sqlx::query_as( + "SELECT severity, area, body FROM audit_findings + WHERE story_id = $1 + AND run_id = ( + SELECT id FROM generation_runs + WHERE story_id = $1 AND kind = 'prose_audit' AND status = 'succeeded' + ORDER BY started_at DESC LIMIT 1 + ) + ORDER BY + CASE severity WHEN 'crit' THEN 0 WHEN 'warn' THEN 1 ELSE 2 END, + area", + ) + .bind(story_id) + .fetch_all(pool) + .await?; + Ok(rows + .into_iter() + .map(|(severity, area, body)| Finding { severity, area, body }) + .collect()) +} + +fn render_findings(findings: &[Finding]) -> String { + let mut out = String::new(); + for f in findings { + out.push_str(&format!( + "[{} · {}]\n{}\n\n", + f.severity.to_uppercase(), + f.area, + f.body, + )); + } + out +} + +async fn resolve_author( + pool: &PgPool, + story_author_id: Option, + flag_slug: Option<&str>, +) -> anyhow::Result> { + if let Some(slug) = flag_slug { + return authors::get_with_current_revision(pool, slug) + .await? + .map(Some) + .with_context(|| format!("author '{slug}' not found")); + } + if let Some(aid) = story_author_id { + let row: Option<(String,)> = sqlx::query_as("SELECT slug FROM authors WHERE id = $1") + .bind(aid) + .fetch_optional(pool) + .await?; + if let Some((slug,)) = row { + return Ok(authors::get_with_current_revision(pool, &slug).await?); + } + } + Ok(None) +} + +fn pass_text(out: &PassOutput) -> anyhow::Result { + let text = out + .result + .as_text() + .map(|s| s.to_string()) + .or_else(|| out.result.result.as_str().map(|s| s.to_string())) + .unwrap_or_else(|| out.result.result.to_string()); + if text.trim().is_empty() { + bail!("dedup pass returned empty"); + } + Ok(text) +} + +fn load_forge_config() -> anyhow::Result { + let base_url = std::env::var("CLAWDFORGE_URL").context("CLAWDFORGE_URL not set")?; + let app_token = std::env::var("CLAWDFORGE_TOKEN").context("CLAWDFORGE_TOKEN not set")?; + let model = std::env::var("SKALD_MODEL").unwrap_or_else(|_| "opus".into()); + Ok(ForgeConfig { + base_url, + app_token, + model, + }) +} diff --git a/skald/src/main.rs b/skald/src/main.rs index 7389b71..b802d8a 100644 --- a/skald/src/main.rs +++ b/skald/src/main.rs @@ -7,6 +7,7 @@ mod audit; mod authors_seed; mod continue_story; +mod dedup; mod import; mod narrate; mod narrate_prep; @@ -185,6 +186,20 @@ enum Cmd { #[arg(long)] story: Uuid, }, + /// Dedup a story against its most recent prose-audit findings. + /// Walks every chapter, rephrasing only the flagged repetitions + /// and fixing flagged continuity errors — everything else stays + /// verbatim. Overwrites body_md and clears body_md_tts. Run + /// `skald audit` first. + Dedup { + /// Story to dedup. + #[arg(long)] + story: Uuid, + /// Author slug for the rephrasing. Falls back to the story's + /// bound author if omitted. + #[arg(long)] + author: Option, + }, } #[tokio::main] @@ -274,6 +289,9 @@ async fn run() -> anyhow::Result<()> { rewrite::run(&cli.database_url, chapter, author.as_deref()).await } Cmd::Audit { story } => audit::run(&cli.database_url, story).await, + Cmd::Dedup { story, author } => { + dedup::run(&cli.database_url, story, author.as_deref()).await + } } }