dedup: length guard against corrupted output + --chapter retry

One chapter in a dedup run had the model emit the chapter twice
(18.7k -> 32.6k chars). A surgical dedup only nudges length, so
the pass now rejects any output wildly off the input length —
marks that chapter's run failed, leaves the prose untouched, and
moves on rather than silently corrupting it. New --chapter flag
narrows a run to one chapter so a skipped one can be retried.
This commit is contained in:
Kayos 2026-05-15 15:39:47 -07:00
parent 2820d173e8
commit 4402c53979
2 changed files with 58 additions and 10 deletions

View file

@ -22,6 +22,7 @@ pub async fn run(
database_url: &str, database_url: &str,
story_id: Uuid, story_id: Uuid,
author_slug: Option<&str>, author_slug: Option<&str>,
chapter_filter: Option<i32>,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let cfg = load_forge_config()?; let cfg = load_forge_config()?;
tracing::info!(base_url = %cfg.base_url, model = %cfg.model, "forge configured"); tracing::info!(base_url = %cfg.base_url, model = %cfg.model, "forge configured");
@ -62,7 +63,7 @@ pub async fn run(
"dedup starting", "dedup starting",
); );
let chapters: Vec<(Uuid, i32)> = sqlx::query_as( let mut chapters: Vec<(Uuid, i32)> = sqlx::query_as(
"SELECT id, n FROM chapters WHERE story_id = $1 ORDER BY n", "SELECT id, n FROM chapters WHERE story_id = $1 ORDER BY n",
) )
.bind(story_id) .bind(story_id)
@ -71,7 +72,16 @@ pub async fn run(
if chapters.is_empty() { if chapters.is_empty() {
bail!("story {story_id} has no chapters"); bail!("story {story_id} has no chapters");
} }
// --chapter narrows the run to one chapter — used to retry a
// chapter the length guard skipped on an earlier run.
if let Some(target) = chapter_filter {
chapters.retain(|(_, n)| *n == target);
if chapters.is_empty() {
bail!("chapter {target} not found in story {story_id}");
}
}
let mut skipped = 0usize;
for (chapter_id, n) in &chapters { for (chapter_id, n) in &chapters {
let body_md: String = let body_md: String =
sqlx::query_scalar("SELECT body_md FROM chapters WHERE id = $1") sqlx::query_scalar("SELECT body_md FROM chapters WHERE id = $1")
@ -107,6 +117,34 @@ pub async fn run(
}; };
let deduped = pass_text(&out)?; let deduped = pass_text(&out)?;
// Sanity guard: a surgical dedup nudges a chapter's length by
// a little. An output wildly off the input means the model
// duplicated or ballooned the chapter — reject it, leave the
// chapter untouched, move on. A re-run with --chapter retries
// just the skipped one.
let before = body_md.len();
let after = deduped.len();
if after > before * 3 / 2 || after < before * 3 / 5 {
sqlx::query(
"UPDATE generation_runs SET status='failed', error=$1, ended_at=$2 WHERE id=$3",
)
.bind(format!(
"rejected: dedup output {after}c is wildly off input {before}c \
likely a duplicated or ballooned output"
))
.bind(Utc::now())
.bind(run_id)
.execute(&pool)
.await?;
skipped += 1;
println!(
"SKIPPED chapter {n}: dedup returned {after}c from {before}c \
chapter left untouched (retry with --chapter {n})"
);
continue;
}
// Overwrite body_md and clear body_md_tts — the chapter must be // Overwrite body_md and clear body_md_tts — the chapter must be
// re-prepped before it is narrated again. body_md_original is // re-prepped before it is narrated again. body_md_original is
// left untouched (it belongs to the rewrite pass). // left untouched (it belongs to the rewrite pass).
@ -121,19 +159,25 @@ pub async fn run(
.execute(&pool) .execute(&pool)
.await?; .await?;
let before = body_md.len();
let after = deduped.len();
println!( println!(
"deduped chapter {n} ({before}c -> {after}c) in {:.1}s", "deduped chapter {n} ({before}c -> {after}c) in {:.1}s",
elapsed.as_secs_f32(), elapsed.as_secs_f32(),
); );
} }
if skipped > 0 {
println!( println!(
"dedup complete: \"{title}\" — {} chapter(s), {} finding(s) applied", "dedup complete: \"{title}\" — {} chapter(s) processed, {skipped} SKIPPED \
(retry each with --chapter)",
chapters.len(),
);
} else {
println!(
"dedup complete: \"{title}\" — {} chapter(s) deduped against {} finding(s)",
chapters.len(), chapters.len(),
findings.len(), findings.len(),
); );
}
Ok(()) Ok(())
} }

View file

@ -199,6 +199,10 @@ enum Cmd {
/// bound author if omitted. /// bound author if omitted.
#[arg(long)] #[arg(long)]
author: Option<String>, author: Option<String>,
/// Restrict the run to a single chapter number — used to
/// retry a chapter the length guard skipped.
#[arg(long)]
chapter: Option<i32>,
}, },
} }
@ -289,8 +293,8 @@ async fn run() -> anyhow::Result<()> {
rewrite::run(&cli.database_url, chapter, author.as_deref()).await rewrite::run(&cli.database_url, chapter, author.as_deref()).await
} }
Cmd::Audit { story } => audit::run(&cli.database_url, story).await, Cmd::Audit { story } => audit::run(&cli.database_url, story).await,
Cmd::Dedup { story, author } => { Cmd::Dedup { story, author, chapter } => {
dedup::run(&cli.database_url, story, author.as_deref()).await dedup::run(&cli.database_url, story, author.as_deref(), chapter).await
} }
} }
} }