dedup: length guard against corrupted output + --chapter retry
One chapter in a dedup run had the model emit the chapter twice (18.7k -> 32.6k chars). A surgical dedup only nudges length, so the pass now rejects any output wildly off the input length — marks that chapter's run failed, leaves the prose untouched, and moves on rather than silently corrupting it. New --chapter flag narrows a run to one chapter so a skipped one can be retried.
This commit is contained in:
parent
2820d173e8
commit
4402c53979
2 changed files with 58 additions and 10 deletions
|
|
@ -22,6 +22,7 @@ pub async fn run(
|
||||||
database_url: &str,
|
database_url: &str,
|
||||||
story_id: Uuid,
|
story_id: Uuid,
|
||||||
author_slug: Option<&str>,
|
author_slug: Option<&str>,
|
||||||
|
chapter_filter: Option<i32>,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let cfg = load_forge_config()?;
|
let cfg = load_forge_config()?;
|
||||||
tracing::info!(base_url = %cfg.base_url, model = %cfg.model, "forge configured");
|
tracing::info!(base_url = %cfg.base_url, model = %cfg.model, "forge configured");
|
||||||
|
|
@ -62,7 +63,7 @@ pub async fn run(
|
||||||
"dedup starting",
|
"dedup starting",
|
||||||
);
|
);
|
||||||
|
|
||||||
let chapters: Vec<(Uuid, i32)> = sqlx::query_as(
|
let mut chapters: Vec<(Uuid, i32)> = sqlx::query_as(
|
||||||
"SELECT id, n FROM chapters WHERE story_id = $1 ORDER BY n",
|
"SELECT id, n FROM chapters WHERE story_id = $1 ORDER BY n",
|
||||||
)
|
)
|
||||||
.bind(story_id)
|
.bind(story_id)
|
||||||
|
|
@ -71,7 +72,16 @@ pub async fn run(
|
||||||
if chapters.is_empty() {
|
if chapters.is_empty() {
|
||||||
bail!("story {story_id} has no chapters");
|
bail!("story {story_id} has no chapters");
|
||||||
}
|
}
|
||||||
|
// --chapter narrows the run to one chapter — used to retry a
|
||||||
|
// chapter the length guard skipped on an earlier run.
|
||||||
|
if let Some(target) = chapter_filter {
|
||||||
|
chapters.retain(|(_, n)| *n == target);
|
||||||
|
if chapters.is_empty() {
|
||||||
|
bail!("chapter {target} not found in story {story_id}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut skipped = 0usize;
|
||||||
for (chapter_id, n) in &chapters {
|
for (chapter_id, n) in &chapters {
|
||||||
let body_md: String =
|
let body_md: String =
|
||||||
sqlx::query_scalar("SELECT body_md FROM chapters WHERE id = $1")
|
sqlx::query_scalar("SELECT body_md FROM chapters WHERE id = $1")
|
||||||
|
|
@ -107,6 +117,34 @@ pub async fn run(
|
||||||
};
|
};
|
||||||
|
|
||||||
let deduped = pass_text(&out)?;
|
let deduped = pass_text(&out)?;
|
||||||
|
|
||||||
|
// Sanity guard: a surgical dedup nudges a chapter's length by
|
||||||
|
// a little. An output wildly off the input means the model
|
||||||
|
// duplicated or ballooned the chapter — reject it, leave the
|
||||||
|
// chapter untouched, move on. A re-run with --chapter retries
|
||||||
|
// just the skipped one.
|
||||||
|
let before = body_md.len();
|
||||||
|
let after = deduped.len();
|
||||||
|
if after > before * 3 / 2 || after < before * 3 / 5 {
|
||||||
|
sqlx::query(
|
||||||
|
"UPDATE generation_runs SET status='failed', error=$1, ended_at=$2 WHERE id=$3",
|
||||||
|
)
|
||||||
|
.bind(format!(
|
||||||
|
"rejected: dedup output {after}c is wildly off input {before}c \
|
||||||
|
— likely a duplicated or ballooned output"
|
||||||
|
))
|
||||||
|
.bind(Utc::now())
|
||||||
|
.bind(run_id)
|
||||||
|
.execute(&pool)
|
||||||
|
.await?;
|
||||||
|
skipped += 1;
|
||||||
|
println!(
|
||||||
|
"SKIPPED chapter {n}: dedup returned {after}c from {before}c \
|
||||||
|
— chapter left untouched (retry with --chapter {n})"
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// Overwrite body_md and clear body_md_tts — the chapter must be
|
// Overwrite body_md and clear body_md_tts — the chapter must be
|
||||||
// re-prepped before it is narrated again. body_md_original is
|
// re-prepped before it is narrated again. body_md_original is
|
||||||
// left untouched (it belongs to the rewrite pass).
|
// left untouched (it belongs to the rewrite pass).
|
||||||
|
|
@ -121,19 +159,25 @@ pub async fn run(
|
||||||
.execute(&pool)
|
.execute(&pool)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let before = body_md.len();
|
|
||||||
let after = deduped.len();
|
|
||||||
println!(
|
println!(
|
||||||
"deduped chapter {n} ({before}c -> {after}c) in {:.1}s",
|
"deduped chapter {n} ({before}c -> {after}c) in {:.1}s",
|
||||||
elapsed.as_secs_f32(),
|
elapsed.as_secs_f32(),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
println!(
|
if skipped > 0 {
|
||||||
"dedup complete: \"{title}\" — {} chapter(s), {} finding(s) applied",
|
println!(
|
||||||
chapters.len(),
|
"dedup complete: \"{title}\" — {} chapter(s) processed, {skipped} SKIPPED \
|
||||||
findings.len(),
|
(retry each with --chapter)",
|
||||||
);
|
chapters.len(),
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
println!(
|
||||||
|
"dedup complete: \"{title}\" — {} chapter(s) deduped against {} finding(s)",
|
||||||
|
chapters.len(),
|
||||||
|
findings.len(),
|
||||||
|
);
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -199,6 +199,10 @@ enum Cmd {
|
||||||
/// bound author if omitted.
|
/// bound author if omitted.
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
author: Option<String>,
|
author: Option<String>,
|
||||||
|
/// Restrict the run to a single chapter number — used to
|
||||||
|
/// retry a chapter the length guard skipped.
|
||||||
|
#[arg(long)]
|
||||||
|
chapter: Option<i32>,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -289,8 +293,8 @@ async fn run() -> anyhow::Result<()> {
|
||||||
rewrite::run(&cli.database_url, chapter, author.as_deref()).await
|
rewrite::run(&cli.database_url, chapter, author.as_deref()).await
|
||||||
}
|
}
|
||||||
Cmd::Audit { story } => audit::run(&cli.database_url, story).await,
|
Cmd::Audit { story } => audit::run(&cli.database_url, story).await,
|
||||||
Cmd::Dedup { story, author } => {
|
Cmd::Dedup { story, author, chapter } => {
|
||||||
dedup::run(&cli.database_url, story, author.as_deref()).await
|
dedup::run(&cli.database_url, story, author.as_deref(), chapter).await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue