From 4a91e0738d939e9047467674c2e98639644930d0 Mon Sep 17 00:00:00 2001 From: Kayos Date: Wed, 13 May 2026 10:10:04 -0700 Subject: [PATCH] =?UTF-8?q?schema:=20narration=5Ffindings=20=E2=80=94=20au?= =?UTF-8?q?dio-layer=20audit=20table?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the TTS schema layer. The v0.2 render pipeline auto-runs an audit chain after each chapter narration: F5 render → narration_runs (succeeded) → ffmpeg chunk into ~30s windows → Whisper-large-v3 STT each chunk → word-level diff vs source chapter text → mismatches → narration_findings (kind=pronunciation|skip|insert) → ffmpeg silence/clip detect → narration_findings (kind=glitch) → (optional) Gemini Flash audio review pass → narration_findings (kind=prosody|tone) → unresolved crits trigger automatic re-roll with new seed Distinct from audit_findings: that table is canon/continuity at the text layer, populated by the third-Opus canon-audit pass. narration_findings is audio-quality only, populated by detectors that consume the rendered WAV. The 'detector' field captures which model produced the finding so we can tune thresholds per detector when one over- or under-flags. cobb's audio agent intuition was right: STT-and-diff catches the 'name came out wrong' case airtight, and a separate audio-native LLM call catches the subtler 'this sentence sounded weird' cases Whisper can't see. --- migrations/0003_narration_findings.sql | 42 ++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 migrations/0003_narration_findings.sql diff --git a/migrations/0003_narration_findings.sql b/migrations/0003_narration_findings.sql new file mode 100644 index 0000000..34cf9a3 --- /dev/null +++ b/migrations/0003_narration_findings.sql @@ -0,0 +1,42 @@ +-- Audio-level audit findings. Populated by the v0.2 audit pipeline: +-- Whisper STT compares the rendered audio against the source text; +-- substantive deltas land here as findings. A separate audio-native +-- LLM pass (Gemini Flash audio etc) may add tone / prosody findings +-- the STT pass can't see. +-- +-- Distinct from `audit_findings` which lives at the TEXT layer +-- (canon drift, character voice, continuity). This table is for +-- the AUDIO layer (mispronounced names, skipped lines, glitches, +-- weird inflection). + +CREATE TABLE narration_findings ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + run_id UUID NOT NULL REFERENCES narration_runs(id) ON DELETE CASCADE, + -- "pronunciation" — wrong word came out (Whisper diff) + -- "skip" — source text absent in audio + -- "insert" — extra word in audio not in source + -- "glitch" — silence, clipping, dropout, etc + -- "prosody" — pacing / rhythm issue (audio-LLM only) + -- "tone" — wrong emotional register (audio-LLM only) + kind TEXT NOT NULL CHECK (kind IN ( + 'pronunciation', 'skip', 'insert', + 'glitch', 'prosody', 'tone' + )), + -- Window in the chapter audio where the issue lives. + timestamp_start REAL NOT NULL, + timestamp_end REAL NOT NULL, + -- For text-layer deltas: what we asked for and what we got. + expected_text TEXT, + heard_text TEXT, + severity TEXT NOT NULL CHECK (severity IN ('info', 'warn', 'crit')), + notes TEXT, + -- Source of the finding: 'whisper' | 'gemini-flash-audio' | + -- 'gpt-4o-audio' | 'qwen2-audio' | etc. + detector TEXT NOT NULL, + resolved BOOLEAN NOT NULL DEFAULT false, + created_at TIMESTAMPTZ NOT NULL DEFAULT now() +); + +CREATE INDEX idx_narration_findings_run ON narration_findings(run_id); +CREATE INDEX idx_narration_findings_severity ON narration_findings(severity); +CREATE INDEX idx_narration_findings_resolved ON narration_findings(resolved) WHERE NOT resolved;