From 4a91e0738d939e9047467674c2e98639644930d0 Mon Sep 17 00:00:00 2001
From: Kayos <kayos@sulkta.com>
Date: Wed, 13 May 2026 10:10:04 -0700
Subject: [PATCH] =?UTF-8?q?schema:=20narration=5Ffindings=20=E2=80=94=20au?=
 =?UTF-8?q?dio-layer=20audit=20table?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes the TTS schema layer. The v0.2 render pipeline auto-runs an
audit chain after each chapter narration:

  F5 render → narration_runs (succeeded)
    → ffmpeg chunk into ~30s windows
    → Whisper-large-v3 STT each chunk
    → word-level diff vs source chapter text
    → mismatches → narration_findings (kind=pronunciation|skip|insert)
    → ffmpeg silence/clip detect → narration_findings (kind=glitch)
    → (optional) Gemini Flash audio review pass
      → narration_findings (kind=prosody|tone)
    → unresolved crits trigger automatic re-roll with new seed

Distinct from audit_findings: that table is canon/continuity at the
text layer, populated by the third-Opus canon-audit pass.
narration_findings is audio-quality only, populated by detectors
that consume the rendered WAV.

The 'detector' field captures which model produced the finding so
we can tune thresholds per detector when one over- or under-flags.

cobb's audio agent intuition was right: STT-and-diff catches the
'name came out wrong' case airtight, and a separate audio-native
LLM call catches the subtler 'this sentence sounded weird' cases
Whisper can't see.
---
 migrations/0003_narration_findings.sql | 42 ++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 migrations/0003_narration_findings.sql

diff --git a/migrations/0003_narration_findings.sql b/migrations/0003_narration_findings.sql
new file mode 100644
index 0000000..34cf9a3
--- /dev/null
+++ b/migrations/0003_narration_findings.sql
@@ -0,0 +1,42 @@
+-- Audio-level audit findings. Populated by the v0.2 audit pipeline:
+-- Whisper STT compares the rendered audio against the source text;
+-- substantive deltas land here as findings. A separate audio-native
+-- LLM pass (Gemini Flash audio etc) may add tone / prosody findings
+-- the STT pass can't see.
+--
+-- Distinct from `audit_findings` which lives at the TEXT layer
+-- (canon drift, character voice, continuity). This table is for
+-- the AUDIO layer (mispronounced names, skipped lines, glitches,
+-- weird inflection).
+
+CREATE TABLE narration_findings (
+    id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    run_id          UUID NOT NULL REFERENCES narration_runs(id) ON DELETE CASCADE,
+    -- "pronunciation"     — wrong word came out (Whisper diff)
+    -- "skip"              — source text absent in audio
+    -- "insert"            — extra word in audio not in source
+    -- "glitch"            — silence, clipping, dropout, etc
+    -- "prosody"           — pacing / rhythm issue (audio-LLM only)
+    -- "tone"              — wrong emotional register (audio-LLM only)
+    kind            TEXT NOT NULL CHECK (kind IN (
+                        'pronunciation', 'skip', 'insert',
+                        'glitch', 'prosody', 'tone'
+                    )),
+    -- Window in the chapter audio where the issue lives.
+    timestamp_start REAL NOT NULL,
+    timestamp_end   REAL NOT NULL,
+    -- For text-layer deltas: what we asked for and what we got.
+    expected_text   TEXT,
+    heard_text      TEXT,
+    severity        TEXT NOT NULL CHECK (severity IN ('info', 'warn', 'crit')),
+    notes           TEXT,
+    -- Source of the finding: 'whisper' | 'gemini-flash-audio' |
+    -- 'gpt-4o-audio' | 'qwen2-audio' | etc.
+    detector        TEXT NOT NULL,
+    resolved        BOOLEAN NOT NULL DEFAULT false,
+    created_at      TIMESTAMPTZ NOT NULL DEFAULT now()
+);
+
+CREATE INDEX idx_narration_findings_run      ON narration_findings(run_id);
+CREATE INDEX idx_narration_findings_severity ON narration_findings(severity);
+CREATE INDEX idx_narration_findings_resolved ON narration_findings(resolved) WHERE NOT resolved;