schema: narration_findings — audio-layer audit table

Closes the TTS schema layer. The v0.2 render pipeline auto-runs an audit chain after each chapter narration: F5 render → narration_runs (succeeded) → ffmpeg chunk into ~30s windows → Whisper-large-v3 STT each chunk → word-level diff vs source chapter text → mismatches → narration_findings (kind=pronunciation|skip|insert) → ffmpeg silence/clip detect → narration_findings (kind=glitch) → (optional) Gemini Flash audio review pass → narration_findings (kind=prosody|tone) → unresolved crits trigger automatic re-roll with new seed Distinct from audit_findings: that table is canon/continuity at the text layer, populated by the third-Opus canon-audit pass. narration_findings is audio-quality only, populated by detectors that consume the rendered WAV. The 'detector' field captures which model produced the finding so we can tune thresholds per detector when one over- or under-flags. cobb's audio agent intuition was right: STT-and-diff catches the 'name came out wrong' case airtight, and a separate audio-native LLM call catches the subtler 'this sentence sounded weird' cases Whisper can't see.
2026-05-13 10:10:04 -07:00 · 2026-05-13 10:10:04 -07:00 · 4a91e0738d
commit 4a91e0738d
parent 465c94b745
1 changed files with 42 additions and 0 deletions
--- a/migrations/0003_narration_findings.sql
+++ b/migrations/0003_narration_findings.sql
@ -0,0 +1,42 @@
 -- Audio-level audit findings. Populated by the v0.2 audit pipeline:
 -- Whisper STT compares the rendered audio against the source text;
 -- substantive deltas land here as findings. A separate audio-native
 -- LLM pass (Gemini Flash audio etc) may add tone / prosody findings
 -- the STT pass can't see.
 --
 -- Distinct from `audit_findings` which lives at the TEXT layer
 -- (canon drift, character voice, continuity). This table is for
 -- the AUDIO layer (mispronounced names, skipped lines, glitches,
 -- weird inflection).
 CREATE TABLE narration_findings (
    id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    run_id          UUID NOT NULL REFERENCES narration_runs(id) ON DELETE CASCADE,
    -- "pronunciation"     — wrong word came out (Whisper diff)
    -- "skip"              — source text absent in audio
    -- "insert"            — extra word in audio not in source
    -- "glitch"            — silence, clipping, dropout, etc
    -- "prosody"           — pacing / rhythm issue (audio-LLM only)
    -- "tone"              — wrong emotional register (audio-LLM only)
    kind            TEXT NOT NULL CHECK (kind IN (
                        'pronunciation', 'skip', 'insert',
                        'glitch', 'prosody', 'tone'
                    )),
    -- Window in the chapter audio where the issue lives.
    timestamp_start REAL NOT NULL,
    timestamp_end   REAL NOT NULL,
    -- For text-layer deltas: what we asked for and what we got.
    expected_text   TEXT,
    heard_text      TEXT,
    severity        TEXT NOT NULL CHECK (severity IN ('info', 'warn', 'crit')),
    notes           TEXT,
    -- Source of the finding: 'whisper' | 'gemini-flash-audio' |
    -- 'gpt-4o-audio' | 'qwen2-audio' | etc.
    detector        TEXT NOT NULL,
    resolved        BOOLEAN NOT NULL DEFAULT false,
    created_at      TIMESTAMPTZ NOT NULL DEFAULT now()
 );
 CREATE INDEX idx_narration_findings_run      ON narration_findings(run_id);
 CREATE INDEX idx_narration_findings_severity ON narration_findings(severity);
 CREATE INDEX idx_narration_findings_resolved ON narration_findings(resolved) WHERE NOT resolved;