schema: narration_findings — audio-layer audit table
Closes the TTS schema layer. The v0.2 render pipeline auto-runs an
audit chain after each chapter narration:
F5 render → narration_runs (succeeded)
→ ffmpeg chunk into ~30s windows
→ Whisper-large-v3 STT each chunk
→ word-level diff vs source chapter text
→ mismatches → narration_findings (kind=pronunciation|skip|insert)
→ ffmpeg silence/clip detect → narration_findings (kind=glitch)
→ (optional) Gemini Flash audio review pass
→ narration_findings (kind=prosody|tone)
→ unresolved crits trigger automatic re-roll with new seed
Distinct from audit_findings: that table is canon/continuity at the
text layer, populated by the third-Opus canon-audit pass.
narration_findings is audio-quality only, populated by detectors
that consume the rendered WAV.
The 'detector' field captures which model produced the finding so
we can tune thresholds per detector when one over- or under-flags.
cobb's audio agent intuition was right: STT-and-diff catches the
'name came out wrong' case airtight, and a separate audio-native
LLM call catches the subtler 'this sentence sounded weird' cases
Whisper can't see.
This commit is contained in:
parent
465c94b745
commit
4a91e0738d
1 changed files with 42 additions and 0 deletions
42
migrations/0003_narration_findings.sql
Normal file
42
migrations/0003_narration_findings.sql
Normal file
|
|
@ -0,0 +1,42 @@
|
||||||
|
-- Audio-level audit findings. Populated by the v0.2 audit pipeline:
|
||||||
|
-- Whisper STT compares the rendered audio against the source text;
|
||||||
|
-- substantive deltas land here as findings. A separate audio-native
|
||||||
|
-- LLM pass (Gemini Flash audio etc) may add tone / prosody findings
|
||||||
|
-- the STT pass can't see.
|
||||||
|
--
|
||||||
|
-- Distinct from `audit_findings` which lives at the TEXT layer
|
||||||
|
-- (canon drift, character voice, continuity). This table is for
|
||||||
|
-- the AUDIO layer (mispronounced names, skipped lines, glitches,
|
||||||
|
-- weird inflection).
|
||||||
|
|
||||||
|
CREATE TABLE narration_findings (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
run_id UUID NOT NULL REFERENCES narration_runs(id) ON DELETE CASCADE,
|
||||||
|
-- "pronunciation" — wrong word came out (Whisper diff)
|
||||||
|
-- "skip" — source text absent in audio
|
||||||
|
-- "insert" — extra word in audio not in source
|
||||||
|
-- "glitch" — silence, clipping, dropout, etc
|
||||||
|
-- "prosody" — pacing / rhythm issue (audio-LLM only)
|
||||||
|
-- "tone" — wrong emotional register (audio-LLM only)
|
||||||
|
kind TEXT NOT NULL CHECK (kind IN (
|
||||||
|
'pronunciation', 'skip', 'insert',
|
||||||
|
'glitch', 'prosody', 'tone'
|
||||||
|
)),
|
||||||
|
-- Window in the chapter audio where the issue lives.
|
||||||
|
timestamp_start REAL NOT NULL,
|
||||||
|
timestamp_end REAL NOT NULL,
|
||||||
|
-- For text-layer deltas: what we asked for and what we got.
|
||||||
|
expected_text TEXT,
|
||||||
|
heard_text TEXT,
|
||||||
|
severity TEXT NOT NULL CHECK (severity IN ('info', 'warn', 'crit')),
|
||||||
|
notes TEXT,
|
||||||
|
-- Source of the finding: 'whisper' | 'gemini-flash-audio' |
|
||||||
|
-- 'gpt-4o-audio' | 'qwen2-audio' | etc.
|
||||||
|
detector TEXT NOT NULL,
|
||||||
|
resolved BOOLEAN NOT NULL DEFAULT false,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX idx_narration_findings_run ON narration_findings(run_id);
|
||||||
|
CREATE INDEX idx_narration_findings_severity ON narration_findings(severity);
|
||||||
|
CREATE INDEX idx_narration_findings_resolved ON narration_findings(resolved) WHERE NOT resolved;
|
||||||
Loading…
Add table
Add a link
Reference in a new issue