schema: voices + pronunciation_overrides + narration_runs (v0.2 prep)

TTS layer landed as schema-only — synthesis pipeline ships in v0.2.
Putting the tables in v0.1 means imports already carry the right
shape; we won't need a 'migrate every existing story' pass later.

Decisions locked 2026-05-13:
- Engine: F5-TTS (best 8GB FOSS option, mid-2026 SOTA)
- Default voice source: LJ Speech (Linda Johnson, PD released
  specifically for TTS training — airtight for sharing/uploading
  generated audio. The 'AI-consent-released' license posture is
  the difference between 'should be fine' and 'definitely fine.')
- Variety voices: Hi-Fi TTS speaker IDs (Apache 2.0, same consent
  shape). LibriVox is optional but never default.
- Pronunciation overrides DB layer (story-scoped + global) to fix
  proper-noun mispronunciation — the actual TTS-quality gap on
  Cobb's bar of 'must not wake me up.' Pre-pass with Opus extracts
  proper nouns + IPA, operator verifies, table caches forever.

Tables:
- voices — name, license, reference_path/text, sample_rate, default flag
- pronunciation_overrides — story-scoped or global, IPA/arpabet
- narration_runs — TTS audit trail mirroring generation_runs
- stories.preferred_voice_id FK

Unique constraints:
- one default voice (partial index)
- one row per (story, word) override
- one global row per word
This commit is contained in:
Kayos 2026-05-13 10:07:32 -07:00
parent f575ad3722
commit 465c94b745

View file

@ -0,0 +1,109 @@
-- TTS layer schema. Synthesis pipeline lands in v0.2; the schema
-- ships now so v0.1 imports already carry the right shape.
--
-- Voice sources are restricted to AI-consent-released training
-- corpora (LJ Speech, Hi-Fi TTS, VCTK, LibriTTS-R) so generated
-- audio is airtight to share — even if a story eventually lands
-- on a public site. LibriVox narrators are not seed defaults; if
-- the operator adds one they accept the "volunteered for
-- public-domain reading, didn't sign up for AI cloning" caveat.
CREATE TABLE voices (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
name TEXT NOT NULL UNIQUE,
display_name TEXT NOT NULL,
gender TEXT,
accent TEXT,
-- Source identifier — e.g. "lj_speech", "hifi_tts:92",
-- "vctk:p225", "librivox:phil-chenevert", "self".
source TEXT NOT NULL,
-- License posture — "PD-ai-consent" (LJ Speech, Hi-Fi TTS,
-- VCTK, LibriTTS-R, etc — corpora released specifically for
-- TTS training), "PD-non-ai" (raw LibriVox, weaker consent),
-- "self" (operator's own recording), "licensed" (paid voice
-- with explicit cloning rights).
license TEXT NOT NULL,
-- On-disk path inside the container to a 10-15s reference WAV.
-- Nullable: a row can be metadata-only until the operator
-- places the audio.
reference_path TEXT,
-- Transcript of `reference_path`. F5-TTS needs both the audio
-- and the text to clone.
reference_text TEXT,
sample_rate_hz INTEGER,
duration_seconds REAL,
notes TEXT,
is_default BOOLEAN NOT NULL DEFAULT false,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
-- At most one default voice.
CREATE UNIQUE INDEX idx_voices_one_default
ON voices((is_default))
WHERE is_default = true;
-- Per-story (and optionally global) pronunciation overrides. The
-- TTS pipeline pre-processes every chapter text through this map
-- before handing it to F5. Solves the proper-noun problem
-- (Pripyat, Dyatlov, Toptunov, etc) without retraining.
CREATE TABLE pronunciation_overrides (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
-- NULL story_id = global default across all stories.
story_id UUID REFERENCES stories(id) ON DELETE CASCADE,
word TEXT NOT NULL,
-- The phonemes string in `phoneme_format` notation. e.g. IPA
-- /ˈɪɪt/ or arpabet "P R IH1 P Y AH0 T".
phonemes TEXT NOT NULL,
-- "ipa" | "arpabet" | "espeak" | "raw" — depends on what F5
-- accepts in the rendering pipeline.
phoneme_format TEXT NOT NULL DEFAULT 'ipa',
notes TEXT,
-- "auto" = LLM-generated (Opus extracted proper nouns + IPA).
-- "manual" = hand-edited.
source TEXT NOT NULL DEFAULT 'manual'
CHECK (source IN ('auto', 'manual')),
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
-- One row per (story, word) pair.
CREATE UNIQUE INDEX idx_pron_overrides_story_word
ON pronunciation_overrides(story_id, word)
WHERE story_id IS NOT NULL;
-- One global row per word (NULL story_id).
CREATE UNIQUE INDEX idx_pron_overrides_global_word
ON pronunciation_overrides(word)
WHERE story_id IS NULL;
CREATE INDEX idx_pron_overrides_story ON pronunciation_overrides(story_id);
-- Stories can declare a preferred voice. NULL → fall back to the
-- system-default voice (voices.is_default = true).
ALTER TABLE stories
ADD COLUMN preferred_voice_id UUID REFERENCES voices(id) ON DELETE SET NULL;
CREATE INDEX idx_stories_preferred_voice ON stories(preferred_voice_id);
-- A narration_runs table mirroring generation_runs but for TTS
-- jobs. Each chapter render is a row. Keeps narration audit trail
-- separate from text-generation audit trail.
CREATE TABLE narration_runs (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
chapter_id UUID NOT NULL REFERENCES chapters(id) ON DELETE CASCADE,
voice_id UUID NOT NULL REFERENCES voices(id) ON DELETE CASCADE,
-- "f5-tts" | "xtts-v2" | future engines.
engine TEXT NOT NULL,
engine_version TEXT,
-- Output WAV/MP3 on disk. Nullable until the run completes.
output_path TEXT,
duration_seconds REAL,
seed BIGINT,
started_at TIMESTAMPTZ NOT NULL DEFAULT now(),
ended_at TIMESTAMPTZ,
status TEXT NOT NULL DEFAULT 'running'
CHECK (status IN ('running', 'succeeded', 'failed', 'rerolled')),
error TEXT
);
CREATE INDEX idx_narration_runs_chapter ON narration_runs(chapter_id);
CREATE INDEX idx_narration_runs_voice ON narration_runs(voice_id);