skald/migrations/0002_voices_and_pronunciation.sql
Kayos 465c94b745 schema: voices + pronunciation_overrides + narration_runs (v0.2 prep)
TTS layer landed as schema-only — synthesis pipeline ships in v0.2.
Putting the tables in v0.1 means imports already carry the right
shape; we won't need a 'migrate every existing story' pass later.

Decisions locked 2026-05-13:
- Engine: F5-TTS (best 8GB FOSS option, mid-2026 SOTA)
- Default voice source: LJ Speech (Linda Johnson, PD released
  specifically for TTS training — airtight for sharing/uploading
  generated audio. The 'AI-consent-released' license posture is
  the difference between 'should be fine' and 'definitely fine.')
- Variety voices: Hi-Fi TTS speaker IDs (Apache 2.0, same consent
  shape). LibriVox is optional but never default.
- Pronunciation overrides DB layer (story-scoped + global) to fix
  proper-noun mispronunciation — the actual TTS-quality gap on
  Cobb's bar of 'must not wake me up.' Pre-pass with Opus extracts
  proper nouns + IPA, operator verifies, table caches forever.

Tables:
- voices — name, license, reference_path/text, sample_rate, default flag
- pronunciation_overrides — story-scoped or global, IPA/arpabet
- narration_runs — TTS audit trail mirroring generation_runs
- stories.preferred_voice_id FK

Unique constraints:
- one default voice (partial index)
- one row per (story, word) override
- one global row per word
2026-05-13 10:07:32 -07:00

109 lines
4.7 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

-- TTS layer schema. Synthesis pipeline lands in v0.2; the schema
-- ships now so v0.1 imports already carry the right shape.
--
-- Voice sources are restricted to AI-consent-released training
-- corpora (LJ Speech, Hi-Fi TTS, VCTK, LibriTTS-R) so generated
-- audio is airtight to share — even if a story eventually lands
-- on a public site. LibriVox narrators are not seed defaults; if
-- the operator adds one they accept the "volunteered for
-- public-domain reading, didn't sign up for AI cloning" caveat.
CREATE TABLE voices (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
name TEXT NOT NULL UNIQUE,
display_name TEXT NOT NULL,
gender TEXT,
accent TEXT,
-- Source identifier — e.g. "lj_speech", "hifi_tts:92",
-- "vctk:p225", "librivox:phil-chenevert", "self".
source TEXT NOT NULL,
-- License posture — "PD-ai-consent" (LJ Speech, Hi-Fi TTS,
-- VCTK, LibriTTS-R, etc — corpora released specifically for
-- TTS training), "PD-non-ai" (raw LibriVox, weaker consent),
-- "self" (operator's own recording), "licensed" (paid voice
-- with explicit cloning rights).
license TEXT NOT NULL,
-- On-disk path inside the container to a 10-15s reference WAV.
-- Nullable: a row can be metadata-only until the operator
-- places the audio.
reference_path TEXT,
-- Transcript of `reference_path`. F5-TTS needs both the audio
-- and the text to clone.
reference_text TEXT,
sample_rate_hz INTEGER,
duration_seconds REAL,
notes TEXT,
is_default BOOLEAN NOT NULL DEFAULT false,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
-- At most one default voice.
CREATE UNIQUE INDEX idx_voices_one_default
ON voices((is_default))
WHERE is_default = true;
-- Per-story (and optionally global) pronunciation overrides. The
-- TTS pipeline pre-processes every chapter text through this map
-- before handing it to F5. Solves the proper-noun problem
-- (Pripyat, Dyatlov, Toptunov, etc) without retraining.
CREATE TABLE pronunciation_overrides (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
-- NULL story_id = global default across all stories.
story_id UUID REFERENCES stories(id) ON DELETE CASCADE,
word TEXT NOT NULL,
-- The phonemes string in `phoneme_format` notation. e.g. IPA
-- /ˈɪɪt/ or arpabet "P R IH1 P Y AH0 T".
phonemes TEXT NOT NULL,
-- "ipa" | "arpabet" | "espeak" | "raw" — depends on what F5
-- accepts in the rendering pipeline.
phoneme_format TEXT NOT NULL DEFAULT 'ipa',
notes TEXT,
-- "auto" = LLM-generated (Opus extracted proper nouns + IPA).
-- "manual" = hand-edited.
source TEXT NOT NULL DEFAULT 'manual'
CHECK (source IN ('auto', 'manual')),
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
-- One row per (story, word) pair.
CREATE UNIQUE INDEX idx_pron_overrides_story_word
ON pronunciation_overrides(story_id, word)
WHERE story_id IS NOT NULL;
-- One global row per word (NULL story_id).
CREATE UNIQUE INDEX idx_pron_overrides_global_word
ON pronunciation_overrides(word)
WHERE story_id IS NULL;
CREATE INDEX idx_pron_overrides_story ON pronunciation_overrides(story_id);
-- Stories can declare a preferred voice. NULL → fall back to the
-- system-default voice (voices.is_default = true).
ALTER TABLE stories
ADD COLUMN preferred_voice_id UUID REFERENCES voices(id) ON DELETE SET NULL;
CREATE INDEX idx_stories_preferred_voice ON stories(preferred_voice_id);
-- A narration_runs table mirroring generation_runs but for TTS
-- jobs. Each chapter render is a row. Keeps narration audit trail
-- separate from text-generation audit trail.
CREATE TABLE narration_runs (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
chapter_id UUID NOT NULL REFERENCES chapters(id) ON DELETE CASCADE,
voice_id UUID NOT NULL REFERENCES voices(id) ON DELETE CASCADE,
-- "f5-tts" | "xtts-v2" | future engines.
engine TEXT NOT NULL,
engine_version TEXT,
-- Output WAV/MP3 on disk. Nullable until the run completes.
output_path TEXT,
duration_seconds REAL,
seed BIGINT,
started_at TIMESTAMPTZ NOT NULL DEFAULT now(),
ended_at TIMESTAMPTZ,
status TEXT NOT NULL DEFAULT 'running'
CHECK (status IN ('running', 'succeeded', 'failed', 'rerolled')),
error TEXT
);
CREATE INDEX idx_narration_runs_chapter ON narration_runs(chapter_id);
CREATE INDEX idx_narration_runs_voice ON narration_runs(voice_id);