TTS layer landed as schema-only — synthesis pipeline ships in v0.2. Putting the tables in v0.1 means imports already carry the right shape; we won't need a 'migrate every existing story' pass later. Decisions locked 2026-05-13: - Engine: F5-TTS (best 8GB FOSS option, mid-2026 SOTA) - Default voice source: LJ Speech (Linda Johnson, PD released specifically for TTS training — airtight for sharing/uploading generated audio. The 'AI-consent-released' license posture is the difference between 'should be fine' and 'definitely fine.') - Variety voices: Hi-Fi TTS speaker IDs (Apache 2.0, same consent shape). LibriVox is optional but never default. - Pronunciation overrides DB layer (story-scoped + global) to fix proper-noun mispronunciation — the actual TTS-quality gap on Cobb's bar of 'must not wake me up.' Pre-pass with Opus extracts proper nouns + IPA, operator verifies, table caches forever. Tables: - voices — name, license, reference_path/text, sample_rate, default flag - pronunciation_overrides — story-scoped or global, IPA/arpabet - narration_runs — TTS audit trail mirroring generation_runs - stories.preferred_voice_id FK Unique constraints: - one default voice (partial index) - one row per (story, word) override - one global row per word
109 lines
4.7 KiB
SQL
109 lines
4.7 KiB
SQL
-- TTS layer schema. Synthesis pipeline lands in v0.2; the schema
|
||
-- ships now so v0.1 imports already carry the right shape.
|
||
--
|
||
-- Voice sources are restricted to AI-consent-released training
|
||
-- corpora (LJ Speech, Hi-Fi TTS, VCTK, LibriTTS-R) so generated
|
||
-- audio is airtight to share — even if a story eventually lands
|
||
-- on a public site. LibriVox narrators are not seed defaults; if
|
||
-- the operator adds one they accept the "volunteered for
|
||
-- public-domain reading, didn't sign up for AI cloning" caveat.
|
||
|
||
CREATE TABLE voices (
|
||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||
name TEXT NOT NULL UNIQUE,
|
||
display_name TEXT NOT NULL,
|
||
gender TEXT,
|
||
accent TEXT,
|
||
-- Source identifier — e.g. "lj_speech", "hifi_tts:92",
|
||
-- "vctk:p225", "librivox:phil-chenevert", "self".
|
||
source TEXT NOT NULL,
|
||
-- License posture — "PD-ai-consent" (LJ Speech, Hi-Fi TTS,
|
||
-- VCTK, LibriTTS-R, etc — corpora released specifically for
|
||
-- TTS training), "PD-non-ai" (raw LibriVox, weaker consent),
|
||
-- "self" (operator's own recording), "licensed" (paid voice
|
||
-- with explicit cloning rights).
|
||
license TEXT NOT NULL,
|
||
-- On-disk path inside the container to a 10-15s reference WAV.
|
||
-- Nullable: a row can be metadata-only until the operator
|
||
-- places the audio.
|
||
reference_path TEXT,
|
||
-- Transcript of `reference_path`. F5-TTS needs both the audio
|
||
-- and the text to clone.
|
||
reference_text TEXT,
|
||
sample_rate_hz INTEGER,
|
||
duration_seconds REAL,
|
||
notes TEXT,
|
||
is_default BOOLEAN NOT NULL DEFAULT false,
|
||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||
);
|
||
|
||
-- At most one default voice.
|
||
CREATE UNIQUE INDEX idx_voices_one_default
|
||
ON voices((is_default))
|
||
WHERE is_default = true;
|
||
|
||
-- Per-story (and optionally global) pronunciation overrides. The
|
||
-- TTS pipeline pre-processes every chapter text through this map
|
||
-- before handing it to F5. Solves the proper-noun problem
|
||
-- (Pripyat, Dyatlov, Toptunov, etc) without retraining.
|
||
CREATE TABLE pronunciation_overrides (
|
||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||
-- NULL story_id = global default across all stories.
|
||
story_id UUID REFERENCES stories(id) ON DELETE CASCADE,
|
||
word TEXT NOT NULL,
|
||
-- The phonemes string in `phoneme_format` notation. e.g. IPA
|
||
-- /ˈpɹɪpʲɪt/ or arpabet "P R IH1 P Y AH0 T".
|
||
phonemes TEXT NOT NULL,
|
||
-- "ipa" | "arpabet" | "espeak" | "raw" — depends on what F5
|
||
-- accepts in the rendering pipeline.
|
||
phoneme_format TEXT NOT NULL DEFAULT 'ipa',
|
||
notes TEXT,
|
||
-- "auto" = LLM-generated (Opus extracted proper nouns + IPA).
|
||
-- "manual" = hand-edited.
|
||
source TEXT NOT NULL DEFAULT 'manual'
|
||
CHECK (source IN ('auto', 'manual')),
|
||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||
);
|
||
|
||
-- One row per (story, word) pair.
|
||
CREATE UNIQUE INDEX idx_pron_overrides_story_word
|
||
ON pronunciation_overrides(story_id, word)
|
||
WHERE story_id IS NOT NULL;
|
||
|
||
-- One global row per word (NULL story_id).
|
||
CREATE UNIQUE INDEX idx_pron_overrides_global_word
|
||
ON pronunciation_overrides(word)
|
||
WHERE story_id IS NULL;
|
||
|
||
CREATE INDEX idx_pron_overrides_story ON pronunciation_overrides(story_id);
|
||
|
||
-- Stories can declare a preferred voice. NULL → fall back to the
|
||
-- system-default voice (voices.is_default = true).
|
||
ALTER TABLE stories
|
||
ADD COLUMN preferred_voice_id UUID REFERENCES voices(id) ON DELETE SET NULL;
|
||
|
||
CREATE INDEX idx_stories_preferred_voice ON stories(preferred_voice_id);
|
||
|
||
-- A narration_runs table mirroring generation_runs but for TTS
|
||
-- jobs. Each chapter render is a row. Keeps narration audit trail
|
||
-- separate from text-generation audit trail.
|
||
CREATE TABLE narration_runs (
|
||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||
chapter_id UUID NOT NULL REFERENCES chapters(id) ON DELETE CASCADE,
|
||
voice_id UUID NOT NULL REFERENCES voices(id) ON DELETE CASCADE,
|
||
-- "f5-tts" | "xtts-v2" | future engines.
|
||
engine TEXT NOT NULL,
|
||
engine_version TEXT,
|
||
-- Output WAV/MP3 on disk. Nullable until the run completes.
|
||
output_path TEXT,
|
||
duration_seconds REAL,
|
||
seed BIGINT,
|
||
started_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||
ended_at TIMESTAMPTZ,
|
||
status TEXT NOT NULL DEFAULT 'running'
|
||
CHECK (status IN ('running', 'succeeded', 'failed', 'rerolled')),
|
||
error TEXT
|
||
);
|
||
|
||
CREATE INDEX idx_narration_runs_chapter ON narration_runs(chapter_id);
|
||
CREATE INDEX idx_narration_runs_voice ON narration_runs(voice_id);
|