From 465c94b745b367edb98aa3b4c12b10102fc67530 Mon Sep 17 00:00:00 2001 From: Kayos Date: Wed, 13 May 2026 10:07:32 -0700 Subject: [PATCH] schema: voices + pronunciation_overrides + narration_runs (v0.2 prep) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TTS layer landed as schema-only — synthesis pipeline ships in v0.2. Putting the tables in v0.1 means imports already carry the right shape; we won't need a 'migrate every existing story' pass later. Decisions locked 2026-05-13: - Engine: F5-TTS (best 8GB FOSS option, mid-2026 SOTA) - Default voice source: LJ Speech (Linda Johnson, PD released specifically for TTS training — airtight for sharing/uploading generated audio. The 'AI-consent-released' license posture is the difference between 'should be fine' and 'definitely fine.') - Variety voices: Hi-Fi TTS speaker IDs (Apache 2.0, same consent shape). LibriVox is optional but never default. - Pronunciation overrides DB layer (story-scoped + global) to fix proper-noun mispronunciation — the actual TTS-quality gap on Cobb's bar of 'must not wake me up.' Pre-pass with Opus extracts proper nouns + IPA, operator verifies, table caches forever. Tables: - voices — name, license, reference_path/text, sample_rate, default flag - pronunciation_overrides — story-scoped or global, IPA/arpabet - narration_runs — TTS audit trail mirroring generation_runs - stories.preferred_voice_id FK Unique constraints: - one default voice (partial index) - one row per (story, word) override - one global row per word --- migrations/0002_voices_and_pronunciation.sql | 109 +++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 migrations/0002_voices_and_pronunciation.sql diff --git a/migrations/0002_voices_and_pronunciation.sql b/migrations/0002_voices_and_pronunciation.sql new file mode 100644 index 0000000..305cb29 --- /dev/null +++ b/migrations/0002_voices_and_pronunciation.sql @@ -0,0 +1,109 @@ +-- TTS layer schema. Synthesis pipeline lands in v0.2; the schema +-- ships now so v0.1 imports already carry the right shape. +-- +-- Voice sources are restricted to AI-consent-released training +-- corpora (LJ Speech, Hi-Fi TTS, VCTK, LibriTTS-R) so generated +-- audio is airtight to share — even if a story eventually lands +-- on a public site. LibriVox narrators are not seed defaults; if +-- the operator adds one they accept the "volunteered for +-- public-domain reading, didn't sign up for AI cloning" caveat. + +CREATE TABLE voices ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + name TEXT NOT NULL UNIQUE, + display_name TEXT NOT NULL, + gender TEXT, + accent TEXT, + -- Source identifier — e.g. "lj_speech", "hifi_tts:92", + -- "vctk:p225", "librivox:phil-chenevert", "self". + source TEXT NOT NULL, + -- License posture — "PD-ai-consent" (LJ Speech, Hi-Fi TTS, + -- VCTK, LibriTTS-R, etc — corpora released specifically for + -- TTS training), "PD-non-ai" (raw LibriVox, weaker consent), + -- "self" (operator's own recording), "licensed" (paid voice + -- with explicit cloning rights). + license TEXT NOT NULL, + -- On-disk path inside the container to a 10-15s reference WAV. + -- Nullable: a row can be metadata-only until the operator + -- places the audio. + reference_path TEXT, + -- Transcript of `reference_path`. F5-TTS needs both the audio + -- and the text to clone. + reference_text TEXT, + sample_rate_hz INTEGER, + duration_seconds REAL, + notes TEXT, + is_default BOOLEAN NOT NULL DEFAULT false, + created_at TIMESTAMPTZ NOT NULL DEFAULT now() +); + +-- At most one default voice. +CREATE UNIQUE INDEX idx_voices_one_default + ON voices((is_default)) + WHERE is_default = true; + +-- Per-story (and optionally global) pronunciation overrides. The +-- TTS pipeline pre-processes every chapter text through this map +-- before handing it to F5. Solves the proper-noun problem +-- (Pripyat, Dyatlov, Toptunov, etc) without retraining. +CREATE TABLE pronunciation_overrides ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + -- NULL story_id = global default across all stories. + story_id UUID REFERENCES stories(id) ON DELETE CASCADE, + word TEXT NOT NULL, + -- The phonemes string in `phoneme_format` notation. e.g. IPA + -- /ˈpɹɪpʲɪt/ or arpabet "P R IH1 P Y AH0 T". + phonemes TEXT NOT NULL, + -- "ipa" | "arpabet" | "espeak" | "raw" — depends on what F5 + -- accepts in the rendering pipeline. + phoneme_format TEXT NOT NULL DEFAULT 'ipa', + notes TEXT, + -- "auto" = LLM-generated (Opus extracted proper nouns + IPA). + -- "manual" = hand-edited. + source TEXT NOT NULL DEFAULT 'manual' + CHECK (source IN ('auto', 'manual')), + created_at TIMESTAMPTZ NOT NULL DEFAULT now() +); + +-- One row per (story, word) pair. +CREATE UNIQUE INDEX idx_pron_overrides_story_word + ON pronunciation_overrides(story_id, word) + WHERE story_id IS NOT NULL; + +-- One global row per word (NULL story_id). +CREATE UNIQUE INDEX idx_pron_overrides_global_word + ON pronunciation_overrides(word) + WHERE story_id IS NULL; + +CREATE INDEX idx_pron_overrides_story ON pronunciation_overrides(story_id); + +-- Stories can declare a preferred voice. NULL → fall back to the +-- system-default voice (voices.is_default = true). +ALTER TABLE stories + ADD COLUMN preferred_voice_id UUID REFERENCES voices(id) ON DELETE SET NULL; + +CREATE INDEX idx_stories_preferred_voice ON stories(preferred_voice_id); + +-- A narration_runs table mirroring generation_runs but for TTS +-- jobs. Each chapter render is a row. Keeps narration audit trail +-- separate from text-generation audit trail. +CREATE TABLE narration_runs ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + chapter_id UUID NOT NULL REFERENCES chapters(id) ON DELETE CASCADE, + voice_id UUID NOT NULL REFERENCES voices(id) ON DELETE CASCADE, + -- "f5-tts" | "xtts-v2" | future engines. + engine TEXT NOT NULL, + engine_version TEXT, + -- Output WAV/MP3 on disk. Nullable until the run completes. + output_path TEXT, + duration_seconds REAL, + seed BIGINT, + started_at TIMESTAMPTZ NOT NULL DEFAULT now(), + ended_at TIMESTAMPTZ, + status TEXT NOT NULL DEFAULT 'running' + CHECK (status IN ('running', 'succeeded', 'failed', 'rerolled')), + error TEXT +); + +CREATE INDEX idx_narration_runs_chapter ON narration_runs(chapter_id); +CREATE INDEX idx_narration_runs_voice ON narration_runs(voice_id);