From 465c94b745b367edb98aa3b4c12b10102fc67530 Mon Sep 17 00:00:00 2001
From: Kayos <kayos@sulkta.com>
Date: Wed, 13 May 2026 10:07:32 -0700
Subject: [PATCH] schema: voices + pronunciation_overrides + narration_runs
 (v0.2 prep)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TTS layer landed as schema-only — synthesis pipeline ships in v0.2.
Putting the tables in v0.1 means imports already carry the right
shape; we won't need a 'migrate every existing story' pass later.

Decisions locked 2026-05-13:
- Engine: F5-TTS (best 8GB FOSS option, mid-2026 SOTA)
- Default voice source: LJ Speech (Linda Johnson, PD released
  specifically for TTS training — airtight for sharing/uploading
  generated audio. The 'AI-consent-released' license posture is
  the difference between 'should be fine' and 'definitely fine.')
- Variety voices: Hi-Fi TTS speaker IDs (Apache 2.0, same consent
  shape). LibriVox is optional but never default.
- Pronunciation overrides DB layer (story-scoped + global) to fix
  proper-noun mispronunciation — the actual TTS-quality gap on
  Cobb's bar of 'must not wake me up.' Pre-pass with Opus extracts
  proper nouns + IPA, operator verifies, table caches forever.

Tables:
- voices — name, license, reference_path/text, sample_rate, default flag
- pronunciation_overrides — story-scoped or global, IPA/arpabet
- narration_runs — TTS audit trail mirroring generation_runs
- stories.preferred_voice_id FK

Unique constraints:
- one default voice (partial index)
- one row per (story, word) override
- one global row per word
---
 migrations/0002_voices_and_pronunciation.sql | 109 +++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 migrations/0002_voices_and_pronunciation.sql

diff --git a/migrations/0002_voices_and_pronunciation.sql b/migrations/0002_voices_and_pronunciation.sql
new file mode 100644
index 0000000..305cb29
--- /dev/null
+++ b/migrations/0002_voices_and_pronunciation.sql
@@ -0,0 +1,109 @@
+-- TTS layer schema. Synthesis pipeline lands in v0.2; the schema
+-- ships now so v0.1 imports already carry the right shape.
+--
+-- Voice sources are restricted to AI-consent-released training
+-- corpora (LJ Speech, Hi-Fi TTS, VCTK, LibriTTS-R) so generated
+-- audio is airtight to share — even if a story eventually lands
+-- on a public site. LibriVox narrators are not seed defaults; if
+-- the operator adds one they accept the "volunteered for
+-- public-domain reading, didn't sign up for AI cloning" caveat.
+
+CREATE TABLE voices (
+    id                UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    name              TEXT NOT NULL UNIQUE,
+    display_name      TEXT NOT NULL,
+    gender            TEXT,
+    accent            TEXT,
+    -- Source identifier — e.g. "lj_speech", "hifi_tts:92",
+    -- "vctk:p225", "librivox:phil-chenevert", "self".
+    source            TEXT NOT NULL,
+    -- License posture — "PD-ai-consent" (LJ Speech, Hi-Fi TTS,
+    -- VCTK, LibriTTS-R, etc — corpora released specifically for
+    -- TTS training), "PD-non-ai" (raw LibriVox, weaker consent),
+    -- "self" (operator's own recording), "licensed" (paid voice
+    -- with explicit cloning rights).
+    license           TEXT NOT NULL,
+    -- On-disk path inside the container to a 10-15s reference WAV.
+    -- Nullable: a row can be metadata-only until the operator
+    -- places the audio.
+    reference_path    TEXT,
+    -- Transcript of `reference_path`. F5-TTS needs both the audio
+    -- and the text to clone.
+    reference_text    TEXT,
+    sample_rate_hz    INTEGER,
+    duration_seconds  REAL,
+    notes             TEXT,
+    is_default        BOOLEAN NOT NULL DEFAULT false,
+    created_at        TIMESTAMPTZ NOT NULL DEFAULT now()
+);
+
+-- At most one default voice.
+CREATE UNIQUE INDEX idx_voices_one_default
+    ON voices((is_default))
+    WHERE is_default = true;
+
+-- Per-story (and optionally global) pronunciation overrides. The
+-- TTS pipeline pre-processes every chapter text through this map
+-- before handing it to F5. Solves the proper-noun problem
+-- (Pripyat, Dyatlov, Toptunov, etc) without retraining.
+CREATE TABLE pronunciation_overrides (
+    id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    -- NULL story_id = global default across all stories.
+    story_id        UUID REFERENCES stories(id) ON DELETE CASCADE,
+    word            TEXT NOT NULL,
+    -- The phonemes string in `phoneme_format` notation. e.g. IPA
+    -- /ˈpɹɪpʲɪt/ or arpabet "P R IH1 P Y AH0 T".
+    phonemes        TEXT NOT NULL,
+    -- "ipa" | "arpabet" | "espeak" | "raw"  — depends on what F5
+    -- accepts in the rendering pipeline.
+    phoneme_format  TEXT NOT NULL DEFAULT 'ipa',
+    notes           TEXT,
+    -- "auto" = LLM-generated (Opus extracted proper nouns + IPA).
+    -- "manual" = hand-edited.
+    source          TEXT NOT NULL DEFAULT 'manual'
+                    CHECK (source IN ('auto', 'manual')),
+    created_at      TIMESTAMPTZ NOT NULL DEFAULT now()
+);
+
+-- One row per (story, word) pair.
+CREATE UNIQUE INDEX idx_pron_overrides_story_word
+    ON pronunciation_overrides(story_id, word)
+    WHERE story_id IS NOT NULL;
+
+-- One global row per word (NULL story_id).
+CREATE UNIQUE INDEX idx_pron_overrides_global_word
+    ON pronunciation_overrides(word)
+    WHERE story_id IS NULL;
+
+CREATE INDEX idx_pron_overrides_story ON pronunciation_overrides(story_id);
+
+-- Stories can declare a preferred voice. NULL → fall back to the
+-- system-default voice (voices.is_default = true).
+ALTER TABLE stories
+    ADD COLUMN preferred_voice_id UUID REFERENCES voices(id) ON DELETE SET NULL;
+
+CREATE INDEX idx_stories_preferred_voice ON stories(preferred_voice_id);
+
+-- A narration_runs table mirroring generation_runs but for TTS
+-- jobs. Each chapter render is a row. Keeps narration audit trail
+-- separate from text-generation audit trail.
+CREATE TABLE narration_runs (
+    id                  UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    chapter_id          UUID NOT NULL REFERENCES chapters(id) ON DELETE CASCADE,
+    voice_id            UUID NOT NULL REFERENCES voices(id) ON DELETE CASCADE,
+    -- "f5-tts" | "xtts-v2" | future engines.
+    engine              TEXT NOT NULL,
+    engine_version      TEXT,
+    -- Output WAV/MP3 on disk. Nullable until the run completes.
+    output_path         TEXT,
+    duration_seconds    REAL,
+    seed                BIGINT,
+    started_at          TIMESTAMPTZ NOT NULL DEFAULT now(),
+    ended_at            TIMESTAMPTZ,
+    status              TEXT NOT NULL DEFAULT 'running'
+                        CHECK (status IN ('running', 'succeeded', 'failed', 'rerolled')),
+    error               TEXT
+);
+
+CREATE INDEX idx_narration_runs_chapter ON narration_runs(chapter_id);
+CREATE INDEX idx_narration_runs_voice ON narration_runs(voice_id);