skald/migrations/0002_voices_and_pronunciation.sql

-- TTS layer schema. Synthesis pipeline lands in v0.2; the schema
-- ships now so v0.1 imports already carry the right shape.
--
-- Voice sources are restricted to AI-consent-released training
-- corpora (LJ Speech, Hi-Fi TTS, VCTK, LibriTTS-R) so generated
-- audio is airtight to share — even if a story eventually lands
-- on a public site. LibriVox narrators are not seed defaults; if
-- the operator adds one they accept the "volunteered for
-- public-domain reading, didn't sign up for AI cloning" caveat.

CREATE TABLE voices (
    id                UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    name              TEXT NOT NULL UNIQUE,
    display_name      TEXT NOT NULL,
    gender            TEXT,
    accent            TEXT,
    -- Source identifier — e.g. "lj_speech", "hifi_tts:92",
    -- "vctk:p225", "librivox:phil-chenevert", "self".
    source            TEXT NOT NULL,
    -- License posture — "PD-ai-consent" (LJ Speech, Hi-Fi TTS,
    -- VCTK, LibriTTS-R, etc — corpora released specifically for
    -- TTS training), "PD-non-ai" (raw LibriVox, weaker consent),
    -- "self" (operator's own recording), "licensed" (paid voice
    -- with explicit cloning rights).
    license           TEXT NOT NULL,
    -- On-disk path inside the container to a 10-15s reference WAV.
    -- Nullable: a row can be metadata-only until the operator
    -- places the audio.
    reference_path    TEXT,
    -- Transcript of `reference_path`. F5-TTS needs both the audio
    -- and the text to clone.
    reference_text    TEXT,
    sample_rate_hz    INTEGER,
    duration_seconds  REAL,
    notes             TEXT,
    is_default        BOOLEAN NOT NULL DEFAULT false,
    created_at        TIMESTAMPTZ NOT NULL DEFAULT now()
);

-- At most one default voice.
CREATE UNIQUE INDEX idx_voices_one_default
    ON voices((is_default))
    WHERE is_default = true;

-- Per-story (and optionally global) pronunciation overrides. The
-- TTS pipeline pre-processes every chapter text through this map
-- before handing it to F5. Solves the proper-noun problem
-- (Pripyat, Dyatlov, Toptunov, etc) without retraining.
CREATE TABLE pronunciation_overrides (
    id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    -- NULL story_id = global default across all stories.
    story_id        UUID REFERENCES stories(id) ON DELETE CASCADE,
    word            TEXT NOT NULL,
    -- The phonemes string in `phoneme_format` notation. e.g. IPA
    -- /ˈpɹɪpʲɪt/ or arpabet "P R IH1 P Y AH0 T".
    phonemes        TEXT NOT NULL,
    -- "ipa" | "arpabet" | "espeak" | "raw"  — depends on what F5
    -- accepts in the rendering pipeline.
    phoneme_format  TEXT NOT NULL DEFAULT 'ipa',
    notes           TEXT,
    -- "auto" = LLM-generated (Opus extracted proper nouns + IPA).
    -- "manual" = hand-edited.
    source          TEXT NOT NULL DEFAULT 'manual'
                    CHECK (source IN ('auto', 'manual')),
    created_at      TIMESTAMPTZ NOT NULL DEFAULT now()
);

-- One row per (story, word) pair.
CREATE UNIQUE INDEX idx_pron_overrides_story_word
    ON pronunciation_overrides(story_id, word)
    WHERE story_id IS NOT NULL;

-- One global row per word (NULL story_id).
CREATE UNIQUE INDEX idx_pron_overrides_global_word
    ON pronunciation_overrides(word)
    WHERE story_id IS NULL;

CREATE INDEX idx_pron_overrides_story ON pronunciation_overrides(story_id);

-- Stories can declare a preferred voice. NULL → fall back to the
-- system-default voice (voices.is_default = true).
ALTER TABLE stories
    ADD COLUMN preferred_voice_id UUID REFERENCES voices(id) ON DELETE SET NULL;

CREATE INDEX idx_stories_preferred_voice ON stories(preferred_voice_id);

-- A narration_runs table mirroring generation_runs but for TTS
-- jobs. Each chapter render is a row. Keeps narration audit trail
-- separate from text-generation audit trail.
CREATE TABLE narration_runs (
    id                  UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    chapter_id          UUID NOT NULL REFERENCES chapters(id) ON DELETE CASCADE,
    voice_id            UUID NOT NULL REFERENCES voices(id) ON DELETE CASCADE,
    -- "f5-tts" | "xtts-v2" | future engines.
    engine              TEXT NOT NULL,
    engine_version      TEXT,
    -- Output WAV/MP3 on disk. Nullable until the run completes.
    output_path         TEXT,
    duration_seconds    REAL,
    seed                BIGINT,
    started_at          TIMESTAMPTZ NOT NULL DEFAULT now(),
    ended_at            TIMESTAMPTZ,
    status              TEXT NOT NULL DEFAULT 'running'
                        CHECK (status IN ('running', 'succeeded', 'failed', 'rerolled')),
    error               TEXT
);

CREATE INDEX idx_narration_runs_chapter ON narration_runs(chapter_id);
CREATE INDEX idx_narration_runs_voice ON narration_runs(voice_id);