skald/migrations/0001_init.sql
Kayos f575ad3722 scaffold v0.1: postgres+pgvector inside-container, schema, markdown ingest, CLI
Skald is a generic story-writer. The database is the product; the
binary is the tooling. Everything story-specific lives in rows, not
in code. cwho's monorepo + binary-per-role pattern transplanted to
this domain.

What this commit ships:
- Cargo workspace (resolver=3, edition 2024): skald-core (lib) +
  skald (bin)
- Migration 0001: stories, characters, canon_facts, chapters,
  chapter_summaries, passages (vector(1536)), generation_runs,
  audit_findings, tags. pgvector + pg_trgm extensions. ivfflat
  index deferred until we have data (post-import the first ~1k
  passages and add the index).
- skald-core::ingest — markdown parser for the cwho/coast-down shape:
  '# Title' → '## Chapter N — date' headings → '# Continuity Bible'
  section with character roster (real + fictional sub-sections) +
  setting / mystery / historical / liberty / hook sub-sections.
  Decomposed into structured rows; original bullet body preserved
  in key_facts/body fields for fidelity. 6 unit tests cover the
  shape.
- skald-core::db — Postgres connection pool + migration runner.
- skald-core::models — row types via sqlx::FromRow.
- skald binary — clap CLI: 'serve' (http + migrations) and
  'import-markdown' (one-shot ingest).
- Dockerfile — multi-stage: rust:1.95-bookworm builder, pgvector/
  pgvector:pg17 runtime, tini under PID 1, custom entrypoint.sh
  that boots embedded postgres then execs skald serve.
- compose.yml — singleton container, postgres data in volume,
  story corpus mounted read-only at /seed.

Decisions locked 2026-05-13:
1. DB in same container 'till we have a real working tool' (cobb)
2. postgres+pgvector (NOT sqlite) — keeps semantic-search story
3. Network-not-socket connection (postgresql://localhost:5432) from
   day one so future split is config-only, not code-rewrite

Not yet wired:
- Web UI
- clawdforge calls (gen → cleanup → canon-audit pipeline)
- Embedding pass
- TTS sidecar
2026-05-13 09:04:28 -07:00

189 lines
7.8 KiB
PL/PgSQL

-- Skald v0.1 schema. Database is the source of truth; the writer is
-- generic tooling that knows nothing hardcoded about any specific
-- story. Every story is rows.
--
-- pgvector for embedding-based callback search across past prose;
-- pg_trgm for fuzzy character-name lookups.
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
-- One row per story (or per sequel). parent_story_id chains a
-- series; root_story_id is the head of the chain (denormalized for
-- cheap series scans).
CREATE TABLE stories (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
title TEXT NOT NULL,
status TEXT NOT NULL DEFAULT 'seed'
CHECK (status IN (
'seed', 'draft', 'generating', 'cleaning',
'auditing', 'complete', 'failed'
)),
prompt TEXT,
model TEXT,
parent_story_id UUID REFERENCES stories(id) ON DELETE SET NULL,
root_story_id UUID REFERENCES stories(id) ON DELETE SET NULL,
series_name TEXT,
word_count_target INTEGER,
word_count_actual INTEGER NOT NULL DEFAULT 0,
summary TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX idx_stories_parent ON stories(parent_story_id);
CREATE INDEX idx_stories_root ON stories(root_story_id);
CREATE INDEX idx_stories_status ON stories(status);
CREATE INDEX idx_stories_series ON stories(series_name) WHERE series_name IS NOT NULL;
-- Characters: real (historical) or fictional. The bible blob is
-- decomposed enough to be searchable but the original prose blob
-- stays in key_facts for full fidelity.
CREATE TABLE characters (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
story_id UUID NOT NULL REFERENCES stories(id) ON DELETE CASCADE,
name TEXT NOT NULL,
kind TEXT NOT NULL CHECK (kind IN ('real', 'fictional')),
role TEXT,
voice_traits TEXT,
key_facts TEXT NOT NULL,
aliases TEXT[] NOT NULL DEFAULT '{}',
first_seen_chapter INTEGER,
state_at_latest TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX idx_characters_story ON characters(story_id);
CREATE INDEX idx_characters_name_trgm ON characters USING gin (name gin_trgm_ops);
CREATE INDEX idx_characters_story_kind ON characters(story_id, kind);
-- Canon facts: everything that's bible-shaped but not a character.
-- Setting details, mystery threads, themes, rules, historical
-- anchors, fictional liberties, suggested hooks for sequels.
CREATE TABLE canon_facts (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
story_id UUID NOT NULL REFERENCES stories(id) ON DELETE CASCADE,
category TEXT NOT NULL CHECK (category IN (
'setting', 'event', 'rule', 'theme',
'mystery', 'liberty', 'hook', 'historical_anchor'
)),
title TEXT NOT NULL,
body TEXT NOT NULL,
weight INTEGER NOT NULL DEFAULT 1,
source_chapter INTEGER,
resolved BOOLEAN NOT NULL DEFAULT false,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX idx_canon_facts_story_category ON canon_facts(story_id, category);
-- Chapters: full prose body, stored in DB (markdown). One row per
-- chapter; UNIQUE(story_id, n) prevents duplicate insertion.
CREATE TABLE chapters (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
story_id UUID NOT NULL REFERENCES stories(id) ON DELETE CASCADE,
n INTEGER NOT NULL,
title TEXT,
body_md TEXT NOT NULL,
word_count INTEGER NOT NULL DEFAULT 0,
generated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
UNIQUE (story_id, n)
);
CREATE INDEX idx_chapters_story ON chapters(story_id);
-- Per-chapter short summary. The writer pulls these instead of full
-- chapter prose when assembling context for a sequel — much cheaper
-- on tokens. Generated by a separate LLM pass after the chapter is
-- finished.
CREATE TABLE chapter_summaries (
chapter_id UUID PRIMARY KEY REFERENCES chapters(id) ON DELETE CASCADE,
body TEXT NOT NULL,
generated_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
-- Passages: paragraph-level prose with embedding vectors for
-- similarity search. Embeddings nullable so v0.1 import doesn't
-- require an embedding pass — we fill them in lazily when we
-- actually need semantic recall.
CREATE TABLE passages (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
chapter_id UUID NOT NULL REFERENCES chapters(id) ON DELETE CASCADE,
paragraph_n INTEGER NOT NULL,
body TEXT NOT NULL,
embedding vector(1536),
embedded_at TIMESTAMPTZ,
UNIQUE (chapter_id, paragraph_n)
);
CREATE INDEX idx_passages_chapter ON passages(chapter_id);
-- ivfflat index on `embedding` is deferred until we have data —
-- ivfflat requires training rows to build, and an empty-table
-- index degrades query plans. Add after first ~1k passages.
-- Every LLM call we make is logged. Useful for cost tracking,
-- forensics, "why is this chapter weird?" investigations.
CREATE TABLE generation_runs (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
story_id UUID NOT NULL REFERENCES stories(id) ON DELETE CASCADE,
kind TEXT NOT NULL CHECK (kind IN (
'gen', 'cleanup', 'audit',
'summary', 'embed'
)),
clawdforge_session_id TEXT,
tokens_in INTEGER,
tokens_out INTEGER,
cost_estimate_cents INTEGER,
started_at TIMESTAMPTZ NOT NULL DEFAULT now(),
ended_at TIMESTAMPTZ,
status TEXT NOT NULL DEFAULT 'running'
CHECK (status IN ('running', 'succeeded', 'failed')),
error TEXT
);
CREATE INDEX idx_generation_runs_story ON generation_runs(story_id);
CREATE INDEX idx_generation_runs_kind ON generation_runs(kind);
-- Canon audit findings. Third-Opus reads parent + sequel + bible
-- and flags any continuity drift, character voice shift, retconned
-- facts, timeline contradictions.
CREATE TABLE audit_findings (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
story_id UUID NOT NULL REFERENCES stories(id) ON DELETE CASCADE,
run_id UUID REFERENCES generation_runs(id) ON DELETE SET NULL,
severity TEXT NOT NULL CHECK (severity IN ('info', 'warn', 'crit')),
area TEXT NOT NULL CHECK (area IN (
'character', 'continuity', 'tone',
'fact', 'timeline', 'other'
)),
body TEXT NOT NULL,
resolved BOOLEAN NOT NULL DEFAULT false,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX idx_audit_findings_story ON audit_findings(story_id);
-- Arbitrary user-applied labels. Genre, mood, status filters, etc.
CREATE TABLE tags (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
story_id UUID NOT NULL REFERENCES stories(id) ON DELETE CASCADE,
name TEXT NOT NULL,
UNIQUE (story_id, name)
);
CREATE INDEX idx_tags_story ON tags(story_id);
-- Auto-touch stories.updated_at whenever anything changes on the
-- story row itself. Cascade-only — not triggered by child writes.
CREATE OR REPLACE FUNCTION touch_updated_at()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = now();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER stories_updated_at
BEFORE UPDATE ON stories
FOR EACH ROW
EXECUTE FUNCTION touch_updated_at();