scaffold v0.1: postgres+pgvector inside-container, schema, markdown ingest, CLI

Skald is a generic story-writer. The database is the product; the binary is the tooling. Everything story-specific lives in rows, not in code. cwho's monorepo + binary-per-role pattern transplanted to this domain. What this commit ships: - Cargo workspace (resolver=3, edition 2024): skald-core (lib) + skald (bin) - Migration 0001: stories, characters, canon_facts, chapters, chapter_summaries, passages (vector(1536)), generation_runs, audit_findings, tags. pgvector + pg_trgm extensions. ivfflat index deferred until we have data (post-import the first ~1k passages and add the index). - skald-core::ingest — markdown parser for the cwho/coast-down shape: '# Title' → '## Chapter N — date' headings → '# Continuity Bible' section with character roster (real + fictional sub-sections) + setting / mystery / historical / liberty / hook sub-sections. Decomposed into structured rows; original bullet body preserved in key_facts/body fields for fidelity. 6 unit tests cover the shape. - skald-core::db — Postgres connection pool + migration runner. - skald-core::models — row types via sqlx::FromRow. - skald binary — clap CLI: 'serve' (http + migrations) and 'import-markdown' (one-shot ingest). - Dockerfile — multi-stage: rust:1.95-bookworm builder, pgvector/ pgvector:pg17 runtime, tini under PID 1, custom entrypoint.sh that boots embedded postgres then execs skald serve. - compose.yml — singleton container, postgres data in volume, story corpus mounted read-only at /seed. Decisions locked 2026-05-13: 1. DB in same container 'till we have a real working tool' (cobb) 2. postgres+pgvector (NOT sqlite) — keeps semantic-search story 3. Network-not-socket connection (postgresql://localhost:5432) from day one so future split is config-only, not code-rewrite Not yet wired: - Web UI - clawdforge calls (gen → cleanup → canon-audit pipeline) - Embedding pass - TTS sidecar
2026-05-13 09:04:28 -07:00 · 2026-05-13 09:04:28 -07:00 · f575ad3722
commit f575ad3722
17 changed files with 4065 additions and 0 deletions
--- a/migrations/0001_init.sql
+++ b/migrations/0001_init.sql
@ -0,0 +1,189 @@
+-- Skald v0.1 schema. Database is the source of truth; the writer is
+-- generic tooling that knows nothing hardcoded about any specific
+-- story. Every story is rows.
+--
+-- pgvector for embedding-based callback search across past prose;
+-- pg_trgm for fuzzy character-name lookups.
+
+CREATE EXTENSION IF NOT EXISTS vector;
+CREATE EXTENSION IF NOT EXISTS pg_trgm;
+
+-- One row per story (or per sequel). parent_story_id chains a
+-- series; root_story_id is the head of the chain (denormalized for
+-- cheap series scans).
+CREATE TABLE stories (
+    id                 UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    title              TEXT NOT NULL,
+    status             TEXT NOT NULL DEFAULT 'seed'
+                       CHECK (status IN (
+                           'seed', 'draft', 'generating', 'cleaning',
+                           'auditing', 'complete', 'failed'
+                       )),
+    prompt             TEXT,
+    model              TEXT,
+    parent_story_id    UUID REFERENCES stories(id) ON DELETE SET NULL,
+    root_story_id      UUID REFERENCES stories(id) ON DELETE SET NULL,
+    series_name        TEXT,
+    word_count_target  INTEGER,
+    word_count_actual  INTEGER NOT NULL DEFAULT 0,
+    summary            TEXT,
+    created_at         TIMESTAMPTZ NOT NULL DEFAULT now(),
+    updated_at         TIMESTAMPTZ NOT NULL DEFAULT now()
+);
+
+CREATE INDEX idx_stories_parent ON stories(parent_story_id);
+CREATE INDEX idx_stories_root   ON stories(root_story_id);
+CREATE INDEX idx_stories_status ON stories(status);
+CREATE INDEX idx_stories_series ON stories(series_name) WHERE series_name IS NOT NULL;
+
+-- Characters: real (historical) or fictional. The bible blob is
+-- decomposed enough to be searchable but the original prose blob
+-- stays in key_facts for full fidelity.
+CREATE TABLE characters (
+    id                  UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    story_id            UUID NOT NULL REFERENCES stories(id) ON DELETE CASCADE,
+    name                TEXT NOT NULL,
+    kind                TEXT NOT NULL CHECK (kind IN ('real', 'fictional')),
+    role                TEXT,
+    voice_traits        TEXT,
+    key_facts           TEXT NOT NULL,
+    aliases             TEXT[] NOT NULL DEFAULT '{}',
+    first_seen_chapter  INTEGER,
+    state_at_latest     TEXT,
+    created_at          TIMESTAMPTZ NOT NULL DEFAULT now()
+);
+
+CREATE INDEX idx_characters_story        ON characters(story_id);
+CREATE INDEX idx_characters_name_trgm    ON characters USING gin (name gin_trgm_ops);
+CREATE INDEX idx_characters_story_kind   ON characters(story_id, kind);
+
+-- Canon facts: everything that's bible-shaped but not a character.
+-- Setting details, mystery threads, themes, rules, historical
+-- anchors, fictional liberties, suggested hooks for sequels.
+CREATE TABLE canon_facts (
+    id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    story_id        UUID NOT NULL REFERENCES stories(id) ON DELETE CASCADE,
+    category        TEXT NOT NULL CHECK (category IN (
+                        'setting', 'event', 'rule', 'theme',
+                        'mystery', 'liberty', 'hook', 'historical_anchor'
+                    )),
+    title           TEXT NOT NULL,
+    body            TEXT NOT NULL,
+    weight          INTEGER NOT NULL DEFAULT 1,
+    source_chapter  INTEGER,
+    resolved        BOOLEAN NOT NULL DEFAULT false,
+    created_at      TIMESTAMPTZ NOT NULL DEFAULT now()
+);
+
+CREATE INDEX idx_canon_facts_story_category ON canon_facts(story_id, category);
+
+-- Chapters: full prose body, stored in DB (markdown). One row per
+-- chapter; UNIQUE(story_id, n) prevents duplicate insertion.
+CREATE TABLE chapters (
+    id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    story_id        UUID NOT NULL REFERENCES stories(id) ON DELETE CASCADE,
+    n               INTEGER NOT NULL,
+    title           TEXT,
+    body_md         TEXT NOT NULL,
+    word_count      INTEGER NOT NULL DEFAULT 0,
+    generated_at    TIMESTAMPTZ NOT NULL DEFAULT now(),
+    UNIQUE (story_id, n)
+);
+
+CREATE INDEX idx_chapters_story ON chapters(story_id);
+
+-- Per-chapter short summary. The writer pulls these instead of full
+-- chapter prose when assembling context for a sequel — much cheaper
+-- on tokens. Generated by a separate LLM pass after the chapter is
+-- finished.
+CREATE TABLE chapter_summaries (
+    chapter_id      UUID PRIMARY KEY REFERENCES chapters(id) ON DELETE CASCADE,
+    body            TEXT NOT NULL,
+    generated_at    TIMESTAMPTZ NOT NULL DEFAULT now()
+);
+
+-- Passages: paragraph-level prose with embedding vectors for
+-- similarity search. Embeddings nullable so v0.1 import doesn't
+-- require an embedding pass — we fill them in lazily when we
+-- actually need semantic recall.
+CREATE TABLE passages (
+    id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    chapter_id      UUID NOT NULL REFERENCES chapters(id) ON DELETE CASCADE,
+    paragraph_n     INTEGER NOT NULL,
+    body            TEXT NOT NULL,
+    embedding       vector(1536),
+    embedded_at     TIMESTAMPTZ,
+    UNIQUE (chapter_id, paragraph_n)
+);
+
+CREATE INDEX idx_passages_chapter ON passages(chapter_id);
+-- ivfflat index on `embedding` is deferred until we have data —
+-- ivfflat requires training rows to build, and an empty-table
+-- index degrades query plans. Add after first ~1k passages.
+
+-- Every LLM call we make is logged. Useful for cost tracking,
+-- forensics, "why is this chapter weird?" investigations.
+CREATE TABLE generation_runs (
+    id                      UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    story_id                UUID NOT NULL REFERENCES stories(id) ON DELETE CASCADE,
+    kind                    TEXT NOT NULL CHECK (kind IN (
+                                'gen', 'cleanup', 'audit',
+                                'summary', 'embed'
+                            )),
+    clawdforge_session_id   TEXT,
+    tokens_in               INTEGER,
+    tokens_out              INTEGER,
+    cost_estimate_cents     INTEGER,
+    started_at              TIMESTAMPTZ NOT NULL DEFAULT now(),
+    ended_at                TIMESTAMPTZ,
+    status                  TEXT NOT NULL DEFAULT 'running'
+                            CHECK (status IN ('running', 'succeeded', 'failed')),
+    error                   TEXT
+);
+
+CREATE INDEX idx_generation_runs_story ON generation_runs(story_id);
+CREATE INDEX idx_generation_runs_kind  ON generation_runs(kind);
+
+-- Canon audit findings. Third-Opus reads parent + sequel + bible
+-- and flags any continuity drift, character voice shift, retconned
+-- facts, timeline contradictions.
+CREATE TABLE audit_findings (
+    id          UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    story_id    UUID NOT NULL REFERENCES stories(id) ON DELETE CASCADE,
+    run_id      UUID REFERENCES generation_runs(id) ON DELETE SET NULL,
+    severity    TEXT NOT NULL CHECK (severity IN ('info', 'warn', 'crit')),
+    area        TEXT NOT NULL CHECK (area IN (
+                    'character', 'continuity', 'tone',
+                    'fact', 'timeline', 'other'
+                )),
+    body        TEXT NOT NULL,
+    resolved    BOOLEAN NOT NULL DEFAULT false,
+    created_at  TIMESTAMPTZ NOT NULL DEFAULT now()
+);
+
+CREATE INDEX idx_audit_findings_story ON audit_findings(story_id);
+
+-- Arbitrary user-applied labels. Genre, mood, status filters, etc.
+CREATE TABLE tags (
+    id          UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    story_id    UUID NOT NULL REFERENCES stories(id) ON DELETE CASCADE,
+    name        TEXT NOT NULL,
+    UNIQUE (story_id, name)
+);
+
+CREATE INDEX idx_tags_story ON tags(story_id);
+
+-- Auto-touch stories.updated_at whenever anything changes on the
+-- story row itself. Cascade-only — not triggered by child writes.
+CREATE OR REPLACE FUNCTION touch_updated_at()
+RETURNS TRIGGER AS $$
+BEGIN
+    NEW.updated_at = now();
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE TRIGGER stories_updated_at
+    BEFORE UPDATE ON stories
+    FOR EACH ROW
+    EXECUTE FUNCTION touch_updated_at();