scaffold v0.1: postgres+pgvector inside-container, schema, markdown ingest, CLI

Skald is a generic story-writer. The database is the product; the binary is the tooling. Everything story-specific lives in rows, not in code. cwho's monorepo + binary-per-role pattern transplanted to this domain. What this commit ships: - Cargo workspace (resolver=3, edition 2024): skald-core (lib) + skald (bin) - Migration 0001: stories, characters, canon_facts, chapters, chapter_summaries, passages (vector(1536)), generation_runs, audit_findings, tags. pgvector + pg_trgm extensions. ivfflat index deferred until we have data (post-import the first ~1k passages and add the index). - skald-core::ingest — markdown parser for the cwho/coast-down shape: '# Title' → '## Chapter N — date' headings → '# Continuity Bible' section with character roster (real + fictional sub-sections) + setting / mystery / historical / liberty / hook sub-sections. Decomposed into structured rows; original bullet body preserved in key_facts/body fields for fidelity. 6 unit tests cover the shape. - skald-core::db — Postgres connection pool + migration runner. - skald-core::models — row types via sqlx::FromRow. - skald binary — clap CLI: 'serve' (http + migrations) and 'import-markdown' (one-shot ingest). - Dockerfile — multi-stage: rust:1.95-bookworm builder, pgvector/ pgvector:pg17 runtime, tini under PID 1, custom entrypoint.sh that boots embedded postgres then execs skald serve. - compose.yml — singleton container, postgres data in volume, story corpus mounted read-only at /seed. Decisions locked 2026-05-13: 1. DB in same container 'till we have a real working tool' (cobb) 2. postgres+pgvector (NOT sqlite) — keeps semantic-search story 3. Network-not-socket connection (postgresql://localhost:5432) from day one so future split is config-only, not code-rewrite Not yet wired: - Web UI - clawdforge calls (gen → cleanup → canon-audit pipeline) - Embedding pass - TTS sidecar
2026-05-13 09:04:28 -07:00 · 2026-05-13 09:04:28 -07:00 · f575ad3722
commit f575ad3722
17 changed files with 4065 additions and 0 deletions
--- a/skald-core/Cargo.toml
+++ b/skald-core/Cargo.toml
@ -0,0 +1,23 @@
+[package]
+name = "skald-core"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+authors.workspace = true
+repository.workspace = true
+description = "Skald's shared lib: db models, schema migrations, markdown ingest, context assembly."
+
+[dependencies]
+tokio = { workspace = true }
+sqlx = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+anyhow = { workspace = true }
+thiserror = { workspace = true }
+tracing = { workspace = true }
+chrono = { workspace = true }
+uuid = { workspace = true }
+regex = { workspace = true }
+
+[dev-dependencies]
+tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
--- a/skald-core/src/db.rs
+++ b/skald-core/src/db.rs
@ -0,0 +1,22 @@
+//! Postgres connection pool helper.
+
+use sqlx::postgres::{PgConnectOptions, PgPoolOptions};
+use sqlx::{ConnectOptions, PgPool};
+use std::str::FromStr;
+use std::time::Duration;
+
+/// Connect to postgres, run pending migrations, return the pool.
+pub async fn connect_and_migrate(url: &str) -> anyhow::Result<PgPool> {
+    let mut opts = PgConnectOptions::from_str(url)?;
+    // sqlx logs every query at INFO by default; that's hostile to
+    // production logs. Pull it down to debug.
+    opts = opts.log_statements(tracing::log::LevelFilter::Debug);
+
+    let pool = PgPoolOptions::new()
+        .max_connections(10)
+        .acquire_timeout(Duration::from_secs(10))
+        .connect_with(opts)
+        .await?;
+    crate::MIGRATOR.run(&pool).await?;
+    Ok(pool)
+}
--- a/skald-core/src/ingest.rs
+++ b/skald-core/src/ingest.rs
@ -0,0 +1,510 @@
+//! Parse a long-form story markdown file into the rows we'll store
+//! in the database. The parser knows the shape we generated in the
+//! 2026-05-13 Coast-Down side-quest (chapters as `## Chapter N — date`,
+//! then a `# Continuity Bible` section with structured subsections),
+//! but isn't story-specific — any markdown that follows that shape
+//! parses cleanly. Other shapes go through `parse_story_file` and
+//! fail loudly so the operator can adjust the doc, not the code.
+
+use anyhow::{Context, bail};
+use regex::Regex;
+use sqlx::PgPool;
+use std::path::Path;
+use std::sync::OnceLock;
+use uuid::Uuid;
+
+/// What we extract from a story markdown file before touching the
+/// database.
+#[derive(Debug, Clone)]
+pub struct ParsedStory {
+    pub title: String,
+    pub chapters: Vec<ParsedChapter>,
+    pub characters: Vec<ParsedCharacter>,
+    pub canon_facts: Vec<ParsedFact>,
+}
+
+#[derive(Debug, Clone)]
+pub struct ParsedChapter {
+    pub n: i32,
+    pub title: Option<String>,
+    pub body: String,
+    pub paragraphs: Vec<String>,
+}
+
+#[derive(Debug, Clone)]
+pub struct ParsedCharacter {
+    pub name: String,
+    pub kind: CharacterKind,
+    pub key_facts: String,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum CharacterKind {
+    Real,
+    Fictional,
+}
+
+impl CharacterKind {
+    pub fn as_str(self) -> &'static str {
+        match self {
+            Self::Real => "real",
+            Self::Fictional => "fictional",
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct ParsedFact {
+    pub category: FactCategory,
+    pub title: String,
+    pub body: String,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum FactCategory {
+    Setting,
+    Mystery,
+    HistoricalAnchor,
+    Liberty,
+    Hook,
+    Event,
+    Rule,
+    Theme,
+}
+
+impl FactCategory {
+    pub fn as_str(self) -> &'static str {
+        match self {
+            Self::Setting => "setting",
+            Self::Mystery => "mystery",
+            Self::HistoricalAnchor => "historical_anchor",
+            Self::Liberty => "liberty",
+            Self::Hook => "hook",
+            Self::Event => "event",
+            Self::Rule => "rule",
+            Self::Theme => "theme",
+        }
+    }
+}
+
+/// Parse a story markdown file.
+pub fn parse_story_file(path: &Path) -> anyhow::Result<ParsedStory> {
+    let raw = std::fs::read_to_string(path)
+        .with_context(|| format!("read {}", path.display()))?;
+    parse_story(&raw)
+}
+
+/// Parse a story markdown string. See module-level docs for the
+/// shape it expects.
+pub fn parse_story(raw: &str) -> anyhow::Result<ParsedStory> {
+    let bible_split: Vec<&str> = raw.splitn(2, "\n# Continuity Bible").collect();
+    let pre_bible = bible_split[0];
+    let bible_body = bible_split.get(1).copied().unwrap_or("");
+
+    let title = extract_title(pre_bible).context("no title heading found")?;
+    let chapters = parse_chapters(pre_bible);
+    let (characters, canon_facts) = parse_bible(bible_body);
+
+    if chapters.is_empty() {
+        bail!("no chapters parsed — expected `## Chapter N — …` headings");
+    }
+
+    Ok(ParsedStory {
+        title,
+        chapters,
+        characters,
+        canon_facts,
+    })
+}
+
+fn extract_title(pre_bible: &str) -> Option<String> {
+    for line in pre_bible.lines() {
+        let line = line.trim_end();
+        if let Some(rest) = line.strip_prefix("# ") {
+            let t = rest.trim();
+            if !t.is_empty() {
+                return Some(t.to_string());
+            }
+        }
+    }
+    None
+}
+
+fn parse_chapters(pre_bible: &str) -> Vec<ParsedChapter> {
+    let mut chapters: Vec<ParsedChapter> = Vec::new();
+    let mut cur_title: Option<String> = None;
+    let mut cur_body: Vec<&str> = Vec::new();
+
+    for line in pre_bible.lines() {
+        if let Some(rest) = line.strip_prefix("## ") {
+            // Flush previous chapter.
+            if cur_title.is_some() {
+                push_chapter(&mut chapters, cur_title.take(), &cur_body);
+                cur_body.clear();
+            }
+            cur_title = Some(rest.trim().to_string());
+        } else if cur_title.is_some() {
+            cur_body.push(line);
+        }
+    }
+    if cur_title.is_some() {
+        push_chapter(&mut chapters, cur_title.take(), &cur_body);
+    }
+    chapters
+}
+
+fn push_chapter(out: &mut Vec<ParsedChapter>, title: Option<String>, lines: &[&str]) {
+    let title = title.unwrap_or_default();
+    let body = lines.join("\n").trim().to_string();
+    if body.is_empty() {
+        return;
+    }
+    let n = (out.len() + 1) as i32;
+    let paragraphs = split_paragraphs(&body);
+    out.push(ParsedChapter {
+        n,
+        title: if title.is_empty() { None } else { Some(title) },
+        body,
+        paragraphs,
+    });
+}
+
+/// Split a chapter body into paragraphs. Blank-line delimited; `---`
+/// (markdown horizontal rule) is treated as a paragraph break and
+/// dropped.
+fn split_paragraphs(body: &str) -> Vec<String> {
+    let mut paragraphs: Vec<String> = Vec::new();
+    let mut cur: Vec<&str> = Vec::new();
+    for line in body.lines() {
+        let trimmed = line.trim();
+        let is_break = trimmed.is_empty() || trimmed == "---" || trimmed == "***";
+        if is_break {
+            if !cur.is_empty() {
+                paragraphs.push(cur.join("\n").trim().to_string());
+                cur.clear();
+            }
+        } else {
+            cur.push(line);
+        }
+    }
+    if !cur.is_empty() {
+        paragraphs.push(cur.join("\n").trim().to_string());
+    }
+    paragraphs
+}
+
+fn parse_bible(bible_body: &str) -> (Vec<ParsedCharacter>, Vec<ParsedFact>) {
+    let mut characters: Vec<ParsedCharacter> = Vec::new();
+    let mut canon_facts: Vec<ParsedFact> = Vec::new();
+    if bible_body.is_empty() {
+        return (characters, canon_facts);
+    }
+
+    // Section boundaries: lines starting with `## ` partition the
+    // bible into named sections.
+    let mut cur_section: Option<String> = None;
+    let mut cur_body: Vec<&str> = Vec::new();
+
+    for line in bible_body.lines() {
+        if let Some(rest) = line.strip_prefix("## ") {
+            flush_bible_section(cur_section.take(), &cur_body, &mut characters, &mut canon_facts);
+            cur_body.clear();
+            cur_section = Some(rest.trim().to_string());
+        } else if cur_section.is_some() {
+            cur_body.push(line);
+        }
+    }
+    flush_bible_section(cur_section, &cur_body, &mut characters, &mut canon_facts);
+
+    (characters, canon_facts)
+}
+
+fn flush_bible_section(
+    section: Option<String>,
+    body_lines: &[&str],
+    characters: &mut Vec<ParsedCharacter>,
+    canon_facts: &mut Vec<ParsedFact>,
+) {
+    let Some(section) = section else { return };
+    let body = body_lines.join("\n").trim().to_string();
+    if body.is_empty() {
+        return;
+    }
+
+    let lower = section.to_lowercase();
+    if lower.starts_with("character roster") {
+        characters.extend(parse_character_roster(&body));
+    } else if let Some(category) = section_to_category(&lower) {
+        canon_facts.push(ParsedFact {
+            category,
+            title: section,
+            body,
+        });
+    }
+    // Sections we don't recognize get silently dropped. That's fine
+    // for v0.1; the operator can re-import after adjusting the doc.
+}
+
+fn section_to_category(lower_title: &str) -> Option<FactCategory> {
+    if lower_title.starts_with("setting") {
+        Some(FactCategory::Setting)
+    } else if lower_title.starts_with("open mystery") || lower_title.starts_with("mystery") {
+        Some(FactCategory::Mystery)
+    } else if lower_title.starts_with("verified historical") || lower_title.contains("historical events") {
+        Some(FactCategory::HistoricalAnchor)
+    } else if lower_title.starts_with("fictional liberties") || lower_title.starts_with("liberties") {
+        Some(FactCategory::Liberty)
+    } else if lower_title.contains("hook") || lower_title.contains("next-chapter") || lower_title.contains("suggested next") {
+        Some(FactCategory::Hook)
+    } else {
+        None
+    }
+}
+
+fn parse_character_roster(body: &str) -> Vec<ParsedCharacter> {
+    let mut out: Vec<ParsedCharacter> = Vec::new();
+    let mut kind: Option<CharacterKind> = None;
+    let mut cur_name: Option<String> = None;
+    let mut cur_body: Vec<String> = Vec::new();
+
+    fn flush(
+        cur_name: &mut Option<String>,
+        cur_body: &mut Vec<String>,
+        kind: Option<CharacterKind>,
+        out: &mut Vec<ParsedCharacter>,
+    ) {
+        if let (Some(name), Some(kind)) = (cur_name.take(), kind)
+            && !name.is_empty()
+        {
+            let body = cur_body.join(" ").trim().to_string();
+            out.push(ParsedCharacter {
+                name,
+                kind,
+                key_facts: body,
+            });
+        }
+        cur_body.clear();
+    }
+
+    for line in body.lines() {
+        let trimmed = line.trim();
+        if let Some(rest) = line.strip_prefix("### ") {
+            // New sub-section → flush current entry first.
+            flush(&mut cur_name, &mut cur_body, kind, &mut out);
+            let s = rest.trim().to_lowercase();
+            kind = if s.starts_with("real") {
+                Some(CharacterKind::Real)
+            } else if s.starts_with("fictional") {
+                Some(CharacterKind::Fictional)
+            } else {
+                None
+            };
+        } else if let Some(stripped) = trimmed.strip_prefix("- ") {
+            // New character bullet → flush previous.
+            flush(&mut cur_name, &mut cur_body, kind, &mut out);
+            if let Some((name, rest)) = split_bold_name(stripped) {
+                cur_name = Some(name);
+                let rest = rest.trim_start_matches([':', '—', '-', ' ']).trim();
+                if !rest.is_empty() {
+                    cur_body.push(rest.to_string());
+                }
+            }
+        } else if !trimmed.is_empty() && cur_name.is_some() {
+            // Continuation of the current bullet.
+            cur_body.push(line.trim_start().to_string());
+        }
+    }
+    flush(&mut cur_name, &mut cur_body, kind, &mut out);
+    out
+}
+
+/// Extract the **bold** name at the start of a bullet body.
+/// Returns (name, rest-of-bullet).
+fn split_bold_name(s: &str) -> Option<(String, &str)> {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE.get_or_init(|| Regex::new(r"^\*\*(.+?)\*\*\s*(.*)$").unwrap());
+    let caps = re.captures(s)?;
+    let name = caps.get(1)?.as_str().trim().to_string();
+    let rest_match = caps.get(2)?;
+    Some((name, &s[rest_match.start()..rest_match.end()]))
+}
+
+/// Insert a parsed story into the database. Returns the story's id.
+pub async fn import_to_db(pool: &PgPool, parsed: ParsedStory) -> anyhow::Result<Uuid> {
+    let mut tx = pool.begin().await?;
+
+    let total_words: i32 = parsed
+        .chapters
+        .iter()
+        .map(|c| word_count(&c.body))
+        .sum();
+
+    let story_id: Uuid = sqlx::query_scalar(
+        "INSERT INTO stories (title, status, word_count_actual)
+         VALUES ($1, 'seed', $2)
+         RETURNING id",
+    )
+    .bind(&parsed.title)
+    .bind(total_words)
+    .fetch_one(&mut *tx)
+    .await?;
+
+    // root_story_id self-references on the seed row.
+    sqlx::query("UPDATE stories SET root_story_id = id WHERE id = $1")
+        .bind(story_id)
+        .execute(&mut *tx)
+        .await?;
+
+    for chapter in &parsed.chapters {
+        let words = word_count(&chapter.body);
+        let chapter_id: Uuid = sqlx::query_scalar(
+            "INSERT INTO chapters (story_id, n, title, body_md, word_count)
+             VALUES ($1, $2, $3, $4, $5)
+             RETURNING id",
+        )
+        .bind(story_id)
+        .bind(chapter.n)
+        .bind(chapter.title.as_deref())
+        .bind(&chapter.body)
+        .bind(words)
+        .fetch_one(&mut *tx)
+        .await?;
+
+        for (i, para) in chapter.paragraphs.iter().enumerate() {
+            sqlx::query(
+                "INSERT INTO passages (chapter_id, paragraph_n, body)
+                 VALUES ($1, $2, $3)",
+            )
+            .bind(chapter_id)
+            .bind(i as i32 + 1)
+            .bind(para)
+            .execute(&mut *tx)
+            .await?;
+        }
+    }
+
+    for ch in &parsed.characters {
+        sqlx::query(
+            "INSERT INTO characters (story_id, name, kind, key_facts)
+             VALUES ($1, $2, $3, $4)",
+        )
+        .bind(story_id)
+        .bind(&ch.name)
+        .bind(ch.kind.as_str())
+        .bind(&ch.key_facts)
+        .execute(&mut *tx)
+        .await?;
+    }
+
+    for fact in &parsed.canon_facts {
+        sqlx::query(
+            "INSERT INTO canon_facts (story_id, category, title, body)
+             VALUES ($1, $2, $3, $4)",
+        )
+        .bind(story_id)
+        .bind(fact.category.as_str())
+        .bind(&fact.title)
+        .bind(&fact.body)
+        .execute(&mut *tx)
+        .await?;
+    }
+
+    tx.commit().await?;
+    Ok(story_id)
+}
+
+fn word_count(s: &str) -> i32 {
+    s.split_whitespace().count() as i32
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const SAMPLE: &str = r#"# Sample Tale
+
+## Chapter One — Monday, May 1
+
+The morning was bright. The bread was warm. The cat sat in the sun.
+
+She drank her coffee slowly.
+
+## Chapter Two — Tuesday, May 2
+
+The cat moved to the windowsill.
+
+She watched the rain.
+
+# Continuity Bible
+
+## Character Roster
+
+### Real historical figures
+
+- **Anya Petrov** — 34, baker. Real. Husband died in 1985.
+  Two children.
+
+### Fictional characters
+
+- **Boris** — 50, the cat. Black with one white paw.
+
+## Setting Bible
+
+A small village in northern Ukraine in May 1985.
+
+## Open Mystery Threads
+
+1. Whose footprints in the flour bin?
+2. Why does Boris meow at midnight?
+"#;
+
+    #[test]
+    fn parses_title() {
+        let p = parse_story(SAMPLE).unwrap();
+        assert_eq!(p.title, "Sample Tale");
+    }
+
+    #[test]
+    fn parses_chapter_count_and_numbering() {
+        let p = parse_story(SAMPLE).unwrap();
+        assert_eq!(p.chapters.len(), 2);
+        assert_eq!(p.chapters[0].n, 1);
+        assert_eq!(p.chapters[1].n, 2);
+        assert!(p.chapters[0].title.as_deref().unwrap().starts_with("Chapter One"));
+    }
+
+    #[test]
+    fn paragraphs_split_on_blank_line_and_hr() {
+        let p = parse_story(SAMPLE).unwrap();
+        // Chapter 1 has 2 paragraphs (the bright-morning one + the
+        // coffee-drinking one).
+        assert_eq!(p.chapters[0].paragraphs.len(), 2);
+    }
+
+    #[test]
+    fn parses_real_and_fictional_characters() {
+        let p = parse_story(SAMPLE).unwrap();
+        assert_eq!(p.characters.len(), 2);
+        let anya = p.characters.iter().find(|c| c.name == "Anya Petrov").unwrap();
+        assert_eq!(anya.kind, CharacterKind::Real);
+        assert!(anya.key_facts.contains("baker"));
+        let boris = p.characters.iter().find(|c| c.name == "Boris").unwrap();
+        assert_eq!(boris.kind, CharacterKind::Fictional);
+    }
+
+    #[test]
+    fn parses_canon_fact_sections() {
+        let p = parse_story(SAMPLE).unwrap();
+        let setting = p.canon_facts.iter().find(|f| f.category == FactCategory::Setting).unwrap();
+        assert!(setting.body.contains("northern Ukraine"));
+        let mystery = p.canon_facts.iter().find(|f| f.category == FactCategory::Mystery).unwrap();
+        assert!(mystery.body.contains("footprints"));
+    }
+
+    #[test]
+    fn missing_chapters_errors() {
+        let bad = "# Title only\n\nSome body text but no chapters.";
+        let err = parse_story(bad).unwrap_err();
+        assert!(err.to_string().contains("no chapters"), "{err}");
+    }
+}
--- a/skald-core/src/lib.rs
+++ b/skald-core/src/lib.rs
@ -0,0 +1,13 @@
+//! Skald's shared kernel.
+//!
+//! Database schema, row types, markdown ingest, and (later) context
+//! assembly for LLM calls. The story-independence rule: nothing in
+//! this crate knows about any specific story. Every story is rows.
+
+pub mod db;
+pub mod ingest;
+pub mod models;
+
+/// Embeds the workspace `migrations/` directory at compile time.
+/// Run via `MIGRATOR.run(&pool).await` at boot.
+pub static MIGRATOR: sqlx::migrate::Migrator = sqlx::migrate!("../migrations");
--- a/skald-core/src/models.rs
+++ b/skald-core/src/models.rs
@ -0,0 +1,75 @@
+//! Row types. Mirror the schema in `migrations/0001_init.sql`.
+//!
+//! These are deliberately thin — no business logic. Queries that need
+//! to project subsets of fields can use `sqlx::query_as!` against
+//! their own narrower types; these full structs are for the cases
+//! where we want the whole row.
+
+use chrono::{DateTime, Utc};
+use serde::{Deserialize, Serialize};
+use sqlx::FromRow;
+use uuid::Uuid;
+
+#[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
+pub struct Story {
+    pub id: Uuid,
+    pub title: String,
+    pub status: String,
+    pub prompt: Option<String>,
+    pub model: Option<String>,
+    pub parent_story_id: Option<Uuid>,
+    pub root_story_id: Option<Uuid>,
+    pub series_name: Option<String>,
+    pub word_count_target: Option<i32>,
+    pub word_count_actual: i32,
+    pub summary: Option<String>,
+    pub created_at: DateTime<Utc>,
+    pub updated_at: DateTime<Utc>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
+pub struct Character {
+    pub id: Uuid,
+    pub story_id: Uuid,
+    pub name: String,
+    pub kind: String,
+    pub role: Option<String>,
+    pub voice_traits: Option<String>,
+    pub key_facts: String,
+    pub aliases: Vec<String>,
+    pub first_seen_chapter: Option<i32>,
+    pub state_at_latest: Option<String>,
+    pub created_at: DateTime<Utc>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
+pub struct CanonFact {
+    pub id: Uuid,
+    pub story_id: Uuid,
+    pub category: String,
+    pub title: String,
+    pub body: String,
+    pub weight: i32,
+    pub source_chapter: Option<i32>,
+    pub resolved: bool,
+    pub created_at: DateTime<Utc>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
+pub struct Chapter {
+    pub id: Uuid,
+    pub story_id: Uuid,
+    pub n: i32,
+    pub title: Option<String>,
+    pub body_md: String,
+    pub word_count: i32,
+    pub generated_at: DateTime<Utc>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
+pub struct Passage {
+    pub id: Uuid,
+    pub chapter_id: Uuid,
+    pub paragraph_n: i32,
+    pub body: String,
+}