From 8a09b8f8bed32b329d92ceee3a9c759689b09589 Mon Sep 17 00:00:00 2001
From: Kayos <kayos@sulkta.com>
Date: Fri, 1 May 2026 00:23:47 -0700
Subject: [PATCH] =?UTF-8?q?discover:=20schema=20migrations=20033/034=20?=
 =?UTF-8?q?=E2=80=94=20scraped=20recipes=20+=20scrape-job=20tracking?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Append-only schema for Discover v0.1. Two tables:

  cauldron_discovered_recipes — corpus of recipes scraped from external
    sources (allrecipes, BBC Good Food, smitten kitchen, etc) before any
    household has imported them. status walks raw → enriched → imported
    | rejected. source_url is capped at 768 chars to stay under InnoDB's
    3072-byte unique-key ceiling (768 × 4-byte utf8mb4 = 3072). FULLTEXT
    on (name, description) for the discover-page search bar.

  cauldron_discover_jobs — daemon state for the scrape runner, same
    pattern as sterilize/enrich/foods-consolidate (state-poll cancel,
    last_progress_at watchdog, terminal-only finalize).

Migrations only — no Python helpers, no scraper, no UI yet. Lands when
the next cauldron container restart picks them up.
---
 cauldron/db.py | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/cauldron/db.py b/cauldron/db.py
index 92bcde1..b3d91c4 100644
--- a/cauldron/db.py
+++ b/cauldron/db.py
@@ -503,6 +503,57 @@ MIGRATIONS = [
     ALTER TABLE cauldron_meal_plans
       ADD COLUMN IF NOT EXISTS hecate_reading TEXT
     """,
+    # 033 — Discover v0.1 corpus: recipes scraped from external sources
+    # (allrecipes, BBC Good Food, smitten kitchen, etc) before any household
+    # has imported them. status walks raw → enriched → imported|rejected.
+    # Once a household clicks "import to Mealie", Mealie's recipe-from-URL
+    # endpoint pulls the recipe into THAT household and our sterilize+enrich
+    # pipelines run on the new Mealie row; the discover row stays as a
+    # provenance breadcrumb. source_url capped at 768 to stay under InnoDB's
+    # 3072-byte unique-key limit (768 × 4-byte utf8mb4 chars = 3072).
+    """
+    CREATE TABLE IF NOT EXISTS cauldron_discovered_recipes (
+        id BIGINT PRIMARY KEY AUTO_INCREMENT,
+        slug VARCHAR(255),
+        source_url VARCHAR(768) NOT NULL,
+        name VARCHAR(500),
+        description TEXT,
+        image_url VARCHAR(1024),
+        scraped_json JSON,
+        meta_json JSON,
+        enrich_version INT NOT NULL DEFAULT 0,
+        status ENUM('raw','enriched','imported','rejected')
+               NOT NULL DEFAULT 'raw',
+        scraped_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
+        last_action_at DATETIME,
+        UNIQUE KEY uk_source_url (source_url),
+        INDEX idx_status_scraped (status, scraped_at),
+        INDEX idx_slug (slug),
+        FULLTEXT KEY ft_search (name, description)
+    ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
+    """,
+    # 034 — Discover scrape-job state. Same daemon-thread runner pattern as
+    # sterilize/enrich/foods-consolidate jobs: one job at a time per source
+    # seed, cancel-respect via state poll, finalize updates only if state
+    # is non-terminal. source_seed is e.g. 'allrecipes.com'.
+    """
+    CREATE TABLE IF NOT EXISTS cauldron_discover_jobs (
+        id BIGINT PRIMARY KEY AUTO_INCREMENT,
+        started_by_sub VARCHAR(190),
+        started_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
+        last_progress_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
+        finished_at DATETIME,
+        source_seed VARCHAR(255),
+        pages_scraped INT NOT NULL DEFAULT 0,
+        recipes_added INT NOT NULL DEFAULT 0,
+        skipped_count INT NOT NULL DEFAULT 0,
+        error_count INT NOT NULL DEFAULT 0,
+        last_error VARCHAR(500),
+        state ENUM('running','done','failed','cancelled')
+              NOT NULL DEFAULT 'running',
+        INDEX idx_state (state)
+    ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
+    """,
 ]