discover: schema migrations 033/034 — scraped recipes + scrape-job tracking

Append-only schema for Discover v0.1. Two tables:

  cauldron_discovered_recipes — corpus of recipes scraped from external
    sources (allrecipes, BBC Good Food, smitten kitchen, etc) before any
    household has imported them. status walks raw → enriched → imported
    | rejected. source_url is capped at 768 chars to stay under InnoDB's
    3072-byte unique-key ceiling (768 × 4-byte utf8mb4 = 3072). FULLTEXT
    on (name, description) for the discover-page search bar.

  cauldron_discover_jobs — daemon state for the scrape runner, same
    pattern as sterilize/enrich/foods-consolidate (state-poll cancel,
    last_progress_at watchdog, terminal-only finalize).

Migrations only — no Python helpers, no scraper, no UI yet. Lands when
the next cauldron container restart picks them up.
This commit is contained in:
Kayos 2026-05-01 00:23:47 -07:00
parent d561a9373e
commit 8a09b8f8be

View file

@ -503,6 +503,57 @@ MIGRATIONS = [
ALTER TABLE cauldron_meal_plans
ADD COLUMN IF NOT EXISTS hecate_reading TEXT
""",
# 033 — Discover v0.1 corpus: recipes scraped from external sources
# (allrecipes, BBC Good Food, smitten kitchen, etc) before any household
# has imported them. status walks raw → enriched → imported|rejected.
# Once a household clicks "import to Mealie", Mealie's recipe-from-URL
# endpoint pulls the recipe into THAT household and our sterilize+enrich
# pipelines run on the new Mealie row; the discover row stays as a
# provenance breadcrumb. source_url capped at 768 to stay under InnoDB's
# 3072-byte unique-key limit (768 × 4-byte utf8mb4 chars = 3072).
"""
CREATE TABLE IF NOT EXISTS cauldron_discovered_recipes (
id BIGINT PRIMARY KEY AUTO_INCREMENT,
slug VARCHAR(255),
source_url VARCHAR(768) NOT NULL,
name VARCHAR(500),
description TEXT,
image_url VARCHAR(1024),
scraped_json JSON,
meta_json JSON,
enrich_version INT NOT NULL DEFAULT 0,
status ENUM('raw','enriched','imported','rejected')
NOT NULL DEFAULT 'raw',
scraped_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
last_action_at DATETIME,
UNIQUE KEY uk_source_url (source_url),
INDEX idx_status_scraped (status, scraped_at),
INDEX idx_slug (slug),
FULLTEXT KEY ft_search (name, description)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
""",
# 034 — Discover scrape-job state. Same daemon-thread runner pattern as
# sterilize/enrich/foods-consolidate jobs: one job at a time per source
# seed, cancel-respect via state poll, finalize updates only if state
# is non-terminal. source_seed is e.g. 'allrecipes.com'.
"""
CREATE TABLE IF NOT EXISTS cauldron_discover_jobs (
id BIGINT PRIMARY KEY AUTO_INCREMENT,
started_by_sub VARCHAR(190),
started_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
last_progress_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
finished_at DATETIME,
source_seed VARCHAR(255),
pages_scraped INT NOT NULL DEFAULT 0,
recipes_added INT NOT NULL DEFAULT 0,
skipped_count INT NOT NULL DEFAULT 0,
error_count INT NOT NULL DEFAULT 0,
last_error VARCHAR(500),
state ENUM('running','done','failed','cancelled')
NOT NULL DEFAULT 'running',
INDEX idx_state (state)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
""",
]