From 8a09b8f8bed32b329d92ceee3a9c759689b09589 Mon Sep 17 00:00:00 2001 From: Kayos Date: Fri, 1 May 2026 00:23:47 -0700 Subject: [PATCH] =?UTF-8?q?discover:=20schema=20migrations=20033/034=20?= =?UTF-8?q?=E2=80=94=20scraped=20recipes=20+=20scrape-job=20tracking?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Append-only schema for Discover v0.1. Two tables: cauldron_discovered_recipes — corpus of recipes scraped from external sources (allrecipes, BBC Good Food, smitten kitchen, etc) before any household has imported them. status walks raw → enriched → imported | rejected. source_url is capped at 768 chars to stay under InnoDB's 3072-byte unique-key ceiling (768 × 4-byte utf8mb4 = 3072). FULLTEXT on (name, description) for the discover-page search bar. cauldron_discover_jobs — daemon state for the scrape runner, same pattern as sterilize/enrich/foods-consolidate (state-poll cancel, last_progress_at watchdog, terminal-only finalize). Migrations only — no Python helpers, no scraper, no UI yet. Lands when the next cauldron container restart picks them up. --- cauldron/db.py | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/cauldron/db.py b/cauldron/db.py index 92bcde1..b3d91c4 100644 --- a/cauldron/db.py +++ b/cauldron/db.py @@ -503,6 +503,57 @@ MIGRATIONS = [ ALTER TABLE cauldron_meal_plans ADD COLUMN IF NOT EXISTS hecate_reading TEXT """, + # 033 — Discover v0.1 corpus: recipes scraped from external sources + # (allrecipes, BBC Good Food, smitten kitchen, etc) before any household + # has imported them. status walks raw → enriched → imported|rejected. + # Once a household clicks "import to Mealie", Mealie's recipe-from-URL + # endpoint pulls the recipe into THAT household and our sterilize+enrich + # pipelines run on the new Mealie row; the discover row stays as a + # provenance breadcrumb. source_url capped at 768 to stay under InnoDB's + # 3072-byte unique-key limit (768 × 4-byte utf8mb4 chars = 3072). + """ + CREATE TABLE IF NOT EXISTS cauldron_discovered_recipes ( + id BIGINT PRIMARY KEY AUTO_INCREMENT, + slug VARCHAR(255), + source_url VARCHAR(768) NOT NULL, + name VARCHAR(500), + description TEXT, + image_url VARCHAR(1024), + scraped_json JSON, + meta_json JSON, + enrich_version INT NOT NULL DEFAULT 0, + status ENUM('raw','enriched','imported','rejected') + NOT NULL DEFAULT 'raw', + scraped_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + last_action_at DATETIME, + UNIQUE KEY uk_source_url (source_url), + INDEX idx_status_scraped (status, scraped_at), + INDEX idx_slug (slug), + FULLTEXT KEY ft_search (name, description) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 + """, + # 034 — Discover scrape-job state. Same daemon-thread runner pattern as + # sterilize/enrich/foods-consolidate jobs: one job at a time per source + # seed, cancel-respect via state poll, finalize updates only if state + # is non-terminal. source_seed is e.g. 'allrecipes.com'. + """ + CREATE TABLE IF NOT EXISTS cauldron_discover_jobs ( + id BIGINT PRIMARY KEY AUTO_INCREMENT, + started_by_sub VARCHAR(190), + started_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + last_progress_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + finished_at DATETIME, + source_seed VARCHAR(255), + pages_scraped INT NOT NULL DEFAULT 0, + recipes_added INT NOT NULL DEFAULT 0, + skipped_count INT NOT NULL DEFAULT 0, + error_count INT NOT NULL DEFAULT 0, + last_error VARCHAR(500), + state ENUM('running','done','failed','cancelled') + NOT NULL DEFAULT 'running', + INDEX idx_state (state) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 + """, ]