diff --git a/cauldron/db.py b/cauldron/db.py index 92bcde1..b3d91c4 100644 --- a/cauldron/db.py +++ b/cauldron/db.py @@ -503,6 +503,57 @@ MIGRATIONS = [ ALTER TABLE cauldron_meal_plans ADD COLUMN IF NOT EXISTS hecate_reading TEXT """, + # 033 — Discover v0.1 corpus: recipes scraped from external sources + # (allrecipes, BBC Good Food, smitten kitchen, etc) before any household + # has imported them. status walks raw → enriched → imported|rejected. + # Once a household clicks "import to Mealie", Mealie's recipe-from-URL + # endpoint pulls the recipe into THAT household and our sterilize+enrich + # pipelines run on the new Mealie row; the discover row stays as a + # provenance breadcrumb. source_url capped at 768 to stay under InnoDB's + # 3072-byte unique-key limit (768 × 4-byte utf8mb4 chars = 3072). + """ + CREATE TABLE IF NOT EXISTS cauldron_discovered_recipes ( + id BIGINT PRIMARY KEY AUTO_INCREMENT, + slug VARCHAR(255), + source_url VARCHAR(768) NOT NULL, + name VARCHAR(500), + description TEXT, + image_url VARCHAR(1024), + scraped_json JSON, + meta_json JSON, + enrich_version INT NOT NULL DEFAULT 0, + status ENUM('raw','enriched','imported','rejected') + NOT NULL DEFAULT 'raw', + scraped_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + last_action_at DATETIME, + UNIQUE KEY uk_source_url (source_url), + INDEX idx_status_scraped (status, scraped_at), + INDEX idx_slug (slug), + FULLTEXT KEY ft_search (name, description) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 + """, + # 034 — Discover scrape-job state. Same daemon-thread runner pattern as + # sterilize/enrich/foods-consolidate jobs: one job at a time per source + # seed, cancel-respect via state poll, finalize updates only if state + # is non-terminal. source_seed is e.g. 'allrecipes.com'. + """ + CREATE TABLE IF NOT EXISTS cauldron_discover_jobs ( + id BIGINT PRIMARY KEY AUTO_INCREMENT, + started_by_sub VARCHAR(190), + started_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + last_progress_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + finished_at DATETIME, + source_seed VARCHAR(255), + pages_scraped INT NOT NULL DEFAULT 0, + recipes_added INT NOT NULL DEFAULT 0, + skipped_count INT NOT NULL DEFAULT 0, + error_count INT NOT NULL DEFAULT 0, + last_error VARCHAR(500), + state ENUM('running','done','failed','cancelled') + NOT NULL DEFAULT 'running', + INDEX idx_state (state) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 + """, ]