discover: schema migrations 033/034 — scraped recipes + scrape-job tracking
Append-only schema for Discover v0.1. Two tables:
cauldron_discovered_recipes — corpus of recipes scraped from external
sources (allrecipes, BBC Good Food, smitten kitchen, etc) before any
household has imported them. status walks raw → enriched → imported
| rejected. source_url is capped at 768 chars to stay under InnoDB's
3072-byte unique-key ceiling (768 × 4-byte utf8mb4 = 3072). FULLTEXT
on (name, description) for the discover-page search bar.
cauldron_discover_jobs — daemon state for the scrape runner, same
pattern as sterilize/enrich/foods-consolidate (state-poll cancel,
last_progress_at watchdog, terminal-only finalize).
Migrations only — no Python helpers, no scraper, no UI yet. Lands when
the next cauldron container restart picks them up.
This commit is contained in:
parent
d561a9373e
commit
8a09b8f8be
1 changed files with 51 additions and 0 deletions
|
|
@ -503,6 +503,57 @@ MIGRATIONS = [
|
|||
ALTER TABLE cauldron_meal_plans
|
||||
ADD COLUMN IF NOT EXISTS hecate_reading TEXT
|
||||
""",
|
||||
# 033 — Discover v0.1 corpus: recipes scraped from external sources
|
||||
# (allrecipes, BBC Good Food, smitten kitchen, etc) before any household
|
||||
# has imported them. status walks raw → enriched → imported|rejected.
|
||||
# Once a household clicks "import to Mealie", Mealie's recipe-from-URL
|
||||
# endpoint pulls the recipe into THAT household and our sterilize+enrich
|
||||
# pipelines run on the new Mealie row; the discover row stays as a
|
||||
# provenance breadcrumb. source_url capped at 768 to stay under InnoDB's
|
||||
# 3072-byte unique-key limit (768 × 4-byte utf8mb4 chars = 3072).
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS cauldron_discovered_recipes (
|
||||
id BIGINT PRIMARY KEY AUTO_INCREMENT,
|
||||
slug VARCHAR(255),
|
||||
source_url VARCHAR(768) NOT NULL,
|
||||
name VARCHAR(500),
|
||||
description TEXT,
|
||||
image_url VARCHAR(1024),
|
||||
scraped_json JSON,
|
||||
meta_json JSON,
|
||||
enrich_version INT NOT NULL DEFAULT 0,
|
||||
status ENUM('raw','enriched','imported','rejected')
|
||||
NOT NULL DEFAULT 'raw',
|
||||
scraped_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
last_action_at DATETIME,
|
||||
UNIQUE KEY uk_source_url (source_url),
|
||||
INDEX idx_status_scraped (status, scraped_at),
|
||||
INDEX idx_slug (slug),
|
||||
FULLTEXT KEY ft_search (name, description)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
|
||||
""",
|
||||
# 034 — Discover scrape-job state. Same daemon-thread runner pattern as
|
||||
# sterilize/enrich/foods-consolidate jobs: one job at a time per source
|
||||
# seed, cancel-respect via state poll, finalize updates only if state
|
||||
# is non-terminal. source_seed is e.g. 'allrecipes.com'.
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS cauldron_discover_jobs (
|
||||
id BIGINT PRIMARY KEY AUTO_INCREMENT,
|
||||
started_by_sub VARCHAR(190),
|
||||
started_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
last_progress_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
finished_at DATETIME,
|
||||
source_seed VARCHAR(255),
|
||||
pages_scraped INT NOT NULL DEFAULT 0,
|
||||
recipes_added INT NOT NULL DEFAULT 0,
|
||||
skipped_count INT NOT NULL DEFAULT 0,
|
||||
error_count INT NOT NULL DEFAULT 0,
|
||||
last_error VARCHAR(500),
|
||||
state ENUM('running','done','failed','cancelled')
|
||||
NOT NULL DEFAULT 'running',
|
||||
INDEX idx_state (state)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
|
||||
""",
|
||||
]
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue