From 3ec120c1d95facfce1d45986715a026021034248 Mon Sep 17 00:00:00 2001 From: Kayos Date: Fri, 1 May 2026 07:38:27 -0700 Subject: [PATCH] discover v0.1: scrape + search + import MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - requirements: add recipe-scrapers 15.6.0 - mealie.import_from_url(): POST /api/recipes/create/url returns slug - db helpers: insert_discovered_recipe, update_discovered_meta, set_discovered_status, list_discovered_recipes (FULLTEXT + JSON filters), count_discovered_by_status, get_discovered_recipe; discover-job CRUD + anti-zombie finalize + stuck-job recovery - discover_recipes.py: daemon-thread runner (mirrors enrich pattern) walks a URL list; scrape_me → reshape to mealie shape → INSERT IGNORE → forge.enrich_recipe → flip raw → enriched. SEED_URLS curated starter packs for allrecipes / bbc / smitten / pinch / hbh. - endpoints: GET /discover, GET /api/discover/search (q + cuisine + complexity + protein + meal_type + kid-fit + max_minutes + status), POST /api/discover/import/, /reject/, /scrape-start (seed or urls list), /scrape-status, /scrape-cancel/ - discover.html: filter row + card grid + collapsible scrape panel with seed chips and url textarea + live progress poll - nav: 'discover' tab on /, link card on /me - boot recovery: fail_stuck_discover_jobs at startup --- cauldron/db.py | 228 ++++++++++++++++++ cauldron/discover_recipes.py | 267 +++++++++++++++++++++ cauldron/mealie.py | 44 ++++ cauldron/server.py | 169 ++++++++++++- cauldron/templates/_base.html | 1 + cauldron/templates/discover.html | 397 +++++++++++++++++++++++++++++++ cauldron/templates/me.html | 3 + requirements.txt | 1 + 8 files changed, 1109 insertions(+), 1 deletion(-) create mode 100644 cauldron/discover_recipes.py create mode 100644 cauldron/templates/discover.html diff --git a/cauldron/db.py b/cauldron/db.py index b3d91c4..fc3fe0d 100644 --- a/cauldron/db.py +++ b/cauldron/db.py @@ -2143,3 +2143,231 @@ class DB: (stale_minutes,), ) return cur.rowcount + + # --- discover (Discover v0.1) ------------------------------------------ + + def insert_discovered_recipe( + self, + *, + slug: str | None, + source_url: str, + name: str | None, + description: str | None, + image_url: str | None, + scraped_json: str, + ) -> int | None: + """INSERT a freshly-scraped recipe in 'raw' state. Returns the new + row id, or None if the source_url was already present (UNIQUE + violation = duplicate scrape, treat as skip).""" + with self.conn() as c, c.cursor() as cur: + cur.execute( + """INSERT IGNORE INTO cauldron_discovered_recipes + (slug, source_url, name, description, image_url, + scraped_json, status, scraped_at, last_action_at) + VALUES (%s, %s, %s, %s, %s, %s, 'raw', NOW(), NOW())""", + (slug, source_url[:768], name, description, image_url, scraped_json), + ) + return cur.lastrowid or None + + def update_discovered_meta( + self, discover_id: int, *, meta_json: str, version: int + ) -> None: + """Persist enriched metadata + flip status raw → enriched.""" + with self.conn() as c, c.cursor() as cur: + cur.execute( + """UPDATE cauldron_discovered_recipes + SET meta_json=%s, + enrich_version=%s, + status=CASE WHEN status='raw' THEN 'enriched' + ELSE status END, + last_action_at=NOW() + WHERE id=%s""", + (meta_json, version, discover_id), + ) + + def set_discovered_status(self, discover_id: int, status: str) -> None: + """Move a discovered recipe to 'imported' or 'rejected'.""" + with self.conn() as c, c.cursor() as cur: + cur.execute( + """UPDATE cauldron_discovered_recipes + SET status=%s, last_action_at=NOW() + WHERE id=%s""", + (status, discover_id), + ) + + def get_discovered_recipe(self, discover_id: int) -> dict | None: + with self.conn() as c, c.cursor() as cur: + cur.execute( + "SELECT * FROM cauldron_discovered_recipes WHERE id=%s", + (discover_id,), + ) + return cur.fetchone() + + def list_discovered_recipes( + self, + *, + status: str | list[str] | None = "enriched", + q: str | None = None, + cuisine: str | None = None, + complexity: str | None = None, + primary_protein: str | None = None, + meal_type: str | None = None, + kid_friendly_min: int | None = None, + max_minutes: int | None = None, + limit: int = 60, + offset: int = 0, + ) -> list[dict]: + """Browse discovered recipes with filters. Status defaults to + 'enriched' so the /discover page surfaces only ready-to-import + rows. JSON path filters use MySQL JSON_EXTRACT against meta_json.""" + where = [] + args: list = [] + if status is not None: + if isinstance(status, list): + if not status: + return [] + placeholders = ",".join(["%s"] * len(status)) + where.append(f"status IN ({placeholders})") + args.extend(status) + else: + where.append("status = %s") + args.append(status) + if q: + where.append("MATCH(name, description) AGAINST (%s IN NATURAL LANGUAGE MODE)") + args.append(q) + if cuisine: + where.append("JSON_UNQUOTE(JSON_EXTRACT(meta_json, '$.cuisine')) = %s") + args.append(cuisine) + if complexity: + where.append("JSON_UNQUOTE(JSON_EXTRACT(meta_json, '$.complexity')) = %s") + args.append(complexity) + if primary_protein: + where.append("JSON_UNQUOTE(JSON_EXTRACT(meta_json, '$.primary_protein')) = %s") + args.append(primary_protein) + if meal_type: + where.append("JSON_UNQUOTE(JSON_EXTRACT(meta_json, '$.meal_type')) = %s") + args.append(meal_type) + if kid_friendly_min is not None: + where.append("CAST(JSON_EXTRACT(meta_json, '$.kid_friendly_score') AS UNSIGNED) >= %s") + args.append(kid_friendly_min) + if max_minutes is not None: + where.append("CAST(JSON_EXTRACT(meta_json, '$.estimated_minutes') AS UNSIGNED) <= %s") + args.append(max_minutes) + sql = "SELECT * FROM cauldron_discovered_recipes" + if where: + sql += " WHERE " + " AND ".join(where) + # Relevance-rank when there's a search query, else newest-first + if q: + sql += " ORDER BY MATCH(name, description) AGAINST (%s IN NATURAL LANGUAGE MODE) DESC, scraped_at DESC" + args.append(q) + else: + sql += " ORDER BY scraped_at DESC" + sql += " LIMIT %s OFFSET %s" + args.extend([int(limit), int(offset)]) + with self.conn() as c, c.cursor() as cur: + cur.execute(sql, args) + return list(cur.fetchall() or []) + + def count_discovered_by_status(self) -> dict[str, int]: + with self.conn() as c, c.cursor() as cur: + cur.execute( + """SELECT status, COUNT(*) AS n + FROM cauldron_discovered_recipes GROUP BY status""" + ) + return {r["status"]: int(r["n"]) for r in (cur.fetchall() or [])} + + def create_discover_job( + self, *, started_by_sub: str, source_seed: str + ) -> int: + with self.conn() as c, c.cursor() as cur: + cur.execute( + """INSERT INTO cauldron_discover_jobs + (started_by_sub, source_seed, state) + VALUES (%s, %s, 'running')""", + (started_by_sub, source_seed[:255]), + ) + return cur.lastrowid + + def get_discover_job(self, job_id: int) -> dict | None: + with self.conn() as c, c.cursor() as cur: + cur.execute( + "SELECT * FROM cauldron_discover_jobs WHERE id=%s", (job_id,) + ) + return cur.fetchone() + + def get_discover_job_state(self, job_id: int) -> str | None: + with self.conn() as c, c.cursor() as cur: + cur.execute( + "SELECT state FROM cauldron_discover_jobs WHERE id=%s", (job_id,) + ) + row = cur.fetchone() + return row["state"] if row else None + + def latest_discover_job(self) -> dict | None: + with self.conn() as c, c.cursor() as cur: + cur.execute( + """SELECT * FROM cauldron_discover_jobs + ORDER BY started_at DESC LIMIT 1""" + ) + return cur.fetchone() + + def running_discover_job(self) -> dict | None: + with self.conn() as c, c.cursor() as cur: + cur.execute( + """SELECT * FROM cauldron_discover_jobs + WHERE state='running' ORDER BY started_at DESC LIMIT 1""" + ) + return cur.fetchone() + + def update_discover_job_progress( + self, + job_id: int, + *, + pages_delta: int = 0, + added_delta: int = 0, + skipped_delta: int = 0, + error_delta: int = 0, + last_error: str | None = None, + ) -> None: + with self.conn() as c, c.cursor() as cur: + cur.execute( + """UPDATE cauldron_discover_jobs + SET pages_scraped = pages_scraped + %s, + recipes_added = recipes_added + %s, + skipped_count = skipped_count + %s, + error_count = error_count + %s, + last_error = COALESCE(%s, last_error), + last_progress_at = NOW() + WHERE id=%s""", + (pages_delta, added_delta, skipped_delta, error_delta, + last_error[:500] if last_error else None, job_id), + ) + + def finalize_discover_job(self, job_id: int, *, state: str) -> None: + """Anti-zombie guard: only update if the job isn't already in a + terminal state. Mirrors finalize_enrich_job.""" + with self.conn() as c, c.cursor() as cur: + cur.execute( + """UPDATE cauldron_discover_jobs + SET state=%s, + finished_at = CASE WHEN %s IN ('done','failed','cancelled') + THEN NOW() ELSE finished_at END, + last_progress_at = NOW() + WHERE id=%s + AND state NOT IN ('done','failed','cancelled')""", + (state, state, job_id), + ) + + def fail_stuck_discover_jobs(self, *, stale_minutes: int = 15) -> int: + with self.conn() as c, c.cursor() as cur: + cur.execute( + """UPDATE cauldron_discover_jobs + SET state='failed', + finished_at=NOW(), + last_error=COALESCE(last_error, + 'recovery: worker exited mid-run') + WHERE state='running' + AND last_progress_at < NOW() - INTERVAL %s MINUTE""", + (stale_minutes,), + ) + return cur.rowcount diff --git a/cauldron/discover_recipes.py b/cauldron/discover_recipes.py new file mode 100644 index 0000000..13fe457 --- /dev/null +++ b/cauldron/discover_recipes.py @@ -0,0 +1,267 @@ +"""Discover v0.1 — scrape external recipe URLs into the discover corpus. + +Pipeline per URL: + 1. recipe_scrapers.scrape_me(url) → schema.org structured recipe + 2. Reshape into a Mealie-ish dict (name, description, recipeYield, + recipeIngredient[{note}], recipeInstructions[{text}]) + 3. INSERT IGNORE into cauldron_discovered_recipes (UNIQUE on source_url) + 4. forge.enrich_recipe(reshaped) → Hecate-tier metadata + 5. Persist meta_json, flip status raw → enriched + +Same daemon-thread + cancel + stuck-recovery pattern as enrich/sterilize. + +Seed sources are hardcoded URL lists per source_seed (allrecipes-popular, +bbc-popular, smitten-kitchen-recent, ...). Cobb supplies a seed name OR +a literal list of URLs via the admin endpoint. Either way, the runner +walks the list, scrape→insert→enrich each, and emits progress. +""" +from __future__ import annotations + +import json +import logging +import threading +from urllib.parse import urlparse + +from .db import DB +from .forge import Forge, ForgeError + +log = logging.getLogger(__name__) + + +# Curated seed URL lists for v0.1 dogfood. Each is a small starter pack — +# we expand later by adding sitemap/category-page walkers. Keeping these +# manual lets v0.1 ship without a separate site-walker per source. +SEED_URLS: dict[str, list[str]] = { + "allrecipes-popular": [ + "https://www.allrecipes.com/recipe/24074/alyssas-chicken/", + "https://www.allrecipes.com/recipe/229960/world-best-now-veggie-burgers/", + "https://www.allrecipes.com/recipe/16641/old-fashioned-mac-and-cheese/", + "https://www.allrecipes.com/recipe/8499082/instant-pot-pulled-pork/", + "https://www.allrecipes.com/recipe/220854/chef-johns-creamy-mushroom-pasta/", + "https://www.allrecipes.com/recipe/8514308/dr-pepper-pulled-pork/", + "https://www.allrecipes.com/recipe/16700/salisbury-steak/", + "https://www.allrecipes.com/recipe/8536048/oven-baked-bbq-chicken-thighs/", + ], + "bbc-good-food": [ + "https://www.bbcgoodfood.com/recipes/spaghetti-bolognese-recipe", + "https://www.bbcgoodfood.com/recipes/best-spaghetti-carbonara-recipe", + "https://www.bbcgoodfood.com/recipes/easy-chicken-curry", + "https://www.bbcgoodfood.com/recipes/chilli-con-carne-recipe", + "https://www.bbcgoodfood.com/recipes/perfect-roast-chicken", + "https://www.bbcgoodfood.com/recipes/chicken-tikka-masala", + "https://www.bbcgoodfood.com/recipes/sticky-toffee-pudding", + ], + "smitten-kitchen": [ + "https://smittenkitchen.com/2023/02/black-pepper-chicken/", + "https://smittenkitchen.com/2024/01/orecchiette-with-broccoli-rabe/", + "https://smittenkitchen.com/2023/09/baked-orzo-with-eggplant-and-mozzarella/", + "https://smittenkitchen.com/2022/12/cacio-e-pepe-soup-with-broccoli-rabe/", + "https://smittenkitchen.com/2022/05/spinach-chickpea-skillet/", + ], + "pinch-of-yum": [ + "https://pinchofyum.com/the-best-soft-chocolate-chip-cookies", + "https://pinchofyum.com/spicy-peanut-soba-noodle-salad", + "https://pinchofyum.com/best-chicken-marinade", + "https://pinchofyum.com/15-minute-meal-prep-cilantro-lime-chicken-and-cauliflower-rice", + "https://pinchofyum.com/pesto-cavatappi", + ], + "half-baked-harvest": [ + "https://www.halfbakedharvest.com/cajun-chicken-pasta/", + "https://www.halfbakedharvest.com/garlic-butter-creamed-spinach-salmon/", + "https://www.halfbakedharvest.com/spicy-pretzel-chicken/", + "https://www.halfbakedharvest.com/crispy-buffalo-chicken-tacos/", + "https://www.halfbakedharvest.com/butter-chicken-meatballs/", + ], +} + + +def list_seeds() -> list[dict]: + """For the /discover admin UI: name + count of curated URLs per seed.""" + return [{"name": k, "count": len(v)} for k, v in SEED_URLS.items()] + + +def _slug_from_url(url: str) -> str | None: + """Cheap slug fallback when the scraper doesn't expose one.""" + try: + parts = [p for p in urlparse(url).path.split("/") if p] + return parts[-1][:255] if parts else None + except Exception: + return None + + +def _safe_call(fn, default=None): + """recipe_scrapers raises various Exception subclasses for missing + fields. Swallow them per-field rather than aborting the whole scrape.""" + try: + return fn() + except Exception: + return default + + +def _to_mealie_shape(scraper, source_url: str) -> dict: + """Reshape a recipe_scrapers.AbstractScraper into the dict shape + forge.enrich_recipe expects (a Mealie recipe). Falls back gracefully + when individual fields are unavailable.""" + title = _safe_call(scraper.title) or "" + description = _safe_call(getattr(scraper, "description", lambda: ""), "") or "" + yields = _safe_call(scraper.yields, "") or "" + image = _safe_call(scraper.image, "") or "" + + ings_raw = _safe_call(scraper.ingredients, []) or [] + ingredients = [ + {"note": str(x).strip()} + for x in ings_raw + if x and str(x).strip() + ] + + # Prefer instructions_list when supported; some scrapers only expose + # the joined string. + steps_list: list[str] = [] + instructions_list = _safe_call(getattr(scraper, "instructions_list", lambda: None), None) + if instructions_list: + steps_list = [str(s).strip() for s in instructions_list if s and str(s).strip()] + else: + joined = _safe_call(scraper.instructions, "") or "" + steps_list = [s.strip() for s in joined.split("\n") if s.strip()] + instructions = [{"text": s} for s in steps_list] + + return { + "name": title, + "description": description, + "recipeYield": yields, + "image": image, + "source_url": source_url, + "recipeIngredient": ingredients, + "recipeInstructions": instructions, + } + + +def _scrape_one(url: str) -> tuple[dict, str | None] | None: + """Scrape a single URL. Returns (mealie_shape_dict, image_url) on + success. Returns None on any unrecoverable scraper error.""" + try: + from recipe_scrapers import scrape_me # type: ignore + except ImportError: + log.exception("[discover] recipe_scrapers not installed") + return None + + try: + scraper = scrape_me(url, wild_mode=True) + except Exception as e: + log.warning("[discover] scrape_me(%s) failed: %s", url, e) + return None + + shaped = _to_mealie_shape(scraper, url) + image = shaped.get("image") or None + if not shaped.get("name"): + log.warning("[discover] no name extracted from %s", url) + return None + if not shaped.get("recipeIngredient"): + log.warning("[discover] no ingredients extracted from %s", url) + return None + return shaped, image + + +def run_discover( + *, + db: DB, + job_id: int, + forge: Forge, + urls: list[str], +) -> None: + """Walk a list of URLs: scrape → insert → enrich. Runs in a daemon + thread; respects external cancel via state poll.""" + log.info("[discover:%s] start (%d urls)", job_id, len(urls)) + + def _cancelled() -> bool: + s = db.get_discover_job_state(job_id) + return s in ("cancelled", "failed", "done") + + try: + for url in urls: + if _cancelled(): + log.info("[discover:%s] aborted (state changed)", job_id) + return + + db.update_discover_job_progress(job_id, pages_delta=1) + + scraped = _scrape_one(url) + if scraped is None: + db.update_discover_job_progress( + job_id, error_delta=1, last_error=f"scrape failed: {url[:200]}" + ) + continue + shaped, image = scraped + + try: + slug = _slug_from_url(url) + discover_id = db.insert_discovered_recipe( + slug=slug, + source_url=url, + name=shaped.get("name") or None, + description=(shaped.get("description") or "")[:60000] or None, + image_url=image, + scraped_json=json.dumps(shaped, ensure_ascii=False), + ) + except Exception as e: + log.warning("[discover:%s] insert(%s) failed: %s", job_id, url, e) + db.update_discover_job_progress( + job_id, error_delta=1, last_error=f"insert: {str(e)[:200]}" + ) + continue + + if not discover_id: + # UNIQUE conflict — already in the corpus from a prior scrape + db.update_discover_job_progress(job_id, skipped_delta=1) + continue + + try: + meta = forge.enrich_recipe(shaped) + except (ForgeError, RuntimeError) as e: + msg = str(e)[:500] + log.warning("[discover:%s] enrich(%s): %s", job_id, url, msg) + db.update_discover_job_progress( + job_id, error_delta=1, last_error=f"enrich: {msg[:200]}" + ) + # Leave the row in 'raw' so we can retry enrichment later. + # The recipe IS in the corpus; just hasn't been classified. + continue + + try: + db.update_discovered_meta( + discover_id, + meta_json=json.dumps(meta, ensure_ascii=False), + version=DB.ENRICH_VERSION, + ) + db.update_discover_job_progress(job_id, added_delta=1) + except Exception as e: + log.warning("[discover:%s] persist meta(%s): %s", job_id, url, e) + db.update_discover_job_progress( + job_id, error_delta=1, last_error=f"persist: {str(e)[:200]}" + ) + + db.finalize_discover_job(job_id, state="done") + log.info("[discover:%s] done", job_id) + except Exception: + log.exception("[discover:%s] crashed", job_id) + try: + db.finalize_discover_job(job_id, state="failed") + except Exception: + pass + + +def spawn_thread( + *, + db: DB, + job_id: int, + forge: Forge, + urls: list[str], +) -> threading.Thread: + t = threading.Thread( + target=run_discover, + kwargs={"db": db, "job_id": job_id, "forge": forge, "urls": urls}, + name=f"discover-recipes-{job_id}", + daemon=True, + ) + t.start() + return t diff --git a/cauldron/mealie.py b/cauldron/mealie.py index c3bb246..0b54866 100644 --- a/cauldron/mealie.py +++ b/cauldron/mealie.py @@ -105,6 +105,50 @@ class Mealie: def update_recipe(self, slug: str, body: dict) -> dict: return self._put(f"/api/recipes/{slug}", body) + def import_from_url( + self, + url: str, + *, + include_tags: bool = False, + include_categories: bool = False, + ) -> str: + """POST /api/recipes/create/url — Mealie scrapes the URL itself + and creates a recipe row in the caller's household. Returns the + new recipe slug. After this lands, the household's existing + sterilize+enrich pipelines will pick it up on next walk. + + Mealie does its own scraping with recipe_scrapers internally; we + don't pass our scraped JSON. This keeps the import path canonical + — same code path as the user clicking "Import from URL" in + Mealie's UI.""" + body = { + "url": url, + "includeTags": bool(include_tags), + "includeCategories": bool(include_categories), + } + try: + r = self.session.post( + f"{self.base_url}/api/recipes/create/url", + json=body, + timeout=60, + ) + except requests.RequestException as e: + raise MealieError(f"POST /api/recipes/create/url transport: {e}") from e + if r.status_code >= 400: + raise MealieError( + f"POST /api/recipes/create/url -> {r.status_code}: {r.text[:300]}" + ) + # Mealie returns the new slug as a bare JSON string + try: + slug = r.json() + except Exception: + slug = r.text.strip().strip('"') + if isinstance(slug, dict): + slug = slug.get("slug") or slug.get("id") + if not isinstance(slug, str) or not slug: + raise MealieError(f"create/url returned no slug: {r.text[:200]}") + return slug + def delete_recipe(self, slug: str) -> dict: """DELETE /api/recipes/. Permanently removes the recipe and its recipe_ingredient rows. Permission-scoped per-household. diff --git a/cauldron/server.py b/cauldron/server.py index 89a3bc2..6b9239a 100644 --- a/cauldron/server.py +++ b/cauldron/server.py @@ -33,7 +33,7 @@ from .config import load from .crypto import TokenCrypto from .db import DB from .forge import Forge, ForgeError -from . import aggregator, bulk_sterilize, consolidate_foods, dedupe_recipes, enrich_recipes, foods +from . import aggregator, bulk_sterilize, consolidate_foods, dedupe_recipes, discover_recipes, enrich_recipes, foods from .mealie import Mealie, MealieError from .oidc import init_oauth from .recipe_index import flatten_recipe, refresh_household_index, search_index @@ -132,6 +132,13 @@ def create_app() -> Flask: except Exception as e: app.logger.warning("enrich stuck-job recovery failed: %s", e) + try: + n_failed = db.fail_stuck_discover_jobs(stale_minutes=15) + if n_failed: + app.logger.info("failed %d stuck discover jobs at boot", n_failed) + except Exception as e: + app.logger.warning("discover stuck-job recovery failed: %s", e) + oauth = init_oauth( app, issuer=cfg.oidc_issuer, @@ -1865,6 +1872,166 @@ def create_app() -> Flask: db.finalize_consolidate_job(job_id, state="cancelled") return jsonify({"ok": True}) + # ---------- Discover v0.1 (browse external recipes) ------------------ + + @app.get("/discover") + @require_session + def discover_page(): + # Discover is a global, cross-household corpus — no household + # gate. But we still want a connected user before showing the + # import buttons (since import targets the user's Mealie). + counts = db.count_discovered_by_status() + latest = db.latest_discover_job() + seeds = discover_recipes.list_seeds() + return render_template( + "discover.html", + active="discover", + counts=counts, + latest_job=_consolidate_job_payload(latest) if latest else None, + seeds=seeds, + ) + + @app.get("/api/discover/search") + @require_session + def discover_search(): + args = request.args + q = (args.get("q") or "").strip() or None + # 'all' default — show enriched + raw together so newly-scraped + # rows surface even before enrichment finishes. Imported/rejected + # are hidden by default. + status_arg = (args.get("status") or "active").strip() + if status_arg == "active": + status: list[str] | str | None = ["enriched", "raw"] + elif status_arg == "all": + status = None + else: + status = status_arg + + def _opt(name: str) -> str | None: + v = (args.get(name) or "").strip() + return v or None + + def _opt_int(name: str) -> int | None: + v = (args.get(name) or "").strip() + if not v: + return None + try: + return int(v) + except ValueError: + return None + + rows = db.list_discovered_recipes( + status=status, + q=q, + cuisine=_opt("cuisine"), + complexity=_opt("complexity"), + primary_protein=_opt("primary_protein"), + meal_type=_opt("meal_type"), + kid_friendly_min=_opt_int("kid_friendly_min"), + max_minutes=_opt_int("max_minutes"), + limit=min(int(args.get("limit") or 60), 200), + offset=max(int(args.get("offset") or 0), 0), + ) + + out = [] + for r in rows: + meta = r.get("meta_json") + if isinstance(meta, str): + try: + meta = _json_loads(meta) + except Exception: + meta = None + for k in ("scraped_at", "last_action_at"): + v = r.get(k) + if v is not None and hasattr(v, "isoformat"): + r[k] = v.isoformat() + # scraped_json can be heavy — drop it from list responses + r.pop("scraped_json", None) + r["meta_json"] = meta + out.append(r) + return jsonify({"recipes": out, "count": len(out)}) + + @app.post("/api/discover/import/") + @require_session + def discover_import(discover_id: int): + row = db.get_discovered_recipe(discover_id) + if not row: + return jsonify({"error": "not_found"}), 404 + if row.get("status") == "imported": + return jsonify({"error": "already_imported"}), 409 + client = current_user_mealie() + if client is None: + return jsonify({"error": "mealie_not_connected"}), 409 + try: + new_slug = client.import_from_url(row["source_url"]) + except MealieError as e: + return jsonify({"error": "mealie_import_failed", "detail": str(e)[:300]}), 502 + db.set_discovered_status(discover_id, "imported") + return jsonify({"ok": True, "slug": new_slug}) + + @app.post("/api/discover/reject/") + @require_session + def discover_reject(discover_id: int): + row = db.get_discovered_recipe(discover_id) + if not row: + return jsonify({"error": "not_found"}), 404 + db.set_discovered_status(discover_id, "rejected") + return jsonify({"ok": True}) + + @app.post("/api/discover/scrape-start") + @require_session + def discover_scrape_start(): + u = session["user"] + active = db.running_discover_job() + if active: + return jsonify({"error": "already_running", "job_id": active["id"]}), 409 + body = request.get_json(silent=True) or {} + urls: list[str] = [] + seed_name = (body.get("seed") or "").strip() + if seed_name: + seeds = discover_recipes.SEED_URLS + if seed_name not in seeds: + return jsonify({"error": "unknown_seed", "available": list(seeds.keys())}), 400 + urls = list(seeds[seed_name]) + else: + url_list = body.get("urls") or [] + if not isinstance(url_list, list): + return jsonify({"error": "urls must be a list"}), 400 + urls = [str(x).strip() for x in url_list if str(x).strip()] + seed_name = "manual" + if not urls: + return jsonify({"error": "no urls supplied"}), 400 + # Light sanity guard: scrub bad entries, cap at 50/job + urls = [x for x in urls if x.startswith(("http://", "https://"))][:50] + if not urls: + return jsonify({"error": "no valid http(s) urls"}), 400 + job_id = db.create_discover_job( + started_by_sub=u["sub"], source_seed=seed_name, + ) + discover_recipes.spawn_thread( + db=db, job_id=job_id, forge=forge, urls=urls, + ) + return jsonify({"ok": True, "job_id": job_id, "urls_queued": len(urls)}) + + @app.get("/api/discover/scrape-status") + @require_session + def discover_scrape_status(): + job = db.latest_discover_job() + if not job: + return jsonify({"job": None}) + return jsonify({"job": _consolidate_job_payload(job)}) + + @app.post("/api/discover/scrape-cancel/") + @require_session + def discover_scrape_cancel(job_id: int): + job = db.get_discover_job(job_id) + if not job: + return jsonify({"error": "not_found"}), 404 + if job["state"] != "running": + return jsonify({"error": f"bad_state:{job['state']}"}), 409 + db.finalize_discover_job(job_id, state="cancelled") + return jsonify({"ok": True}) + # ---------- admin sterilizer (bearer-auth, kick off on user's behalf) - @app.post("/api/admin/sterilize/bulk-start") diff --git a/cauldron/templates/_base.html b/cauldron/templates/_base.html index f73b624..edcb145 100644 --- a/cauldron/templates/_base.html +++ b/cauldron/templates/_base.html @@ -482,6 +482,7 @@ button { font-family: inherit; } picks plan list + discover me
diff --git a/cauldron/templates/discover.html b/cauldron/templates/discover.html new file mode 100644 index 0000000..192cf21 --- /dev/null +++ b/cauldron/templates/discover.html @@ -0,0 +1,397 @@ +{% extends "_base.html" %} +{% block title %}Discover · Cauldron{% endblock %} +{% block content %} + + + +
+
// discover · external recipe corpus
+

recipe discover

+
+ a cross-household library of recipes scraped from the open web — + each enriched by hecate so you can filter by cuisine, complexity, + primary protein, kid-friendliness, time. one click imports to your + mealie household; the same sterilize + enrich pipelines you already + trust run on it. +
+
+ +
+
+

browse

+ + {{ counts.get('enriched', 0) }} enriched · {{ counts.get('raw', 0) }} raw · + {{ counts.get('imported', 0) }} imported · {{ counts.get('rejected', 0) }} rejected + +
+ +
+ + + + + + + + +
+ +
+ + scrape new recipes (admin) +

+ kick off a background scrape from a curated seed list, or paste your own + urls (one per line, max 50). each url goes through scrape → + insert → hecate enrich. +

+
+ {% for s in seeds %} + + {% endfor %} +
+ +
+ + +
+ +
+ +
+ +
+ + + +{% endblock %} diff --git a/cauldron/templates/me.html b/cauldron/templates/me.html index 070ac70..b1bee15 100644 --- a/cauldron/templates/me.html +++ b/cauldron/templates/me.html @@ -65,6 +65,9 @@

have hecate generate per-recipe metadata — cuisine, complexity, macros, primary protein/carb, comfort tier, summary. the plan generator reads this so "high protein week" is a real query, not just a vibe.

✨ enrich recipes →

+ +

browse a cross-household corpus of scraped recipes — search by cuisine / protein / time / kid-friendliness. one click sends a recipe to your mealie library; sterilize+enrich pipelines run on it like any other.

+

🌐 discover recipes →

{% endif %} diff --git a/requirements.txt b/requirements.txt index cd2d775..5463b47 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ Authlib==1.3.2 PyMySQL==1.1.1 cryptography==43.0.3 rapidfuzz==3.10.1 +recipe-scrapers==15.6.0