discover v0.1: scrape + search + import

- requirements: add recipe-scrapers 15.6.0
- mealie.import_from_url(): POST /api/recipes/create/url returns slug
- db helpers: insert_discovered_recipe, update_discovered_meta,
  set_discovered_status, list_discovered_recipes (FULLTEXT + JSON
  filters), count_discovered_by_status, get_discovered_recipe;
  discover-job CRUD + anti-zombie finalize + stuck-job recovery
- discover_recipes.py: daemon-thread runner (mirrors enrich pattern)
  walks a URL list; scrape_me → reshape to mealie shape → INSERT IGNORE
  → forge.enrich_recipe → flip raw → enriched. SEED_URLS curated
  starter packs for allrecipes / bbc / smitten / pinch / hbh.
- endpoints: GET /discover, GET /api/discover/search (q + cuisine +
  complexity + protein + meal_type + kid-fit + max_minutes + status),
  POST /api/discover/import/<id>, /reject/<id>, /scrape-start (seed
  or urls list), /scrape-status, /scrape-cancel/<id>
- discover.html: filter row + card grid + collapsible scrape panel
  with seed chips and url textarea + live progress poll
- nav: 'discover' tab on /, link card on /me
- boot recovery: fail_stuck_discover_jobs at startup
This commit is contained in:
Kayos 2026-05-01 07:38:27 -07:00
parent 8a09b8f8be
commit 3ec120c1d9
8 changed files with 1109 additions and 1 deletions

View file

@ -2143,3 +2143,231 @@ class DB:
(stale_minutes,),
)
return cur.rowcount
# --- discover (Discover v0.1) ------------------------------------------
def insert_discovered_recipe(
self,
*,
slug: str | None,
source_url: str,
name: str | None,
description: str | None,
image_url: str | None,
scraped_json: str,
) -> int | None:
"""INSERT a freshly-scraped recipe in 'raw' state. Returns the new
row id, or None if the source_url was already present (UNIQUE
violation = duplicate scrape, treat as skip)."""
with self.conn() as c, c.cursor() as cur:
cur.execute(
"""INSERT IGNORE INTO cauldron_discovered_recipes
(slug, source_url, name, description, image_url,
scraped_json, status, scraped_at, last_action_at)
VALUES (%s, %s, %s, %s, %s, %s, 'raw', NOW(), NOW())""",
(slug, source_url[:768], name, description, image_url, scraped_json),
)
return cur.lastrowid or None
def update_discovered_meta(
self, discover_id: int, *, meta_json: str, version: int
) -> None:
"""Persist enriched metadata + flip status raw → enriched."""
with self.conn() as c, c.cursor() as cur:
cur.execute(
"""UPDATE cauldron_discovered_recipes
SET meta_json=%s,
enrich_version=%s,
status=CASE WHEN status='raw' THEN 'enriched'
ELSE status END,
last_action_at=NOW()
WHERE id=%s""",
(meta_json, version, discover_id),
)
def set_discovered_status(self, discover_id: int, status: str) -> None:
"""Move a discovered recipe to 'imported' or 'rejected'."""
with self.conn() as c, c.cursor() as cur:
cur.execute(
"""UPDATE cauldron_discovered_recipes
SET status=%s, last_action_at=NOW()
WHERE id=%s""",
(status, discover_id),
)
def get_discovered_recipe(self, discover_id: int) -> dict | None:
with self.conn() as c, c.cursor() as cur:
cur.execute(
"SELECT * FROM cauldron_discovered_recipes WHERE id=%s",
(discover_id,),
)
return cur.fetchone()
def list_discovered_recipes(
self,
*,
status: str | list[str] | None = "enriched",
q: str | None = None,
cuisine: str | None = None,
complexity: str | None = None,
primary_protein: str | None = None,
meal_type: str | None = None,
kid_friendly_min: int | None = None,
max_minutes: int | None = None,
limit: int = 60,
offset: int = 0,
) -> list[dict]:
"""Browse discovered recipes with filters. Status defaults to
'enriched' so the /discover page surfaces only ready-to-import
rows. JSON path filters use MySQL JSON_EXTRACT against meta_json."""
where = []
args: list = []
if status is not None:
if isinstance(status, list):
if not status:
return []
placeholders = ",".join(["%s"] * len(status))
where.append(f"status IN ({placeholders})")
args.extend(status)
else:
where.append("status = %s")
args.append(status)
if q:
where.append("MATCH(name, description) AGAINST (%s IN NATURAL LANGUAGE MODE)")
args.append(q)
if cuisine:
where.append("JSON_UNQUOTE(JSON_EXTRACT(meta_json, '$.cuisine')) = %s")
args.append(cuisine)
if complexity:
where.append("JSON_UNQUOTE(JSON_EXTRACT(meta_json, '$.complexity')) = %s")
args.append(complexity)
if primary_protein:
where.append("JSON_UNQUOTE(JSON_EXTRACT(meta_json, '$.primary_protein')) = %s")
args.append(primary_protein)
if meal_type:
where.append("JSON_UNQUOTE(JSON_EXTRACT(meta_json, '$.meal_type')) = %s")
args.append(meal_type)
if kid_friendly_min is not None:
where.append("CAST(JSON_EXTRACT(meta_json, '$.kid_friendly_score') AS UNSIGNED) >= %s")
args.append(kid_friendly_min)
if max_minutes is not None:
where.append("CAST(JSON_EXTRACT(meta_json, '$.estimated_minutes') AS UNSIGNED) <= %s")
args.append(max_minutes)
sql = "SELECT * FROM cauldron_discovered_recipes"
if where:
sql += " WHERE " + " AND ".join(where)
# Relevance-rank when there's a search query, else newest-first
if q:
sql += " ORDER BY MATCH(name, description) AGAINST (%s IN NATURAL LANGUAGE MODE) DESC, scraped_at DESC"
args.append(q)
else:
sql += " ORDER BY scraped_at DESC"
sql += " LIMIT %s OFFSET %s"
args.extend([int(limit), int(offset)])
with self.conn() as c, c.cursor() as cur:
cur.execute(sql, args)
return list(cur.fetchall() or [])
def count_discovered_by_status(self) -> dict[str, int]:
with self.conn() as c, c.cursor() as cur:
cur.execute(
"""SELECT status, COUNT(*) AS n
FROM cauldron_discovered_recipes GROUP BY status"""
)
return {r["status"]: int(r["n"]) for r in (cur.fetchall() or [])}
def create_discover_job(
self, *, started_by_sub: str, source_seed: str
) -> int:
with self.conn() as c, c.cursor() as cur:
cur.execute(
"""INSERT INTO cauldron_discover_jobs
(started_by_sub, source_seed, state)
VALUES (%s, %s, 'running')""",
(started_by_sub, source_seed[:255]),
)
return cur.lastrowid
def get_discover_job(self, job_id: int) -> dict | None:
with self.conn() as c, c.cursor() as cur:
cur.execute(
"SELECT * FROM cauldron_discover_jobs WHERE id=%s", (job_id,)
)
return cur.fetchone()
def get_discover_job_state(self, job_id: int) -> str | None:
with self.conn() as c, c.cursor() as cur:
cur.execute(
"SELECT state FROM cauldron_discover_jobs WHERE id=%s", (job_id,)
)
row = cur.fetchone()
return row["state"] if row else None
def latest_discover_job(self) -> dict | None:
with self.conn() as c, c.cursor() as cur:
cur.execute(
"""SELECT * FROM cauldron_discover_jobs
ORDER BY started_at DESC LIMIT 1"""
)
return cur.fetchone()
def running_discover_job(self) -> dict | None:
with self.conn() as c, c.cursor() as cur:
cur.execute(
"""SELECT * FROM cauldron_discover_jobs
WHERE state='running' ORDER BY started_at DESC LIMIT 1"""
)
return cur.fetchone()
def update_discover_job_progress(
self,
job_id: int,
*,
pages_delta: int = 0,
added_delta: int = 0,
skipped_delta: int = 0,
error_delta: int = 0,
last_error: str | None = None,
) -> None:
with self.conn() as c, c.cursor() as cur:
cur.execute(
"""UPDATE cauldron_discover_jobs
SET pages_scraped = pages_scraped + %s,
recipes_added = recipes_added + %s,
skipped_count = skipped_count + %s,
error_count = error_count + %s,
last_error = COALESCE(%s, last_error),
last_progress_at = NOW()
WHERE id=%s""",
(pages_delta, added_delta, skipped_delta, error_delta,
last_error[:500] if last_error else None, job_id),
)
def finalize_discover_job(self, job_id: int, *, state: str) -> None:
"""Anti-zombie guard: only update if the job isn't already in a
terminal state. Mirrors finalize_enrich_job."""
with self.conn() as c, c.cursor() as cur:
cur.execute(
"""UPDATE cauldron_discover_jobs
SET state=%s,
finished_at = CASE WHEN %s IN ('done','failed','cancelled')
THEN NOW() ELSE finished_at END,
last_progress_at = NOW()
WHERE id=%s
AND state NOT IN ('done','failed','cancelled')""",
(state, state, job_id),
)
def fail_stuck_discover_jobs(self, *, stale_minutes: int = 15) -> int:
with self.conn() as c, c.cursor() as cur:
cur.execute(
"""UPDATE cauldron_discover_jobs
SET state='failed',
finished_at=NOW(),
last_error=COALESCE(last_error,
'recovery: worker exited mid-run')
WHERE state='running'
AND last_progress_at < NOW() - INTERVAL %s MINUTE""",
(stale_minutes,),
)
return cur.rowcount

View file

@ -0,0 +1,267 @@
"""Discover v0.1 — scrape external recipe URLs into the discover corpus.
Pipeline per URL:
1. recipe_scrapers.scrape_me(url) schema.org structured recipe
2. Reshape into a Mealie-ish dict (name, description, recipeYield,
recipeIngredient[{note}], recipeInstructions[{text}])
3. INSERT IGNORE into cauldron_discovered_recipes (UNIQUE on source_url)
4. forge.enrich_recipe(reshaped) Hecate-tier metadata
5. Persist meta_json, flip status raw enriched
Same daemon-thread + cancel + stuck-recovery pattern as enrich/sterilize.
Seed sources are hardcoded URL lists per source_seed (allrecipes-popular,
bbc-popular, smitten-kitchen-recent, ...). Cobb supplies a seed name OR
a literal list of URLs via the admin endpoint. Either way, the runner
walks the list, scrapeinsertenrich each, and emits progress.
"""
from __future__ import annotations
import json
import logging
import threading
from urllib.parse import urlparse
from .db import DB
from .forge import Forge, ForgeError
log = logging.getLogger(__name__)
# Curated seed URL lists for v0.1 dogfood. Each is a small starter pack —
# we expand later by adding sitemap/category-page walkers. Keeping these
# manual lets v0.1 ship without a separate site-walker per source.
SEED_URLS: dict[str, list[str]] = {
"allrecipes-popular": [
"https://www.allrecipes.com/recipe/24074/alyssas-chicken/",
"https://www.allrecipes.com/recipe/229960/world-best-now-veggie-burgers/",
"https://www.allrecipes.com/recipe/16641/old-fashioned-mac-and-cheese/",
"https://www.allrecipes.com/recipe/8499082/instant-pot-pulled-pork/",
"https://www.allrecipes.com/recipe/220854/chef-johns-creamy-mushroom-pasta/",
"https://www.allrecipes.com/recipe/8514308/dr-pepper-pulled-pork/",
"https://www.allrecipes.com/recipe/16700/salisbury-steak/",
"https://www.allrecipes.com/recipe/8536048/oven-baked-bbq-chicken-thighs/",
],
"bbc-good-food": [
"https://www.bbcgoodfood.com/recipes/spaghetti-bolognese-recipe",
"https://www.bbcgoodfood.com/recipes/best-spaghetti-carbonara-recipe",
"https://www.bbcgoodfood.com/recipes/easy-chicken-curry",
"https://www.bbcgoodfood.com/recipes/chilli-con-carne-recipe",
"https://www.bbcgoodfood.com/recipes/perfect-roast-chicken",
"https://www.bbcgoodfood.com/recipes/chicken-tikka-masala",
"https://www.bbcgoodfood.com/recipes/sticky-toffee-pudding",
],
"smitten-kitchen": [
"https://smittenkitchen.com/2023/02/black-pepper-chicken/",
"https://smittenkitchen.com/2024/01/orecchiette-with-broccoli-rabe/",
"https://smittenkitchen.com/2023/09/baked-orzo-with-eggplant-and-mozzarella/",
"https://smittenkitchen.com/2022/12/cacio-e-pepe-soup-with-broccoli-rabe/",
"https://smittenkitchen.com/2022/05/spinach-chickpea-skillet/",
],
"pinch-of-yum": [
"https://pinchofyum.com/the-best-soft-chocolate-chip-cookies",
"https://pinchofyum.com/spicy-peanut-soba-noodle-salad",
"https://pinchofyum.com/best-chicken-marinade",
"https://pinchofyum.com/15-minute-meal-prep-cilantro-lime-chicken-and-cauliflower-rice",
"https://pinchofyum.com/pesto-cavatappi",
],
"half-baked-harvest": [
"https://www.halfbakedharvest.com/cajun-chicken-pasta/",
"https://www.halfbakedharvest.com/garlic-butter-creamed-spinach-salmon/",
"https://www.halfbakedharvest.com/spicy-pretzel-chicken/",
"https://www.halfbakedharvest.com/crispy-buffalo-chicken-tacos/",
"https://www.halfbakedharvest.com/butter-chicken-meatballs/",
],
}
def list_seeds() -> list[dict]:
"""For the /discover admin UI: name + count of curated URLs per seed."""
return [{"name": k, "count": len(v)} for k, v in SEED_URLS.items()]
def _slug_from_url(url: str) -> str | None:
"""Cheap slug fallback when the scraper doesn't expose one."""
try:
parts = [p for p in urlparse(url).path.split("/") if p]
return parts[-1][:255] if parts else None
except Exception:
return None
def _safe_call(fn, default=None):
"""recipe_scrapers raises various Exception subclasses for missing
fields. Swallow them per-field rather than aborting the whole scrape."""
try:
return fn()
except Exception:
return default
def _to_mealie_shape(scraper, source_url: str) -> dict:
"""Reshape a recipe_scrapers.AbstractScraper into the dict shape
forge.enrich_recipe expects (a Mealie recipe). Falls back gracefully
when individual fields are unavailable."""
title = _safe_call(scraper.title) or ""
description = _safe_call(getattr(scraper, "description", lambda: ""), "") or ""
yields = _safe_call(scraper.yields, "") or ""
image = _safe_call(scraper.image, "") or ""
ings_raw = _safe_call(scraper.ingredients, []) or []
ingredients = [
{"note": str(x).strip()}
for x in ings_raw
if x and str(x).strip()
]
# Prefer instructions_list when supported; some scrapers only expose
# the joined string.
steps_list: list[str] = []
instructions_list = _safe_call(getattr(scraper, "instructions_list", lambda: None), None)
if instructions_list:
steps_list = [str(s).strip() for s in instructions_list if s and str(s).strip()]
else:
joined = _safe_call(scraper.instructions, "") or ""
steps_list = [s.strip() for s in joined.split("\n") if s.strip()]
instructions = [{"text": s} for s in steps_list]
return {
"name": title,
"description": description,
"recipeYield": yields,
"image": image,
"source_url": source_url,
"recipeIngredient": ingredients,
"recipeInstructions": instructions,
}
def _scrape_one(url: str) -> tuple[dict, str | None] | None:
"""Scrape a single URL. Returns (mealie_shape_dict, image_url) on
success. Returns None on any unrecoverable scraper error."""
try:
from recipe_scrapers import scrape_me # type: ignore
except ImportError:
log.exception("[discover] recipe_scrapers not installed")
return None
try:
scraper = scrape_me(url, wild_mode=True)
except Exception as e:
log.warning("[discover] scrape_me(%s) failed: %s", url, e)
return None
shaped = _to_mealie_shape(scraper, url)
image = shaped.get("image") or None
if not shaped.get("name"):
log.warning("[discover] no name extracted from %s", url)
return None
if not shaped.get("recipeIngredient"):
log.warning("[discover] no ingredients extracted from %s", url)
return None
return shaped, image
def run_discover(
*,
db: DB,
job_id: int,
forge: Forge,
urls: list[str],
) -> None:
"""Walk a list of URLs: scrape → insert → enrich. Runs in a daemon
thread; respects external cancel via state poll."""
log.info("[discover:%s] start (%d urls)", job_id, len(urls))
def _cancelled() -> bool:
s = db.get_discover_job_state(job_id)
return s in ("cancelled", "failed", "done")
try:
for url in urls:
if _cancelled():
log.info("[discover:%s] aborted (state changed)", job_id)
return
db.update_discover_job_progress(job_id, pages_delta=1)
scraped = _scrape_one(url)
if scraped is None:
db.update_discover_job_progress(
job_id, error_delta=1, last_error=f"scrape failed: {url[:200]}"
)
continue
shaped, image = scraped
try:
slug = _slug_from_url(url)
discover_id = db.insert_discovered_recipe(
slug=slug,
source_url=url,
name=shaped.get("name") or None,
description=(shaped.get("description") or "")[:60000] or None,
image_url=image,
scraped_json=json.dumps(shaped, ensure_ascii=False),
)
except Exception as e:
log.warning("[discover:%s] insert(%s) failed: %s", job_id, url, e)
db.update_discover_job_progress(
job_id, error_delta=1, last_error=f"insert: {str(e)[:200]}"
)
continue
if not discover_id:
# UNIQUE conflict — already in the corpus from a prior scrape
db.update_discover_job_progress(job_id, skipped_delta=1)
continue
try:
meta = forge.enrich_recipe(shaped)
except (ForgeError, RuntimeError) as e:
msg = str(e)[:500]
log.warning("[discover:%s] enrich(%s): %s", job_id, url, msg)
db.update_discover_job_progress(
job_id, error_delta=1, last_error=f"enrich: {msg[:200]}"
)
# Leave the row in 'raw' so we can retry enrichment later.
# The recipe IS in the corpus; just hasn't been classified.
continue
try:
db.update_discovered_meta(
discover_id,
meta_json=json.dumps(meta, ensure_ascii=False),
version=DB.ENRICH_VERSION,
)
db.update_discover_job_progress(job_id, added_delta=1)
except Exception as e:
log.warning("[discover:%s] persist meta(%s): %s", job_id, url, e)
db.update_discover_job_progress(
job_id, error_delta=1, last_error=f"persist: {str(e)[:200]}"
)
db.finalize_discover_job(job_id, state="done")
log.info("[discover:%s] done", job_id)
except Exception:
log.exception("[discover:%s] crashed", job_id)
try:
db.finalize_discover_job(job_id, state="failed")
except Exception:
pass
def spawn_thread(
*,
db: DB,
job_id: int,
forge: Forge,
urls: list[str],
) -> threading.Thread:
t = threading.Thread(
target=run_discover,
kwargs={"db": db, "job_id": job_id, "forge": forge, "urls": urls},
name=f"discover-recipes-{job_id}",
daemon=True,
)
t.start()
return t

View file

@ -105,6 +105,50 @@ class Mealie:
def update_recipe(self, slug: str, body: dict) -> dict:
return self._put(f"/api/recipes/{slug}", body)
def import_from_url(
self,
url: str,
*,
include_tags: bool = False,
include_categories: bool = False,
) -> str:
"""POST /api/recipes/create/url — Mealie scrapes the URL itself
and creates a recipe row in the caller's household. Returns the
new recipe slug. After this lands, the household's existing
sterilize+enrich pipelines will pick it up on next walk.
Mealie does its own scraping with recipe_scrapers internally; we
don't pass our scraped JSON. This keeps the import path canonical
same code path as the user clicking "Import from URL" in
Mealie's UI."""
body = {
"url": url,
"includeTags": bool(include_tags),
"includeCategories": bool(include_categories),
}
try:
r = self.session.post(
f"{self.base_url}/api/recipes/create/url",
json=body,
timeout=60,
)
except requests.RequestException as e:
raise MealieError(f"POST /api/recipes/create/url transport: {e}") from e
if r.status_code >= 400:
raise MealieError(
f"POST /api/recipes/create/url -> {r.status_code}: {r.text[:300]}"
)
# Mealie returns the new slug as a bare JSON string
try:
slug = r.json()
except Exception:
slug = r.text.strip().strip('"')
if isinstance(slug, dict):
slug = slug.get("slug") or slug.get("id")
if not isinstance(slug, str) or not slug:
raise MealieError(f"create/url returned no slug: {r.text[:200]}")
return slug
def delete_recipe(self, slug: str) -> dict:
"""DELETE /api/recipes/<slug>. Permanently removes the recipe and
its recipe_ingredient rows. Permission-scoped per-household.

View file

@ -33,7 +33,7 @@ from .config import load
from .crypto import TokenCrypto
from .db import DB
from .forge import Forge, ForgeError
from . import aggregator, bulk_sterilize, consolidate_foods, dedupe_recipes, enrich_recipes, foods
from . import aggregator, bulk_sterilize, consolidate_foods, dedupe_recipes, discover_recipes, enrich_recipes, foods
from .mealie import Mealie, MealieError
from .oidc import init_oauth
from .recipe_index import flatten_recipe, refresh_household_index, search_index
@ -132,6 +132,13 @@ def create_app() -> Flask:
except Exception as e:
app.logger.warning("enrich stuck-job recovery failed: %s", e)
try:
n_failed = db.fail_stuck_discover_jobs(stale_minutes=15)
if n_failed:
app.logger.info("failed %d stuck discover jobs at boot", n_failed)
except Exception as e:
app.logger.warning("discover stuck-job recovery failed: %s", e)
oauth = init_oauth(
app,
issuer=cfg.oidc_issuer,
@ -1865,6 +1872,166 @@ def create_app() -> Flask:
db.finalize_consolidate_job(job_id, state="cancelled")
return jsonify({"ok": True})
# ---------- Discover v0.1 (browse external recipes) ------------------
@app.get("/discover")
@require_session
def discover_page():
# Discover is a global, cross-household corpus — no household
# gate. But we still want a connected user before showing the
# import buttons (since import targets the user's Mealie).
counts = db.count_discovered_by_status()
latest = db.latest_discover_job()
seeds = discover_recipes.list_seeds()
return render_template(
"discover.html",
active="discover",
counts=counts,
latest_job=_consolidate_job_payload(latest) if latest else None,
seeds=seeds,
)
@app.get("/api/discover/search")
@require_session
def discover_search():
args = request.args
q = (args.get("q") or "").strip() or None
# 'all' default — show enriched + raw together so newly-scraped
# rows surface even before enrichment finishes. Imported/rejected
# are hidden by default.
status_arg = (args.get("status") or "active").strip()
if status_arg == "active":
status: list[str] | str | None = ["enriched", "raw"]
elif status_arg == "all":
status = None
else:
status = status_arg
def _opt(name: str) -> str | None:
v = (args.get(name) or "").strip()
return v or None
def _opt_int(name: str) -> int | None:
v = (args.get(name) or "").strip()
if not v:
return None
try:
return int(v)
except ValueError:
return None
rows = db.list_discovered_recipes(
status=status,
q=q,
cuisine=_opt("cuisine"),
complexity=_opt("complexity"),
primary_protein=_opt("primary_protein"),
meal_type=_opt("meal_type"),
kid_friendly_min=_opt_int("kid_friendly_min"),
max_minutes=_opt_int("max_minutes"),
limit=min(int(args.get("limit") or 60), 200),
offset=max(int(args.get("offset") or 0), 0),
)
out = []
for r in rows:
meta = r.get("meta_json")
if isinstance(meta, str):
try:
meta = _json_loads(meta)
except Exception:
meta = None
for k in ("scraped_at", "last_action_at"):
v = r.get(k)
if v is not None and hasattr(v, "isoformat"):
r[k] = v.isoformat()
# scraped_json can be heavy — drop it from list responses
r.pop("scraped_json", None)
r["meta_json"] = meta
out.append(r)
return jsonify({"recipes": out, "count": len(out)})
@app.post("/api/discover/import/<int:discover_id>")
@require_session
def discover_import(discover_id: int):
row = db.get_discovered_recipe(discover_id)
if not row:
return jsonify({"error": "not_found"}), 404
if row.get("status") == "imported":
return jsonify({"error": "already_imported"}), 409
client = current_user_mealie()
if client is None:
return jsonify({"error": "mealie_not_connected"}), 409
try:
new_slug = client.import_from_url(row["source_url"])
except MealieError as e:
return jsonify({"error": "mealie_import_failed", "detail": str(e)[:300]}), 502
db.set_discovered_status(discover_id, "imported")
return jsonify({"ok": True, "slug": new_slug})
@app.post("/api/discover/reject/<int:discover_id>")
@require_session
def discover_reject(discover_id: int):
row = db.get_discovered_recipe(discover_id)
if not row:
return jsonify({"error": "not_found"}), 404
db.set_discovered_status(discover_id, "rejected")
return jsonify({"ok": True})
@app.post("/api/discover/scrape-start")
@require_session
def discover_scrape_start():
u = session["user"]
active = db.running_discover_job()
if active:
return jsonify({"error": "already_running", "job_id": active["id"]}), 409
body = request.get_json(silent=True) or {}
urls: list[str] = []
seed_name = (body.get("seed") or "").strip()
if seed_name:
seeds = discover_recipes.SEED_URLS
if seed_name not in seeds:
return jsonify({"error": "unknown_seed", "available": list(seeds.keys())}), 400
urls = list(seeds[seed_name])
else:
url_list = body.get("urls") or []
if not isinstance(url_list, list):
return jsonify({"error": "urls must be a list"}), 400
urls = [str(x).strip() for x in url_list if str(x).strip()]
seed_name = "manual"
if not urls:
return jsonify({"error": "no urls supplied"}), 400
# Light sanity guard: scrub bad entries, cap at 50/job
urls = [x for x in urls if x.startswith(("http://", "https://"))][:50]
if not urls:
return jsonify({"error": "no valid http(s) urls"}), 400
job_id = db.create_discover_job(
started_by_sub=u["sub"], source_seed=seed_name,
)
discover_recipes.spawn_thread(
db=db, job_id=job_id, forge=forge, urls=urls,
)
return jsonify({"ok": True, "job_id": job_id, "urls_queued": len(urls)})
@app.get("/api/discover/scrape-status")
@require_session
def discover_scrape_status():
job = db.latest_discover_job()
if not job:
return jsonify({"job": None})
return jsonify({"job": _consolidate_job_payload(job)})
@app.post("/api/discover/scrape-cancel/<int:job_id>")
@require_session
def discover_scrape_cancel(job_id: int):
job = db.get_discover_job(job_id)
if not job:
return jsonify({"error": "not_found"}), 404
if job["state"] != "running":
return jsonify({"error": f"bad_state:{job['state']}"}), 409
db.finalize_discover_job(job_id, state="cancelled")
return jsonify({"ok": True})
# ---------- admin sterilizer (bearer-auth, kick off on user's behalf) -
@app.post("/api/admin/sterilize/bulk-start")

View file

@ -482,6 +482,7 @@ button { font-family: inherit; }
<a href="/picks" class="{% if active == 'picks' %}active{% endif %}">picks</a>
<a href="/plan" class="{% if active == 'plan' %}active{% endif %}">plan</a>
<a href="/list" class="{% if active == 'list' %}active{% endif %}">list</a>
<a href="/discover" class="{% if active == 'discover' %}active{% endif %}">discover</a>
<a href="/me" class="{% if active == 'me' %}active{% endif %}">me</a>
</nav>
<div class="topmeta">

View file

@ -0,0 +1,397 @@
{% extends "_base.html" %}
{% block title %}Discover · Cauldron{% endblock %}
{% block content %}
<style>
.filter-row { display:flex; flex-wrap:wrap; gap:10px; margin:10px 0 18px 0;
align-items:center; }
.filter-row label { font-family:var(--mono); font-size:11px;
color:var(--bone-dim); letter-spacing:.1em; text-transform:uppercase;
display:flex; flex-direction:column; gap:4px; }
.filter-row input[type=text], .filter-row input[type=number],
.filter-row select {
background:var(--bg-2); border:1px solid var(--line); color:var(--bone);
padding:6px 8px; border-radius:6px; font-family:var(--mono);
font-size:13px; min-width:130px; }
.filter-row input[type=text].search { min-width:240px; }
.grid { display:grid; gap:14px;
grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); }
.dcard { background:var(--bg-2); border:1px solid var(--line);
border-radius:10px; overflow:hidden; display:flex; flex-direction:column; }
.dcard .img { width:100%; aspect-ratio: 16/10;
background:var(--bg-1) center/cover no-repeat;
border-bottom:1px solid var(--line); }
.dcard .img.placeholder { display:flex; align-items:center;
justify-content:center; color:var(--muted); font-size:36px; }
.dcard .body { padding:12px 14px; flex:1; display:flex;
flex-direction:column; gap:8px; }
.dcard h3 { font-family:var(--serif); font-size:1.05em; margin:0;
color:var(--bone); line-height:1.2; }
.dcard .meta-line { font-family:var(--mono); font-size:11px;
color:var(--bone-dim); letter-spacing:.05em; }
.dcard .quip { font-family:var(--serif); font-style:italic;
color:var(--purple-bright); font-size:.92em; line-height:1.35; }
.dcard .desc { color:var(--muted); font-size:.9em; line-height:1.4;
display:-webkit-box; -webkit-line-clamp:3; -webkit-box-orient:vertical;
overflow:hidden; }
.dcard .src { font-family:var(--mono); font-size:10.5px;
color:var(--muted); text-transform:lowercase; }
.dcard .src a { color:var(--green-bright); text-decoration:none; }
.dcard .actions { display:flex; gap:8px; padding:10px 14px;
border-top:1px solid var(--line); background:var(--bg-1); }
.dcard .actions .btn { flex:1; }
.dcard.imported { opacity:.55; }
.dcard.rejected { opacity:.4; }
.dcard .raw-tag { display:inline-block; padding:2px 6px; font-size:10px;
border:1px solid var(--line); border-radius:4px; color:var(--muted);
font-family:var(--mono); letter-spacing:.1em; text-transform:uppercase; }
.empty { color:var(--muted); font-style:italic; padding:32px;
text-align:center; border:1px dashed var(--line); border-radius:8px; }
.seed-row { display:flex; flex-wrap:wrap; gap:6px; margin:8px 0; }
.seed-row .chip { cursor:pointer; }
.progress-rail { width:100%; height:10px; background:var(--bg-2);
border:1px solid var(--line); border-radius:6px; overflow:hidden;
margin:8px 0; }
.progress-fill { height:100%;
background:linear-gradient(90deg, var(--purple-deep), var(--purple-bright));
transition:width .3s ease; }
.progress-meta { color:var(--bone-dim); font-family:var(--mono);
font-size:11px; letter-spacing:.1em; display:flex; gap:14px; flex-wrap:wrap; }
.progress-meta strong { color:var(--bone); }
details.scrape-panel { margin-bottom:14px; }
details.scrape-panel summary { cursor:pointer; font-family:var(--mono);
font-size:12px; letter-spacing:.1em; color:var(--bone-dim);
text-transform:uppercase; padding:6px 0; }
details.scrape-panel summary:hover { color:var(--purple-bright); }
textarea#urls-input { width:100%; min-height:80px;
background:var(--bg-2); border:1px solid var(--line); color:var(--bone);
padding:8px; border-radius:6px; font-family:var(--mono); font-size:12px; }
</style>
<div class="page-head">
<div class="crumb">// discover · external recipe corpus</div>
<h1>recipe <span class="accent">discover</span></h1>
<div class="lede">
a cross-household library of recipes scraped from the open web —
each enriched by hecate so you can filter by cuisine, complexity,
primary protein, kid-friendliness, time. one click imports to your
mealie household; the same sterilize + enrich pipelines you already
trust run on it.
</div>
</div>
<section class="panel">
<div class="panel-head">
<h2>browse</h2>
<span class="ctx" id="status-line">
{{ counts.get('enriched', 0) }} enriched · {{ counts.get('raw', 0) }} raw ·
{{ counts.get('imported', 0) }} imported · {{ counts.get('rejected', 0) }} rejected
</span>
</div>
<div class="filter-row">
<label>search
<input type="text" id="q" class="search" placeholder="chicken, soup, ramen…">
</label>
<label>cuisine
<select id="cuisine">
<option value="">any</option>
<option>american</option><option>italian</option><option>asian</option>
<option>mexican</option><option>mediterranean</option>
<option>indian</option><option>french</option>
<option>middle-eastern</option><option>other</option>
</select>
</label>
<label>complexity
<select id="complexity">
<option value="">any</option>
<option>easy</option><option>medium</option><option>involved</option>
</select>
</label>
<label>protein
<select id="primary_protein">
<option value="">any</option>
<option>chicken</option><option>beef</option><option>pork</option>
<option>fish</option><option>seafood</option><option>tofu</option>
<option>tempeh</option><option>beans</option><option>eggs</option>
<option>cheese</option><option>nuts</option><option>none</option>
<option>mixed</option>
</select>
</label>
<label>meal type
<select id="meal_type">
<option value="">any</option>
<option>breakfast</option><option>lunch</option><option>dinner</option>
<option>snack</option><option>dessert</option><option>side</option>
</select>
</label>
<label>kid-fit ≥
<select id="kid_friendly_min">
<option value="">any</option>
<option value="1">1</option><option value="2">2</option>
<option value="3">3</option><option value="4">4</option>
<option value="5">5</option>
</select>
</label>
<label>max minutes
<input type="number" id="max_minutes" min="1" max="600" placeholder="—">
</label>
<label>status
<select id="status">
<option value="active" selected>enriched + raw</option>
<option value="enriched">enriched only</option>
<option value="all">all (incl imported/rejected)</option>
<option value="imported">imported</option>
<option value="rejected">rejected</option>
</select>
</label>
</div>
<details class="scrape-panel">
<summary>+ scrape new recipes (admin)</summary>
<p class="muted" style="margin-top:8px;">
kick off a background scrape from a curated seed list, or paste your own
urls (one per line, max 50). each url goes through scrape →
insert → hecate enrich.
</p>
<div class="seed-row">
{% for s in seeds %}
<button class="chip" type="button" onclick="seedSet('{{ s.name }}')">
{{ s.name }} <span class="muted">({{ s.count }})</span>
</button>
{% endfor %}
</div>
<textarea id="urls-input" placeholder="https://… one per line"></textarea>
<div class="btn-row" style="margin-top:8px;">
<button class="btn btn-purple" type="button" onclick="startScrape()">▸ start scrape</button>
<button class="btn" type="button" onclick="cancelScrape()" id="cancel-btn" style="display:none;">cancel</button>
</div>
<div id="scrape-progress" style="display:none; margin-top:12px;">
<div class="progress-rail"><div class="progress-fill" id="bar" style="width:0%;"></div></div>
<div class="progress-meta">
<span><strong id="pages">0</strong> walked</span>
<span><strong id="added">0</strong> added</span>
<span><strong id="skipped">0</strong> skipped</span>
<span><strong id="errors">0</strong> errors</span>
<span class="muted" id="last-error"></span>
</div>
</div>
</details>
<div id="grid" class="grid"></div>
<div id="empty-msg" class="empty" style="display:none;">no recipes match. widen filters or scrape some new ones.</div>
</section>
<script>
let scrapeJob = {{ (latest_job | tojson) if latest_job else 'null' }};
let scrapePoll = null;
let searchTimer = null;
function $(id){ return document.getElementById(id); }
function _esc(s){
if(s == null) return '';
return String(s).replace(/[&<>"']/g, ch => ({
'&':'&amp;','<':'&lt;','>':'&gt;','"':'&quot;',"'":'&#39;'
})[ch]);
}
function _metaLine(meta){
if(!meta) return '<span class="raw-tag">awaiting enrich</span>';
const bits = [];
if(meta.cuisine && meta.cuisine !== 'unknown') bits.push(meta.cuisine);
if(meta.complexity) bits.push(meta.complexity);
if(meta.primary_protein && meta.primary_protein !== 'none') bits.push(meta.primary_protein);
if(meta.estimated_minutes) bits.push(meta.estimated_minutes + ' min');
if(meta.kid_friendly_score != null) bits.push('kid:' + meta.kid_friendly_score);
return _esc(bits.join(' · '));
}
function _renderCard(r){
const meta = r.meta_json || null;
const quip = meta && meta.hecate_quip ? meta.hecate_quip : '';
const desc = r.description || (meta && meta.summary) || '';
const imgUrl = r.image_url || '';
const klass = 'dcard ' + (r.status === 'imported' ? 'imported' :
r.status === 'rejected' ? 'rejected' : '');
const imgHtml = imgUrl
? `<div class="img" style="background-image:url('${_esc(imgUrl)}')"></div>`
: `<div class="img placeholder">🍴</div>`;
let actionsHtml = '';
if(r.status === 'imported'){
actionsHtml = '<span class="muted" style="flex:1; text-align:center; font-family:var(--mono); font-size:11px;">✓ imported</span>';
} else if(r.status === 'rejected'){
actionsHtml = '<span class="muted" style="flex:1; text-align:center; font-family:var(--mono); font-size:11px;">✗ rejected</span>';
} else {
actionsHtml = `
<button class="btn btn-purple" type="button" onclick="importDiscover(${r.id}, this)">🍳 import</button>
<button class="btn" type="button" onclick="rejectDiscover(${r.id}, this)" title="hide from discover">✗ skip</button>
`;
}
return `
<div class="${klass}" data-id="${r.id}">
${imgHtml}
<div class="body">
<h3>${_esc(r.name || '(untitled)')}</h3>
<div class="meta-line">${_metaLine(meta)}</div>
${quip ? `<div class="quip">${_esc(quip)}</div>` : ''}
${desc ? `<div class="desc">${_esc(desc)}</div>` : ''}
<div class="src"><a href="${_esc(r.source_url)}" target="_blank" rel="noopener noreferrer">${_esc(new URL(r.source_url).host)}</a></div>
</div>
<div class="actions">${actionsHtml}</div>
</div>`;
}
async function refreshSearch(){
const params = new URLSearchParams();
for(const id of ['q','cuisine','complexity','primary_protein','meal_type','kid_friendly_min','max_minutes','status']){
const v = $(id).value;
if(v !== '') params.set(id, v);
}
try {
const r = await fetch('/api/discover/search?' + params.toString());
const d = await r.json();
const recipes = d.recipes || [];
$('grid').innerHTML = recipes.map(_renderCard).join('');
$('empty-msg').style.display = recipes.length === 0 ? '' : 'none';
} catch(e){
console.error('search failed', e);
}
}
function debouncedSearch(){
clearTimeout(searchTimer);
searchTimer = setTimeout(refreshSearch, 250);
}
for(const id of ['q','cuisine','complexity','primary_protein','meal_type','kid_friendly_min','max_minutes','status']){
const el = $(id);
el.addEventListener('input', debouncedSearch);
el.addEventListener('change', debouncedSearch);
}
async function importDiscover(id, btn){
btn.disabled = true; btn.textContent = 'importing…';
try {
const r = await fetch('/api/discover/import/' + id, { method:'POST' });
const d = await r.json();
if(!r.ok) throw new Error(d.error || r.status);
// Mark card as imported in-place
const card = btn.closest('.dcard');
if(card){
card.classList.add('imported');
card.querySelector('.actions').innerHTML =
'<span class="muted" style="flex:1; text-align:center; font-family:var(--mono); font-size:11px;">✓ imported as <code>' + _esc(d.slug) + '</code></span>';
}
} catch(e){
btn.disabled = false; btn.textContent = '🍳 import';
alert('import failed: ' + e.message);
}
}
async function rejectDiscover(id, btn){
btn.disabled = true; btn.textContent = '…';
try {
const r = await fetch('/api/discover/reject/' + id, { method:'POST' });
if(!r.ok){ const d = await r.json().catch(()=>({})); throw new Error(d.error || r.status); }
const card = btn.closest('.dcard');
if(card) card.remove();
} catch(e){
btn.disabled = false; btn.textContent = '✗ skip';
alert('reject failed: ' + e.message);
}
}
function seedSet(name){
$('urls-input').value = '__seed:' + name;
$('urls-input').setAttribute('data-seed', name);
}
async function startScrape(){
const txt = $('urls-input').value.trim();
let body;
if(txt.startsWith('__seed:')){
body = { seed: txt.slice(7).trim() };
} else {
const urls = txt.split(/\r?\n/).map(s => s.trim()).filter(Boolean);
if(urls.length === 0){ alert('paste some urls or pick a seed'); return; }
body = { urls };
}
try {
const r = await fetch('/api/discover/scrape-start', {
method:'POST', headers:{'Content-Type':'application/json'},
body: JSON.stringify(body),
});
const d = await r.json();
if(!r.ok) throw new Error(d.error || r.status);
scrapeJob = { id: d.job_id, state: 'running' };
paintScrape();
pollScrape();
} catch(e){
alert('scrape start failed: ' + e.message);
}
}
async function cancelScrape(){
if(!scrapeJob) return;
if(!confirm('cancel scrape?')) return;
try {
await fetch('/api/discover/scrape-cancel/' + scrapeJob.id, { method:'POST' });
await fetchScrapeStatus();
} catch(e){
alert('cancel failed: ' + e.message);
}
}
async function fetchScrapeStatus(){
try {
const r = await fetch('/api/discover/scrape-status');
const d = await r.json();
scrapeJob = d.job || null;
paintScrape();
} catch(e){
console.error('scrape status failed', e);
}
}
function paintScrape(){
const j = scrapeJob;
const pp = $('scrape-progress');
const cb = $('cancel-btn');
if(!j || j.state !== 'running'){
pp.style.display = j ? '' : 'none';
cb.style.display = 'none';
stopPollScrape();
} else {
pp.style.display = '';
cb.style.display = '';
}
if(!j) return;
const total = (j.pages_scraped || 0) + 0; // we don't pre-emit total; pages tracks done
$('pages').textContent = j.pages_scraped || 0;
$('added').textContent = j.recipes_added || 0;
$('skipped').textContent = j.skipped_count || 0;
$('errors').textContent = j.error_count || 0;
$('last-error').textContent = j.last_error ? '· ' + j.last_error : '';
// Bar can't show absolute pct without a known total; show a slow pulse on progress
if(j.state === 'running'){
const pct = Math.min(95, ((j.pages_scraped || 0) * 7) % 95);
$('bar').style.width = pct + '%';
} else if(j.state === 'done'){
$('bar').style.width = '100%';
}
if(j.state === 'done' || j.state === 'cancelled' || j.state === 'failed'){
// refresh the grid so any new rows appear
refreshSearch();
}
}
function pollScrape(){ if(!scrapePoll) scrapePoll = setInterval(fetchScrapeStatus, 2000); }
function stopPollScrape(){ if(scrapePoll){ clearInterval(scrapePoll); scrapePoll = null; } }
// Initial paint
paintScrape();
if(scrapeJob && scrapeJob.state === 'running') pollScrape();
refreshSearch();
</script>
{% endblock %}

View file

@ -65,6 +65,9 @@
<p class="muted" style="margin-top:14px;">have hecate generate per-recipe metadata — cuisine, complexity, macros, primary protein/carb, comfort tier, summary. the plan generator reads this so "high protein week" is a real query, not just a vibe.</p>
<p><a class="btn" href="/enrich-recipes">✨ enrich recipes →</a></p>
<p class="muted" style="margin-top:14px;">browse a cross-household corpus of scraped recipes — search by cuisine / protein / time / kid-friendliness. one click sends a recipe to your mealie library; sterilize+enrich pipelines run on it like any other.</p>
<p><a class="btn" href="/discover">🌐 discover recipes →</a></p>
</section>
{% endif %}

View file

@ -5,3 +5,4 @@ Authlib==1.3.2
PyMySQL==1.1.1
cryptography==43.0.3
rapidfuzz==3.10.1
recipe-scrapers==15.6.0