diff --git a/cauldron/aggregator.py b/cauldron/aggregator.py index 63e4059..6908b9e 100644 --- a/cauldron/aggregator.py +++ b/cauldron/aggregator.py @@ -187,11 +187,20 @@ def _aggregate_one_food( meta: dict, ) -> list[ShoppingLine]: """All ingredients for ONE food → 1+ ShoppingLines.""" - # Bucket by unit class + # Bucket by unit class. Ingredients with qty=None go to a separate + # `no_qty` bucket so they DON'T silently disappear from the shopping + # list when Mealie's parser couldn't extract a number (audit F-15 + # domain, 2026-05-02). The killer feature should surface "buy onion" + # even if the source recipe just said "1 onion, chopped" without a + # parseable quantity. buckets: dict[str, list[tuple[Ingredient, float]]] = { "mass": [], "volume": [], "count": [], "vague": [], "unknown": [], } + no_qty_items: list[Ingredient] = [] for ing in items: + if ing.qty is None and (ing.unit or "").strip() == "": + no_qty_items.append(ing) + continue cls = classify_unit(ing.unit) buckets[cls].append((ing, ing.qty if ing.qty is not None else 0.0)) @@ -202,6 +211,12 @@ def _aggregate_one_food( for i in items if (i.original_text or i.qty is not None or i.note) ] + # Pull no-qty original-text contributors into the contribs list so + # they appear under whatever line we emit (or the standalone fallback) + for ing in no_qty_items: + text = ing.original_text or _render(ing) + if text and text not in contribs: + contribs.append(text) density = float(meta.get("density_g_per_ml") or 0) or None @@ -272,6 +287,25 @@ def _aggregate_one_food( is_split=True, )) + # qty=None safety net: if every contributor was a no-qty ingredient + # (Mealie's parser couldn't extract a number), nothing else above + # produced a line. Emit a placeholder so the food APPEARS on the + # shopping list — Abby still needs to know to buy onions even if the + # recipe just said "1 onion, chopped". UI surfaces this as a + # "qty unspecified" hint, nudging Cobb to run sterilize. + if no_qty_items and not lines: + lines.append(ShoppingLine( + food=food, qty=None, unit="ea", + contributors=contribs, + notes=notes_acc + ["quantity unspecified — re-sterilize for an exact total"], + )) + elif no_qty_items and lines: + # We DID emit a sized line — still flag that some contributors + # had unknown qty so the user knows the total may be incomplete. + lines[0].notes.append( + f"+ {len(no_qty_items)} ingredient(s) with no quantity" + ) + return lines diff --git a/cauldron/db.py b/cauldron/db.py index 9b9329a..eec29fe 100644 --- a/cauldron/db.py +++ b/cauldron/db.py @@ -481,10 +481,12 @@ MIGRATIONS = [ """, # 029 — Old unique key was (plan_id, day) — now needs to include # meal_type so a Monday can have breakfast AND lunch AND dinner. - # Drop-and-add, idempotent: catch the "doesn't exist" if already done. + # Drop-and-add, idempotent: IF EXISTS so a partial-failure retry + # doesn't brick boot if the index was already dropped before + # schema_migrations recorded the version. """ ALTER TABLE cauldron_meal_plan_slots - DROP INDEX uk_plan_day + DROP INDEX IF EXISTS uk_plan_day """, """ ALTER TABLE cauldron_meal_plan_slots @@ -604,6 +606,25 @@ MIGRATIONS = [ FOREIGN KEY (household_id) REFERENCES cauldron_households(id) ON DELETE CASCADE ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 """, + # 039 — Per-household discover skips. Replaces the global + # `cauldron_discovered_recipes.status='rejected'` write — that flipped + # the row for EVERY household in EVERY group (audit finding F-2 routes, + # 2026-05-02). Different households have different tastes; skip is + # per-household. The global status column stays for spam URLs an admin + # wants to nuke for everyone (set via a future bearer-only endpoint); + # routine "not interested" goes here. + """ + CREATE TABLE IF NOT EXISTS cauldron_discover_skips ( + discover_id BIGINT NOT NULL, + household_id BIGINT NOT NULL, + skipped_by_sub VARCHAR(190), + skipped_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (discover_id, household_id), + INDEX idx_household (household_id), + FOREIGN KEY (discover_id) REFERENCES cauldron_discovered_recipes(id) ON DELETE CASCADE, + FOREIGN KEY (household_id) REFERENCES cauldron_households(id) ON DELETE CASCADE + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 + """, ] @@ -2338,44 +2359,95 @@ class DB: row["imported_at"] = row["imported_at"].isoformat() return row + def record_discover_skip( + self, + *, + discover_id: int, + household_id: int, + skipped_by_sub: str | None, + ) -> None: + """Per-household 'skip from discover' — INSERT IGNORE so re-clicking + is a no-op. PK is (discover_id, household_id). Only the caller's + household is affected; other households still see the row.""" + with self.conn() as c, c.cursor() as cur: + cur.execute( + """INSERT IGNORE INTO cauldron_discover_skips + (discover_id, household_id, skipped_by_sub) + VALUES (%s, %s, %s)""", + (discover_id, household_id, skipped_by_sub), + ) + + def unskip_discover( + self, *, discover_id: int, household_id: int + ) -> int: + """Undo a per-household skip. Returns rowcount.""" + with self.conn() as c, c.cursor() as cur: + cur.execute( + """DELETE FROM cauldron_discover_skips + WHERE discover_id=%s AND household_id=%s""", + (discover_id, household_id), + ) + return cur.rowcount + + def get_skipped_discover_ids_for_household( + self, *, household_id: int + ) -> set[int]: + """Used by /api/discover/search to filter out the caller's + household's skipped rows from the default view.""" + with self.conn() as c, c.cursor() as cur: + cur.execute( + """SELECT discover_id FROM cauldron_discover_skips + WHERE household_id=%s""", + (household_id,), + ) + return {r["discover_id"] for r in (cur.fetchall() or [])} + def list_discover_eligible_for_group( - self, *, mealie_group_id: str | None, limit: int = 100 + self, + *, + mealie_group_id: str | None, + skipping_household_id: int | None = None, + limit: int = 100, ) -> list[dict]: """Return enriched discover rows that no household in the given Mealie - group has imported yet. Used by Flavor B's pool builder — these are - the 'new gems' Hecate can suggest alongside library recipes. + group has imported yet — and that the caller's household has not + skipped. Used by Flavor B's pool builder. + + Switched 2026-05-02 from LEFT JOIN to NOT EXISTS subqueries to fix + F-5 data (LIMIT-shrink from cross-group import multiplication) and + to add the per-household skip filter cleanly. If `mealie_group_id` is None we still return the unimported global list (no group context yet — first-boot edge case).""" + clauses = ["d.status = 'enriched'"] + args: list = [] + if mealie_group_id: + clauses.append( + "NOT EXISTS (SELECT 1 FROM cauldron_discover_imports i " + "JOIN cauldron_households h ON h.id = i.household_id " + "WHERE i.discover_id = d.id AND h.mealie_group_id = %s)" + ) + args.append(mealie_group_id) + else: + clauses.append( + "NOT EXISTS (SELECT 1 FROM cauldron_discover_imports i " + "WHERE i.discover_id = d.id)" + ) + if skipping_household_id is not None: + clauses.append( + "NOT EXISTS (SELECT 1 FROM cauldron_discover_skips s " + "WHERE s.discover_id = d.id AND s.household_id = %s)" + ) + args.append(int(skipping_household_id)) + sql = ( + "SELECT d.id, d.slug, d.source_url, d.name, d.description, " + "d.image_url, d.meta_json FROM cauldron_discovered_recipes d " + f"WHERE {' AND '.join(clauses)} " + "ORDER BY d.scraped_at DESC LIMIT %s" + ) + args.append(int(limit)) with self.conn() as c, c.cursor() as cur: - if mealie_group_id: - cur.execute( - """SELECT d.id, d.slug, d.source_url, d.name, d.description, - d.image_url, d.meta_json - FROM cauldron_discovered_recipes d - LEFT JOIN cauldron_discover_imports i - ON i.discover_id = d.id - LEFT JOIN cauldron_households h - ON h.id = i.household_id - AND h.mealie_group_id = %s - WHERE d.status = 'enriched' - AND h.id IS NULL - ORDER BY d.scraped_at DESC - LIMIT %s""", - (mealie_group_id, int(limit)), - ) - else: - cur.execute( - """SELECT d.id, d.slug, d.source_url, d.name, d.description, - d.image_url, d.meta_json - FROM cauldron_discovered_recipes d - LEFT JOIN cauldron_discover_imports i ON i.discover_id = d.id - WHERE d.status = 'enriched' - AND i.discover_id IS NULL - ORDER BY d.scraped_at DESC - LIMIT %s""", - (int(limit),), - ) + cur.execute(sql, args) return list(cur.fetchall() or []) def get_discovered_recipe(self, discover_id: int) -> dict | None: diff --git a/cauldron/discover_recipes.py b/cauldron/discover_recipes.py index 16692e2..e973035 100644 --- a/cauldron/discover_recipes.py +++ b/cauldron/discover_recipes.py @@ -22,6 +22,9 @@ import logging import threading from urllib.parse import urlparse +import ipaddress as _ipaddr +import socket as _socket + from .db import DB from .forge import Forge, ForgeError @@ -80,6 +83,67 @@ def list_seeds() -> list[dict]: return [{"name": k, "count": len(v)} for k, v in SEED_URLS.items()] +def is_public_url(url: str) -> tuple[bool, str]: + """SSRF guard: validate that `url` resolves to a non-private, + non-loopback, non-link-local IP. Returns (ok, reason). + + Used by both `/api/discover/scrape-start` (pre-queue rejection) and + `_scrape_one` (defense-in-depth before fetch). Audit finding F-1 + (CRIT, 2026-05-02): without this, any session user could queue URLs + pointing at Lucy's LAN, the docker bridge, or cloud metadata + endpoints (169.254.169.254, etc). + + Strategy: + 1. Parse the URL. Reject non-http(s) schemes. + 2. Extract hostname. Reject if IP-literal pointing at private space. + 3. Resolve via getaddrinfo (covers IPv4 + IPv6 + IDNs). + 4. For each resolved address, reject if .is_private, .is_loopback, + .is_link_local, .is_multicast, .is_reserved, .is_unspecified. + + Note: this is best-effort. A malicious resolver could DNS-rebind + between this check and the actual GET. recipe-scrapers also makes + its own HTTP calls in scrape_me — those bypass this guard. Acceptable + for v0.1 (LAN-only deployment, family OIDC). The right answer + long-term is a custom requests transport that re-validates per- + connection like Mealie's pkgs/safehttp.""" + from urllib.parse import urlparse + try: + parsed = urlparse(url) + except Exception as e: + return (False, f"unparseable url: {e}") + if parsed.scheme not in ("http", "https"): + return (False, f"scheme not allowed: {parsed.scheme!r}") + host = parsed.hostname + if not host: + return (False, "no hostname in url") + # Reject IP-literals pointing into private / loopback / link-local / + # multicast / reserved space directly (saves a DNS roundtrip too). + try: + ip = _ipaddr.ip_address(host) + if (ip.is_private or ip.is_loopback or ip.is_link_local + or ip.is_multicast or ip.is_reserved or ip.is_unspecified): + return (False, f"ip in restricted range: {ip}") + return (True, "") + except ValueError: + pass # not an IP literal — resolve via DNS + + # Resolve and check every returned address. + try: + infos = _socket.getaddrinfo(host, None, type=_socket.SOCK_STREAM) + except _socket.gaierror as e: + return (False, f"dns resolution failed: {e}") + for info in infos: + addr = info[4][0] + try: + ip = _ipaddr.ip_address(addr) + except ValueError: + continue + if (ip.is_private or ip.is_loopback or ip.is_link_local + or ip.is_multicast or ip.is_reserved or ip.is_unspecified): + return (False, f"resolves to restricted ip: {ip} (host={host})") + return (True, "") + + def _slug_from_url(url: str) -> str | None: """Cheap slug fallback when the scraper doesn't expose one.""" try: @@ -139,6 +203,14 @@ def _to_mealie_shape(scraper, source_url: str) -> dict: def _scrape_one(url: str) -> tuple[dict, str | None] | None: """Scrape a single URL. Returns (mealie_shape_dict, image_url) on success. Returns None on any unrecoverable scraper error.""" + # SSRF defense-in-depth: even though /api/discover/scrape-start + # validates URLs at queue time, re-check here so any future caller + # (cron, admin script, future bulk runner) can't bypass it. + ok, reason = is_public_url(url) + if not ok: + log.warning("[discover] refusing private/restricted url %s (%s)", url, reason) + return None + try: from recipe_scrapers import scrape_me # type: ignore except ImportError: diff --git a/cauldron/forge.py b/cauldron/forge.py index c3a127f..ac4cec0 100644 --- a/cauldron/forge.py +++ b/cauldron/forge.py @@ -839,7 +839,14 @@ class Forge: "- When uncertain on a categorical, use 'unknown' or 'other' rather than guessing." ) result = self.run(prompt, model=model or "sonnet", timeout_secs=180) - return _extract_recipe_meta(result) + meta = _extract_recipe_meta(result) + # Anaphylaxis safety belt: regex-override `contains.*` toward TRUE + # for any anaphylaxis-class allergen detected in the raw ingredient + # text. Defends against prompt-injection via discover-scraped fields. + meta["contains"] = apply_allergen_safety_override( + meta.get("contains"), "\n".join(ing_lines) + ) + return meta def verify_allergens( self, @@ -904,9 +911,17 @@ class Forge: try: result = self.run(prompt, model=model or "sonnet", timeout_secs=60) except ForgeError: - # Verification failed — fall back to prior contains; better than nothing - return prior_contains or {} - return _extract_allergen_verification(result, prior_contains or {}) + # Verification failed — fall back to prior contains; better than nothing. + # Still apply the regex safety belt so we don't trust unverified prior + # data on anaphylaxis-class allergens. + return apply_allergen_safety_override( + prior_contains or {}, "\n".join(ing_lines) + ) + verified = _extract_allergen_verification(result, prior_contains or {}) + # Anaphylaxis safety belt — regex-override toward TRUE for the + # six anaphylaxis-class allergens. Sonnet's TRUEs are preserved; + # only FALSEs that contradict regex evidence get flipped. + return apply_allergen_safety_override(verified, "\n".join(ing_lines)) def suggest_recipes( self, @@ -1064,6 +1079,8 @@ class Forge: '{"suggestions": [{"recipe_slug": "...", "fit_score": 1-5, "reason": "..."}]}\n\n' "Rules:\n" f"- Exactly {target} suggestion(s); each recipe_slug MUST be in the pool above\n" + "- recipe_slug values MUST be returned VERBATIM as shown — INCLUDING the " + "`lib:` or `disc:` prefix when present. Do not strip, simplify, or 'clean' these slugs.\n" "- VARIETY: don't pick 3 of the same cuisine or 3 of the same primary_protein\n" "- BIAS toward recipes whose meta best fits the family's picker profiles " "and any week preference/targets stated above\n" @@ -1140,6 +1157,111 @@ class Forge: return _extract_food_info(result) +# Anaphylaxis-class allergen safety override. +# +# The enrich + verify_allergens prompts interpolate user-controlled fields +# (recipe NAME, DESCRIPTION, INGREDIENTS, STEPS) verbatim into Sonnet's prompt. +# An adversarial discover-scraped recipe can theoretically inject text that +# nudges Sonnet to flip `contains.` to FALSE. The audit doc calls +# this F-6 (CRITICAL). +# +# This override is a belt-and-suspenders safety check: after Sonnet produces +# `contains.*` booleans, we run regex pattern-matching on the raw ingredient +# text. For the SIX anaphylaxis-class allergens (peanuts, tree nuts, shellfish, +# fish, eggs, sesame, dairy), if a regex match is found, we force the bool to +# TRUE regardless of what Sonnet said. False positives are recoverable; an +# undetected anaphylaxis-class allergen is not. +# +# Pork/soy/gluten are NOT auto-overridden because they're either +# religious/dietary (pork) or so common that false positives would block +# 90 % of recipes (gluten via flour mentions in step text). +import re as _re + +_ALLERGEN_SAFETY_PATTERNS: dict[str, _re.Pattern] = { + "peanuts": _re.compile( + r"\b(peanut|peanuts|peanut\s*butter|peanut\s*oil|groundnut)\b", + _re.IGNORECASE, + ), + "nuts": _re.compile( + r"\b(almond|almonds|cashew|cashews|pecan|pecans|walnut|walnuts|" + r"pistachio|pistachios|hazelnut|hazelnuts|brazil\s*nut|" + r"macadamia|pine\s*nut|pine\s*nuts|chestnut|chestnuts|" + r"praline|nutella|frangipane|marzipan|amaretto|almond\s*flour|" + r"almond\s*meal|almond\s*milk|almond\s*butter|cashew\s*butter|" + r"hazelnut\s*spread|tree\s*nut|tree\s*nuts)\b", + _re.IGNORECASE, + ), + "shellfish": _re.compile( + r"\b(shrimp|prawn|prawns|lobster|crab|crabmeat|scallop|scallops|" + r"clam|clams|mussel|mussels|oyster|oysters|crayfish|langoustine|" + r"crawfish|krill|squid|calamari|octopus|cuttlefish)\b", + _re.IGNORECASE, + ), + "fish": _re.compile( + r"\b(salmon|tuna|cod|tilapia|halibut|anchovy|anchovies|" + r"sardine|sardines|trout|mackerel|snapper|sea\s*bass|" + r"swordfish|monkfish|perch|herring|mahi|mahi-mahi|" + r"flounder|sole|catfish|grouper|haddock|pollock|" + r"sea\s*bream|branzino|barramundi|bonito|kingfish|" + r"fish\s*sauce|nuoc\s*mam|fish\s*stock|fish\s*paste|" + r"worcestershire)\b", + _re.IGNORECASE, + ), + "eggs": _re.compile( + r"\b(egg|eggs|egg\s*yolk|egg\s*yolks|egg\s*white|egg\s*whites|" + r"mayonnaise|mayo|aioli|hollandaise|carbonara|meringue|" + r"egg\s*wash|albumen|ovalbumin)\b", + _re.IGNORECASE, + ), + "sesame": _re.compile( + r"\b(sesame|tahini|gomashio|gomasio|halva|halvah|benne|" + r"sesame\s*oil|sesame\s*seed|sesame\s*seeds|sesame\s*paste)\b", + _re.IGNORECASE, + ), + "dairy": _re.compile( + r"\b(milk|cream|butter|cheese|yogurt|yoghurt|ghee|whey|casein|" + r"mascarpone|ricotta|feta|parmesan|parmigiano|mozzarella|" + r"cheddar|gouda|brie|camembert|gruyere|provolone|asiago|" + r"fontina|manchego|pecorino|goat\s*cheese|cream\s*cheese|" + r"sour\s*cream|crème\s*fraîche|creme\s*fraiche|buttermilk|" + r"half-and-half|half\s*and\s*half|condensed\s*milk|" + r"evaporated\s*milk|powdered\s*milk|dry\s*milk|" + r"clotted\s*cream|crema|leche|burrata|halloumi|paneer|" + r"queso|kefir|skyr|labneh|quark|cottage\s*cheese|" + r"ice\s*cream|gelato|custard)\b", + _re.IGNORECASE, + ), +} + + +def apply_allergen_safety_override( + contains: dict | None, raw_ingredient_text: str +) -> dict: + """Belt-and-suspenders: force `contains.` TRUE for any + anaphylaxis-class allergen where a regex pattern matches the raw + ingredient text. Mitigation for prompt-injection attacks where Sonnet + is talked into setting a real allergen to FALSE. + + Inputs: + - `contains`: the dict from Sonnet (may be None or partial) + - `raw_ingredient_text`: joined string of all ingredient names/notes + as scraped from the source. Step text is intentionally excluded + because it's noisier (step text mentions ingredients used vs. not). + + Returns: a new dict with anaphylaxis-class allergens forced TRUE on + regex match. Sonnet's other allergen calls (pork/soy/gluten) are + preserved as-is. Original `contains` is not mutated. + """ + out = dict(contains or {}) + if not raw_ingredient_text: + return out + for allergen, pat in _ALLERGEN_SAFETY_PATTERNS.items(): + if pat.search(raw_ingredient_text): + # Override toward TRUE only — never FALSE-stomp Sonnet's TRUE + out[allergen] = True + return out + + def _extract_allergen_verification(forge_result: dict, prior: dict) -> dict: """Pull the corrected contains dict out of the verify_allergens reply. Falls back to prior on any shape problem — verification is best-effort.""" diff --git a/cauldron/server.py b/cauldron/server.py index dda48b5..7a70b26 100644 --- a/cauldron/server.py +++ b/cauldron/server.py @@ -284,8 +284,13 @@ def create_app() -> Flask: @app.get("/login") def login(): - # Stash where to go after login + # Stash where to go after login. Validate same-origin path to + # close the open-redirect surface — `next=https://evil.example/...` + # would otherwise route an authenticated user to an attacker page + # right after OIDC handshake. Audit F-3a routes 2026-05-02. nxt = request.args.get("next") or "/me" + if not nxt.startswith("/") or nxt.startswith("//") or nxt.startswith("/\\"): + nxt = "/me" session["post_login_next"] = nxt return oauth.cauldron.authorize_redirect(cfg.oidc_redirect_uri) @@ -1172,7 +1177,9 @@ def create_app() -> Flask: my_household = db.get_household(hid) my_group_id = (my_household or {}).get("mealie_group_id") discover_rows = db.list_discover_eligible_for_group( - mealie_group_id=my_group_id, limit=80 + mealie_group_id=my_group_id, + skipping_household_id=hid, + limit=80, ) discover_by_id: dict[int, dict] = {} for d in discover_rows: @@ -2080,7 +2087,9 @@ def create_app() -> Flask: status_arg = (args.get("status") or "active").strip() if status_arg == "active": status: list[str] | str | None = ["enriched", "raw"] - elif status_arg == "all": + elif status_arg in ("all", "skipped"): + # `skipped` is a per-household virtual status, not a DB value + # — pull all rows and let the post-filter handle it. status = None else: status = status_arg @@ -2123,9 +2132,24 @@ def create_app() -> Flask: db.get_discover_imports_for_group(mealie_group_id=my_group_id) if my_group_id else {} ) + # Per-household skips (audit F-2 routes — was a global flip). + # Default view filters out rows the caller's household has skipped; + # `?status=skipped` surfaces them so the user can unskip if needed. + my_skipped = ( + db.get_skipped_discover_ids_for_household(household_id=my_hid) + if my_hid else set() + ) + show_skipped = (status_arg == "skipped") out = [] for r in rows: + is_skipped = r["id"] in my_skipped + if show_skipped and not is_skipped: + # `?status=skipped` view — only skipped rows for this household. + continue + if not show_skipped and is_skipped and status_arg in ("active", "enriched"): + # Default browse — hide rows this household has skipped. + continue meta = r.get("meta_json") if isinstance(meta, str): try: @@ -2139,6 +2163,7 @@ def create_app() -> Flask: # scraped_json can be heavy — drop it from list responses r.pop("scraped_json", None) r["meta_json"] = meta + r["skipped_by_my_household"] = is_skipped imp = group_imports.get(r["id"]) if imp: @@ -2195,11 +2220,25 @@ def create_app() -> Flask: @app.post("/api/discover/reject/") @require_session def discover_reject(discover_id: int): + """Per-household 'skip from discover'. Audit F-2 routes 2026-05-02: + previously this flipped the GLOBAL `cauldron_discovered_recipes.status + = 'rejected'` field, hiding the recipe from every household in every + group. Now writes to `cauldron_discover_skips(discover_id, household_id)` + — only the caller's household stops seeing it; other households are + unaffected. Different households have different tastes.""" + u = session["user"] row = db.get_discovered_recipe(discover_id) if not row: return jsonify({"error": "not_found"}), 404 - db.set_discovered_status(discover_id, "rejected") - return jsonify({"ok": True}) + hid = current_household_id() + if not hid: + return jsonify({"error": "no_household"}), 409 + db.record_discover_skip( + discover_id=discover_id, + household_id=hid, + skipped_by_sub=u["sub"], + ) + return jsonify({"ok": True, "scope": "household"}) @app.post("/api/discover/scrape-start") @require_session @@ -2228,13 +2267,36 @@ def create_app() -> Flask: urls = [x for x in urls if x.startswith(("http://", "https://"))][:50] if not urls: return jsonify({"error": "no valid http(s) urls"}), 400 + # SSRF guard (audit F-1 routes, CRIT, 2026-05-02): every URL must + # resolve to a public IP. Reject any that hit private / loopback / + # link-local / multicast / reserved space — those are LAN, docker + # bridge, or cloud metadata endpoints. Apply BEFORE queueing so + # the caller gets a clean error per bad URL. + accepted: list[str] = [] + rejected: list[dict] = [] + for u_url in urls: + ok, reason = discover_recipes.is_public_url(u_url) + if ok: + accepted.append(u_url) + else: + rejected.append({"url": u_url, "reason": reason}) + if not accepted: + return jsonify({ + "error": "no_safe_urls", + "rejected": rejected, + }), 400 job_id = db.create_discover_job( started_by_sub=u["sub"], source_seed=seed_name, ) discover_recipes.spawn_thread( - db=db, job_id=job_id, forge=forge, urls=urls, + db=db, job_id=job_id, forge=forge, urls=accepted, ) - return jsonify({"ok": True, "job_id": job_id, "urls_queued": len(urls)}) + return jsonify({ + "ok": True, + "job_id": job_id, + "urls_queued": len(accepted), + "rejected": rejected, + }) @app.get("/api/discover/scrape-status") @require_session diff --git a/cauldron/sterilizer.py b/cauldron/sterilizer.py index 1a9cc0e..8674b0b 100644 --- a/cauldron/sterilizer.py +++ b/cauldron/sterilizer.py @@ -220,13 +220,27 @@ class Sterilizer: return {"slug": slug, "updated": 0, "skipped": 0, "created_foods": [], "created_units": []} recipe = self.mealie.get_recipe(slug) + # Mid-flight edit guard: preview was built from one snapshot of the + # recipe; we just re-fetched. If the user added or removed an + # ingredient in between (Sonnet preview can take 60-300s), Python's + # zip would silently truncate to the shorter list, dropping the + # user's edit OR mis-aligning proposals with new ingredient slots. + # Refuse to apply rather than scribble. Bulk runner catches the + # RuntimeError and marks the proposal stale; user retries cleanly. + mealie_ings = recipe.get("recipeIngredient") or [] + if len(mealie_ings) != len(proposals): + raise RuntimeError( + f"recipe edited mid-flight: preview had {len(proposals)} ingredients " + f"but Mealie now has {len(mealie_ings)} — re-run sterilize on this slug" + ) + food_index = self._build_name_index(self.mealie.list_foods()) unit_index = self._build_name_index(self.mealie.list_units()) created_foods: list[str] = [] created_units: list[str] = [] new_ingredients: list[dict] = [] - for orig_ing, prop in zip(recipe.get("recipeIngredient") or [], proposals): + for orig_ing, prop in zip(mealie_ings, proposals): # Each proposal can produce 1+ parsed children (fan-out for # compound inputs like "Toppings (a, b, c)" or "salt and pepper"). # Keep the proposal_json key flexible: prefer parsed_items but