diff --git a/cauldron/consolidate_foods.py b/cauldron/consolidate_foods.py index c0cc136..dcbf323 100644 --- a/cauldron/consolidate_foods.py +++ b/cauldron/consolidate_foods.py @@ -27,7 +27,7 @@ from __future__ import annotations import json import logging import threading -from typing import Optional +from typing import Callable, Optional from rapidfuzz import fuzz, process @@ -87,7 +87,11 @@ def _foods_in_household(mealie: Mealie, household_id: str) -> list[dict]: return out -def _cluster(foods: list[dict], threshold: int = CLUSTER_THRESHOLD) -> list[list[dict]]: +def _cluster( + foods: list[dict], + threshold: int = CLUSTER_THRESHOLD, + cancel_check: Callable[[], bool] | None = None, +) -> list[list[dict]]: """Pair-based: emit one 2-food candidate per (i, j) where token_set_ratio >= threshold. Replaces the original single-link agglomerative which produced a 50+ food megacluster on Cobb's catalog by chaining weak @@ -101,16 +105,25 @@ def _cluster(foods: list[dict], threshold: int = CLUSTER_THRESHOLD) -> list[list away by an earlier pair. For ~3000 foods this is ~4M comparisons in pure Python — runs in - a few seconds.""" + a few seconds. For larger catalogs (10K+) the inner loop polls + cancel_check every 5K comparisons so a user-initiated cancel can + abort cleanly mid-scan rather than waiting tens of seconds. 2nd-pass + audit fix CODE-1 (2026-05-02 PM).""" n = len(foods) names = [(f.get("name") or "").strip().lower() for f in foods] pairs: list[list[dict]] = [] + poll_every = 5000 + cmp_count = 0 for i in range(n): if not names[i]: continue for j in range(i + 1, n): if not names[j]: continue + cmp_count += 1 + if cancel_check is not None and cmp_count % poll_every == 0: + if cancel_check(): + return pairs if fuzz.token_set_ratio(names[i], names[j]) >= threshold: pairs.append([foods[i], foods[j]]) return pairs @@ -134,8 +147,11 @@ def run_walk(*, db: DB, job_id: int, mealie: Mealie, forge: Forge) -> None: foods = _foods_in_household(mealie, hh) log.info("[consolidate:%s] household=%s foods=%d", job_id, hh, len(foods)) - clusters = _cluster(foods) + clusters = _cluster(foods, cancel_check=_cancelled) log.info("[consolidate:%s] clusters≥2: %d", job_id, len(clusters)) + if _cancelled(): + log.info("[consolidate:%s] walk aborted during clustering", job_id) + return with db.conn() as c, c.cursor() as cur: cur.execute( diff --git a/cauldron/dedupe_recipes.py b/cauldron/dedupe_recipes.py index c2346dc..0596e7e 100644 --- a/cauldron/dedupe_recipes.py +++ b/cauldron/dedupe_recipes.py @@ -22,7 +22,7 @@ from __future__ import annotations import json import logging import threading -from typing import Optional +from typing import Callable, Optional from rapidfuzz import fuzz @@ -81,39 +81,41 @@ def _filter_to_household(recipes: list[dict], household_id: str) -> list[dict]: return out -def _cluster_by_name(recipes: list[dict], threshold: int = NAME_THRESHOLD) -> list[list[dict]]: - """Single-link agglomerative on rapidfuzz token_set_ratio. Returns - clusters of size >= 2. ~250 recipes = ~30K comparisons, runs instantly.""" +def _cluster_by_name( + recipes: list[dict], + threshold: int = NAME_THRESHOLD, + cancel_check: Callable[[], bool] | None = None, +) -> list[list[dict]]: + """Pair-based: emit one 2-recipe candidate per (i, j) where + token_set_ratio >= threshold. 2nd-pass audit fix (CODE-2, 2026-05-02 PM): + the previous single-link agglomerative chained weak similarities through + the recipe corpus the same way it did with foods — `chicken alfredo` + → `chicken parm` → `parm chicken cutlets` → ... — collapsing dozens of + unrelated recipes into one megacluster that Sonnet then had to refuse. + Mirrors the pattern used in `consolidate_foods._cluster`. + + cancel_check, when provided, is polled every 5K pair-comparisons so a + user-initiated cancel can abort a long scan early (CODE-1 fix). On + cancel we return the pairs accumulated so far rather than raising — + the caller's _cancelled() in run_walk will catch and exit cleanly.""" n = len(recipes) - parent = list(range(n)) - - def find(x): - while parent[x] != x: - parent[x] = parent[parent[x]] - x = parent[x] - return x - - def union(a, b): - ra, rb = find(a), find(b) - if ra != rb: - parent[ra] = rb - names = [(r.get("name") or "").strip().lower() for r in recipes] + pairs: list[list[dict]] = [] + poll_every = 5000 + cmp_count = 0 for i in range(n): if not names[i]: continue for j in range(i + 1, n): if not names[j]: continue - score = fuzz.token_set_ratio(names[i], names[j]) - if score >= threshold: - union(i, j) - - groups: dict[int, list[dict]] = {} - for i in range(n): - r = find(i) - groups.setdefault(r, []).append(recipes[i]) - return [g for g in groups.values() if len(g) >= 2] + cmp_count += 1 + if cancel_check is not None and cmp_count % poll_every == 0: + if cancel_check(): + return pairs + if fuzz.token_set_ratio(names[i], names[j]) >= threshold: + pairs.append([recipes[i], recipes[j]]) + return pairs def _summarize_recipe(full: dict) -> dict: @@ -155,8 +157,11 @@ def run_walk(*, db: DB, job_id: int, mealie: Mealie, forge: Forge) -> None: slim = _filter_to_household(_all_recipes(mealie), hh) log.info("[dedupe-recipes:%s] household=%s recipes=%d", job_id, hh, len(slim)) - clusters_slim = _cluster_by_name(slim) - log.info("[dedupe-recipes:%s] name-clusters≥2: %d", job_id, len(clusters_slim)) + clusters_slim = _cluster_by_name(slim, cancel_check=_cancelled) + log.info("[dedupe-recipes:%s] name-pairs: %d", job_id, len(clusters_slim)) + if _cancelled(): + log.info("[dedupe-recipes:%s] walk aborted during clustering", job_id) + return with db.conn() as c, c.cursor() as cur: cur.execute( diff --git a/cauldron/server.py b/cauldron/server.py index eeff745..9ead02c 100644 --- a/cauldron/server.py +++ b/cauldron/server.py @@ -235,7 +235,10 @@ def create_app() -> Flask: resp.headers.setdefault("X-Frame-Options", "DENY") resp.headers.setdefault("X-Content-Type-Options", "nosniff") resp.headers.setdefault("Referrer-Policy", "same-origin") - resp.headers.setdefault("Permissions-Policy", "interest-cohort=()") + # Opt-out of FLoC (Chrome ≤94) and the Topics API replacement + # (Chrome ≥115). Both directive names are unknown to other + # browsers and silently ignored — no parse-error risk. + resp.headers.setdefault("Permissions-Policy", "interest-cohort=(), browsing-topics=()") resp.headers.setdefault( "Content-Security-Policy", "default-src 'self'; " @@ -460,7 +463,15 @@ def create_app() -> Flask: # redirect surface — `next=https://evil.example/...` would # otherwise route an authenticated user to an attacker page # right after OIDC handshake. - session["post_login_next"] = _safe_next(request.args.get("next")) + nxt = _safe_next(request.args.get("next")) + # Already-authenticated users skip OIDC entirely (CVE-NEW-5 fix, + # 2026-05-02 PM): a malicious cross-origin link + # `` would otherwise + # silently re-trigger the OIDC handshake on a logged-in user + # and hand them off to the attacker-supplied next= path. + if session.get("user"): + return redirect(nxt) + session["post_login_next"] = nxt return oauth.cauldron.authorize_redirect(cfg.oidc_redirect_uri) @app.get("/auth/callback") @@ -490,12 +501,18 @@ def create_app() -> Flask: detail="that login link expired (you probably retried after a blip). hit login again to start fresh.", ), 400 except OAuthError as e: + # Log the full Authentik error server-side; render only a + # generic detail to the user. Audit CVE-NEW-8 (2026-05-02 PM): + # the prior `f"auth handshake failed: {e}"` echoed Authentik + # error codes (e.g. invalid_client_id) into the auth_retry + # page — anyone who can hit /auth/callback?state=evil could + # probe Authentik internals via the rendered detail. app.logger.warning("OIDC callback: oauth error: %s", e) session.pop("_state_cauldron_authlib", None) return render_template( "auth_retry.html", reason="oauth", - detail=f"auth handshake failed: {e}", + detail="the auth handshake didn't complete. hit login again to start fresh.", ), 400 userinfo = token.get("userinfo") or oauth.cauldron.userinfo(token=token) sub = userinfo.get("sub") or userinfo.get("email") @@ -663,7 +680,10 @@ def create_app() -> Flask: if not client: return jsonify({"error": "not connected"}), 409 u = session["user"] - page = max(1, int(request.args.get("page", "1"))) + try: + page = max(1, int(request.args.get("page", "1") or "1")) + except ValueError: + page = 1 search = (request.args.get("q") or "").strip() sort = request.args.get("sort", "newest") category = (request.args.get("cat") or "").strip() or None @@ -2574,8 +2594,16 @@ def create_app() -> Flask: @app.get("/api/recipes") @require_bearer def list_recipes_api(): - page = int(request.args.get("page", "1")) - per_page = min(int(request.args.get("per_page", "50")), 200) + # Defensive int parse — `?page=foo` previously raised ValueError + # and surfaced a 500 (audit CODE-9, 2026-05-02 PM). + try: + page = max(1, int(request.args.get("page", "1") or "1")) + except ValueError: + page = 1 + try: + per_page = min(max(1, int(request.args.get("per_page", "50") or "50")), 200) + except ValueError: + per_page = 50 return jsonify(system_mealie.list_recipes(page=page, per_page=per_page)) @app.post("/api/sterilize/preview/")