audit-fixes: dedupe megacluster, consolidate cancel-poll, login-CSRF, misc

Continuing through the 2nd-pass audit findings.

dedupe_recipes.py CODE-2 (HIGH): _cluster_by_name dropped union-find
single-link agglomerative for the same pair-accumulator pattern
consolidate_foods._cluster already uses. Single-link chained weak
similarities through the recipe corpus the same way it did with foods,
producing one giant cluster on a 250+ corpus that Sonnet would refuse.
Now emits one 2-recipe pair per (i,j) above NAME_THRESHOLD.

consolidate_foods.py + dedupe_recipes.py CODE-1 (HIGH): added
in-loop cancel-poll to both _cluster passes. Polls cancel_check every
5K pair-comparisons so a user-initiated cancel can abort cleanly mid-
scan instead of waiting tens of seconds. Run-walk callers also
re-check _cancelled() right after clustering returns and bail.

server.py CVE-NEW-5 (MED): /login skips OIDC re-init when
session.get('user') exists. Closes the login-CSRF surface where a
malicious cross-origin link `<a href=…/login?next=/poison>` would
re-trigger the OIDC handshake on a logged-in user and silently change
their post-login landing.

server.py CVE-NEW-7 (LOW): Permissions-Policy now sends both
`interest-cohort=()` (FLoC, Chrome ≤94) and `browsing-topics=()`
(Topics API, Chrome ≥115) for opt-out across the lineage.

server.py CVE-NEW-8 (LOW): /auth/callback OAuthError branch no longer
echoes Authentik's raw error string to auth_retry.html. Detail is
logged server-side; users see a generic retry message. Closes the
information-disclosure on Authentik error codes.

server.py CODE-9/CODE-10 (LOW): wrapped int() of ?page= and
?per_page= in try/except so garbage args land on safe defaults
instead of surfacing ValueError as a 500.

Deferred to the rackham-vhost commit:
- CVE-NEW-6 (cauldron bind / ProxyFix trusted-peer filter) needs
  paired Apache vhost config (RequestHeader unset X-Forwarded-*)
  before the docker-side change is safe; landing it solo would
  either break the LAN deploy or leave a half-broken trust chain.
This commit is contained in:
Kayos 2026-05-02 17:36:25 -07:00
parent fdd1102a6f
commit 946abd0322
3 changed files with 87 additions and 38 deletions

View file

@ -27,7 +27,7 @@ from __future__ import annotations
import json
import logging
import threading
from typing import Optional
from typing import Callable, Optional
from rapidfuzz import fuzz, process
@ -87,7 +87,11 @@ def _foods_in_household(mealie: Mealie, household_id: str) -> list[dict]:
return out
def _cluster(foods: list[dict], threshold: int = CLUSTER_THRESHOLD) -> list[list[dict]]:
def _cluster(
foods: list[dict],
threshold: int = CLUSTER_THRESHOLD,
cancel_check: Callable[[], bool] | None = None,
) -> list[list[dict]]:
"""Pair-based: emit one 2-food candidate per (i, j) where token_set_ratio
>= threshold. Replaces the original single-link agglomerative which
produced a 50+ food megacluster on Cobb's catalog by chaining weak
@ -101,16 +105,25 @@ def _cluster(foods: list[dict], threshold: int = CLUSTER_THRESHOLD) -> list[list
away by an earlier pair.
For ~3000 foods this is ~4M comparisons in pure Python runs in
a few seconds."""
a few seconds. For larger catalogs (10K+) the inner loop polls
cancel_check every 5K comparisons so a user-initiated cancel can
abort cleanly mid-scan rather than waiting tens of seconds. 2nd-pass
audit fix CODE-1 (2026-05-02 PM)."""
n = len(foods)
names = [(f.get("name") or "").strip().lower() for f in foods]
pairs: list[list[dict]] = []
poll_every = 5000
cmp_count = 0
for i in range(n):
if not names[i]:
continue
for j in range(i + 1, n):
if not names[j]:
continue
cmp_count += 1
if cancel_check is not None and cmp_count % poll_every == 0:
if cancel_check():
return pairs
if fuzz.token_set_ratio(names[i], names[j]) >= threshold:
pairs.append([foods[i], foods[j]])
return pairs
@ -134,8 +147,11 @@ def run_walk(*, db: DB, job_id: int, mealie: Mealie, forge: Forge) -> None:
foods = _foods_in_household(mealie, hh)
log.info("[consolidate:%s] household=%s foods=%d", job_id, hh, len(foods))
clusters = _cluster(foods)
clusters = _cluster(foods, cancel_check=_cancelled)
log.info("[consolidate:%s] clusters≥2: %d", job_id, len(clusters))
if _cancelled():
log.info("[consolidate:%s] walk aborted during clustering", job_id)
return
with db.conn() as c, c.cursor() as cur:
cur.execute(

View file

@ -22,7 +22,7 @@ from __future__ import annotations
import json
import logging
import threading
from typing import Optional
from typing import Callable, Optional
from rapidfuzz import fuzz
@ -81,39 +81,41 @@ def _filter_to_household(recipes: list[dict], household_id: str) -> list[dict]:
return out
def _cluster_by_name(recipes: list[dict], threshold: int = NAME_THRESHOLD) -> list[list[dict]]:
"""Single-link agglomerative on rapidfuzz token_set_ratio. Returns
clusters of size >= 2. ~250 recipes = ~30K comparisons, runs instantly."""
def _cluster_by_name(
recipes: list[dict],
threshold: int = NAME_THRESHOLD,
cancel_check: Callable[[], bool] | None = None,
) -> list[list[dict]]:
"""Pair-based: emit one 2-recipe candidate per (i, j) where
token_set_ratio >= threshold. 2nd-pass audit fix (CODE-2, 2026-05-02 PM):
the previous single-link agglomerative chained weak similarities through
the recipe corpus the same way it did with foods `chicken alfredo`
`chicken parm` `parm chicken cutlets` ... collapsing dozens of
unrelated recipes into one megacluster that Sonnet then had to refuse.
Mirrors the pattern used in `consolidate_foods._cluster`.
cancel_check, when provided, is polled every 5K pair-comparisons so a
user-initiated cancel can abort a long scan early (CODE-1 fix). On
cancel we return the pairs accumulated so far rather than raising
the caller's _cancelled() in run_walk will catch and exit cleanly."""
n = len(recipes)
parent = list(range(n))
def find(x):
while parent[x] != x:
parent[x] = parent[parent[x]]
x = parent[x]
return x
def union(a, b):
ra, rb = find(a), find(b)
if ra != rb:
parent[ra] = rb
names = [(r.get("name") or "").strip().lower() for r in recipes]
pairs: list[list[dict]] = []
poll_every = 5000
cmp_count = 0
for i in range(n):
if not names[i]:
continue
for j in range(i + 1, n):
if not names[j]:
continue
score = fuzz.token_set_ratio(names[i], names[j])
if score >= threshold:
union(i, j)
groups: dict[int, list[dict]] = {}
for i in range(n):
r = find(i)
groups.setdefault(r, []).append(recipes[i])
return [g for g in groups.values() if len(g) >= 2]
cmp_count += 1
if cancel_check is not None and cmp_count % poll_every == 0:
if cancel_check():
return pairs
if fuzz.token_set_ratio(names[i], names[j]) >= threshold:
pairs.append([recipes[i], recipes[j]])
return pairs
def _summarize_recipe(full: dict) -> dict:
@ -155,8 +157,11 @@ def run_walk(*, db: DB, job_id: int, mealie: Mealie, forge: Forge) -> None:
slim = _filter_to_household(_all_recipes(mealie), hh)
log.info("[dedupe-recipes:%s] household=%s recipes=%d", job_id, hh, len(slim))
clusters_slim = _cluster_by_name(slim)
log.info("[dedupe-recipes:%s] name-clusters≥2: %d", job_id, len(clusters_slim))
clusters_slim = _cluster_by_name(slim, cancel_check=_cancelled)
log.info("[dedupe-recipes:%s] name-pairs: %d", job_id, len(clusters_slim))
if _cancelled():
log.info("[dedupe-recipes:%s] walk aborted during clustering", job_id)
return
with db.conn() as c, c.cursor() as cur:
cur.execute(

View file

@ -235,7 +235,10 @@ def create_app() -> Flask:
resp.headers.setdefault("X-Frame-Options", "DENY")
resp.headers.setdefault("X-Content-Type-Options", "nosniff")
resp.headers.setdefault("Referrer-Policy", "same-origin")
resp.headers.setdefault("Permissions-Policy", "interest-cohort=()")
# Opt-out of FLoC (Chrome ≤94) and the Topics API replacement
# (Chrome ≥115). Both directive names are unknown to other
# browsers and silently ignored — no parse-error risk.
resp.headers.setdefault("Permissions-Policy", "interest-cohort=(), browsing-topics=()")
resp.headers.setdefault(
"Content-Security-Policy",
"default-src 'self'; "
@ -460,7 +463,15 @@ def create_app() -> Flask:
# redirect surface — `next=https://evil.example/...` would
# otherwise route an authenticated user to an attacker page
# right after OIDC handshake.
session["post_login_next"] = _safe_next(request.args.get("next"))
nxt = _safe_next(request.args.get("next"))
# Already-authenticated users skip OIDC entirely (CVE-NEW-5 fix,
# 2026-05-02 PM): a malicious cross-origin link
# `<a href="…/login?next=/some-poisoned-path">` would otherwise
# silently re-trigger the OIDC handshake on a logged-in user
# and hand them off to the attacker-supplied next= path.
if session.get("user"):
return redirect(nxt)
session["post_login_next"] = nxt
return oauth.cauldron.authorize_redirect(cfg.oidc_redirect_uri)
@app.get("/auth/callback")
@ -490,12 +501,18 @@ def create_app() -> Flask:
detail="that login link expired (you probably retried after a blip). hit login again to start fresh.",
), 400
except OAuthError as e:
# Log the full Authentik error server-side; render only a
# generic detail to the user. Audit CVE-NEW-8 (2026-05-02 PM):
# the prior `f"auth handshake failed: {e}"` echoed Authentik
# error codes (e.g. invalid_client_id) into the auth_retry
# page — anyone who can hit /auth/callback?state=evil could
# probe Authentik internals via the rendered detail.
app.logger.warning("OIDC callback: oauth error: %s", e)
session.pop("_state_cauldron_authlib", None)
return render_template(
"auth_retry.html",
reason="oauth",
detail=f"auth handshake failed: {e}",
detail="the auth handshake didn't complete. hit login again to start fresh.",
), 400
userinfo = token.get("userinfo") or oauth.cauldron.userinfo(token=token)
sub = userinfo.get("sub") or userinfo.get("email")
@ -663,7 +680,10 @@ def create_app() -> Flask:
if not client:
return jsonify({"error": "not connected"}), 409
u = session["user"]
page = max(1, int(request.args.get("page", "1")))
try:
page = max(1, int(request.args.get("page", "1") or "1"))
except ValueError:
page = 1
search = (request.args.get("q") or "").strip()
sort = request.args.get("sort", "newest")
category = (request.args.get("cat") or "").strip() or None
@ -2574,8 +2594,16 @@ def create_app() -> Flask:
@app.get("/api/recipes")
@require_bearer
def list_recipes_api():
page = int(request.args.get("page", "1"))
per_page = min(int(request.args.get("per_page", "50")), 200)
# Defensive int parse — `?page=foo` previously raised ValueError
# and surfaced a 500 (audit CODE-9, 2026-05-02 PM).
try:
page = max(1, int(request.args.get("page", "1") or "1"))
except ValueError:
page = 1
try:
per_page = min(max(1, int(request.args.get("per_page", "50") or "50")), 200)
except ValueError:
per_page = 50
return jsonify(system_mealie.list_recipes(page=page, per_page=per_page))
@app.post("/api/sterilize/preview/<slug>")