diff --git a/cauldron/consolidate_foods.py b/cauldron/consolidate_foods.py index c7262ff..c0cc136 100644 --- a/cauldron/consolidate_foods.py +++ b/cauldron/consolidate_foods.py @@ -88,38 +88,32 @@ def _foods_in_household(mealie: Mealie, household_id: str) -> list[dict]: def _cluster(foods: list[dict], threshold: int = CLUSTER_THRESHOLD) -> list[list[dict]]: - """Single-link agglomerative clustering on token_set_ratio. O(n²) — fine - for ~3000 foods (~9M comparisons). Returns clusters of size ≥ 2.""" + """Pair-based: emit one 2-food candidate per (i, j) where token_set_ratio + >= threshold. Replaces the original single-link agglomerative which + produced a 50+ food megacluster on Cobb's catalog by chaining weak + similarities (`2% milk` → `acai berry` → `acai berry juice` → ...). + + Each emitted pair is a clean Sonnet-decision unit — easier prompt, + higher accuracy, uniform UI cards. The trade-off (3-way dupes get + split into 3 pairs that go through review separately) is fine — + Mealie's merge endpoint is per-pair anyway, and the apply path + defensively skips a pair whose canonical_id was already merged + away by an earlier pair. + + For ~3000 foods this is ~4M comparisons in pure Python — runs in + a few seconds.""" n = len(foods) - parent = list(range(n)) - - def find(x): - while parent[x] != x: - parent[x] = parent[parent[x]] - x = parent[x] - return x - - def union(a, b): - ra, rb = find(a), find(b) - if ra != rb: - parent[ra] = rb - names = [(f.get("name") or "").strip().lower() for f in foods] + pairs: list[list[dict]] = [] for i in range(n): if not names[i]: continue for j in range(i + 1, n): if not names[j]: continue - score = fuzz.token_set_ratio(names[i], names[j]) - if score >= threshold: - union(i, j) - - groups: dict[int, list[dict]] = {} - for i in range(n): - r = find(i) - groups.setdefault(r, []).append(foods[i]) - return [g for g in groups.values() if len(g) >= 2] + if fuzz.token_set_ratio(names[i], names[j]) >= threshold: + pairs.append([foods[i], foods[j]]) + return pairs def _cluster_key(cluster: list[dict]) -> str: @@ -231,6 +225,14 @@ def run_apply(*, db: DB, job_id: int, mealie: Mealie) -> None: try: mealie.merge_foods(from_id=did, to_id=canonical_id) except MealieError as e: + msg = str(e) + # Pair-based clustering can emit overlapping pairs: + # if (A,B) was already approved+merged, a later (A,C) + # pair has stale A. Mealie returns 404 — treat that + # as already-handled, not an error. + if "404" in msg or "not found" in msg.lower(): + log.info("[consolidate:%s] merge %s → %s: stale (already merged elsewhere)", job_id, did, canonical_id) + continue err = f"merge {did} → {canonical_id}: {e}" log.warning("[consolidate:%s] %s", job_id, err) break