From eed7f94c25af49e1521f9b75e02190cbf27a1981 Mon Sep 17 00:00:00 2001 From: Kayos Date: Thu, 30 Apr 2026 19:51:59 -0700 Subject: [PATCH] consolidate: pair-based clustering instead of single-link agglomerative MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Job 1's 131 clusters included a 50+ food megacluster ('2% milk', 'acai berry', 'acai berry juice', 'achiote oil', 'aleppo pepper', 'all purpose flour', ...) that Sonnet correctly rejected as a false positive. Cause: single-link agglomerative chains weak similarities — A~B and B~C unite A and C even though A and C aren't actually similar. Switched to pair-based: emit one 2-food candidate per (i, j) above threshold, no clustering. Eliminates the megacluster shape entirely. Sonnet decisions are cleaner on uniform 2-row pairs, UI cards are uniform, and Mealie's merge endpoint is per-pair anyway. Trade-off: a true 3-way dupe (A=B=C) now produces 3 separate pairs (A,B) (B,C) (A,C) that each go through review. Net effect after approval: same merges happen. Apply path defensively catches the 404 case — once (A,B) merges, the (A,C) pair has stale A and Mealie returns 404; treat as already-handled, not an error. For ~3000 foods this is ~4M comparisons in pure Python (a few seconds). Job 1's data still applies cleanly — 10 historical merges + 121 keep-distinct decisions stay where they are. Future runs use the new shape. --- cauldron/consolidate_foods.py | 50 ++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/cauldron/consolidate_foods.py b/cauldron/consolidate_foods.py index c7262ff..c0cc136 100644 --- a/cauldron/consolidate_foods.py +++ b/cauldron/consolidate_foods.py @@ -88,38 +88,32 @@ def _foods_in_household(mealie: Mealie, household_id: str) -> list[dict]: def _cluster(foods: list[dict], threshold: int = CLUSTER_THRESHOLD) -> list[list[dict]]: - """Single-link agglomerative clustering on token_set_ratio. O(n²) — fine - for ~3000 foods (~9M comparisons). Returns clusters of size ≥ 2.""" + """Pair-based: emit one 2-food candidate per (i, j) where token_set_ratio + >= threshold. Replaces the original single-link agglomerative which + produced a 50+ food megacluster on Cobb's catalog by chaining weak + similarities (`2% milk` → `acai berry` → `acai berry juice` → ...). + + Each emitted pair is a clean Sonnet-decision unit — easier prompt, + higher accuracy, uniform UI cards. The trade-off (3-way dupes get + split into 3 pairs that go through review separately) is fine — + Mealie's merge endpoint is per-pair anyway, and the apply path + defensively skips a pair whose canonical_id was already merged + away by an earlier pair. + + For ~3000 foods this is ~4M comparisons in pure Python — runs in + a few seconds.""" n = len(foods) - parent = list(range(n)) - - def find(x): - while parent[x] != x: - parent[x] = parent[parent[x]] - x = parent[x] - return x - - def union(a, b): - ra, rb = find(a), find(b) - if ra != rb: - parent[ra] = rb - names = [(f.get("name") or "").strip().lower() for f in foods] + pairs: list[list[dict]] = [] for i in range(n): if not names[i]: continue for j in range(i + 1, n): if not names[j]: continue - score = fuzz.token_set_ratio(names[i], names[j]) - if score >= threshold: - union(i, j) - - groups: dict[int, list[dict]] = {} - for i in range(n): - r = find(i) - groups.setdefault(r, []).append(foods[i]) - return [g for g in groups.values() if len(g) >= 2] + if fuzz.token_set_ratio(names[i], names[j]) >= threshold: + pairs.append([foods[i], foods[j]]) + return pairs def _cluster_key(cluster: list[dict]) -> str: @@ -231,6 +225,14 @@ def run_apply(*, db: DB, job_id: int, mealie: Mealie) -> None: try: mealie.merge_foods(from_id=did, to_id=canonical_id) except MealieError as e: + msg = str(e) + # Pair-based clustering can emit overlapping pairs: + # if (A,B) was already approved+merged, a later (A,C) + # pair has stale A. Mealie returns 404 — treat that + # as already-handled, not an error. + if "404" in msg or "not found" in msg.lower(): + log.info("[consolidate:%s] merge %s → %s: stale (already merged elsewhere)", job_id, did, canonical_id) + continue err = f"merge {did} → {canonical_id}: {e}" log.warning("[consolidate:%s] %s", job_id, err) break