From 9334d161e4151add4e1091f2d2a5b7a80c3e794b Mon Sep 17 00:00:00 2001 From: Kayos Date: Thu, 30 Apr 2026 11:48:40 -0700 Subject: [PATCH] sterilize Phase 2: pass Mealie's food catalog into Sonnet's prompt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sonnet was picking food names blindly. We then tried to match those back to Mealie's catalog post-hoc. When Sonnet's natural pick didn't match Mealie's exact convention, we'd create a duplicate row instead of reusing the existing one. Lucky alignment with the seed kept the dupe rate low, but the architecture had no real "Mealie is source of truth" guarantee. This change makes that guarantee explicit: 1. Sterilizer now lazy-loads Mealie's full food catalog on first _parse_batch call (one fetch per Sterilizer instance, so a bulk job pulls 2895 rows once and reuses across all 226 recipe parses). Uses the underlying mealie._get with per_page=2000 + page-walk for defensive coverage of really large catalogs. 2. STERILIZE_SYSTEM is now STERILIZE_SYSTEM_TEMPLATE — a string with a {foods} placeholder. _system_prompt() splices in a bullet list of every Mealie food (name, plural, aliases) at runtime. 3. New CATALOG RULES in the prompt instruct Sonnet to: - Match against name / pluralName / aliases first, return canonical name verbatim with is_new_food=false - Strip prep modifiers into note - Singularize plurals when canonical is singular - Treat brand variations as canonical+note ("Heinz ketchup" → food: "ketchup", note: "Heinz") - Set is_new_food=true ONLY when no reasonable catalog match exists, since adding aliases to fix mismatches later is way easier than cleaning duplicate food rows after the fact 4. New is_new_food field on IngredientParse and per-item schema. Will eventually drive an "alias suggestion" UI, but for v1 just gives us telemetry on how often Sonnet falls back to inventing names. Net effect for the family-internal goal: zero duplicate food creates from convention mismatches, smarter parses that respect the catalog Cobb spent time curating, foundation laid for the Step 2 re-key migration where cauldron_food_metadata gets keyed by Mealie food.id. --- cauldron/sterilizer.py | 134 +++++++++++++++++++++++++++++++++++------ 1 file changed, 115 insertions(+), 19 deletions(-) diff --git a/cauldron/sterilizer.py b/cauldron/sterilizer.py index 16c31f4..a75ab6d 100644 --- a/cauldron/sterilizer.py +++ b/cauldron/sterilizer.py @@ -22,7 +22,7 @@ from .forge import Forge, ForgeError from .mealie import Mealie, MealieError -STERILIZE_SYSTEM = """You are a precise recipe ingredient parser. You ONLY output valid JSON. +STERILIZE_SYSTEM_TEMPLATE = """You are a precise recipe ingredient parser. You ONLY output valid JSON. You receive a list of free-form ingredient strings and return a parallel list of LISTS — one inner list per input. Most inputs map 1→1 (single item inside the list). Compound lines that name multiple distinct foods MUST @@ -30,43 +30,65 @@ fan out into multiple items so each food gets its own row on the shopping list. Per-item schema: -{ +{{ "quantity": , # numeric amount; fractions → decimals (1/2 → 0.5) "unit": , # singular canonical: "cup", "tbsp", "tsp", "oz", "lb", "g", "kg", "ml", "l", "clove", "slice", "can", "package", "piece", "pinch", "dash", "handful". null if no unit (e.g. "1 onion"). - "food": , # core food noun, singular canonical lowercase: "onion", "garlic", "rice", "olive oil". Strip prep state ("chopped", "diced") into note. + "food": , # core food noun. PREFER an exact match from the canonical catalog below; only invent a new name if nothing matches. "note": , # prep state, brand, color, modifier: "chopped", "extra virgin", "yellow", "to taste" - "approx": # true if input said "about", "a pinch", "to taste", or otherwise vague -} + "approx": , # true if input said "about", "a pinch", "to taste", or otherwise vague + "is_new_food": # true if `food` is NOT in the canonical catalog and you're proposing a new name. false if `food` matches an existing canonical entry verbatim. +}} + +CANONICAL FOOD CATALOG (use these names verbatim — exact case + spelling +— when the ingredient maps to one of them). Rows shown as: + • [aliases: a, b, c] or • (plural: ) + +{foods} + +CATALOG RULES (most important): +- If the ingredient string matches an entry by name OR pluralName OR alias, + return the canonical NAME (the first column above) verbatim, exact case. + Set is_new_food=false. +- Strip prep modifiers ("chopped", "diced", "minced", "halved") into note; + the catalog name should be the bare food. +- Plural in input → singular in output if the canonical is singular. + e.g. "2 onions" → food: "onion" (when "onion" is in the catalog). +- Branded variations (e.g. "Heinz ketchup") → use canonical "ketchup" (or + whatever's in the catalog), put brand in note. +- Only set is_new_food=true when there's truly no reasonable match. Prefer + matching even if imperfect (e.g. "kosher salt" → "salt" with note + "kosher", is_new_food=false), since adding aliases later is easier than + cleaning duplicate food rows. FAN-OUT RULES — return MULTIPLE items for one input when: - "salt and pepper" / "salt and ground black pepper to taste" → split into 2 items, each - {quantity: 1, unit: "dash", food: "salt"|"black pepper", note: "to taste", approx: true} + {{quantity: 1, unit: "dash", food: , note: "to taste", approx: true}} - "Toppings (cinnamon butter, marshmallows, ground cinnamon, butter, etc)" or "Optional: cilantro, lime, queso fresco" → one item per food in the comma list. Drop the wrapper word ("Toppings", "Optional"); leave it OUT of food/note. Skip - filler words like "etc". Each item: quantity=null, unit=null, food=, note=null, approx=true. -- "1 lemon, juice and zest" → 2 items: {qty:1, unit:null, food:"lemon juice"} and {qty:1, unit:null, food:"lemon zest"} + filler words like "etc". Each item: quantity=null, unit=null, food=, note=null, approx=true. +- "1 lemon, juice and zest" → 2 items: {{qty:1, unit:null, food:}} and {{qty:1, unit:null, food:}}. - DO NOT split "salt and vinegar chips" or "macaroni and cheese" — those are compound food names, not multi-food lines. Heuristic: if the words on either side of "and" are a recognized standalone food, split; otherwise keep as one. PARSE RULES (for the common 1→1 case): - Convert fractions: "1/2" → 0.5, "1 1/4" → 1.25 -- "a pinch" / "a dash" alone → {quantity: 1, unit: "pinch"|"dash", approx: true} -- "to taste" alone → {quantity: null, unit: null, food: , note: "to taste", approx: true} -- "1 small onion" → {quantity: 1, unit: null, food: "onion", note: "small"} -- "2 cloves garlic, minced" → {quantity: 2, unit: "clove", food: "garlic", note: "minced"} -- "1.5 cups broccoli (coarsely chopped florets)" → {quantity: 1.5, unit: "cup", food: "broccoli", note: "coarsely chopped florets"} +- "a pinch" / "a dash" alone → {{quantity: 1, unit: "pinch"|"dash", approx: true}} +- "to taste" alone → {{quantity: null, unit: null, food: , note: "to taste", approx: true}} +- "1 small onion" → {{quantity: 1, unit: null, food: "onion", note: "small"}} +- "2 cloves garlic, minced" → {{quantity: 2, unit: "clove", food: "garlic", note: "minced"}} +- "1.5 cups broccoli (coarsely chopped florets)" → {{quantity: 1.5, unit: "cup", food: "broccoli", note: "coarsely chopped florets"}} - Section headers like "For the sauce:" → 1 item with all fields null EXCEPT - note: "
" (so Mealie can preserve the header row) + note: "
", is_new_food: false (so Mealie can preserve the header row) - If you genuinely cannot parse (junk input), return 1 item with all fields null - and the original string in note. + and the original string in note, is_new_food: false. - DO NOT add fields not in the schema. - DO NOT wrap output in markdown fences. - DO NOT include any prose before or after the JSON. -Input shape: {"ingredients": ["str", "str", ...]} -Output shape: {"parses": [[{...}, {...}], [{...}], [{...}, {...}, {...}], ...]} +Input shape: {{"ingredients": ["str", "str", ...]}} +Output shape: {{"parses": [[{{...}}, {{...}}], [{{...}}], [{{...}}, {{...}}, {{...}}], ...]}} The outer list MUST have the same length as the input list. Each inner list MUST contain at least 1 item (use the all-null junk-fallback if needed). """ @@ -79,6 +101,7 @@ class IngredientParse: food: str | None note: str | None approx: bool + is_new_food: bool = False # true when Sonnet proposes a new canonical name @dataclass @@ -100,6 +123,11 @@ class Sterilizer: self.mealie = mealie self.forge = forge self.model = model + # Lazy-loaded canonical food catalog from Mealie. Fetched once + # per Sterilizer instance (so a bulk sterilize job pulls it + # once and reuses across all 226 recipe parses). + self._catalog_cache: list[dict] | None = None + self._catalog_prompt: str | None = None # --- public ------------------------------------------------------------- @@ -343,6 +371,73 @@ class Sterilizer: # --- private ------------------------------------------------------------ + # --- canonical food catalog (Mealie is source of truth) ---------------- + + def _load_catalog(self) -> list[dict]: + """Pull every food row from Mealie in one big request. The user's + session token scopes to their group, so this spans every household + the user can see — fine, we want Sonnet to know all canonical + names. Cached on the instance after first call. + + We use the underlying _get directly (not list_foods) so we can + also pass a page param if a per_page=5000 doesn't return everything + in one shot.""" + if self._catalog_cache is not None: + return self._catalog_cache + out: list[dict] = [] + page = 1 + while page <= 20: # defensive ceiling + resp = self.mealie._get( + "/api/foods", search="", perPage=2000, page=page + ) + items = resp.get("items") or resp.get("data") or [] + for item in items: + out.append(item) + tp = resp.get("total_pages") or resp.get("totalPages") or 1 + if not items or page >= tp: + break + page += 1 + self._catalog_cache = out + return out + + def _catalog_for_prompt(self) -> str: + """Render the catalog as a bullet list for the system prompt. + Cached on the instance so we don't rebuild this per batch.""" + if self._catalog_prompt is not None: + return self._catalog_prompt + items = self._load_catalog() + lines: list[str] = [] + for it in items: + name = (it.get("name") or "").strip() + if not name: + continue + plural = (it.get("pluralName") or "").strip() + aliases = it.get("aliases") or [] + # Aliases on Mealie can be a list of strings or a list of + # {name, foodId} dicts depending on version. Normalize. + alias_names: list[str] = [] + for a in aliases: + if isinstance(a, str) and a.strip(): + alias_names.append(a.strip()) + elif isinstance(a, dict): + n = (a.get("name") or "").strip() + if n: + alias_names.append(n) + line = f" • {name}" + if plural and plural.lower() != name.lower(): + line += f" (plural: {plural})" + if alias_names: + line += f" [aliases: {', '.join(alias_names)}]" + lines.append(line) + self._catalog_prompt = "\n".join(lines) + return self._catalog_prompt + + def _system_prompt(self) -> str: + """Build the full STERILIZE_SYSTEM prompt with the catalog spliced in.""" + return STERILIZE_SYSTEM_TEMPLATE.format(foods=self._catalog_for_prompt()) + + # --- per-batch Sonnet call --------------------------------------------- + def _parse_batch(self, strings: list[str]) -> list[list[IngredientParse]]: """Returns list-of-lists matching the input length. Each inner list is the parses derived from one input string (1 in normal case, N @@ -352,8 +447,8 @@ class Sterilizer: resp = self.forge.run( prompt=prompt, model=self.model, - system=STERILIZE_SYSTEM, - timeout_secs=120, + system=self._system_prompt(), + timeout_secs=180, ) except ForgeError as e: raise RuntimeError(f"clawdforge failed: {e}") from e @@ -389,6 +484,7 @@ class Sterilizer: food=_clean_str(it.get("food")), note=_clean_str(it.get("note")), approx=bool(it.get("approx")), + is_new_food=bool(it.get("is_new_food")), ) ) if not inner: