diff --git a/cauldron/sterilizer.py b/cauldron/sterilizer.py index 16c31f4..a75ab6d 100644 --- a/cauldron/sterilizer.py +++ b/cauldron/sterilizer.py @@ -22,7 +22,7 @@ from .forge import Forge, ForgeError from .mealie import Mealie, MealieError -STERILIZE_SYSTEM = """You are a precise recipe ingredient parser. You ONLY output valid JSON. +STERILIZE_SYSTEM_TEMPLATE = """You are a precise recipe ingredient parser. You ONLY output valid JSON. You receive a list of free-form ingredient strings and return a parallel list of LISTS — one inner list per input. Most inputs map 1→1 (single item inside the list). Compound lines that name multiple distinct foods MUST @@ -30,43 +30,65 @@ fan out into multiple items so each food gets its own row on the shopping list. Per-item schema: -{ +{{ "quantity": , # numeric amount; fractions → decimals (1/2 → 0.5) "unit": , # singular canonical: "cup", "tbsp", "tsp", "oz", "lb", "g", "kg", "ml", "l", "clove", "slice", "can", "package", "piece", "pinch", "dash", "handful". null if no unit (e.g. "1 onion"). - "food": , # core food noun, singular canonical lowercase: "onion", "garlic", "rice", "olive oil". Strip prep state ("chopped", "diced") into note. + "food": , # core food noun. PREFER an exact match from the canonical catalog below; only invent a new name if nothing matches. "note": , # prep state, brand, color, modifier: "chopped", "extra virgin", "yellow", "to taste" - "approx": # true if input said "about", "a pinch", "to taste", or otherwise vague -} + "approx": , # true if input said "about", "a pinch", "to taste", or otherwise vague + "is_new_food": # true if `food` is NOT in the canonical catalog and you're proposing a new name. false if `food` matches an existing canonical entry verbatim. +}} + +CANONICAL FOOD CATALOG (use these names verbatim — exact case + spelling +— when the ingredient maps to one of them). Rows shown as: + • [aliases: a, b, c] or • (plural: ) + +{foods} + +CATALOG RULES (most important): +- If the ingredient string matches an entry by name OR pluralName OR alias, + return the canonical NAME (the first column above) verbatim, exact case. + Set is_new_food=false. +- Strip prep modifiers ("chopped", "diced", "minced", "halved") into note; + the catalog name should be the bare food. +- Plural in input → singular in output if the canonical is singular. + e.g. "2 onions" → food: "onion" (when "onion" is in the catalog). +- Branded variations (e.g. "Heinz ketchup") → use canonical "ketchup" (or + whatever's in the catalog), put brand in note. +- Only set is_new_food=true when there's truly no reasonable match. Prefer + matching even if imperfect (e.g. "kosher salt" → "salt" with note + "kosher", is_new_food=false), since adding aliases later is easier than + cleaning duplicate food rows. FAN-OUT RULES — return MULTIPLE items for one input when: - "salt and pepper" / "salt and ground black pepper to taste" → split into 2 items, each - {quantity: 1, unit: "dash", food: "salt"|"black pepper", note: "to taste", approx: true} + {{quantity: 1, unit: "dash", food: , note: "to taste", approx: true}} - "Toppings (cinnamon butter, marshmallows, ground cinnamon, butter, etc)" or "Optional: cilantro, lime, queso fresco" → one item per food in the comma list. Drop the wrapper word ("Toppings", "Optional"); leave it OUT of food/note. Skip - filler words like "etc". Each item: quantity=null, unit=null, food=, note=null, approx=true. -- "1 lemon, juice and zest" → 2 items: {qty:1, unit:null, food:"lemon juice"} and {qty:1, unit:null, food:"lemon zest"} + filler words like "etc". Each item: quantity=null, unit=null, food=, note=null, approx=true. +- "1 lemon, juice and zest" → 2 items: {{qty:1, unit:null, food:}} and {{qty:1, unit:null, food:}}. - DO NOT split "salt and vinegar chips" or "macaroni and cheese" — those are compound food names, not multi-food lines. Heuristic: if the words on either side of "and" are a recognized standalone food, split; otherwise keep as one. PARSE RULES (for the common 1→1 case): - Convert fractions: "1/2" → 0.5, "1 1/4" → 1.25 -- "a pinch" / "a dash" alone → {quantity: 1, unit: "pinch"|"dash", approx: true} -- "to taste" alone → {quantity: null, unit: null, food: , note: "to taste", approx: true} -- "1 small onion" → {quantity: 1, unit: null, food: "onion", note: "small"} -- "2 cloves garlic, minced" → {quantity: 2, unit: "clove", food: "garlic", note: "minced"} -- "1.5 cups broccoli (coarsely chopped florets)" → {quantity: 1.5, unit: "cup", food: "broccoli", note: "coarsely chopped florets"} +- "a pinch" / "a dash" alone → {{quantity: 1, unit: "pinch"|"dash", approx: true}} +- "to taste" alone → {{quantity: null, unit: null, food: , note: "to taste", approx: true}} +- "1 small onion" → {{quantity: 1, unit: null, food: "onion", note: "small"}} +- "2 cloves garlic, minced" → {{quantity: 2, unit: "clove", food: "garlic", note: "minced"}} +- "1.5 cups broccoli (coarsely chopped florets)" → {{quantity: 1.5, unit: "cup", food: "broccoli", note: "coarsely chopped florets"}} - Section headers like "For the sauce:" → 1 item with all fields null EXCEPT - note: "
" (so Mealie can preserve the header row) + note: "
", is_new_food: false (so Mealie can preserve the header row) - If you genuinely cannot parse (junk input), return 1 item with all fields null - and the original string in note. + and the original string in note, is_new_food: false. - DO NOT add fields not in the schema. - DO NOT wrap output in markdown fences. - DO NOT include any prose before or after the JSON. -Input shape: {"ingredients": ["str", "str", ...]} -Output shape: {"parses": [[{...}, {...}], [{...}], [{...}, {...}, {...}], ...]} +Input shape: {{"ingredients": ["str", "str", ...]}} +Output shape: {{"parses": [[{{...}}, {{...}}], [{{...}}], [{{...}}, {{...}}, {{...}}], ...]}} The outer list MUST have the same length as the input list. Each inner list MUST contain at least 1 item (use the all-null junk-fallback if needed). """ @@ -79,6 +101,7 @@ class IngredientParse: food: str | None note: str | None approx: bool + is_new_food: bool = False # true when Sonnet proposes a new canonical name @dataclass @@ -100,6 +123,11 @@ class Sterilizer: self.mealie = mealie self.forge = forge self.model = model + # Lazy-loaded canonical food catalog from Mealie. Fetched once + # per Sterilizer instance (so a bulk sterilize job pulls it + # once and reuses across all 226 recipe parses). + self._catalog_cache: list[dict] | None = None + self._catalog_prompt: str | None = None # --- public ------------------------------------------------------------- @@ -343,6 +371,73 @@ class Sterilizer: # --- private ------------------------------------------------------------ + # --- canonical food catalog (Mealie is source of truth) ---------------- + + def _load_catalog(self) -> list[dict]: + """Pull every food row from Mealie in one big request. The user's + session token scopes to their group, so this spans every household + the user can see — fine, we want Sonnet to know all canonical + names. Cached on the instance after first call. + + We use the underlying _get directly (not list_foods) so we can + also pass a page param if a per_page=5000 doesn't return everything + in one shot.""" + if self._catalog_cache is not None: + return self._catalog_cache + out: list[dict] = [] + page = 1 + while page <= 20: # defensive ceiling + resp = self.mealie._get( + "/api/foods", search="", perPage=2000, page=page + ) + items = resp.get("items") or resp.get("data") or [] + for item in items: + out.append(item) + tp = resp.get("total_pages") or resp.get("totalPages") or 1 + if not items or page >= tp: + break + page += 1 + self._catalog_cache = out + return out + + def _catalog_for_prompt(self) -> str: + """Render the catalog as a bullet list for the system prompt. + Cached on the instance so we don't rebuild this per batch.""" + if self._catalog_prompt is not None: + return self._catalog_prompt + items = self._load_catalog() + lines: list[str] = [] + for it in items: + name = (it.get("name") or "").strip() + if not name: + continue + plural = (it.get("pluralName") or "").strip() + aliases = it.get("aliases") or [] + # Aliases on Mealie can be a list of strings or a list of + # {name, foodId} dicts depending on version. Normalize. + alias_names: list[str] = [] + for a in aliases: + if isinstance(a, str) and a.strip(): + alias_names.append(a.strip()) + elif isinstance(a, dict): + n = (a.get("name") or "").strip() + if n: + alias_names.append(n) + line = f" • {name}" + if plural and plural.lower() != name.lower(): + line += f" (plural: {plural})" + if alias_names: + line += f" [aliases: {', '.join(alias_names)}]" + lines.append(line) + self._catalog_prompt = "\n".join(lines) + return self._catalog_prompt + + def _system_prompt(self) -> str: + """Build the full STERILIZE_SYSTEM prompt with the catalog spliced in.""" + return STERILIZE_SYSTEM_TEMPLATE.format(foods=self._catalog_for_prompt()) + + # --- per-batch Sonnet call --------------------------------------------- + def _parse_batch(self, strings: list[str]) -> list[list[IngredientParse]]: """Returns list-of-lists matching the input length. Each inner list is the parses derived from one input string (1 in normal case, N @@ -352,8 +447,8 @@ class Sterilizer: resp = self.forge.run( prompt=prompt, model=self.model, - system=STERILIZE_SYSTEM, - timeout_secs=120, + system=self._system_prompt(), + timeout_secs=180, ) except ForgeError as e: raise RuntimeError(f"clawdforge failed: {e}") from e @@ -389,6 +484,7 @@ class Sterilizer: food=_clean_str(it.get("food")), note=_clean_str(it.get("note")), approx=bool(it.get("approx")), + is_new_food=bool(it.get("is_new_food")), ) ) if not inner: