cauldron/cauldron/sterilizer.py
Kayos f7b30d3b65 sterilize: search-then-create + retry-on-UNIQUE-400 + don't mark errored as applied
Job 1's bulk run apply'd 184 recipes and 182 of them failed with the
same error: POST /api/foods -> 400 UNIQUE constraint failed:
ingredient_foods.name, ingredient_foods.group_id. Cause: Mealie's
name_normalized strips punctuation/whitespace/case more aggressively
than our local _build_name_index's plain .lower(), so the cache misses,
the create_food fires blindly, and Mealie's UNIQUE constraint kills
the call. Whole-recipe apply was wrapped in try/except at the bulk
runner so the recipe got marked errored — but applied_at was still
set to NOW(), making the rerun think we'd already tried. We had, but
the recipe's still unparsed.

Two fixes:

1. sterilizer._resolve_food / _resolve_unit replace the inline
   create-on-miss block. Order: local cache → Mealie search-endpoint
   tie-break → create. On any UNIQUE-flavored 400 from create, fall
   back to one more search to adopt whatever Mealie has under the
   normalized form. Mealie's search endpoint applies its own
   name_normalized internally so we don't have to mirror its rules.
   _search_for_match takes "foods" or "units" and looks for an exact
   case-insensitive match against name or pluralName, with a fallback
   to "trust Mealie's ranker" when there's exactly one hit.

2. db.mark_proposal_applied no longer sets applied_at on error. On
   success: applied_at=NOW(), apply_error=NULL. On error: applied_at
   stays NULL, apply_error gets the message. list_approved_unapplied_
   proposals keys off applied_at IS NULL, so a rerun naturally retries
   only the failed recipes.

Net effect: rerun can now successfully apply the 182 failed recipes
without re-walking them, and won't waste calls on the 2 that did go
through.
2026-04-30 06:05:19 -07:00

369 lines
14 KiB
Python

"""Ingredient sterilizer — turn Mealie's free-form ingredient strings into
structured (qty, unit, food, note) so shopping-list aggregation works.
Why this exists: Mealie has its own CRF parser, but it's mediocre and produces
inconsistent results. Cobb's hand-typed recipes have lots of "about 2 cups
cooked white rice" / "1 small handful kale" / "a pinch of salt" etc. that
slip past the parser. We send these to Sonnet via clawdforge and get back
clean structured form.
Flow:
1. Fetch the recipe from Mealie
2. Build a single batched prompt with all ingredients (one Sonnet call/recipe)
3. Get back a parallel array of {quantity, unit, food, note}
4. (preview) return the proposal
5. (apply) link each parse to existing Mealie food/unit (create if missing),
then PUT the updated recipe back
"""
import json
from dataclasses import dataclass, asdict
from .forge import Forge, ForgeError
from .mealie import Mealie, MealieError
STERILIZE_SYSTEM = """You are a precise recipe ingredient parser. You ONLY output valid JSON.
You receive a list of free-form ingredient strings and must return a parallel
array where each item is parsed into structured form.
Output schema (per item):
{
"quantity": <number or null>, # numeric amount, fractions converted to decimals (1/2 -> 0.5)
"unit": <string or null>, # singular canonical form: "cup", "tbsp", "tsp", "oz", "lb", "g", "kg", "ml", "l", "clove", "slice", "can", "package", "piece", "pinch", "dash", "handful". null if no unit (e.g. "1 onion").
"food": <string or null>, # the core food noun in singular canonical form: "onion", "garlic", "rice", "olive oil". Strip prep state ("chopped", "diced") -- those go in note.
"note": <string or null>, # prep state, brand, color, modifier: "chopped", "extra virgin", "yellow", "to taste"
"approx": <bool> # true if the input said "about" / "a pinch" / "to taste" / vague qty
}
Rules:
- Convert fractions: "1/2" -> 0.5, "1 1/4" -> 1.25
- "a pinch", "a dash", "to taste" -> {quantity: null, approx: true, note: "to taste"}
- "1 small onion" -> {quantity: 1, unit: null, food: "onion", note: "small"}
- "2 cloves garlic, minced" -> {quantity: 2, unit: "clove", food: "garlic", note: "minced"}
- Section headers like "For the sauce:" -> all fields null EXCEPT note: "<header text>"
- If you genuinely cannot parse, set all fields null and put the original in note.
- DO NOT add fields not in the schema.
- DO NOT wrap output in markdown fences.
- DO NOT include any prose before or after the JSON.
You will be given a JSON object: {"ingredients": ["str", "str", ...]}
You return: {"parses": [{...}, {...}, ...]} -- same length, same order.
"""
@dataclass
class IngredientParse:
quantity: float | None
unit: str | None
food: str | None
note: str | None
approx: bool
@dataclass
class IngredientProposal:
"""One ingredient before vs after."""
index: int
original_display: str
original_quantity: float | None
original_unit_name: str | None
original_food_name: str | None
original_note: str | None
parsed: IngredientParse
class Sterilizer:
def __init__(self, *, mealie: Mealie, forge: Forge, model: str = "sonnet"):
self.mealie = mealie
self.forge = forge
self.model = model
# --- public -------------------------------------------------------------
def preview_recipe(self, slug: str) -> dict:
"""Dry-run: parse all ingredients, return proposals without writing."""
recipe = self.mealie.get_recipe(slug)
ingredients = recipe.get("recipeIngredient") or []
if not ingredients:
return {"slug": slug, "name": recipe.get("name"), "proposals": []}
strings = [_render_ingredient_for_parse(ing) for ing in ingredients]
parses = self._parse_batch(strings)
proposals: list[IngredientProposal] = []
for i, (ing, parse) in enumerate(zip(ingredients, parses)):
proposals.append(
IngredientProposal(
index=i,
original_display=ing.get("display") or "",
original_quantity=ing.get("quantity"),
original_unit_name=(ing.get("unit") or {}).get("name") if ing.get("unit") else None,
original_food_name=(ing.get("food") or {}).get("name") if ing.get("food") else None,
original_note=ing.get("note"),
parsed=parse,
)
)
return {
"slug": slug,
"name": recipe.get("name"),
"ingredient_count": len(ingredients),
"proposals": [_proposal_to_dict(p) for p in proposals],
}
def apply_recipe(self, slug: str, *, create_missing: bool = True) -> dict:
"""Run preview, then write changes back to Mealie.
For each ingredient we resolve (or create) Mealie food/unit by name,
then assemble the new recipeIngredient list and PUT the recipe.
Mealie normalizes food/unit names more aggressively than .lower()
(its name_normalized strips punctuation + collapses whitespace +
unicode-folds). So a local-cache miss followed by a blind create
can hit Mealie's UNIQUE constraint on (name, group_id). We
ALWAYS try the search endpoint as a tie-break before creating,
and on a UNIQUE-violation 400 we re-search and adopt whatever
Mealie has under that normalized form.
"""
preview = self.preview_recipe(slug)
proposals = preview["proposals"]
if not proposals:
return {"slug": slug, "updated": 0, "skipped": 0, "created_foods": [], "created_units": []}
recipe = self.mealie.get_recipe(slug)
food_index = self._build_name_index(self.mealie.list_foods())
unit_index = self._build_name_index(self.mealie.list_units())
created_foods: list[str] = []
created_units: list[str] = []
new_ingredients: list[dict] = []
for orig_ing, prop in zip(recipe.get("recipeIngredient") or [], proposals):
parsed = prop["parsed"]
new_ing = dict(orig_ing) # preserve id, refId, original_text
new_ing["quantity"] = parsed["quantity"]
food_name = (parsed.get("food") or "").strip()
if food_name:
food_id = self._resolve_food(
food_name, food_index,
create_missing=create_missing,
created_log=created_foods,
)
if food_id:
new_ing["food"] = {"id": food_id, "name": food_name}
new_ing["isFood"] = True
else:
# Section header style — clear food, mark not-food
new_ing["food"] = None
new_ing["isFood"] = False
unit_name = (parsed.get("unit") or "").strip()
if unit_name:
unit_id = self._resolve_unit(
unit_name, unit_index,
create_missing=create_missing,
created_log=created_units,
)
if unit_id:
new_ing["unit"] = {"id": unit_id, "name": unit_name}
else:
new_ing["unit"] = None
new_ing["note"] = parsed.get("note") or ""
new_ingredients.append(new_ing)
recipe["recipeIngredient"] = new_ingredients
self.mealie.update_recipe(slug, recipe)
return {
"slug": slug,
"updated": len(new_ingredients),
"created_foods": created_foods,
"created_units": created_units,
}
# --- food/unit resolution helpers --------------------------------------
def _resolve_food(
self,
name: str,
index: dict[str, str],
*,
create_missing: bool,
created_log: list[str],
) -> str | None:
"""Find or create a Mealie food row, robust to normalization gaps."""
key = name.lower()
# Step 1: local cache hit (covers name + pluralName from list_foods)
if key in index:
return index[key]
# Step 2: server-side search — Mealie does proper normalization here
existing_id = self._search_for_match(name, "foods")
if existing_id:
index[key] = existing_id
return existing_id
# Step 3: create. If Mealie races us with a UNIQUE-constraint 400,
# search again and use whatever it has under the normalized form.
if not create_missing:
return None
try:
created = self.mealie.create_food(name)
food_id = created.get("id")
except MealieError as e:
msg = str(e)
if "UNIQUE constraint" in msg or "400" in msg:
food_id = self._search_for_match(name, "foods")
if not food_id:
raise # truly couldn't reconcile — let caller record error
else:
raise
if food_id:
index[key] = food_id
created_log.append(name)
return food_id
def _resolve_unit(
self,
name: str,
index: dict[str, str],
*,
create_missing: bool,
created_log: list[str],
) -> str | None:
key = name.lower()
if key in index:
return index[key]
existing_id = self._search_for_match(name, "units")
if existing_id:
index[key] = existing_id
return existing_id
if not create_missing:
return None
try:
created = self.mealie.create_unit(name)
unit_id = created.get("id")
except MealieError as e:
msg = str(e)
if "UNIQUE constraint" in msg or "400" in msg:
unit_id = self._search_for_match(name, "units")
if not unit_id:
raise
else:
raise
if unit_id:
index[key] = unit_id
created_log.append(name)
return unit_id
def _search_for_match(self, name: str, kind: str) -> str | None:
"""Use Mealie's search endpoint to find a foods/units row matching
`name`. Returns the id of the first item whose name or pluralName
matches (case-insensitive) the query, else None."""
target = name.strip().lower()
if not target:
return None
listing = (self.mealie.list_foods(search=name)
if kind == "foods"
else self.mealie.list_units(search=name))
items = listing.get("items") or listing.get("data") or []
# Mealie's search returns ranked results; take the first exact-ish match
for item in items:
for field in ("name", "pluralName"):
v = (item.get(field) or "").strip().lower()
if v and v == target:
return item.get("id")
# Fallback: if there's exactly one search hit, trust Mealie's ranker
if len(items) == 1 and items[0].get("id"):
return items[0]["id"]
return None
# --- private ------------------------------------------------------------
def _parse_batch(self, strings: list[str]) -> list[IngredientParse]:
prompt = json.dumps({"ingredients": strings}, ensure_ascii=False)
try:
resp = self.forge.run(
prompt=prompt,
model=self.model,
system=STERILIZE_SYSTEM,
timeout_secs=120,
)
except ForgeError as e:
raise RuntimeError(f"clawdforge failed: {e}") from e
result = resp.get("result")
if not isinstance(result, dict) or "parses" not in result:
raise RuntimeError(f"unexpected response shape: {str(result)[:200]}")
parses_raw = result["parses"]
if not isinstance(parses_raw, list) or len(parses_raw) != len(strings):
raise RuntimeError(
f"parse count mismatch: got {len(parses_raw)}, expected {len(strings)}"
)
out: list[IngredientParse] = []
for p in parses_raw:
out.append(
IngredientParse(
quantity=_coerce_float(p.get("quantity")),
unit=_clean_str(p.get("unit")),
food=_clean_str(p.get("food")),
note=_clean_str(p.get("note")),
approx=bool(p.get("approx")),
)
)
return out
@staticmethod
def _build_name_index(listing: dict) -> dict[str, str]:
index: dict[str, str] = {}
items = listing.get("items") or listing.get("data") or []
for item in items:
if name := item.get("name"):
index[name.lower()] = item["id"]
if plural := item.get("pluralName"):
index[plural.lower()] = item["id"]
return index
def _render_ingredient_for_parse(ing: dict) -> str:
"""Best string representation of a Mealie ingredient for sending to Claude."""
if ing.get("originalText"):
return ing["originalText"]
if ing.get("display"):
return ing["display"]
parts: list[str] = []
if (q := ing.get("quantity")) is not None:
parts.append(str(q))
if u := ing.get("unit"):
parts.append(u.get("name") or "")
if f := ing.get("food"):
parts.append(f.get("name") or "")
if note := ing.get("note"):
parts.append(note)
return " ".join(p for p in parts if p).strip() or "(empty)"
def _coerce_float(v) -> float | None:
if v is None:
return None
try:
return float(v)
except (TypeError, ValueError):
return None
def _clean_str(v) -> str | None:
if v is None:
return None
s = str(v).strip()
return s or None
def _proposal_to_dict(p: IngredientProposal) -> dict:
d = asdict(p)
return d