cauldron/scripts/clean_foods_seed.py
Kayos d649b99aef v0.3 step 5: lean shopping list — claude on-demand foods + game strip
Two changes:

1. foods catalog grows organically. Switch the canonical seed from the
   noisy USDA dump (2462 rows of "'s, classic chicken noodle soup")
   to the Sonnet-curated cut (229 clean rows). search_food() is now
   exact + case-insensitive — Mealie's parser already canonicalizes
   food names household-side, so cauldron just needs to look them up
   verbatim. On miss, the /list view calls forge.fetch_food_info() to
   ask Sonnet for {density_g_per_ml, default_unit_class, common_size_g,
   category}, persists the row with source='claude', and the household's
   actual kitchen catalog builds itself out as Abby uses it.

   Killer case verified end-to-end: "2 cups + 50g + 1.25 lb rice"
   collapses to a single "2.25 lb rice" line on the shopping list once
   rice has a density row.

2. Game system stripped from /plan. Scoreboard panel, streak banner,
   "first to lock takes the week" / "🏆 you locked this one in" copy
   all gone. award_pick_points calls in /api/plan/generate +
   /api/plan/regenerate stopped firing. household_scoreboard /
   household_streak DB methods kept as dead code; cauldron_pick_points
   table left in place — non-destructive, easy to revive later if
   gamification comes back. Goal: get the base flow (pick → plan →
   list) working for Abby first, layer features on after.
2026-04-29 22:02:20 -07:00

209 lines
7.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Clean the USDA-derived foods seed via clawdforge → Sonnet, in batches.
Input: cauldron/data/foods_seed_usda.json (~2462 noisy rows)
Output: cauldron/data/foods_seed.json (curated, ~500-800 rows)
Why batched: 2462 entries × 200 chars = 577KB prompt; Sonnet hit timeout
on the single-shot curation. Splitting by category keeps each batch ~30-80
entries → ~50KB prompt → ~30-60s round-trip.
Run with:
CLAWDFORGE_TOKEN=cf_... python3 scripts/clean_foods_seed.py
"""
import json
import os
import sys
import time
from collections import defaultdict
from pathlib import Path
import requests
HERE = Path(__file__).parent.parent
RAW_PATH = HERE / "cauldron/data/foods_seed_usda.json"
OUT_PATH = HERE / "cauldron/data/foods_seed.json"
CLAWDFORGE_URL = os.environ.get("CLAWDFORGE_URL", "http://192.168.0.5:8800")
CLAWDFORGE_TOKEN = os.environ["CLAWDFORGE_TOKEN"]
# Suggested count-foods to add per category — Sonnet uses these as seeds
# but is free to add more.
COUNT_FOODS_HINTS = {
"produce-vegetable": "onion (~150g), garlic clove (~5g), tomato (~120g), "
"potato (~170g), bell pepper (~120g), jalapeno (~14g), "
"shallot (~25g), carrot (~60g), leek (~90g)",
"produce-fruit": "apple (~180g), banana (~118g), orange (~130g), "
"lemon (~60g), lime (~65g), avocado (~200g), peach (~150g)",
"dairy": "egg (~50g large, count not mass), slice cheese (~28g)",
"grain": "slice bread (~28g), tortilla (~50g)",
"meat": "(no count hints — meat is sold by weight)",
"legume": "can (~425g drained tomato/bean/etc when relevant)",
"condiment": "can (~400g for canned tomato/coconut milk)",
"oil-fat": "(none — sold by volume or weight)",
"spice": "(none — pinch/dash for to-taste)",
"baking": "(none unless slice-of-X applies)",
"beverage": "(none — bought in bottles, treat as volume)",
"nut-seed": "(none — sold by weight)",
"other": "(skip count hints)",
}
SYSTEM_PROMPT = """You are a culinary database curator. You receive a small batch
of raw USDA-derived food entries (one category at a time) and produce a clean,
useful subset for a family meal-planning app.
OUTPUT: ONE valid JSON object. No markdown fences, no prose, JSON only:
{
"foods": [
{
"canonical_name": "<short singular noun, lowercase>",
"category": "<input_category>",
"density_g_per_ml": <number or null>,
"default_unit_class": "<one of: mass, volume, count, mixed>",
"common_size_g": <number or null>,
"usda_fdc_id": <int or null>,
"notes": <string or null>
}
]
}
RULES:
1. **Drop ruthlessly** — composite/prepared meals, brand-laden entries
(PILLSBURY ..., GERBER ...), babyfood, alcoholic beverages, fast food,
ready-to-eat junk, sulfured/preserved derivatives.
2. **Normalize canonical_name** to the simplest cooking form:
- "Apples, raw""apple"
- "Rice, white, long-grain, regular, raw""white rice" (or "rice")
- "Pepper, black, ground""black pepper"
- "Mayonnaise, reduced fat, with olive oil" → DROP (variant)
- Keep meaningful distinctions ("brown rice" vs "white rice", "salted butter" vs "unsalted butter")
3. **Preserve density_g_per_ml** from the input — don't re-derive.
4. **default_unit_class**:
- mass: dry goods sold by weight (rice, flour, sugar, meat, beans, butter)
- volume: liquids (milk, oil, juice, syrup, vinegar)
- count: discrete items (egg, onion, garlic clove, lemon, slice bread)
- mixed: bought in different forms (cheese — block vs shredded; salt — pinch vs grams)
5. **For count foods, set common_size_g** so the aggregator can convert
"2 onions + 1 cup chopped onion" sensibly.
6. **ADD common count-based foods USDA doesn't track for this category**
if they're missing. Suggested hints will be supplied per category.
7. Cap output at **80 foods per category**. Quality over quantity. Drop
variants — pick the canonical form and skip the rest.
8. JSON only. No markdown fences. No preamble."""
USER_PROMPT_TEMPLATE = """Curate the **{category}** entries below.
Input: {n} raw entries.
Suggested count-foods to add for this category if missing:
{hints}
Entries:
{json}"""
def main():
raw = json.loads(RAW_PATH.read_text())
print(f"Loaded {len(raw)} raw foods", file=sys.stderr)
# Bucket by category
by_cat: dict[str, list[dict]] = defaultdict(list)
for r in raw:
by_cat[r.get("category") or "other"].append(r)
print(f"Categories: {[(c, len(items)) for c, items in sorted(by_cat.items(), key=lambda x: -len(x[1]))]}", file=sys.stderr)
print(file=sys.stderr)
all_foods: list[dict] = []
seen_canonical: set[str] = set()
total_dropped = 0
for cat in sorted(by_cat.keys()):
items = by_cat[cat]
# 'other' is too noisy + low-priority — process last and let Sonnet drop ~all
# we'll run it but cap at first 100 entries to keep prompt size sane
slice_items = items[:120] if cat != "other" else items[:80]
prompt = USER_PROMPT_TEMPLATE.format(
category=cat,
n=len(slice_items),
hints=COUNT_FOODS_HINTS.get(cat, "(none)"),
json=json.dumps(slice_items, ensure_ascii=False),
)
print(f"[{cat}] {len(slice_items)} entries → ", end="", file=sys.stderr, flush=True)
t0 = time.monotonic()
try:
r = requests.post(
f"{CLAWDFORGE_URL.rstrip('/')}/run",
headers={"Authorization": f"Bearer {CLAWDFORGE_TOKEN}"},
json={
"prompt": prompt,
"system": SYSTEM_PROMPT,
"model": "sonnet",
"timeout_secs": 180,
},
timeout=210,
)
except requests.RequestException as e:
print(f"transport err: {e}", file=sys.stderr)
continue
dur = time.monotonic() - t0
if r.status_code >= 400:
print(f"HTTP {r.status_code} ({dur:.1f}s) body={r.text[:200]}", file=sys.stderr)
continue
body = r.json()
if not body.get("ok"):
print(f"forge !ok ({dur:.1f}s) {body.get('error', '')}", file=sys.stderr)
continue
result = body.get("result")
if isinstance(result, str):
try:
result = json.loads(result)
except json.JSONDecodeError:
print(f"non-JSON result ({dur:.1f}s)", file=sys.stderr)
continue
foods = result.get("foods") or [] if isinstance(result, dict) else []
kept = 0
for f in foods:
cn = (f.get("canonical_name") or "").strip().lower()
if not cn or cn in seen_canonical:
continue
f["canonical_name"] = cn
f["category"] = cat # enforce category from outer batch
seen_canonical.add(cn)
all_foods.append(f)
kept += 1
dropped = len(slice_items) - kept
total_dropped += max(0, dropped)
print(f"{kept} kept, ~{max(0, dropped)} dropped ({dur:.1f}s)", file=sys.stderr)
all_foods.sort(key=lambda x: (x["category"] or "", x["canonical_name"]))
print(file=sys.stderr)
print(f"Total cleaned: {len(all_foods)} foods", file=sys.stderr)
OUT_PATH.write_text(json.dumps(all_foods, indent=2, ensure_ascii=False) + "\n")
print(f"Wrote {OUT_PATH}", file=sys.stderr)
if __name__ == "__main__":
main()