#!/usr/bin/env python3 """Clean the USDA-derived foods seed via clawdforge → Sonnet, in batches. Input: cauldron/data/foods_seed_usda.json (~2462 noisy rows) Output: cauldron/data/foods_seed.json (curated, ~500-800 rows) Why batched: 2462 entries × 200 chars = 577KB prompt; Sonnet hit timeout on the single-shot curation. Splitting by category keeps each batch ~30-80 entries → ~50KB prompt → ~30-60s round-trip. Run with: CLAWDFORGE_TOKEN=cf_... python3 scripts/clean_foods_seed.py """ import json import os import sys import time from collections import defaultdict from pathlib import Path import requests HERE = Path(__file__).parent.parent RAW_PATH = HERE / "cauldron/data/foods_seed_usda.json" OUT_PATH = HERE / "cauldron/data/foods_seed.json" CLAWDFORGE_URL = os.environ.get("CLAWDFORGE_URL", "http://clawdforge:8800") CLAWDFORGE_TOKEN = os.environ["CLAWDFORGE_TOKEN"] # Suggested count-foods to add per category — Sonnet uses these as seeds # but is free to add more. COUNT_FOODS_HINTS = { "produce-vegetable": "onion (~150g), garlic clove (~5g), tomato (~120g), " "potato (~170g), bell pepper (~120g), jalapeno (~14g), " "shallot (~25g), carrot (~60g), leek (~90g)", "produce-fruit": "apple (~180g), banana (~118g), orange (~130g), " "lemon (~60g), lime (~65g), avocado (~200g), peach (~150g)", "dairy": "egg (~50g large, count not mass), slice cheese (~28g)", "grain": "slice bread (~28g), tortilla (~50g)", "meat": "(no count hints — meat is sold by weight)", "legume": "can (~425g drained tomato/bean/etc when relevant)", "condiment": "can (~400g for canned tomato/coconut milk)", "oil-fat": "(none — sold by volume or weight)", "spice": "(none — pinch/dash for to-taste)", "baking": "(none unless slice-of-X applies)", "beverage": "(none — bought in bottles, treat as volume)", "nut-seed": "(none — sold by weight)", "other": "(skip count hints)", } SYSTEM_PROMPT = """You are a culinary database curator. You receive a small batch of raw USDA-derived food entries (one category at a time) and produce a clean, useful subset for a family meal-planning app. OUTPUT: ONE valid JSON object. No markdown fences, no prose, JSON only: { "foods": [ { "canonical_name": "", "category": "", "density_g_per_ml": , "default_unit_class": "", "common_size_g": , "usda_fdc_id": , "notes": } ] } RULES: 1. **Drop ruthlessly** — composite/prepared meals, brand-laden entries (PILLSBURY ..., GERBER ...), babyfood, alcoholic beverages, fast food, ready-to-eat junk, sulfured/preserved derivatives. 2. **Normalize canonical_name** to the simplest cooking form: - "Apples, raw" → "apple" - "Rice, white, long-grain, regular, raw" → "white rice" (or "rice") - "Pepper, black, ground" → "black pepper" - "Mayonnaise, reduced fat, with olive oil" → DROP (variant) - Keep meaningful distinctions ("brown rice" vs "white rice", "salted butter" vs "unsalted butter") 3. **Preserve density_g_per_ml** from the input — don't re-derive. 4. **default_unit_class**: - mass: dry goods sold by weight (rice, flour, sugar, meat, beans, butter) - volume: liquids (milk, oil, juice, syrup, vinegar) - count: discrete items (egg, onion, garlic clove, lemon, slice bread) - mixed: bought in different forms (cheese — block vs shredded; salt — pinch vs grams) 5. **For count foods, set common_size_g** so the aggregator can convert "2 onions + 1 cup chopped onion" sensibly. 6. **ADD common count-based foods USDA doesn't track for this category** if they're missing. Suggested hints will be supplied per category. 7. Cap output at **80 foods per category**. Quality over quantity. Drop variants — pick the canonical form and skip the rest. 8. JSON only. No markdown fences. No preamble.""" USER_PROMPT_TEMPLATE = """Curate the **{category}** entries below. Input: {n} raw entries. Suggested count-foods to add for this category if missing: {hints} Entries: {json}""" def main(): raw = json.loads(RAW_PATH.read_text()) print(f"Loaded {len(raw)} raw foods", file=sys.stderr) # Bucket by category by_cat: dict[str, list[dict]] = defaultdict(list) for r in raw: by_cat[r.get("category") or "other"].append(r) print(f"Categories: {[(c, len(items)) for c, items in sorted(by_cat.items(), key=lambda x: -len(x[1]))]}", file=sys.stderr) print(file=sys.stderr) all_foods: list[dict] = [] seen_canonical: set[str] = set() total_dropped = 0 for cat in sorted(by_cat.keys()): items = by_cat[cat] # 'other' is too noisy + low-priority — process last and let Sonnet drop ~all # we'll run it but cap at first 100 entries to keep prompt size sane slice_items = items[:120] if cat != "other" else items[:80] prompt = USER_PROMPT_TEMPLATE.format( category=cat, n=len(slice_items), hints=COUNT_FOODS_HINTS.get(cat, "(none)"), json=json.dumps(slice_items, ensure_ascii=False), ) print(f"[{cat}] {len(slice_items)} entries → ", end="", file=sys.stderr, flush=True) t0 = time.monotonic() try: r = requests.post( f"{CLAWDFORGE_URL.rstrip('/')}/run", headers={"Authorization": f"Bearer {CLAWDFORGE_TOKEN}"}, json={ "prompt": prompt, "system": SYSTEM_PROMPT, "model": "sonnet", "timeout_secs": 180, }, timeout=210, ) except requests.RequestException as e: print(f"transport err: {e}", file=sys.stderr) continue dur = time.monotonic() - t0 if r.status_code >= 400: print(f"HTTP {r.status_code} ({dur:.1f}s) body={r.text[:200]}", file=sys.stderr) continue body = r.json() if not body.get("ok"): print(f"forge !ok ({dur:.1f}s) {body.get('error', '')}", file=sys.stderr) continue result = body.get("result") if isinstance(result, str): try: result = json.loads(result) except json.JSONDecodeError: print(f"non-JSON result ({dur:.1f}s)", file=sys.stderr) continue foods = result.get("foods") or [] if isinstance(result, dict) else [] kept = 0 for f in foods: cn = (f.get("canonical_name") or "").strip().lower() if not cn or cn in seen_canonical: continue f["canonical_name"] = cn f["category"] = cat # enforce category from outer batch seen_canonical.add(cn) all_foods.append(f) kept += 1 dropped = len(slice_items) - kept total_dropped += max(0, dropped) print(f"{kept} kept, ~{max(0, dropped)} dropped ({dur:.1f}s)", file=sys.stderr) all_foods.sort(key=lambda x: (x["category"] or "", x["canonical_name"])) print(file=sys.stderr) print(f"Total cleaned: {len(all_foods)} foods", file=sys.stderr) OUT_PATH.write_text(json.dumps(all_foods, indent=2, ensure_ascii=False) + "\n") print(f"Wrote {OUT_PATH}", file=sys.stderr) if __name__ == "__main__": main()