cauldron/scripts/clean_foods_seed.py

#!/usr/bin/env python3
"""Clean the USDA-derived foods seed via clawdforge → Sonnet, in batches.

Input:  cauldron/data/foods_seed_usda.json (~2462 noisy rows)
Output: cauldron/data/foods_seed.json     (curated, ~500-800 rows)

Why batched: 2462 entries × 200 chars = 577KB prompt; Sonnet hit timeout
on the single-shot curation. Splitting by category keeps each batch ~30-80
entries → ~50KB prompt → ~30-60s round-trip.

Run with:
  CLAWDFORGE_TOKEN=cf_... python3 scripts/clean_foods_seed.py
"""
import json
import os
import sys
import time
from collections import defaultdict
from pathlib import Path

import requests


HERE = Path(__file__).parent.parent
RAW_PATH = HERE / "cauldron/data/foods_seed_usda.json"
OUT_PATH = HERE / "cauldron/data/foods_seed.json"

CLAWDFORGE_URL = os.environ.get("CLAWDFORGE_URL", "http://clawdforge:8800")
CLAWDFORGE_TOKEN = os.environ["CLAWDFORGE_TOKEN"]


# Suggested count-foods to add per category — Sonnet uses these as seeds
# but is free to add more.
COUNT_FOODS_HINTS = {
    "produce-vegetable": "onion (~150g), garlic clove (~5g), tomato (~120g), "
                         "potato (~170g), bell pepper (~120g), jalapeno (~14g), "
                         "shallot (~25g), carrot (~60g), leek (~90g)",
    "produce-fruit":     "apple (~180g), banana (~118g), orange (~130g), "
                         "lemon (~60g), lime (~65g), avocado (~200g), peach (~150g)",
    "dairy":             "egg (~50g large, count not mass), slice cheese (~28g)",
    "grain":             "slice bread (~28g), tortilla (~50g)",
    "meat":              "(no count hints — meat is sold by weight)",
    "legume":            "can (~425g drained tomato/bean/etc when relevant)",
    "condiment":         "can (~400g for canned tomato/coconut milk)",
    "oil-fat":           "(none — sold by volume or weight)",
    "spice":             "(none — pinch/dash for to-taste)",
    "baking":            "(none unless slice-of-X applies)",
    "beverage":          "(none — bought in bottles, treat as volume)",
    "nut-seed":          "(none — sold by weight)",
    "other":             "(skip count hints)",
}


SYSTEM_PROMPT = """You are a culinary database curator. You receive a small batch
of raw USDA-derived food entries (one category at a time) and produce a clean,
useful subset for a family meal-planning app.

OUTPUT: ONE valid JSON object. No markdown fences, no prose, JSON only:

{
  "foods": [
    {
      "canonical_name": "<short singular noun, lowercase>",
      "category": "<input_category>",
      "density_g_per_ml": <number or null>,
      "default_unit_class": "<one of: mass, volume, count, mixed>",
      "common_size_g": <number or null>,
      "usda_fdc_id": <int or null>,
      "notes": <string or null>
    }
  ]
}

RULES:

1. **Drop ruthlessly** — composite/prepared meals, brand-laden entries
   (PILLSBURY ..., GERBER ...), babyfood, alcoholic beverages, fast food,
   ready-to-eat junk, sulfured/preserved derivatives.

2. **Normalize canonical_name** to the simplest cooking form:
   - "Apples, raw" → "apple"
   - "Rice, white, long-grain, regular, raw" → "white rice" (or "rice")
   - "Pepper, black, ground" → "black pepper"
   - "Mayonnaise, reduced fat, with olive oil" → DROP (variant)
   - Keep meaningful distinctions ("brown rice" vs "white rice", "salted butter" vs "unsalted butter")

3. **Preserve density_g_per_ml** from the input — don't re-derive.

4. **default_unit_class**:
   - mass: dry goods sold by weight (rice, flour, sugar, meat, beans, butter)
   - volume: liquids (milk, oil, juice, syrup, vinegar)
   - count: discrete items (egg, onion, garlic clove, lemon, slice bread)
   - mixed: bought in different forms (cheese — block vs shredded; salt — pinch vs grams)

5. **For count foods, set common_size_g** so the aggregator can convert
   "2 onions + 1 cup chopped onion" sensibly.

6. **ADD common count-based foods USDA doesn't track for this category**
   if they're missing. Suggested hints will be supplied per category.

7. Cap output at **80 foods per category**. Quality over quantity. Drop
   variants — pick the canonical form and skip the rest.

8. JSON only. No markdown fences. No preamble."""


USER_PROMPT_TEMPLATE = """Curate the **{category}** entries below.

Input: {n} raw entries.

Suggested count-foods to add for this category if missing:
{hints}

Entries:
{json}"""


def main():
    raw = json.loads(RAW_PATH.read_text())
    print(f"Loaded {len(raw)} raw foods", file=sys.stderr)

    # Bucket by category
    by_cat: dict[str, list[dict]] = defaultdict(list)
    for r in raw:
        by_cat[r.get("category") or "other"].append(r)

    print(f"Categories: {[(c, len(items)) for c, items in sorted(by_cat.items(), key=lambda x: -len(x[1]))]}", file=sys.stderr)
    print(file=sys.stderr)

    all_foods: list[dict] = []
    seen_canonical: set[str] = set()
    total_dropped = 0

    for cat in sorted(by_cat.keys()):
        items = by_cat[cat]
        # 'other' is too noisy + low-priority — process last and let Sonnet drop ~all
        # we'll run it but cap at first 100 entries to keep prompt size sane
        slice_items = items[:120] if cat != "other" else items[:80]

        prompt = USER_PROMPT_TEMPLATE.format(
            category=cat,
            n=len(slice_items),
            hints=COUNT_FOODS_HINTS.get(cat, "(none)"),
            json=json.dumps(slice_items, ensure_ascii=False),
        )

        print(f"[{cat}] {len(slice_items)} entries → ", end="", file=sys.stderr, flush=True)
        t0 = time.monotonic()
        try:
            r = requests.post(
                f"{CLAWDFORGE_URL.rstrip('/')}/run",
                headers={"Authorization": f"Bearer {CLAWDFORGE_TOKEN}"},
                json={
                    "prompt": prompt,
                    "system": SYSTEM_PROMPT,
                    "model": "sonnet",
                    "timeout_secs": 180,
                },
                timeout=210,
            )
        except requests.RequestException as e:
            print(f"transport err: {e}", file=sys.stderr)
            continue

        dur = time.monotonic() - t0

        if r.status_code >= 400:
            print(f"HTTP {r.status_code} ({dur:.1f}s) body={r.text[:200]}", file=sys.stderr)
            continue

        body = r.json()
        if not body.get("ok"):
            print(f"forge !ok ({dur:.1f}s) {body.get('error', '')}", file=sys.stderr)
            continue

        result = body.get("result")
        if isinstance(result, str):
            try:
                result = json.loads(result)
            except json.JSONDecodeError:
                print(f"non-JSON result ({dur:.1f}s)", file=sys.stderr)
                continue

        foods = result.get("foods") or [] if isinstance(result, dict) else []
        kept = 0
        for f in foods:
            cn = (f.get("canonical_name") or "").strip().lower()
            if not cn or cn in seen_canonical:
                continue
            f["canonical_name"] = cn
            f["category"] = cat   # enforce category from outer batch
            seen_canonical.add(cn)
            all_foods.append(f)
            kept += 1
        dropped = len(slice_items) - kept
        total_dropped += max(0, dropped)
        print(f"{kept} kept, ~{max(0, dropped)} dropped ({dur:.1f}s)", file=sys.stderr)

    all_foods.sort(key=lambda x: (x["category"] or "", x["canonical_name"]))

    print(file=sys.stderr)
    print(f"Total cleaned: {len(all_foods)} foods", file=sys.stderr)

    OUT_PATH.write_text(json.dumps(all_foods, indent=2, ensure_ascii=False) + "\n")
    print(f"Wrote {OUT_PATH}", file=sys.stderr)


if __name__ == "__main__":
    main()