cauldron/scripts/clean_foods_seed.py
Cobb Hayes 592b4f1161 Public-flip audit: env-driven paths, scrub audit-ticket prefixes, terser README
Lucy bind paths + LAN host pins replaced with env defaults. Repository URLs
→ git.sulkta.com. Audit-changelog scaffolding stripped from inline comments
(technical reasoning preserved). README sheds marketing scaffolding. AI-speak
in load-bearing prompts/SOULs left alone — that IS the product.
2026-05-27 11:42:56 -07:00

209 lines
7.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Clean the USDA-derived foods seed via clawdforge → Sonnet, in batches.
Input: cauldron/data/foods_seed_usda.json (~2462 noisy rows)
Output: cauldron/data/foods_seed.json (curated, ~500-800 rows)
Why batched: 2462 entries × 200 chars = 577KB prompt; Sonnet hit timeout
on the single-shot curation. Splitting by category keeps each batch ~30-80
entries → ~50KB prompt → ~30-60s round-trip.
Run with:
CLAWDFORGE_TOKEN=cf_... python3 scripts/clean_foods_seed.py
"""
import json
import os
import sys
import time
from collections import defaultdict
from pathlib import Path
import requests
HERE = Path(__file__).parent.parent
RAW_PATH = HERE / "cauldron/data/foods_seed_usda.json"
OUT_PATH = HERE / "cauldron/data/foods_seed.json"
CLAWDFORGE_URL = os.environ.get("CLAWDFORGE_URL", "http://clawdforge:8800")
CLAWDFORGE_TOKEN = os.environ["CLAWDFORGE_TOKEN"]
# Suggested count-foods to add per category — Sonnet uses these as seeds
# but is free to add more.
COUNT_FOODS_HINTS = {
"produce-vegetable": "onion (~150g), garlic clove (~5g), tomato (~120g), "
"potato (~170g), bell pepper (~120g), jalapeno (~14g), "
"shallot (~25g), carrot (~60g), leek (~90g)",
"produce-fruit": "apple (~180g), banana (~118g), orange (~130g), "
"lemon (~60g), lime (~65g), avocado (~200g), peach (~150g)",
"dairy": "egg (~50g large, count not mass), slice cheese (~28g)",
"grain": "slice bread (~28g), tortilla (~50g)",
"meat": "(no count hints — meat is sold by weight)",
"legume": "can (~425g drained tomato/bean/etc when relevant)",
"condiment": "can (~400g for canned tomato/coconut milk)",
"oil-fat": "(none — sold by volume or weight)",
"spice": "(none — pinch/dash for to-taste)",
"baking": "(none unless slice-of-X applies)",
"beverage": "(none — bought in bottles, treat as volume)",
"nut-seed": "(none — sold by weight)",
"other": "(skip count hints)",
}
SYSTEM_PROMPT = """You are a culinary database curator. You receive a small batch
of raw USDA-derived food entries (one category at a time) and produce a clean,
useful subset for a family meal-planning app.
OUTPUT: ONE valid JSON object. No markdown fences, no prose, JSON only:
{
"foods": [
{
"canonical_name": "<short singular noun, lowercase>",
"category": "<input_category>",
"density_g_per_ml": <number or null>,
"default_unit_class": "<one of: mass, volume, count, mixed>",
"common_size_g": <number or null>,
"usda_fdc_id": <int or null>,
"notes": <string or null>
}
]
}
RULES:
1. **Drop ruthlessly** — composite/prepared meals, brand-laden entries
(PILLSBURY ..., GERBER ...), babyfood, alcoholic beverages, fast food,
ready-to-eat junk, sulfured/preserved derivatives.
2. **Normalize canonical_name** to the simplest cooking form:
- "Apples, raw""apple"
- "Rice, white, long-grain, regular, raw""white rice" (or "rice")
- "Pepper, black, ground""black pepper"
- "Mayonnaise, reduced fat, with olive oil" → DROP (variant)
- Keep meaningful distinctions ("brown rice" vs "white rice", "salted butter" vs "unsalted butter")
3. **Preserve density_g_per_ml** from the input — don't re-derive.
4. **default_unit_class**:
- mass: dry goods sold by weight (rice, flour, sugar, meat, beans, butter)
- volume: liquids (milk, oil, juice, syrup, vinegar)
- count: discrete items (egg, onion, garlic clove, lemon, slice bread)
- mixed: bought in different forms (cheese — block vs shredded; salt — pinch vs grams)
5. **For count foods, set common_size_g** so the aggregator can convert
"2 onions + 1 cup chopped onion" sensibly.
6. **ADD common count-based foods USDA doesn't track for this category**
if they're missing. Suggested hints will be supplied per category.
7. Cap output at **80 foods per category**. Quality over quantity. Drop
variants — pick the canonical form and skip the rest.
8. JSON only. No markdown fences. No preamble."""
USER_PROMPT_TEMPLATE = """Curate the **{category}** entries below.
Input: {n} raw entries.
Suggested count-foods to add for this category if missing:
{hints}
Entries:
{json}"""
def main():
raw = json.loads(RAW_PATH.read_text())
print(f"Loaded {len(raw)} raw foods", file=sys.stderr)
# Bucket by category
by_cat: dict[str, list[dict]] = defaultdict(list)
for r in raw:
by_cat[r.get("category") or "other"].append(r)
print(f"Categories: {[(c, len(items)) for c, items in sorted(by_cat.items(), key=lambda x: -len(x[1]))]}", file=sys.stderr)
print(file=sys.stderr)
all_foods: list[dict] = []
seen_canonical: set[str] = set()
total_dropped = 0
for cat in sorted(by_cat.keys()):
items = by_cat[cat]
# 'other' is too noisy + low-priority — process last and let Sonnet drop ~all
# we'll run it but cap at first 100 entries to keep prompt size sane
slice_items = items[:120] if cat != "other" else items[:80]
prompt = USER_PROMPT_TEMPLATE.format(
category=cat,
n=len(slice_items),
hints=COUNT_FOODS_HINTS.get(cat, "(none)"),
json=json.dumps(slice_items, ensure_ascii=False),
)
print(f"[{cat}] {len(slice_items)} entries → ", end="", file=sys.stderr, flush=True)
t0 = time.monotonic()
try:
r = requests.post(
f"{CLAWDFORGE_URL.rstrip('/')}/run",
headers={"Authorization": f"Bearer {CLAWDFORGE_TOKEN}"},
json={
"prompt": prompt,
"system": SYSTEM_PROMPT,
"model": "sonnet",
"timeout_secs": 180,
},
timeout=210,
)
except requests.RequestException as e:
print(f"transport err: {e}", file=sys.stderr)
continue
dur = time.monotonic() - t0
if r.status_code >= 400:
print(f"HTTP {r.status_code} ({dur:.1f}s) body={r.text[:200]}", file=sys.stderr)
continue
body = r.json()
if not body.get("ok"):
print(f"forge !ok ({dur:.1f}s) {body.get('error', '')}", file=sys.stderr)
continue
result = body.get("result")
if isinstance(result, str):
try:
result = json.loads(result)
except json.JSONDecodeError:
print(f"non-JSON result ({dur:.1f}s)", file=sys.stderr)
continue
foods = result.get("foods") or [] if isinstance(result, dict) else []
kept = 0
for f in foods:
cn = (f.get("canonical_name") or "").strip().lower()
if not cn or cn in seen_canonical:
continue
f["canonical_name"] = cn
f["category"] = cat # enforce category from outer batch
seen_canonical.add(cn)
all_foods.append(f)
kept += 1
dropped = len(slice_items) - kept
total_dropped += max(0, dropped)
print(f"{kept} kept, ~{max(0, dropped)} dropped ({dur:.1f}s)", file=sys.stderr)
all_foods.sort(key=lambda x: (x["category"] or "", x["canonical_name"]))
print(file=sys.stderr)
print(f"Total cleaned: {len(all_foods)} foods", file=sys.stderr)
OUT_PATH.write_text(json.dumps(all_foods, indent=2, ensure_ascii=False) + "\n")
print(f"Wrote {OUT_PATH}", file=sys.stderr)
if __name__ == "__main__":
main()