Lucy bind paths + LAN host pins replaced with env defaults. Repository URLs → git.sulkta.com. Audit-changelog scaffolding stripped from inline comments (technical reasoning preserved). README sheds marketing scaffolding. AI-speak in load-bearing prompts/SOULs left alone — that IS the product.
209 lines
7.5 KiB
Python
209 lines
7.5 KiB
Python
#!/usr/bin/env python3
|
||
"""Clean the USDA-derived foods seed via clawdforge → Sonnet, in batches.
|
||
|
||
Input: cauldron/data/foods_seed_usda.json (~2462 noisy rows)
|
||
Output: cauldron/data/foods_seed.json (curated, ~500-800 rows)
|
||
|
||
Why batched: 2462 entries × 200 chars = 577KB prompt; Sonnet hit timeout
|
||
on the single-shot curation. Splitting by category keeps each batch ~30-80
|
||
entries → ~50KB prompt → ~30-60s round-trip.
|
||
|
||
Run with:
|
||
CLAWDFORGE_TOKEN=cf_... python3 scripts/clean_foods_seed.py
|
||
"""
|
||
import json
|
||
import os
|
||
import sys
|
||
import time
|
||
from collections import defaultdict
|
||
from pathlib import Path
|
||
|
||
import requests
|
||
|
||
|
||
HERE = Path(__file__).parent.parent
|
||
RAW_PATH = HERE / "cauldron/data/foods_seed_usda.json"
|
||
OUT_PATH = HERE / "cauldron/data/foods_seed.json"
|
||
|
||
CLAWDFORGE_URL = os.environ.get("CLAWDFORGE_URL", "http://clawdforge:8800")
|
||
CLAWDFORGE_TOKEN = os.environ["CLAWDFORGE_TOKEN"]
|
||
|
||
|
||
# Suggested count-foods to add per category — Sonnet uses these as seeds
|
||
# but is free to add more.
|
||
COUNT_FOODS_HINTS = {
|
||
"produce-vegetable": "onion (~150g), garlic clove (~5g), tomato (~120g), "
|
||
"potato (~170g), bell pepper (~120g), jalapeno (~14g), "
|
||
"shallot (~25g), carrot (~60g), leek (~90g)",
|
||
"produce-fruit": "apple (~180g), banana (~118g), orange (~130g), "
|
||
"lemon (~60g), lime (~65g), avocado (~200g), peach (~150g)",
|
||
"dairy": "egg (~50g large, count not mass), slice cheese (~28g)",
|
||
"grain": "slice bread (~28g), tortilla (~50g)",
|
||
"meat": "(no count hints — meat is sold by weight)",
|
||
"legume": "can (~425g drained tomato/bean/etc when relevant)",
|
||
"condiment": "can (~400g for canned tomato/coconut milk)",
|
||
"oil-fat": "(none — sold by volume or weight)",
|
||
"spice": "(none — pinch/dash for to-taste)",
|
||
"baking": "(none unless slice-of-X applies)",
|
||
"beverage": "(none — bought in bottles, treat as volume)",
|
||
"nut-seed": "(none — sold by weight)",
|
||
"other": "(skip count hints)",
|
||
}
|
||
|
||
|
||
SYSTEM_PROMPT = """You are a culinary database curator. You receive a small batch
|
||
of raw USDA-derived food entries (one category at a time) and produce a clean,
|
||
useful subset for a family meal-planning app.
|
||
|
||
OUTPUT: ONE valid JSON object. No markdown fences, no prose, JSON only:
|
||
|
||
{
|
||
"foods": [
|
||
{
|
||
"canonical_name": "<short singular noun, lowercase>",
|
||
"category": "<input_category>",
|
||
"density_g_per_ml": <number or null>,
|
||
"default_unit_class": "<one of: mass, volume, count, mixed>",
|
||
"common_size_g": <number or null>,
|
||
"usda_fdc_id": <int or null>,
|
||
"notes": <string or null>
|
||
}
|
||
]
|
||
}
|
||
|
||
RULES:
|
||
|
||
1. **Drop ruthlessly** — composite/prepared meals, brand-laden entries
|
||
(PILLSBURY ..., GERBER ...), babyfood, alcoholic beverages, fast food,
|
||
ready-to-eat junk, sulfured/preserved derivatives.
|
||
|
||
2. **Normalize canonical_name** to the simplest cooking form:
|
||
- "Apples, raw" → "apple"
|
||
- "Rice, white, long-grain, regular, raw" → "white rice" (or "rice")
|
||
- "Pepper, black, ground" → "black pepper"
|
||
- "Mayonnaise, reduced fat, with olive oil" → DROP (variant)
|
||
- Keep meaningful distinctions ("brown rice" vs "white rice", "salted butter" vs "unsalted butter")
|
||
|
||
3. **Preserve density_g_per_ml** from the input — don't re-derive.
|
||
|
||
4. **default_unit_class**:
|
||
- mass: dry goods sold by weight (rice, flour, sugar, meat, beans, butter)
|
||
- volume: liquids (milk, oil, juice, syrup, vinegar)
|
||
- count: discrete items (egg, onion, garlic clove, lemon, slice bread)
|
||
- mixed: bought in different forms (cheese — block vs shredded; salt — pinch vs grams)
|
||
|
||
5. **For count foods, set common_size_g** so the aggregator can convert
|
||
"2 onions + 1 cup chopped onion" sensibly.
|
||
|
||
6. **ADD common count-based foods USDA doesn't track for this category**
|
||
if they're missing. Suggested hints will be supplied per category.
|
||
|
||
7. Cap output at **80 foods per category**. Quality over quantity. Drop
|
||
variants — pick the canonical form and skip the rest.
|
||
|
||
8. JSON only. No markdown fences. No preamble."""
|
||
|
||
|
||
USER_PROMPT_TEMPLATE = """Curate the **{category}** entries below.
|
||
|
||
Input: {n} raw entries.
|
||
|
||
Suggested count-foods to add for this category if missing:
|
||
{hints}
|
||
|
||
Entries:
|
||
{json}"""
|
||
|
||
|
||
def main():
|
||
raw = json.loads(RAW_PATH.read_text())
|
||
print(f"Loaded {len(raw)} raw foods", file=sys.stderr)
|
||
|
||
# Bucket by category
|
||
by_cat: dict[str, list[dict]] = defaultdict(list)
|
||
for r in raw:
|
||
by_cat[r.get("category") or "other"].append(r)
|
||
|
||
print(f"Categories: {[(c, len(items)) for c, items in sorted(by_cat.items(), key=lambda x: -len(x[1]))]}", file=sys.stderr)
|
||
print(file=sys.stderr)
|
||
|
||
all_foods: list[dict] = []
|
||
seen_canonical: set[str] = set()
|
||
total_dropped = 0
|
||
|
||
for cat in sorted(by_cat.keys()):
|
||
items = by_cat[cat]
|
||
# 'other' is too noisy + low-priority — process last and let Sonnet drop ~all
|
||
# we'll run it but cap at first 100 entries to keep prompt size sane
|
||
slice_items = items[:120] if cat != "other" else items[:80]
|
||
|
||
prompt = USER_PROMPT_TEMPLATE.format(
|
||
category=cat,
|
||
n=len(slice_items),
|
||
hints=COUNT_FOODS_HINTS.get(cat, "(none)"),
|
||
json=json.dumps(slice_items, ensure_ascii=False),
|
||
)
|
||
|
||
print(f"[{cat}] {len(slice_items)} entries → ", end="", file=sys.stderr, flush=True)
|
||
t0 = time.monotonic()
|
||
try:
|
||
r = requests.post(
|
||
f"{CLAWDFORGE_URL.rstrip('/')}/run",
|
||
headers={"Authorization": f"Bearer {CLAWDFORGE_TOKEN}"},
|
||
json={
|
||
"prompt": prompt,
|
||
"system": SYSTEM_PROMPT,
|
||
"model": "sonnet",
|
||
"timeout_secs": 180,
|
||
},
|
||
timeout=210,
|
||
)
|
||
except requests.RequestException as e:
|
||
print(f"transport err: {e}", file=sys.stderr)
|
||
continue
|
||
|
||
dur = time.monotonic() - t0
|
||
|
||
if r.status_code >= 400:
|
||
print(f"HTTP {r.status_code} ({dur:.1f}s) body={r.text[:200]}", file=sys.stderr)
|
||
continue
|
||
|
||
body = r.json()
|
||
if not body.get("ok"):
|
||
print(f"forge !ok ({dur:.1f}s) {body.get('error', '')}", file=sys.stderr)
|
||
continue
|
||
|
||
result = body.get("result")
|
||
if isinstance(result, str):
|
||
try:
|
||
result = json.loads(result)
|
||
except json.JSONDecodeError:
|
||
print(f"non-JSON result ({dur:.1f}s)", file=sys.stderr)
|
||
continue
|
||
|
||
foods = result.get("foods") or [] if isinstance(result, dict) else []
|
||
kept = 0
|
||
for f in foods:
|
||
cn = (f.get("canonical_name") or "").strip().lower()
|
||
if not cn or cn in seen_canonical:
|
||
continue
|
||
f["canonical_name"] = cn
|
||
f["category"] = cat # enforce category from outer batch
|
||
seen_canonical.add(cn)
|
||
all_foods.append(f)
|
||
kept += 1
|
||
dropped = len(slice_items) - kept
|
||
total_dropped += max(0, dropped)
|
||
print(f"{kept} kept, ~{max(0, dropped)} dropped ({dur:.1f}s)", file=sys.stderr)
|
||
|
||
all_foods.sort(key=lambda x: (x["category"] or "", x["canonical_name"]))
|
||
|
||
print(file=sys.stderr)
|
||
print(f"Total cleaned: {len(all_foods)} foods", file=sys.stderr)
|
||
|
||
OUT_PATH.write_text(json.dumps(all_foods, indent=2, ensure_ascii=False) + "\n")
|
||
print(f"Wrote {OUT_PATH}", file=sys.stderr)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|