cauldron/scripts/build_foods_seed.py

#!/usr/bin/env python3
"""Build cauldron's foods_seed.json from USDA SR Legacy.

Usage:
  python scripts/build_foods_seed.py <usda-sr-legacy.json> > cauldron/data/foods_seed.json

Steps:
  1. Load SR Legacy JSON dump
  2. For each food, extract foodPortions and derive density g/ml from
     volume measurements (cup/tbsp/tsp/fl oz/ml/etc)
  3. Average densities across multiple portions of the same food
  4. Filter out non-cooking junk (branded items, weird stuff)
  5. Normalize description into a canonical_name (strip ", raw" suffixes,
     parenthetical brand names, etc.)
  6. Categorize using simple keyword heuristics
  7. Emit JSON ready for the cauldron_foods loader
"""
import json
import re
import sys
from collections import defaultdict


VOL_TO_ML = {
    'cup': 236.588,
    'tablespoon': 14.787, 'tbsp': 14.787,
    'teaspoon': 4.929, 'tsp': 4.929,
    'fl oz': 29.574, 'fluid ounce': 29.574, 'fluid ounces': 29.574,
    'ml': 1.0, 'milliliter': 1.0,
    'liter': 1000.0, 'l': 1000.0,
    'pint': 473.176, 'quart': 946.353, 'gallon': 3785.41,
}

# Description starts-with prefixes we drop entirely
DROP_PREFIXES = (
    "babyfood",
    "infant formula",
    "alcoholic beverage",
    "snacks,",
    "fast food",
    "restaurant",
    "school lunch",
    "puddings,",
    "frostings,",
    "candies,",
    "leavening agents,",  # these are baking but USDA names are weird
)

# Substrings anywhere → drop
DROP_KEYWORDS = [
    "fast food", "restaurant", "school lunch",
    "MCDONALDS", "BURGER KING", "KFC", "PIZZA HUT", "STARBUCKS",
    "SUBWAY", "TACO BELL", "WENDY'S", "DOMINOS", "PAPA JOHN",
    "CHICK-FIL-A", "POPEYES", "CHIPOTLE", "DENNY'S",
    "supplement", "weight control", "ready-to-drink", "ready to drink",
    "ready-to-eat", "muscle milk", "ENSURE", "BOOST",
    "nutrition bar", "meal replacement", "fortified",
    "sulfured", "dry mix", "frozen meal", "frozen dinner",
    "baby formula", "GERBER", "PILLSBURY", "KELLOGG",
    "QUAKER", "GENERAL MILLS", "POST", "BETTY CROCKER",
    "instant", "junior", "strained", "toddler",
    "(yield from", "from raw",
]

# Brand-like ALL-CAPS tokens to strip from description
BRAND_PATTERN = re.compile(r'\b[A-Z]{3,}(\s+[A-Z]{3,}|\s+[A-Z]{2,})*\b')

CATEGORY_MAP = [
    # (keyword in description.lower(), cauldron category)
    ("oil", "oil-fat"),
    ("butter", "oil-fat"),
    ("lard", "oil-fat"),
    ("shortening", "oil-fat"),
    ("flour", "baking"),
    ("sugar", "baking"),
    ("yeast", "baking"),
    ("baking powder", "baking"),
    ("baking soda", "baking"),
    ("vanilla", "baking"),
    ("cocoa", "baking"),
    ("chocolate", "baking"),
    ("salt", "spice"),
    ("pepper", "spice"),
    ("cinnamon", "spice"),
    ("paprika", "spice"),
    ("oregano", "spice"),
    ("basil", "spice"),
    ("thyme", "spice"),
    ("rosemary", "spice"),
    ("garlic powder", "spice"),
    ("onion powder", "spice"),
    ("cumin", "spice"),
    ("turmeric", "spice"),
    ("ginger", "spice"),
    ("milk", "dairy"),
    ("cream", "dairy"),
    ("yogurt", "dairy"),
    ("cheese", "dairy"),
    ("rice", "grain"),
    ("pasta", "grain"),
    ("noodle", "grain"),
    ("bread", "grain"),
    ("oats", "grain"),
    ("oatmeal", "grain"),
    ("quinoa", "grain"),
    ("barley", "grain"),
    ("couscous", "grain"),
    ("beans", "legume"),
    ("lentil", "legume"),
    ("chickpea", "legume"),
    ("garbanzo", "legume"),
    ("tofu", "legume"),
    ("tempeh", "legume"),
    ("almond", "nut-seed"),
    ("walnut", "nut-seed"),
    ("pecan", "nut-seed"),
    ("cashew", "nut-seed"),
    ("peanut", "nut-seed"),
    ("pistachio", "nut-seed"),
    ("hazelnut", "nut-seed"),
    ("seed", "nut-seed"),
    ("nut", "nut-seed"),
    ("beef", "meat"),
    ("pork", "meat"),
    ("chicken", "meat"),
    ("turkey", "meat"),
    ("lamb", "meat"),
    ("ham", "meat"),
    ("bacon", "meat"),
    ("sausage", "meat"),
    ("fish", "meat"),
    ("salmon", "meat"),
    ("tuna", "meat"),
    ("shrimp", "meat"),
    ("egg", "dairy"),  # close enough
    ("juice", "beverage"),
    ("water", "beverage"),
    ("tea", "beverage"),
    ("coffee", "beverage"),
    ("beer", "beverage"),
    ("wine", "beverage"),
    ("alcoholic", "beverage"),
    ("soda", "beverage"),
    ("vinegar", "condiment"),
    ("sauce", "condiment"),
    ("ketchup", "condiment"),
    ("mustard", "condiment"),
    ("mayonnaise", "condiment"),
    ("soy sauce", "condiment"),
    ("dressing", "condiment"),
    ("syrup", "condiment"),
    ("honey", "condiment"),
    ("jam", "condiment"),
    ("jelly", "condiment"),
    ("apple", "produce-fruit"),
    ("banana", "produce-fruit"),
    ("orange", "produce-fruit"),
    ("strawberry", "produce-fruit"),
    ("blueberry", "produce-fruit"),
    ("raspberry", "produce-fruit"),
    ("grape", "produce-fruit"),
    ("lemon", "produce-fruit"),
    ("lime", "produce-fruit"),
    ("pineapple", "produce-fruit"),
    ("mango", "produce-fruit"),
    ("watermelon", "produce-fruit"),
    ("cherry", "produce-fruit"),
    ("peach", "produce-fruit"),
    ("pear", "produce-fruit"),
    ("avocado", "produce-fruit"),
    ("tomato", "produce-vegetable"),  # we know
    ("onion", "produce-vegetable"),
    ("garlic", "produce-vegetable"),
    ("carrot", "produce-vegetable"),
    ("potato", "produce-vegetable"),
    ("spinach", "produce-vegetable"),
    ("lettuce", "produce-vegetable"),
    ("kale", "produce-vegetable"),
    ("broccoli", "produce-vegetable"),
    ("cauliflower", "produce-vegetable"),
    ("celery", "produce-vegetable"),
    ("cucumber", "produce-vegetable"),
    ("zucchini", "produce-vegetable"),
    ("pepper, sweet", "produce-vegetable"),
    ("pepper, bell", "produce-vegetable"),
    ("mushroom", "produce-vegetable"),
    ("squash", "produce-vegetable"),
    ("pumpkin", "produce-vegetable"),
    ("cabbage", "produce-vegetable"),
]


def categorize(name: str) -> str:
    """Match against the longest keyword first so 'soy sauce' beats 'sauce'
    and 'pepper, black' beats 'pepper'. Score by keyword length."""
    n = name.lower()
    best = (None, 0)
    for kw, cat in CATEGORY_MAP:
        if kw in n and len(kw) > best[1]:
            best = (cat, len(kw))
    return best[0] or "other"


def normalize_name(desc: str) -> str:
    """Pull a canonical name out of the verbose USDA description."""
    s = desc
    # Strip everything after the first comma in many cases ("Salt, table" -> "Salt")
    # but keep useful descriptors ("Pepper, black, ground" -> "black pepper" via reorder)
    # First: drop preparation suffixes that don't matter for shopping
    s = re.sub(r',\s*(raw|cooked, boiled|cooked, drained|prepared|whole|ground|fresh|dried|granulated|all)(\s*,|$)', '', s, flags=re.I)
    # Drop branded all-caps tokens
    s = BRAND_PATTERN.sub('', s)
    # Drop parentheticals
    s = re.sub(r'\([^)]*\)', '', s)
    # Tidy whitespace
    s = re.sub(r'\s+', ' ', s).strip(', ').strip()
    # Reorder "X, Y" → "Y X" for spices/seasonings ("Pepper, black" → "black pepper")
    if ',' in s and not any(s.lower().startswith(p) for p in ('alcoholic', 'beverage', 'soup', 'sauce')):
        parts = [p.strip() for p in s.split(',') if p.strip()]
        if len(parts) == 2 and len(parts[1]) <= 25:
            s = f"{parts[1]} {parts[0]}"
    return s.lower().strip()


_MODIFIER_VOL = re.compile(
    r'^(?:[\d./\s]*\s*)?(cup|tablespoon|tbsp|teaspoon|tsp|fl oz|fluid ounce|fluid ounces|ml|milliliter|liter|pint|quart|gallon)\b',
    re.I,
)
_MODIFIER_NORMALIZE = {
    'tbsp': 'tablespoon',
    'tsp': 'teaspoon',
    'fluid ounce': 'fl oz',
    'fluid ounces': 'fl oz',
    'milliliter': 'ml',
    'liter': 'liter',
}


def _modifier_to_unit(modifier: str) -> str | None:
    """Pull a known volume unit out of a USDA modifier string. Handles
    'cup', 'cup (8 fl oz)', 'cup, chopped', 'tablespoon', etc."""
    m = _MODIFIER_VOL.match((modifier or '').strip().lower())
    if not m:
        return None
    raw = m.group(1).lower()
    return _MODIFIER_NORMALIZE.get(raw, raw)


def derive_densities(food: dict) -> list[float]:
    """Return list of derived g/ml density values from this food's portions.

    SR Legacy puts the actual unit in `modifier` (not measureUnit.name,
    which is almost always 'undetermined'). We parse the modifier with a
    regex tolerant of garnish phrases ('cup, chopped', 'cup (8 fl oz)')."""
    out = []
    for p in (food.get('foodPortions') or []):
        gw = p.get('gramWeight')
        if not gw or gw <= 0:
            continue
        amount = p.get('amount') or 1
        unit_name = ((p.get('measureUnit') or {}).get('name') or '').lower().strip()
        modifier = p.get('modifier') or ''
        unit = unit_name if unit_name in VOL_TO_ML else _modifier_to_unit(modifier)
        if unit not in VOL_TO_ML:
            continue
        ml = VOL_TO_ML[unit] * amount
        if ml > 0:
            density = gw / ml
            if 0.1 < density < 3.0:
                out.append(density)
    return out


def main():
    src = sys.argv[1]
    with open(src) as f:
        data = json.load(f)
    foods = data.get('SRLegacyFoods') or data.get('FoundationFoods') or []

    out = []
    seen_canonical = {}
    for f in foods:
        desc = f.get('description') or ''
        if not desc:
            continue
        # Drop junk by prefix
        d_low = desc.lower()
        if any(d_low.startswith(p) for p in DROP_PREFIXES):
            continue
        # Drop junk by substring
        if any(kw.lower() in d_low for kw in DROP_KEYWORDS):
            continue
        densities = derive_densities(f)
        if not densities:
            continue
        avg = round(sum(densities) / len(densities), 3)

        canonical = normalize_name(desc)
        if not canonical or len(canonical) > 80:
            continue
        # If we've already seen this canonical name with a similar density, skip
        if canonical in seen_canonical:
            existing = seen_canonical[canonical]
            existing['density_samples'].append(avg)
            existing['density_g_per_ml'] = round(
                sum(existing['density_samples']) / len(existing['density_samples']), 3
            )
            continue
        seen_canonical[canonical] = {
            'canonical_name': canonical,
            'category': categorize(canonical),
            'density_g_per_ml': avg,
            'default_unit_class': 'volume' if avg < 1.05 else 'mass',
            'usda_fdc_id': f.get('fdcId'),
            'usda_description': desc,
            'density_samples': [avg],
        }

    # Drop the working sample list before serializing
    final = []
    for v in seen_canonical.values():
        v.pop('density_samples', None)
        final.append(v)

    final.sort(key=lambda x: x['canonical_name'])
    json.dump(final, sys.stdout, indent=2, ensure_ascii=False)
    print(f'\n# {len(final)} foods', file=sys.stderr)


if __name__ == '__main__':
    main()