discover: realistic Chrome UA on fallback fetch — most sites 403 bot UAs

This commit is contained in:
Kayos 2026-05-01 07:41:58 -07:00
parent 7773b2785c
commit 09d716116a

View file

@ -157,7 +157,19 @@ def _scrape_one(url: str) -> tuple[dict, str | None] | None:
resp = _rq.get( resp = _rq.get(
url, url,
timeout=15, timeout=15,
headers={"User-Agent": "Mozilla/5.0 (cauldron-discover)"}, headers={
# Realistic desktop UA — many recipe sites 403 anything
# that smells like a bot. We're identifying as a normal
# browser; per-site robots.txt we still respect via
# recipe_scrapers' built-in wild_mode safety nets.
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
},
) )
if resp.status_code != 200: if resp.status_code != 200:
log.warning("[discover] fetch %s -> %s", url, resp.status_code) log.warning("[discover] fetch %s -> %s", url, resp.status_code)