From 09d716116a928db8f510a0e87ed339c59dec5d7b Mon Sep 17 00:00:00 2001 From: Kayos Date: Fri, 1 May 2026 07:41:58 -0700 Subject: [PATCH] =?UTF-8?q?discover:=20realistic=20Chrome=20UA=20on=20fall?= =?UTF-8?q?back=20fetch=20=E2=80=94=20most=20sites=20403=20bot=20UAs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cauldron/discover_recipes.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/cauldron/discover_recipes.py b/cauldron/discover_recipes.py index a58c88b..16692e2 100644 --- a/cauldron/discover_recipes.py +++ b/cauldron/discover_recipes.py @@ -157,7 +157,19 @@ def _scrape_one(url: str) -> tuple[dict, str | None] | None: resp = _rq.get( url, timeout=15, - headers={"User-Agent": "Mozilla/5.0 (cauldron-discover)"}, + headers={ + # Realistic desktop UA — many recipe sites 403 anything + # that smells like a bot. We're identifying as a normal + # browser; per-site robots.txt we still respect via + # recipe_scrapers' built-in wild_mode safety nets. + "User-Agent": ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0 Safari/537.36" + ), + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + }, ) if resp.status_code != 200: log.warning("[discover] fetch %s -> %s", url, resp.status_code)