discover: realistic Chrome UA on fallback fetch — most sites 403 bot UAs
This commit is contained in:
parent
7773b2785c
commit
09d716116a
1 changed files with 13 additions and 1 deletions
|
|
@ -157,7 +157,19 @@ def _scrape_one(url: str) -> tuple[dict, str | None] | None:
|
||||||
resp = _rq.get(
|
resp = _rq.get(
|
||||||
url,
|
url,
|
||||||
timeout=15,
|
timeout=15,
|
||||||
headers={"User-Agent": "Mozilla/5.0 (cauldron-discover)"},
|
headers={
|
||||||
|
# Realistic desktop UA — many recipe sites 403 anything
|
||||||
|
# that smells like a bot. We're identifying as a normal
|
||||||
|
# browser; per-site robots.txt we still respect via
|
||||||
|
# recipe_scrapers' built-in wild_mode safety nets.
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/120.0 Safari/537.36"
|
||||||
|
),
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en-US,en;q=0.5",
|
||||||
|
},
|
||||||
)
|
)
|
||||||
if resp.status_code != 200:
|
if resp.status_code != 200:
|
||||||
log.warning("[discover] fetch %s -> %s", url, resp.status_code)
|
log.warning("[discover] fetch %s -> %s", url, resp.status_code)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue