From ed0894ddca1a7b86343c60043f68cecc5387c089 Mon Sep 17 00:00:00 2001 From: Kayos Date: Sat, 2 May 2026 14:12:36 -0700 Subject: [PATCH] discover: normalize source_url trailing slash before insert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same recipe URL with vs without trailing slash was producing duplicate discover corpus rows because UNIQUE(source_url) is byte-exact: https://www.tasteofhome.com/recipes/falafel → id 7 https://www.tasteofhome.com/recipes/falafel/ → id 3 (manually pasted) Caught 2026-05-02 when Cobb pasted his first 4 with trailing slashes, then a follow-up listing-page extractor stripped them, producing 1:1 dupes. rstrip('/') in insert_discovered_recipe normalizes at the persistence layer so all callers get the dedup for free. Existing data manually fixed: deleted dupes 7,8,5; stripped trailing slashes off rows 3,4,6 to canonical form. Corpus now clean (4 rows). --- cauldron/db.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/cauldron/db.py b/cauldron/db.py index eec29fe..8621eeb 100644 --- a/cauldron/db.py +++ b/cauldron/db.py @@ -2251,7 +2251,18 @@ class DB: ) -> int | None: """INSERT a freshly-scraped recipe in 'raw' state. Returns the new row id, or None if the source_url was already present (UNIQUE - violation = duplicate scrape, treat as skip).""" + violation = duplicate scrape, treat as skip). + + Normalizes source_url by stripping trailing slashes so that + `.../recipes/falafel` and `.../recipes/falafel/` map to the same + UNIQUE key. 2026-05-02: caught when manual `/discover` paste + included trailing slash but listing-page extractor stripped it, + producing 1:1 duplicates.""" + # URL canonicalization — single rstrip is safe for recipe paths + # (they always have a non-slash terminal segment; `https://host/` + # alone wouldn't be a valid recipe URL anyway). + if source_url.endswith("/"): + source_url = source_url.rstrip("/") with self.conn() as c, c.cursor() as cur: cur.execute( """INSERT IGNORE INTO cauldron_discovered_recipes