From 6ea25896fb6122aa1e85fc5df450595ba9a88644 Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Mon, 27 Apr 2026 11:07:35 +0100 Subject: [PATCH] Fix geo-restriction: remove Newegg seller filter and handle 0-result pages Newegg now returns 0 results for the 'Sold by Newegg' (8000) filter when accessed from a UK IP. Remove the filter from all categories so marketplace and Newegg-sold items both appear. Also accept GDPR cookie banners, handle the 0-items case gracefully instead of crashing, and fix get_page_count to return 0 instead of asserting when no pagination is found. Co-Authored-By: Claude Sonnet 4.6 --- crawl.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/crawl.py b/crawl.py index 57e0de6..e78c188 100755 --- a/crawl.py +++ b/crawl.py @@ -75,7 +75,11 @@ def get_product_list(n: str, page: Optional[int] = None) -> str: ) pg = context.new_page() pg.goto(url, wait_until="domcontentloaded", timeout=60000) - pg.wait_for_selector("div.item-container", timeout=60000) + + try: + pg.locator("button:has-text('Accept All')").click(timeout=2000) + except Exception: + pass if "areyouahuman" in pg.url: logger.info("bot detection triggered, simulating mouse movement...") @@ -91,6 +95,16 @@ def get_product_list(n: str, page: Optional[int] = None) -> str: except Exception: logger.warning("CAPTCHA bypass failed") + try: + pg.wait_for_selector("div.item-container", timeout=30000) + except Exception: + if pg.locator("text=We have found 0 items that match").count(): + logger.warning("geo-restricted: 0 items returned", url=url) + html = pg.content() + browser.close() + return html + raise + html = pg.content() browser.close() return html @@ -99,7 +113,7 @@ def get_product_list(n: str, page: Optional[int] = None) -> str: # RSS URL: https://www.newegg.com/Product/RSS.aspx?Submit=ENE&N=8000%204814%20600003489&IsNodeId=1&ShowDeactivatedMark=False&Order=RELEASE # ^ can include order=RELEASE -# seller = newegg: 8000 +# seller = newegg: 8000 (removed: geo-blocked for UK IPs) # condition = new: 4814 # form factor = 2.5": 600003490 # form factor = 3.5": 600003489 @@ -115,19 +129,19 @@ def get_product_list(n: str, page: Optional[int] = None) -> str: filter_params = [ - ("internal_35", '3.5" Internal HDD', "100167523 8000 4814 600003489"), - ("portable_25", '2.5" Portable HDD', "100167526 8000 4818 600003490"), - # ('portable_35', '3.5" portable drives', '100167526 8000 4818 600003489'), - ("external_35", '3.5" External HDD', "100167525 8000 4818 600003489"), + ("internal_35", '3.5" Internal HDD', "100167523 4814 600003489"), + ("portable_25", '2.5" Portable HDD', "100167526 4818 600003490"), + # ('portable_35', '3.5" portable drives', '100167526 4818 600003489'), + ("external_35", '3.5" External HDD', "100167525 4818 600003489"), ( "ssd_sata", "SATA SSD", - "100011693 8000 4814 600038506 600038510 600038519", + "100011693 4814 600038506 600038510 600038519", ), ( "ssd_pcie", "NVMe SSD", - "100011693 8000 4814 600640786 601296941 601301243", + "100011693 4814 600640786 601296941 601301243", ), ] @@ -178,7 +192,8 @@ def get_pages() -> None: def get_page_count(html: str) -> int: """Get page count.""" m = re_page.search(html) - assert m + if not m: + return 0 return int(m.group(1))