Fix geo-restriction: remove Newegg seller filter and handle 0-result pages

Newegg now returns 0 results for the 'Sold by Newegg' (8000) filter when
accessed from a UK IP. Remove the filter from all categories so marketplace
and Newegg-sold items both appear. Also accept GDPR cookie banners, handle
the 0-items case gracefully instead of crashing, and fix get_page_count to
return 0 instead of asserting when no pagination is found.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Edward Betts 2026-04-27 11:07:35 +01:00
parent 47aaa52320
commit 6ea25896fb

View file

@ -75,7 +75,11 @@ def get_product_list(n: str, page: Optional[int] = None) -> str:
) )
pg = context.new_page() pg = context.new_page()
pg.goto(url, wait_until="domcontentloaded", timeout=60000) pg.goto(url, wait_until="domcontentloaded", timeout=60000)
pg.wait_for_selector("div.item-container", timeout=60000)
try:
pg.locator("button:has-text('Accept All')").click(timeout=2000)
except Exception:
pass
if "areyouahuman" in pg.url: if "areyouahuman" in pg.url:
logger.info("bot detection triggered, simulating mouse movement...") logger.info("bot detection triggered, simulating mouse movement...")
@ -91,6 +95,16 @@ def get_product_list(n: str, page: Optional[int] = None) -> str:
except Exception: except Exception:
logger.warning("CAPTCHA bypass failed") logger.warning("CAPTCHA bypass failed")
try:
pg.wait_for_selector("div.item-container", timeout=30000)
except Exception:
if pg.locator("text=We have found 0 items that match").count():
logger.warning("geo-restricted: 0 items returned", url=url)
html = pg.content()
browser.close()
return html
raise
html = pg.content() html = pg.content()
browser.close() browser.close()
return html return html
@ -99,7 +113,7 @@ def get_product_list(n: str, page: Optional[int] = None) -> str:
# RSS URL: https://www.newegg.com/Product/RSS.aspx?Submit=ENE&N=8000%204814%20600003489&IsNodeId=1&ShowDeactivatedMark=False&Order=RELEASE # RSS URL: https://www.newegg.com/Product/RSS.aspx?Submit=ENE&N=8000%204814%20600003489&IsNodeId=1&ShowDeactivatedMark=False&Order=RELEASE
# ^ can include order=RELEASE # ^ can include order=RELEASE
# seller = newegg: 8000 # seller = newegg: 8000 (removed: geo-blocked for UK IPs)
# condition = new: 4814 # condition = new: 4814
# form factor = 2.5": 600003490 # form factor = 2.5": 600003490
# form factor = 3.5": 600003489 # form factor = 3.5": 600003489
@ -115,19 +129,19 @@ def get_product_list(n: str, page: Optional[int] = None) -> str:
filter_params = [ filter_params = [
("internal_35", '3.5" Internal HDD', "100167523 8000 4814 600003489"), ("internal_35", '3.5" Internal HDD', "100167523 4814 600003489"),
("portable_25", '2.5" Portable HDD', "100167526 8000 4818 600003490"), ("portable_25", '2.5" Portable HDD', "100167526 4818 600003490"),
# ('portable_35', '3.5" portable drives', '100167526 8000 4818 600003489'), # ('portable_35', '3.5" portable drives', '100167526 4818 600003489'),
("external_35", '3.5" External HDD', "100167525 8000 4818 600003489"), ("external_35", '3.5" External HDD', "100167525 4818 600003489"),
( (
"ssd_sata", "ssd_sata",
"SATA SSD", "SATA SSD",
"100011693 8000 4814 600038506 600038510 600038519", "100011693 4814 600038506 600038510 600038519",
), ),
( (
"ssd_pcie", "ssd_pcie",
"NVMe SSD", "NVMe SSD",
"100011693 8000 4814 600640786 601296941 601301243", "100011693 4814 600640786 601296941 601301243",
), ),
] ]
@ -178,7 +192,8 @@ def get_pages() -> None:
def get_page_count(html: str) -> int: def get_page_count(html: str) -> int:
"""Get page count.""" """Get page count."""
m = re_page.search(html) m = re_page.search(html)
assert m if not m:
return 0
return int(m.group(1)) return int(m.group(1))