Fix geo-restriction: remove Newegg seller filter and handle 0-result pages
Newegg now returns 0 results for the 'Sold by Newegg' (8000) filter when accessed from a UK IP. Remove the filter from all categories so marketplace and Newegg-sold items both appear. Also accept GDPR cookie banners, handle the 0-items case gracefully instead of crashing, and fix get_page_count to return 0 instead of asserting when no pagination is found. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
47aaa52320
commit
6ea25896fb
1 changed files with 24 additions and 9 deletions
33
crawl.py
33
crawl.py
|
|
@ -75,7 +75,11 @@ def get_product_list(n: str, page: Optional[int] = None) -> str:
|
|||
)
|
||||
pg = context.new_page()
|
||||
pg.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||
pg.wait_for_selector("div.item-container", timeout=60000)
|
||||
|
||||
try:
|
||||
pg.locator("button:has-text('Accept All')").click(timeout=2000)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if "areyouahuman" in pg.url:
|
||||
logger.info("bot detection triggered, simulating mouse movement...")
|
||||
|
|
@ -91,6 +95,16 @@ def get_product_list(n: str, page: Optional[int] = None) -> str:
|
|||
except Exception:
|
||||
logger.warning("CAPTCHA bypass failed")
|
||||
|
||||
try:
|
||||
pg.wait_for_selector("div.item-container", timeout=30000)
|
||||
except Exception:
|
||||
if pg.locator("text=We have found 0 items that match").count():
|
||||
logger.warning("geo-restricted: 0 items returned", url=url)
|
||||
html = pg.content()
|
||||
browser.close()
|
||||
return html
|
||||
raise
|
||||
|
||||
html = pg.content()
|
||||
browser.close()
|
||||
return html
|
||||
|
|
@ -99,7 +113,7 @@ def get_product_list(n: str, page: Optional[int] = None) -> str:
|
|||
# RSS URL: https://www.newegg.com/Product/RSS.aspx?Submit=ENE&N=8000%204814%20600003489&IsNodeId=1&ShowDeactivatedMark=False&Order=RELEASE
|
||||
# ^ can include order=RELEASE
|
||||
|
||||
# seller = newegg: 8000
|
||||
# seller = newegg: 8000 (removed: geo-blocked for UK IPs)
|
||||
# condition = new: 4814
|
||||
# form factor = 2.5": 600003490
|
||||
# form factor = 3.5": 600003489
|
||||
|
|
@ -115,19 +129,19 @@ def get_product_list(n: str, page: Optional[int] = None) -> str:
|
|||
|
||||
|
||||
filter_params = [
|
||||
("internal_35", '3.5" Internal HDD', "100167523 8000 4814 600003489"),
|
||||
("portable_25", '2.5" Portable HDD', "100167526 8000 4818 600003490"),
|
||||
# ('portable_35', '3.5" portable drives', '100167526 8000 4818 600003489'),
|
||||
("external_35", '3.5" External HDD', "100167525 8000 4818 600003489"),
|
||||
("internal_35", '3.5" Internal HDD', "100167523 4814 600003489"),
|
||||
("portable_25", '2.5" Portable HDD', "100167526 4818 600003490"),
|
||||
# ('portable_35', '3.5" portable drives', '100167526 4818 600003489'),
|
||||
("external_35", '3.5" External HDD', "100167525 4818 600003489"),
|
||||
(
|
||||
"ssd_sata",
|
||||
"SATA SSD",
|
||||
"100011693 8000 4814 600038506 600038510 600038519",
|
||||
"100011693 4814 600038506 600038510 600038519",
|
||||
),
|
||||
(
|
||||
"ssd_pcie",
|
||||
"NVMe SSD",
|
||||
"100011693 8000 4814 600640786 601296941 601301243",
|
||||
"100011693 4814 600640786 601296941 601301243",
|
||||
),
|
||||
]
|
||||
|
||||
|
|
@ -178,7 +192,8 @@ def get_pages() -> None:
|
|||
def get_page_count(html: str) -> int:
|
||||
"""Get page count."""
|
||||
m = re_page.search(html)
|
||||
assert m
|
||||
if not m:
|
||||
return 0
|
||||
return int(m.group(1))
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue