Fix geo-restriction: remove Newegg seller filter and handle 0-result pages
Newegg now returns 0 results for the 'Sold by Newegg' (8000) filter when accessed from a UK IP. Remove the filter from all categories so marketplace and Newegg-sold items both appear. Also accept GDPR cookie banners, handle the 0-items case gracefully instead of crashing, and fix get_page_count to return 0 instead of asserting when no pagination is found. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
47aaa52320
commit
6ea25896fb
1 changed files with 24 additions and 9 deletions
33
crawl.py
33
crawl.py
|
|
@ -75,7 +75,11 @@ def get_product_list(n: str, page: Optional[int] = None) -> str:
|
||||||
)
|
)
|
||||||
pg = context.new_page()
|
pg = context.new_page()
|
||||||
pg.goto(url, wait_until="domcontentloaded", timeout=60000)
|
pg.goto(url, wait_until="domcontentloaded", timeout=60000)
|
||||||
pg.wait_for_selector("div.item-container", timeout=60000)
|
|
||||||
|
try:
|
||||||
|
pg.locator("button:has-text('Accept All')").click(timeout=2000)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
if "areyouahuman" in pg.url:
|
if "areyouahuman" in pg.url:
|
||||||
logger.info("bot detection triggered, simulating mouse movement...")
|
logger.info("bot detection triggered, simulating mouse movement...")
|
||||||
|
|
@ -91,6 +95,16 @@ def get_product_list(n: str, page: Optional[int] = None) -> str:
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.warning("CAPTCHA bypass failed")
|
logger.warning("CAPTCHA bypass failed")
|
||||||
|
|
||||||
|
try:
|
||||||
|
pg.wait_for_selector("div.item-container", timeout=30000)
|
||||||
|
except Exception:
|
||||||
|
if pg.locator("text=We have found 0 items that match").count():
|
||||||
|
logger.warning("geo-restricted: 0 items returned", url=url)
|
||||||
|
html = pg.content()
|
||||||
|
browser.close()
|
||||||
|
return html
|
||||||
|
raise
|
||||||
|
|
||||||
html = pg.content()
|
html = pg.content()
|
||||||
browser.close()
|
browser.close()
|
||||||
return html
|
return html
|
||||||
|
|
@ -99,7 +113,7 @@ def get_product_list(n: str, page: Optional[int] = None) -> str:
|
||||||
# RSS URL: https://www.newegg.com/Product/RSS.aspx?Submit=ENE&N=8000%204814%20600003489&IsNodeId=1&ShowDeactivatedMark=False&Order=RELEASE
|
# RSS URL: https://www.newegg.com/Product/RSS.aspx?Submit=ENE&N=8000%204814%20600003489&IsNodeId=1&ShowDeactivatedMark=False&Order=RELEASE
|
||||||
# ^ can include order=RELEASE
|
# ^ can include order=RELEASE
|
||||||
|
|
||||||
# seller = newegg: 8000
|
# seller = newegg: 8000 (removed: geo-blocked for UK IPs)
|
||||||
# condition = new: 4814
|
# condition = new: 4814
|
||||||
# form factor = 2.5": 600003490
|
# form factor = 2.5": 600003490
|
||||||
# form factor = 3.5": 600003489
|
# form factor = 3.5": 600003489
|
||||||
|
|
@ -115,19 +129,19 @@ def get_product_list(n: str, page: Optional[int] = None) -> str:
|
||||||
|
|
||||||
|
|
||||||
filter_params = [
|
filter_params = [
|
||||||
("internal_35", '3.5" Internal HDD', "100167523 8000 4814 600003489"),
|
("internal_35", '3.5" Internal HDD', "100167523 4814 600003489"),
|
||||||
("portable_25", '2.5" Portable HDD', "100167526 8000 4818 600003490"),
|
("portable_25", '2.5" Portable HDD', "100167526 4818 600003490"),
|
||||||
# ('portable_35', '3.5" portable drives', '100167526 8000 4818 600003489'),
|
# ('portable_35', '3.5" portable drives', '100167526 4818 600003489'),
|
||||||
("external_35", '3.5" External HDD', "100167525 8000 4818 600003489"),
|
("external_35", '3.5" External HDD', "100167525 4818 600003489"),
|
||||||
(
|
(
|
||||||
"ssd_sata",
|
"ssd_sata",
|
||||||
"SATA SSD",
|
"SATA SSD",
|
||||||
"100011693 8000 4814 600038506 600038510 600038519",
|
"100011693 4814 600038506 600038510 600038519",
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"ssd_pcie",
|
"ssd_pcie",
|
||||||
"NVMe SSD",
|
"NVMe SSD",
|
||||||
"100011693 8000 4814 600640786 601296941 601301243",
|
"100011693 4814 600640786 601296941 601301243",
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -178,7 +192,8 @@ def get_pages() -> None:
|
||||||
def get_page_count(html: str) -> int:
|
def get_page_count(html: str) -> int:
|
||||||
"""Get page count."""
|
"""Get page count."""
|
||||||
m = re_page.search(html)
|
m = re_page.search(html)
|
||||||
assert m
|
if not m:
|
||||||
|
return 0
|
||||||
return int(m.group(1))
|
return int(m.group(1))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue