Switch to Playwright to bypass Newegg bot detection, closes #2

Newegg now blocks requests-based scraping; replace with Playwright
using headless Chromium with mouse simulation to pass bot detection.
Also fix hardcoded build output path, use os.makedirs for nested dirs,
update category labels (HDD/SATA SSD/NVMe SSD), drop near-empty 2.5"
internal and laptop HDD categories, and fix invalid HTML in index
template (h2 inside table cells).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Edward Betts 2026-04-03 15:06:49 +01:00
parent 55bb3697b6
commit 2dc799ecaa
3 changed files with 62 additions and 42 deletions

View file

@ -10,6 +10,7 @@ import re
import sys
import time
import typing
import urllib.parse
from collections import defaultdict
from datetime import date
from decimal import Decimal
@ -17,15 +18,13 @@ from typing import Optional
import daiquiri
import lxml.html
import requests
from jinja2 import Environment, FileSystemLoader
from playwright.sync_api import sync_playwright
daiquiri.setup(level=logging.INFO)
logger = daiquiri.getLogger(__name__)
# user_agent = 'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0'
user_agent = "UniversalFeedParser/5.2.0 +http://feedparser.org/"
product_list_url = "https://www.newegg.com/Product/ProductList.aspx"
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
re_page = re.compile(r"Page <strong>\d+/(\d+)</strong>")
re_size1 = re.compile(r"\b([0-9.]+) ?([TGtg])[Bb]\b(?!/s)")
@ -39,7 +38,7 @@ data_root = os.path.join(root_dir, "data")
def exists_or_create_dir(d: str) -> None:
"""Create a directory if it doesn't already exist."""
if not os.path.exists(d):
os.mkdir(d)
os.makedirs(d)
def random_sleep() -> None:
@ -47,27 +46,51 @@ def random_sleep() -> None:
time.sleep(random.randint(20, 90))
def get_product_list(n: str, page: Optional[int] = None) -> requests.Response:
"""Get product list."""
params: dict[str, str | int] = {
"Submit": "ENE",
def get_product_list(n: str, page: Optional[int] = None) -> str:
"""Get product list using Playwright to handle bot detection."""
params: dict[str, str] = {
"N": n,
"IsNodeId": 1,
"ActiveSearchResult": "True",
"IsNodeId": "1",
"Order": "RELEASE",
"PageSize": 96,
"PageSize": "96",
}
if page is not None:
params["page"] = page
r = requests.get(
product_list_url,
# allow_redirects=False,
params=params,
headers={"User-Agent": user_agent},
params["page"] = str(page)
url = "https://www.newegg.com/p/pl?" + urllib.parse.urlencode(params)
logger.info("fetching", url=url)
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=["--disable-blink-features=AutomationControlled"],
)
print(r.url)
logger.debug("request", url=r.url)
return r
context = browser.new_context(
user_agent=user_agent,
viewport={"width": 1280, "height": 800},
)
context.add_init_script(
"Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
)
pg = context.new_page()
pg.goto(url, wait_until="networkidle", timeout=60000)
if "areyouahuman" in pg.url:
logger.info("bot detection triggered, simulating mouse movement...")
for i in range(30):
x = 100 + i * 30 + random.randint(-5, 5)
y = 300 + random.randint(-20, 20)
pg.mouse.move(x, y)
time.sleep(0.04 + random.random() * 0.04)
try:
frame = pg.frame_locator('iframe[title*="reCAPTCHA"]').first
frame.locator("#recaptcha-anchor").click(timeout=5000)
pg.wait_for_url(lambda u: "areyouahuman" not in u, timeout=30000)
except Exception:
logger.warning("CAPTCHA bypass failed")
html = pg.content()
browser.close()
return html
# RSS URL: https://www.newegg.com/Product/RSS.aspx?Submit=ENE&N=8000%204814%20600003489&IsNodeId=1&ShowDeactivatedMark=False&Order=RELEASE
@ -89,20 +112,18 @@ def get_product_list(n: str, page: Optional[int] = None) -> requests.Response:
filter_params = [
("internal_35", '3.5" internal drives', "100167523 8000 4814 600003489"),
("internal_25", '2.5" internal drives', "100167523 8000 4814 600003490"),
("laptop_25", '2.5" laptop drives', "100167524 8000 4814 600003490"),
("portable_25", '2.5" portable drives', "100167526 8000 4818 600003490"),
("internal_35", '3.5" Internal HDD', "100167523 8000 4814 600003489"),
("portable_25", '2.5" Portable HDD', "100167526 8000 4818 600003490"),
# ('portable_35', '3.5" portable drives', '100167526 8000 4818 600003489'),
("external_35", '3.5" external drives', "100167525 8000 4818 600003489"),
("external_35", '3.5" External HDD', "100167525 8000 4818 600003489"),
(
"ssd_sata",
"SSD with SATA interface",
"SATA SSD",
"100011693 8000 4814 600038506 600038510 600038519",
),
(
"ssd_pcie",
"SSD with PCIe interface",
"NVMe SSD",
"100011693 8000 4814 600640786 601296941 601301243",
),
]
@ -113,9 +134,9 @@ def page_filename(d: str, name: str, page: int) -> str:
return os.path.join(d, f"{name}_page{page:02d}.html")
def save_page(r: requests.models.Response, d: str, name: str, page: int) -> None:
def save_page(html: str, d: str, name: str, page: int) -> None:
"""Save page."""
open(page_filename(d, name, page), "w").write(r.text)
open(page_filename(d, name, page), "w").write(html)
def get_pages() -> None:
@ -133,10 +154,9 @@ def get_pages() -> None:
logger.info(f"get {name}", label=label, page=1)
if download:
random_sleep()
page1 = get_product_list(filter_param)
page_content = get_product_list(filter_param)
download = True
page_content = page1.text
save_page(page1, today_dir, name, 1)
save_page(page_content, today_dir, name, 1)
page_content = page_content.replace("<!-- -->", "")
page_count = get_page_count(page_content)
logger.info(f"{name} page count: {page_count}")
@ -147,9 +167,9 @@ def get_pages() -> None:
logger.info(f"get {name}", label=label, page=page_num)
if download:
random_sleep()
r = get_product_list(filter_param, page=page_num)
html = get_product_list(filter_param, page=page_num)
download = True
save_page(r, today_dir, name, page_num)
save_page(html, today_dir, name, page_num)
def get_page_count(html: str) -> int:
@ -314,7 +334,7 @@ def group_items(
def build() -> None:
"""Build."""
build_root = "/var/www/edward/docs/price_per_tb"
build_root = os.path.join(root_dir, "output")
today = date.today()
templates_dir = os.path.join(root_dir, "templates")

View file

@ -1,4 +1,4 @@
requests
playwright
lxml
jinja2
daiquiri

View file

@ -13,9 +13,9 @@ Comments welcome: edward@4angle.com
<p>Last updated: {{ today.strftime('%d %B %Y') }}.<p>
<table>
{% for cat in best %}
<tr><td colspan="4"><h2>{{ cat.label }}</h2></td></tr>
<h2>{{ cat.label }}</h2>
<table>
<tr>
<th align="right">Price<br>per TB</th>
<th align="right">Price</th>
@ -30,7 +30,7 @@ Comments welcome: edward@4angle.com
<td><a href="https://www.newegg.com/Product/Product.aspx?Item={{ hdd.number }}">{{ hdd.title }}</a></td>
</tr>
{% endfor %}
<tr><td colspan="4"><a href="{{ cat.name }}/index.html">more</a></td></tr>
{% endfor %}
</table>
<p><a href="{{ cat.name }}/index.html">more</a></p>
{% endfor %}
{% endblock %}