Switch to Playwright to bypass Newegg bot detection, closes #2

Newegg now blocks requests-based scraping; replace with Playwright
using headless Chromium with mouse simulation to pass bot detection.
Also fix hardcoded build output path, use os.makedirs for nested dirs,
update category labels (HDD/SATA SSD/NVMe SSD), drop near-empty 2.5"
internal and laptop HDD categories, and fix invalid HTML in index
template (h2 inside table cells).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Edward Betts 2026-04-03 15:06:49 +01:00
parent 55bb3697b6
commit 2dc799ecaa
3 changed files with 62 additions and 42 deletions

View file

@ -10,6 +10,7 @@ import re
import sys import sys
import time import time
import typing import typing
import urllib.parse
from collections import defaultdict from collections import defaultdict
from datetime import date from datetime import date
from decimal import Decimal from decimal import Decimal
@ -17,15 +18,13 @@ from typing import Optional
import daiquiri import daiquiri
import lxml.html import lxml.html
import requests
from jinja2 import Environment, FileSystemLoader from jinja2 import Environment, FileSystemLoader
from playwright.sync_api import sync_playwright
daiquiri.setup(level=logging.INFO) daiquiri.setup(level=logging.INFO)
logger = daiquiri.getLogger(__name__) logger = daiquiri.getLogger(__name__)
# user_agent = 'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0' user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
user_agent = "UniversalFeedParser/5.2.0 +http://feedparser.org/"
product_list_url = "https://www.newegg.com/Product/ProductList.aspx"
re_page = re.compile(r"Page <strong>\d+/(\d+)</strong>") re_page = re.compile(r"Page <strong>\d+/(\d+)</strong>")
re_size1 = re.compile(r"\b([0-9.]+) ?([TGtg])[Bb]\b(?!/s)") re_size1 = re.compile(r"\b([0-9.]+) ?([TGtg])[Bb]\b(?!/s)")
@ -39,7 +38,7 @@ data_root = os.path.join(root_dir, "data")
def exists_or_create_dir(d: str) -> None: def exists_or_create_dir(d: str) -> None:
"""Create a directory if it doesn't already exist.""" """Create a directory if it doesn't already exist."""
if not os.path.exists(d): if not os.path.exists(d):
os.mkdir(d) os.makedirs(d)
def random_sleep() -> None: def random_sleep() -> None:
@ -47,27 +46,51 @@ def random_sleep() -> None:
time.sleep(random.randint(20, 90)) time.sleep(random.randint(20, 90))
def get_product_list(n: str, page: Optional[int] = None) -> requests.Response: def get_product_list(n: str, page: Optional[int] = None) -> str:
"""Get product list.""" """Get product list using Playwright to handle bot detection."""
params: dict[str, str | int] = { params: dict[str, str] = {
"Submit": "ENE",
"N": n, "N": n,
"IsNodeId": 1, "IsNodeId": "1",
"ActiveSearchResult": "True",
"Order": "RELEASE", "Order": "RELEASE",
"PageSize": 96, "PageSize": "96",
} }
if page is not None: if page is not None:
params["page"] = page params["page"] = str(page)
r = requests.get( url = "https://www.newegg.com/p/pl?" + urllib.parse.urlencode(params)
product_list_url, logger.info("fetching", url=url)
# allow_redirects=False,
params=params, with sync_playwright() as p:
headers={"User-Agent": user_agent}, browser = p.chromium.launch(
headless=True,
args=["--disable-blink-features=AutomationControlled"],
) )
print(r.url) context = browser.new_context(
logger.debug("request", url=r.url) user_agent=user_agent,
return r viewport={"width": 1280, "height": 800},
)
context.add_init_script(
"Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
)
pg = context.new_page()
pg.goto(url, wait_until="networkidle", timeout=60000)
if "areyouahuman" in pg.url:
logger.info("bot detection triggered, simulating mouse movement...")
for i in range(30):
x = 100 + i * 30 + random.randint(-5, 5)
y = 300 + random.randint(-20, 20)
pg.mouse.move(x, y)
time.sleep(0.04 + random.random() * 0.04)
try:
frame = pg.frame_locator('iframe[title*="reCAPTCHA"]').first
frame.locator("#recaptcha-anchor").click(timeout=5000)
pg.wait_for_url(lambda u: "areyouahuman" not in u, timeout=30000)
except Exception:
logger.warning("CAPTCHA bypass failed")
html = pg.content()
browser.close()
return html
# RSS URL: https://www.newegg.com/Product/RSS.aspx?Submit=ENE&N=8000%204814%20600003489&IsNodeId=1&ShowDeactivatedMark=False&Order=RELEASE # RSS URL: https://www.newegg.com/Product/RSS.aspx?Submit=ENE&N=8000%204814%20600003489&IsNodeId=1&ShowDeactivatedMark=False&Order=RELEASE
@ -89,20 +112,18 @@ def get_product_list(n: str, page: Optional[int] = None) -> requests.Response:
filter_params = [ filter_params = [
("internal_35", '3.5" internal drives', "100167523 8000 4814 600003489"), ("internal_35", '3.5" Internal HDD', "100167523 8000 4814 600003489"),
("internal_25", '2.5" internal drives', "100167523 8000 4814 600003490"), ("portable_25", '2.5" Portable HDD', "100167526 8000 4818 600003490"),
("laptop_25", '2.5" laptop drives', "100167524 8000 4814 600003490"),
("portable_25", '2.5" portable drives', "100167526 8000 4818 600003490"),
# ('portable_35', '3.5" portable drives', '100167526 8000 4818 600003489'), # ('portable_35', '3.5" portable drives', '100167526 8000 4818 600003489'),
("external_35", '3.5" external drives', "100167525 8000 4818 600003489"), ("external_35", '3.5" External HDD', "100167525 8000 4818 600003489"),
( (
"ssd_sata", "ssd_sata",
"SSD with SATA interface", "SATA SSD",
"100011693 8000 4814 600038506 600038510 600038519", "100011693 8000 4814 600038506 600038510 600038519",
), ),
( (
"ssd_pcie", "ssd_pcie",
"SSD with PCIe interface", "NVMe SSD",
"100011693 8000 4814 600640786 601296941 601301243", "100011693 8000 4814 600640786 601296941 601301243",
), ),
] ]
@ -113,9 +134,9 @@ def page_filename(d: str, name: str, page: int) -> str:
return os.path.join(d, f"{name}_page{page:02d}.html") return os.path.join(d, f"{name}_page{page:02d}.html")
def save_page(r: requests.models.Response, d: str, name: str, page: int) -> None: def save_page(html: str, d: str, name: str, page: int) -> None:
"""Save page.""" """Save page."""
open(page_filename(d, name, page), "w").write(r.text) open(page_filename(d, name, page), "w").write(html)
def get_pages() -> None: def get_pages() -> None:
@ -133,10 +154,9 @@ def get_pages() -> None:
logger.info(f"get {name}", label=label, page=1) logger.info(f"get {name}", label=label, page=1)
if download: if download:
random_sleep() random_sleep()
page1 = get_product_list(filter_param) page_content = get_product_list(filter_param)
download = True download = True
page_content = page1.text save_page(page_content, today_dir, name, 1)
save_page(page1, today_dir, name, 1)
page_content = page_content.replace("<!-- -->", "") page_content = page_content.replace("<!-- -->", "")
page_count = get_page_count(page_content) page_count = get_page_count(page_content)
logger.info(f"{name} page count: {page_count}") logger.info(f"{name} page count: {page_count}")
@ -147,9 +167,9 @@ def get_pages() -> None:
logger.info(f"get {name}", label=label, page=page_num) logger.info(f"get {name}", label=label, page=page_num)
if download: if download:
random_sleep() random_sleep()
r = get_product_list(filter_param, page=page_num) html = get_product_list(filter_param, page=page_num)
download = True download = True
save_page(r, today_dir, name, page_num) save_page(html, today_dir, name, page_num)
def get_page_count(html: str) -> int: def get_page_count(html: str) -> int:
@ -314,7 +334,7 @@ def group_items(
def build() -> None: def build() -> None:
"""Build.""" """Build."""
build_root = "/var/www/edward/docs/price_per_tb" build_root = os.path.join(root_dir, "output")
today = date.today() today = date.today()
templates_dir = os.path.join(root_dir, "templates") templates_dir = os.path.join(root_dir, "templates")

View file

@ -1,4 +1,4 @@
requests playwright
lxml lxml
jinja2 jinja2
daiquiri daiquiri

View file

@ -13,9 +13,9 @@ Comments welcome: edward@4angle.com
<p>Last updated: {{ today.strftime('%d %B %Y') }}.<p> <p>Last updated: {{ today.strftime('%d %B %Y') }}.<p>
<table>
{% for cat in best %} {% for cat in best %}
<tr><td colspan="4"><h2>{{ cat.label }}</h2></td></tr> <h2>{{ cat.label }}</h2>
<table>
<tr> <tr>
<th align="right">Price<br>per TB</th> <th align="right">Price<br>per TB</th>
<th align="right">Price</th> <th align="right">Price</th>
@ -30,7 +30,7 @@ Comments welcome: edward@4angle.com
<td><a href="https://www.newegg.com/Product/Product.aspx?Item={{ hdd.number }}">{{ hdd.title }}</a></td> <td><a href="https://www.newegg.com/Product/Product.aspx?Item={{ hdd.number }}">{{ hdd.title }}</a></td>
</tr> </tr>
{% endfor %} {% endfor %}
<tr><td colspan="4"><a href="{{ cat.name }}/index.html">more</a></td></tr>
{% endfor %}
</table> </table>
<p><a href="{{ cat.name }}/index.html">more</a></p>
{% endfor %}
{% endblock %} {% endblock %}