diff --git a/crawl.py b/crawl.py index c057eff..236537f 100755 --- a/crawl.py +++ b/crawl.py @@ -10,6 +10,7 @@ import re import sys import time import typing +import urllib.parse from collections import defaultdict from datetime import date from decimal import Decimal @@ -17,15 +18,13 @@ from typing import Optional import daiquiri import lxml.html -import requests from jinja2 import Environment, FileSystemLoader +from playwright.sync_api import sync_playwright daiquiri.setup(level=logging.INFO) logger = daiquiri.getLogger(__name__) -# user_agent = 'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0' -user_agent = "UniversalFeedParser/5.2.0 +http://feedparser.org/" -product_list_url = "https://www.newegg.com/Product/ProductList.aspx" +user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" re_page = re.compile(r"Page \d+/(\d+)") re_size1 = re.compile(r"\b([0-9.]+) ?([TGtg])[Bb]\b(?!/s)") @@ -39,7 +38,7 @@ data_root = os.path.join(root_dir, "data") def exists_or_create_dir(d: str) -> None: """Create a directory if it doesn't already exist.""" if not os.path.exists(d): - os.mkdir(d) + os.makedirs(d) def random_sleep() -> None: @@ -47,27 +46,51 @@ def random_sleep() -> None: time.sleep(random.randint(20, 90)) -def get_product_list(n: str, page: Optional[int] = None) -> requests.Response: - """Get product list.""" - params: dict[str, str | int] = { - "Submit": "ENE", +def get_product_list(n: str, page: Optional[int] = None) -> str: + """Get product list using Playwright to handle bot detection.""" + params: dict[str, str] = { "N": n, - "IsNodeId": 1, - "ActiveSearchResult": "True", + "IsNodeId": "1", "Order": "RELEASE", - "PageSize": 96, + "PageSize": "96", } if page is not None: - params["page"] = page - r = requests.get( - product_list_url, - # allow_redirects=False, - params=params, - headers={"User-Agent": user_agent}, - ) - print(r.url) - logger.debug("request", url=r.url) - return r + params["page"] = str(page) + url = "https://www.newegg.com/p/pl?" + urllib.parse.urlencode(params) + logger.info("fetching", url=url) + + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=["--disable-blink-features=AutomationControlled"], + ) + context = browser.new_context( + user_agent=user_agent, + viewport={"width": 1280, "height": 800}, + ) + context.add_init_script( + "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})" + ) + pg = context.new_page() + pg.goto(url, wait_until="networkidle", timeout=60000) + + if "areyouahuman" in pg.url: + logger.info("bot detection triggered, simulating mouse movement...") + for i in range(30): + x = 100 + i * 30 + random.randint(-5, 5) + y = 300 + random.randint(-20, 20) + pg.mouse.move(x, y) + time.sleep(0.04 + random.random() * 0.04) + try: + frame = pg.frame_locator('iframe[title*="reCAPTCHA"]').first + frame.locator("#recaptcha-anchor").click(timeout=5000) + pg.wait_for_url(lambda u: "areyouahuman" not in u, timeout=30000) + except Exception: + logger.warning("CAPTCHA bypass failed") + + html = pg.content() + browser.close() + return html # RSS URL: https://www.newegg.com/Product/RSS.aspx?Submit=ENE&N=8000%204814%20600003489&IsNodeId=1&ShowDeactivatedMark=False&Order=RELEASE @@ -89,20 +112,18 @@ def get_product_list(n: str, page: Optional[int] = None) -> requests.Response: filter_params = [ - ("internal_35", '3.5" internal drives', "100167523 8000 4814 600003489"), - ("internal_25", '2.5" internal drives', "100167523 8000 4814 600003490"), - ("laptop_25", '2.5" laptop drives', "100167524 8000 4814 600003490"), - ("portable_25", '2.5" portable drives', "100167526 8000 4818 600003490"), + ("internal_35", '3.5" Internal HDD', "100167523 8000 4814 600003489"), + ("portable_25", '2.5" Portable HDD', "100167526 8000 4818 600003490"), # ('portable_35', '3.5" portable drives', '100167526 8000 4818 600003489'), - ("external_35", '3.5" external drives', "100167525 8000 4818 600003489"), + ("external_35", '3.5" External HDD', "100167525 8000 4818 600003489"), ( "ssd_sata", - "SSD with SATA interface", + "SATA SSD", "100011693 8000 4814 600038506 600038510 600038519", ), ( "ssd_pcie", - "SSD with PCIe interface", + "NVMe SSD", "100011693 8000 4814 600640786 601296941 601301243", ), ] @@ -113,9 +134,9 @@ def page_filename(d: str, name: str, page: int) -> str: return os.path.join(d, f"{name}_page{page:02d}.html") -def save_page(r: requests.models.Response, d: str, name: str, page: int) -> None: +def save_page(html: str, d: str, name: str, page: int) -> None: """Save page.""" - open(page_filename(d, name, page), "w").write(r.text) + open(page_filename(d, name, page), "w").write(html) def get_pages() -> None: @@ -133,10 +154,9 @@ def get_pages() -> None: logger.info(f"get {name}", label=label, page=1) if download: random_sleep() - page1 = get_product_list(filter_param) + page_content = get_product_list(filter_param) download = True - page_content = page1.text - save_page(page1, today_dir, name, 1) + save_page(page_content, today_dir, name, 1) page_content = page_content.replace("", "") page_count = get_page_count(page_content) logger.info(f"{name} page count: {page_count}") @@ -147,9 +167,9 @@ def get_pages() -> None: logger.info(f"get {name}", label=label, page=page_num) if download: random_sleep() - r = get_product_list(filter_param, page=page_num) + html = get_product_list(filter_param, page=page_num) download = True - save_page(r, today_dir, name, page_num) + save_page(html, today_dir, name, page_num) def get_page_count(html: str) -> int: @@ -314,7 +334,7 @@ def group_items( def build() -> None: """Build.""" - build_root = "/var/www/edward/docs/price_per_tb" + build_root = os.path.join(root_dir, "output") today = date.today() templates_dir = os.path.join(root_dir, "templates") diff --git a/requirements.txt b/requirements.txt index 2aeecde..d22ca9b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -requests +playwright lxml jinja2 daiquiri diff --git a/templates/index.html b/templates/index.html index 398de8f..cff89f9 100644 --- a/templates/index.html +++ b/templates/index.html @@ -13,9 +13,9 @@ Comments welcome: edward@4angle.com
Last updated: {{ today.strftime('%d %B %Y') }}.
-
{{ cat.label }} | |||
| Price per TB |
Price | @@ -30,7 +30,7 @@ Comments welcome: edward@4angle.com{{ hdd.title }} | |
|---|---|---|---|
| more | |||