Use cloudscraper
This commit is contained in:
parent
8c0acb1453
commit
d845ec805f
38
check.py
38
check.py
|
@ -2,6 +2,7 @@
|
|||
|
||||
"""Check if conference websites are live."""
|
||||
|
||||
import html
|
||||
import os
|
||||
import re
|
||||
import smtplib
|
||||
|
@ -11,6 +12,7 @@ from email.mime.text import MIMEText
|
|||
from email.utils import formatdate, make_msgid
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
import cloudscraper
|
||||
import requests
|
||||
import yaml
|
||||
from requests.adapters import HTTPAdapter
|
||||
|
@ -53,8 +55,14 @@ re_title = re.compile("<title>(.*?)</title>", re.DOTALL)
|
|||
AGENT = config["browser"]["User-Agent"]
|
||||
headers = {"User-Agent": AGENT, "Accept": "text/html"}
|
||||
|
||||
s = requests.Session()
|
||||
s.headers.update(headers)
|
||||
s = cloudscraper.CloudScraper()
|
||||
s_no_dot = cloudscraper.CloudScraper()
|
||||
|
||||
# s = requests.Session()
|
||||
# s.headers.update(headers)
|
||||
|
||||
# s_no_dot = requests.Session()
|
||||
# s_no_dot.headers.update(headers)
|
||||
|
||||
# Create a session and mount the custom adapter for both HTTP and HTTPS requests.
|
||||
adapter = AbsoluteDNSAdapter()
|
||||
|
@ -82,7 +90,7 @@ not_here_list = [
|
|||
"Ooops! Could Not Find It",
|
||||
"OpenStreetMap Authentication Proxy",
|
||||
"Error 404",
|
||||
"Under Construction",
|
||||
# "Under Construction",
|
||||
"Page not found",
|
||||
"Error 404: Page not found",
|
||||
"Barcamptools",
|
||||
|
@ -95,18 +103,21 @@ not_here_list = [
|
|||
"This page doesn't currently exist",
|
||||
"ERROR 503 - Service Unavailable",
|
||||
"ERROR 503",
|
||||
"401 Authorization Required",
|
||||
"Authorization Required",
|
||||
"used Cloudflare to restrict access",
|
||||
]
|
||||
|
||||
|
||||
def find_not_here_message(html: str) -> str | None:
|
||||
def find_not_here_message(page_html: str) -> str | None:
|
||||
"""Find not here message in web page."""
|
||||
return next((not_here for not_here in not_here_list if not_here in html), None)
|
||||
return next((not_here for not_here in not_here_list if not_here in page_html), None)
|
||||
|
||||
|
||||
def get_title(html: str) -> str:
|
||||
def get_title(page_html: str) -> str:
|
||||
"""Title from web page."""
|
||||
m = re_title.search(html)
|
||||
return m.group(1).strip() if m and m.group(1) else "no title"
|
||||
m = re_title.search(page_html)
|
||||
return html.unescape(m.group(1).strip()) if m and m.group(1) else "no title"
|
||||
|
||||
|
||||
def normalize_url(url: str) -> str:
|
||||
|
@ -183,18 +194,17 @@ class Conference:
|
|||
|
||||
def check(self) -> tuple[bool, str, str | None]:
|
||||
"""Check if conference is live."""
|
||||
no_dot = {"bsideskbh.dk", "pif.camp"}
|
||||
session = (
|
||||
s if all(hostname not in self.url for hostname in no_dot) else s_no_dot
|
||||
)
|
||||
try:
|
||||
# SotM Baltics has an invalid TLS certificate, but we don't care
|
||||
r = s.get(self.url, verify=False)
|
||||
r = session.get(self.url)
|
||||
except requests.exceptions.ConnectionError:
|
||||
return (False, "connection refused", None)
|
||||
|
||||
self.response = r
|
||||
|
||||
filename = url_to_filename(self.url)
|
||||
with open(os.path.join("sites", filename + ".html"), "w") as out:
|
||||
out.write(r.text)
|
||||
|
||||
not_here = find_not_here_message(r.text)
|
||||
if (
|
||||
len(r.text) < 2048
|
||||
|
|
Loading…
Reference in a new issue