Compare commits
2 commits
main
...
cloudscrap
| Author | SHA1 | Date | |
|---|---|---|---|
| fee54a37e7 | |||
| 35c213110d |
3 changed files with 47 additions and 182 deletions
199
check.py
199
check.py
|
|
@ -6,15 +6,13 @@ import html
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import smtplib
|
import smtplib
|
||||||
import sys
|
|
||||||
import warnings
|
import warnings
|
||||||
from dataclasses import dataclass
|
|
||||||
from datetime import date
|
from datetime import date
|
||||||
from email.mime.text import MIMEText
|
from email.mime.text import MIMEText
|
||||||
from email.utils import formatdate, make_msgid
|
from email.utils import formatdate, make_msgid
|
||||||
from urllib.parse import urlparse, urlunparse
|
from urllib.parse import urlparse, urlunparse
|
||||||
|
|
||||||
import cloudscraper # type: ignore
|
import cloudscraper
|
||||||
import requests
|
import requests
|
||||||
import yaml
|
import yaml
|
||||||
from requests.adapters import HTTPAdapter
|
from requests.adapters import HTTPAdapter
|
||||||
|
|
@ -23,8 +21,6 @@ from urllib3.util.url import parse_url
|
||||||
|
|
||||||
from conference import LiveConference, config, load_yaml
|
from conference import LiveConference, config, load_yaml
|
||||||
|
|
||||||
IS_TTY = sys.stdout.isatty()
|
|
||||||
|
|
||||||
|
|
||||||
class AbsoluteDNSAdapter(HTTPAdapter):
|
class AbsoluteDNSAdapter(HTTPAdapter):
|
||||||
"""A custom adapter for requests to ensure hostnames are treated as absolute."""
|
"""A custom adapter for requests to ensure hostnames are treated as absolute."""
|
||||||
|
|
@ -80,7 +76,6 @@ not_here_list = [
|
||||||
"This page does not exist yet",
|
"This page does not exist yet",
|
||||||
"404 Not Found",
|
"404 Not Found",
|
||||||
"500 Internal Server Error",
|
"500 Internal Server Error",
|
||||||
"500: Internal Server Error",
|
|
||||||
"Test Page for the Apache HTTP Server",
|
"Test Page for the Apache HTTP Server",
|
||||||
"Site not found · GitHub Pages",
|
"Site not found · GitHub Pages",
|
||||||
"504: Gateway time-out",
|
"504: Gateway time-out",
|
||||||
|
|
@ -108,17 +103,9 @@ not_here_list = [
|
||||||
"This page doesn't currently exist",
|
"This page doesn't currently exist",
|
||||||
"ERROR 503 - Service Unavailable",
|
"ERROR 503 - Service Unavailable",
|
||||||
"ERROR 503",
|
"ERROR 503",
|
||||||
"503 Service Unavailable",
|
|
||||||
"401 Authorization Required",
|
"401 Authorization Required",
|
||||||
"Authorization Required",
|
"Authorization Required",
|
||||||
"used Cloudflare to restrict access",
|
"used Cloudflare to restrict access",
|
||||||
"Error 1014",
|
|
||||||
"CNAME Cross-User Banned",
|
|
||||||
"looks like there's no page here",
|
|
||||||
"404 page can’t be found either",
|
|
||||||
"503 self-signed certificate",
|
|
||||||
"504 Gateway Timeout",
|
|
||||||
"<h2>Pages</h2>",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -148,133 +135,31 @@ def normalize_url(url: str) -> str:
|
||||||
return urlunparse(parsed_url._replace(netloc=normalized_netloc))
|
return urlunparse(parsed_url._replace(netloc=normalized_netloc))
|
||||||
|
|
||||||
|
|
||||||
def url_to_filename(url: str) -> str:
|
def check_conference(
|
||||||
"""
|
name: str, src_url: str, year: int
|
||||||
Convert a URL to a valid filename by replacing invalid characters with underscores.
|
) -> tuple[bool, str, str | None]:
|
||||||
|
|
||||||
Args:
|
|
||||||
url (str): The URL to be converted.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: A valid filename.
|
|
||||||
"""
|
|
||||||
# Replace invalid characters with underscores
|
|
||||||
return re.sub(r'[\/:*?"<>|]', "_", url)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_opengraph_tags(html: str) -> dict[str, str]:
|
|
||||||
"""
|
|
||||||
Locate Open Graph meta tags, and return them as a dictionary.
|
|
||||||
|
|
||||||
Keys will match the name following 'og:', e.g. 'title', 'image', 'url', 'site_name'.
|
|
||||||
"""
|
|
||||||
pattern = re.compile(
|
|
||||||
r'<meta\s+property="og:([^"]+)"\s+content="([^"]+)"\s*/?>', flags=re.IGNORECASE
|
|
||||||
)
|
|
||||||
matches = pattern.findall(html)
|
|
||||||
|
|
||||||
og_tags = {}
|
|
||||||
for prop, content in matches:
|
|
||||||
og_tags[prop] = content
|
|
||||||
|
|
||||||
return og_tags
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Conference:
|
|
||||||
"""Conference."""
|
|
||||||
|
|
||||||
name: str
|
|
||||||
src_url: str
|
|
||||||
year: int
|
|
||||||
response: requests.models.Response | None = None
|
|
||||||
redirect_to_url: str | None = None
|
|
||||||
|
|
||||||
@property
|
|
||||||
def url(self) -> str:
|
|
||||||
"""Conference URL."""
|
|
||||||
return self.build_url(year=self.year)
|
|
||||||
|
|
||||||
def build_url(self, year: int) -> str:
|
|
||||||
"""Build conference URL."""
|
|
||||||
return self.src_url.format(year=year, two_digit_year=year % 2000)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def past_url(self) -> str:
|
|
||||||
"""URL for previous year."""
|
|
||||||
return self.build_url(year=self.year - 1)
|
|
||||||
|
|
||||||
def check(self) -> tuple[bool, str, str | None]:
|
|
||||||
"""Check if conference is live."""
|
"""Check if conference is live."""
|
||||||
|
url = src_url.format(year=year)
|
||||||
|
past_url = src_url.format(year=year - 1)
|
||||||
no_dot = {"bsideskbh.dk", "pif.camp"}
|
no_dot = {"bsideskbh.dk", "pif.camp"}
|
||||||
url = self.url
|
|
||||||
session = s if all(hostname not in url for hostname in no_dot) else s_no_dot
|
session = s if all(hostname not in url for hostname in no_dot) else s_no_dot
|
||||||
try:
|
try:
|
||||||
r = session.get(url)
|
r = session.get(url)
|
||||||
except requests.exceptions.ConnectionError:
|
except requests.exceptions.ConnectionError:
|
||||||
return (False, "connection refused", None)
|
return (False, "connection refused", None)
|
||||||
|
|
||||||
self.response = r
|
|
||||||
|
|
||||||
if r.url.endswith("404.html") or r.url.endswith("404.htm"):
|
|
||||||
return (False, "URL ends with 404.html/404.htm", r.url)
|
|
||||||
|
|
||||||
if not r.text:
|
|
||||||
return (False, "empty response", r.url)
|
|
||||||
|
|
||||||
not_here = find_not_here_message(r.text)
|
not_here = find_not_here_message(r.text)
|
||||||
if (
|
if (
|
||||||
len(r.text) < 2048
|
len(r.text) < 2048
|
||||||
and 'http-equiv="refresh"' in r.text
|
and 'http-equiv="refresh"' in r.text
|
||||||
and str(self.year) not in r.text
|
and str(year) not in r.text
|
||||||
):
|
):
|
||||||
return (False, "redirect to URL without year", r.url)
|
return (False, "redirect to URL without year", r.url)
|
||||||
|
|
||||||
if normalize_url(r.url) == normalize_url(self.past_url):
|
if normalize_url(r.url) == normalize_url(past_url):
|
||||||
return (False, "redirect to previous year", r.url)
|
return (False, "redirect to previous year", r.url)
|
||||||
|
|
||||||
if not_here:
|
return (False, not_here, r.url) if not_here else (True, get_title(r.text), r.url)
|
||||||
return (False, not_here, r.url)
|
|
||||||
|
|
||||||
return (True, get_title(r.text), r.url)
|
|
||||||
|
|
||||||
def og_tags(self) -> dict[str, str]:
|
|
||||||
"""Open Graph tags."""
|
|
||||||
return parse_opengraph_tags(self.response.text) if self.response else {}
|
|
||||||
|
|
||||||
def check_web_site(self) -> bool:
|
|
||||||
"""Check if an individual web site is live."""
|
|
||||||
assert "{year}" in self.src_url or "{two_digit_year}" in self.src_url
|
|
||||||
if IS_TTY:
|
|
||||||
print(f"Checking {self.name} {self.year}: {self.url}")
|
|
||||||
live, msg, redirect_to_url = self.check()
|
|
||||||
if not live:
|
|
||||||
if IS_TTY:
|
|
||||||
print(f" Not live: {msg}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
if IS_TTY:
|
|
||||||
print(f" Live! Title: {msg}")
|
|
||||||
|
|
||||||
og = "".join(f"\n{key}: {value}" for key, value in self.og_tags().items())
|
|
||||||
|
|
||||||
if og:
|
|
||||||
og = "\n\nOpen Graph\n\n" + og
|
|
||||||
|
|
||||||
if redirect_to_url and normalize_url(redirect_to_url) == normalize_url(
|
|
||||||
self.url
|
|
||||||
):
|
|
||||||
body = f"{self.name}\n{self.url}\n"
|
|
||||||
else:
|
|
||||||
self.redirect_to_url = redirect_to_url
|
|
||||||
body = f"{self.name}\n{self.url} redirects to {redirect_to_url}\n"
|
|
||||||
|
|
||||||
body += f"Web page title: {msg}{og}" ""
|
|
||||||
send_mail(f"Conference site live: {self.name} - {self.year}", body)
|
|
||||||
if IS_TTY:
|
|
||||||
print(f" Email sent")
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def send_mail(subject: str, body: str) -> None:
|
def send_mail(subject: str, body: str) -> None:
|
||||||
|
|
@ -300,6 +185,23 @@ def send_mail(subject: str, body: str) -> None:
|
||||||
s.quit()
|
s.quit()
|
||||||
|
|
||||||
|
|
||||||
|
def check_conference_web_site(name: str, src_url: str, year: int) -> bool:
|
||||||
|
"""Check if an individual web site is live."""
|
||||||
|
assert "{year}" in src_url
|
||||||
|
live, msg, redirect_to_url = check_conference(name, src_url, year)
|
||||||
|
url = src_url.format(year=year)
|
||||||
|
if live:
|
||||||
|
if redirect_to_url and normalize_url(redirect_to_url) == normalize_url(url):
|
||||||
|
body = f"{name}\n{url}\nWeb page title: {msg}"
|
||||||
|
else:
|
||||||
|
body = f"""{name}
|
||||||
|
{url} redirects to {redirect_to_url}
|
||||||
|
Web page title: {msg}"""
|
||||||
|
send_mail(f"Conference site live: {name} - {year}", body)
|
||||||
|
|
||||||
|
return live
|
||||||
|
|
||||||
|
|
||||||
def find_new_conference_web_sites(
|
def find_new_conference_web_sites(
|
||||||
today: date, live: list[LiveConference]
|
today: date, live: list[LiveConference]
|
||||||
) -> list[LiveConference]:
|
) -> list[LiveConference]:
|
||||||
|
|
@ -310,59 +212,24 @@ def find_new_conference_web_sites(
|
||||||
|
|
||||||
live_set = {(c["conference"], c["year"]) for c in live}
|
live_set = {(c["conference"], c["year"]) for c in live}
|
||||||
for name, src_url in load_yaml("conferences").items():
|
for name, src_url in load_yaml("conferences").items():
|
||||||
for year in (this_year, this_year + 1):
|
new += [
|
||||||
if (name, year) in live_set:
|
{"conference": name, "year": year, "live": today}
|
||||||
continue
|
for year in (this_year, this_year + 1)
|
||||||
conf = Conference(name, src_url, year)
|
if (name, year) not in live_set
|
||||||
if not conf.check_web_site():
|
and check_conference_web_site(name, src_url, year)
|
||||||
continue
|
]
|
||||||
c: LiveConference = {
|
|
||||||
"conference": name,
|
|
||||||
"year": year,
|
|
||||||
"live": today,
|
|
||||||
"url": conf.url,
|
|
||||||
}
|
|
||||||
if conf.redirect_to_url:
|
|
||||||
c["redirect_to_url"] = conf.redirect_to_url
|
|
||||||
new.append(c)
|
|
||||||
return new
|
return new
|
||||||
|
|
||||||
|
|
||||||
class NoAliasDumper(yaml.SafeDumper):
|
|
||||||
"""Dumper that disables YAML anchors and aliases."""
|
|
||||||
|
|
||||||
def ignore_aliases(self, data):
|
|
||||||
"""Skip alias generation."""
|
|
||||||
return True # disables anchors and aliases
|
|
||||||
|
|
||||||
|
|
||||||
def main(show_not_live: bool = False) -> None:
|
def main(show_not_live: bool = False) -> None:
|
||||||
"""Check fow new conference web sites."""
|
"""Check fow new conference web sites."""
|
||||||
if IS_TTY:
|
|
||||||
print("Loading existing live conferences...")
|
|
||||||
live: list[LiveConference] = load_yaml("live")
|
live: list[LiveConference] = load_yaml("live")
|
||||||
if IS_TTY:
|
|
||||||
print(f"Found {len(live)} existing live conferences")
|
|
||||||
print("\nChecking for new conference websites...")
|
|
||||||
if not (new := find_new_conference_web_sites(date.today(), live)):
|
if not (new := find_new_conference_web_sites(date.today(), live)):
|
||||||
if IS_TTY:
|
|
||||||
print("\nNo new conference websites found")
|
|
||||||
return
|
return
|
||||||
|
|
||||||
if IS_TTY:
|
|
||||||
print(f"\n{len(new)} new conference(s) found! Updating live.yaml...")
|
|
||||||
live_filename = os.path.expanduser(config["data"]["live"])
|
live_filename = os.path.expanduser(config["data"]["live"])
|
||||||
with open(live_filename, "w") as out:
|
with open(live_filename, "w") as out:
|
||||||
yaml.dump(
|
yaml.dump(live + new, stream=out, sort_keys=False)
|
||||||
live + new,
|
|
||||||
stream=out,
|
|
||||||
sort_keys=False,
|
|
||||||
Dumper=NoAliasDumper,
|
|
||||||
allow_unicode=True,
|
|
||||||
default_flow_style=False,
|
|
||||||
)
|
|
||||||
if IS_TTY:
|
|
||||||
print(f"Updated {live_filename}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,6 @@ class LiveConference(typing.TypedDict, total=False):
|
||||||
year: int
|
year: int
|
||||||
live: date
|
live: date
|
||||||
url: str | None
|
url: str | None
|
||||||
redirect_to_url: str | None
|
|
||||||
|
|
||||||
|
|
||||||
def load_yaml(name: str) -> typing.Any:
|
def load_yaml(name: str) -> typing.Any:
|
||||||
|
|
|
||||||
|
|
@ -1,2 +1 @@
|
||||||
requests
|
requests
|
||||||
cloudscraper
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue