diff --git a/backup.py b/backup.py index 45fea0f..c1f7d35 100644 --- a/backup.py +++ b/backup.py @@ -68,7 +68,7 @@ def run_backup(page: Page) -> None: def run(playwright: Playwright) -> None: """Download export.""" - browser = playwright.chromium.launch(headless=False) + browser = playwright.chromium.launch(headless=True) auth_json = os.path.join(script_dir, "auth.json") context = browser.new_context(storage_state=auth_json) page = context.new_page() diff --git a/get.py b/get.py new file mode 100755 index 0000000..e654472 --- /dev/null +++ b/get.py @@ -0,0 +1,97 @@ +#!/usr/bin/python3 + +import requests +from http.cookiejar import LWPCookieJar +import os +import re +import lxml.html +from random import shuffle +from time import sleep + +re_recommend = re.compile( + ' ' +) + +s = requests.Session() + +cookie_dir = "/home/edward/lib/cookies" +cookie_file = os.path.join(cookie_dir, "goodreads") + +cj = LWPCookieJar(cookie_file) +if os.path.exists(cookie_file): + cj.load() +s.cookies = cj + + +def login(): + sign_in_page = "https://www.goodreads.com/user/sign_in" + page = s.get(sign_in_page).text + open("sign_in.html", "w").write(page) + if '"name":"Edward Betts"' in page: + return # already signed in + + re_token = re.compile( + '' + ) + re_n = re.compile("") + + token = re_token.search(page).group(1) + + data = { + "utf8": "\u2713", + "authenticity_token": token, + "user[email]": "edward@4angle.com", + "user[password]": "8V8~9:3~U!Ly", + "remember_me": 1, + "next": "Sign in", + "n": re_n.search(page).group(1), + } + + print(token) + print(data["n"]) + + r = s.post(sign_in_page, data=data, headers={"referer": sign_in_page}) + + open("signed_in.html", "w").write(r.text) + + root = lxml.html.fromstring(r.content) + flash = root.find_class("flash") + if flash: + print("flash:", flash[0].text) + + cj.save(ignore_discard=True) + + +def get_index(): + # url = 'https://www.goodreads.com/recommendations' + url = "https://www.goodreads.com/recommendations/?recs_current_view=list" + + r = s.get(url) + open("recommendations.html", "w").write(r.text) + + +def get_individual(): + for line in open("recommendations.html"): + if "actionLinkLite" not in line: + continue + m = re_recommend.match(line) + if m: + yield m.groups() + + +# art = 'https://www.goodreads.com/recommendations/genre/art' +login() +get_index() +recommend_list = list(get_individual()) +shuffle(recommend_list) + +headers = {"Accept": "text/html"} + +for a, b, c in recommend_list: + print((b, c)) + url = "https://www.goodreads.com" + a + + r = s.get(url, headers=headers) + filename = os.path.join(b, c + ".html") + open(filename, "w").write(r.text) + sleep(0.5) diff --git a/parse.py b/parse.py new file mode 100755 index 0000000..f0f6294 --- /dev/null +++ b/parse.py @@ -0,0 +1,138 @@ +#!/usr/bin/python3 +import os +import re +import lxml.html +import jinja2 +import json +import sys +import requests +from time import sleep +from pprint import pprint + +parser = lxml.html.HTMLParser(encoding="utf-8") + +env = jinja2.Environment(loader=jinja2.FileSystemLoader("templates")) + +re_book = re.compile( + "function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}", re.DOTALL +) +re_book = re.compile( + "function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}", re.DOTALL +) + +re_tip = re.compile(rb'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {') + +dirs = ["shelf", "genre"] +start = "https://www.goodreads.com/book/show/" + +existing = {book["title"] for book in json.load(open("calibre_book_list.json"))} + + +def iter_books(): + for d in dirs: + for f in sorted(os.listdir(d)): + filename = os.path.join(d, f) + root = lxml.html.parse(filename, parser=parser).getroot() + for div in root.find_class("bookInformation"): + link = div.find(".//a") + rating = div.find('.//span[@class="minirating"]') + description = div.find('./div[@class="bookDescription"]') + r = rating[0].tail.strip() + + cover = div.getnext().find(".//img").get("src") + + book = { + "title": link.text, + "url": link.get("href"), + "rating": r, + "r": float(r[:3]), + "cover": cover, + "authors": [a.text for a in div.find_class("authorName")], + } + if description is not None: + index = 1 if len(description) == 3 else 0 + book["description"] = description[index].text + yield d, f, book + continue + + # print(filename) + for line in open(filename, "rb"): + if b"var newTip" not in line: + continue + print(line) + m = re_tip.search(line) + tip = m.group(1).decode("unicode_escape").replace("\/", "/") + # tip = m.group(1) # .replace('\/', '/') + # print(tip) + if '