#!/usr/bin/python3 import os import re import lxml.html import jinja2 import json import sys import requests from time import sleep from pprint import pprint parser = lxml.html.HTMLParser(encoding="utf-8") env = jinja2.Environment(loader=jinja2.FileSystemLoader("templates")) re_book = re.compile( "function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}", re.DOTALL ) re_book = re.compile( "function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}", re.DOTALL ) re_tip = re.compile(rb'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {') dirs = ["shelf", "genre"] start = "https://www.goodreads.com/book/show/" existing = {book["title"] for book in json.load(open("calibre_book_list.json"))} def iter_books(): for d in dirs: for f in sorted(os.listdir(d)): filename = os.path.join(d, f) root = lxml.html.parse(filename, parser=parser).getroot() for div in root.find_class("bookInformation"): link = div.find(".//a") rating = div.find('.//span[@class="minirating"]') description = div.find('./div[@class="bookDescription"]') r = rating[0].tail.strip() cover = div.getnext().find(".//img").get("src") book = { "title": link.text, "url": link.get("href"), "rating": r, "r": float(r[:3]), "cover": cover, "authors": [a.text for a in div.find_class("authorName")], } if description is not None: index = 1 if len(description) == 3 else 0 book["description"] = description[index].text yield d, f, book continue # print(filename) for line in open(filename, "rb"): if b"var newTip" not in line: continue print(line) m = re_tip.search(line) tip = m.group(1).decode("unicode_escape").replace("\/", "/") # tip = m.group(1) # .replace('\/', '/') # print(tip) if '