#!/usr/bin/python3 """Parse shelves and output web page.""" import json import os import re import sys import typing from time import sleep import jinja2 import lxml.html import requests parser = lxml.html.HTMLParser(encoding="utf-8") env = jinja2.Environment(loader=jinja2.FileSystemLoader("templates")) re_book = re.compile( r"function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}", re.DOTALL ) re_book = re.compile( r"function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}", re.DOTALL ) re_tip = re.compile(r'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {') dirs = ["shelf", "genre"] start = "https://www.goodreads.com/book/show/" existing = {book["title"] for book in json.load(open("calibre_book_list.json"))} def parse_book_div(div: lxml.html.HtmlElement) -> dict[str, typing.Any]: """Parse book div.""" link = div.find(".//a") assert link is not None rating = div.find('.//span[@class="minirating"]') description = div.find('./div[@class="bookDescription"]') assert rating is not None and rating[0] is not None r = rating[0].tail assert r div_next = div.getnext() assert div_next is not None cover_img = div_next.find(".//img") assert cover_img is not None cover = cover_img.get("src") book = { "title": link.text, "url": link.get("href"), "rating": r, "r": float(r[:3].strip()), "cover": cover, "authors": [a.text for a in div.find_class("authorName")], } if description is not None: index = 1 if len(description) == 3 else 0 book["description"] = description[index].text return book def iter_books(): for d in dirs: for f in sorted(os.listdir(d)): filename = os.path.join(d, f) root = lxml.html.parse(filename, parser=parser).getroot() for div in root.find_class("bookInformation"): book = parse_book_div(div) yield d, f, book continue # print(filename) for line in open(filename, "rb"): if b"var newTip" not in line: continue print(line) m = re_tip.search(line) tip = m.group(1).decode("unicode_escape").replace(r"\/", "/") # tip = m.group(1) # .replace('\/', '/') # print(tip) if '