#!/usr/bin/python3 import os import re import lxml.html import jinja2 import json import sys import requests from time import sleep from pprint import pprint parser = lxml.html.HTMLParser(encoding='utf-8') env = jinja2.Environment(loader=jinja2.FileSystemLoader('templates')) re_book = re.compile('function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}', re.DOTALL) re_book = re.compile('function refreshGroupBox\(group_id, book_id\) \{(.*?)\n *\}', re.DOTALL) re_tip = re.compile(br'var newTip = new Tip\(\$\(\'[^\']+\'\), "(.*?)", {') dirs = ['shelf', 'genre'] start = 'https://www.goodreads.com/book/show/' existing = {book['title'] for book in json.load(open('calibre_book_list.json'))} def iter_books(): for d in dirs: for f in sorted(os.listdir(d)): filename = os.path.join(d, f) root = lxml.html.parse(filename, parser=parser).getroot() for div in root.find_class('bookInformation'): link = div.find('.//a') rating = div.find('.//span[@class="minirating"]') description = div.find('./div[@class="bookDescription"]') r = rating[0].tail.strip() cover = div.getnext().find('.//img').get('src') book = { 'title': link.text, 'url': link.get('href'), 'rating': r, 'r': float(r[:3]), 'cover': cover, 'authors': [a.text for a in div.find_class('authorName')], } if description is not None: index = 1 if len(description) == 3 else 0 book['description'] = description[index].text yield d, f, book continue # print(filename) for line in open(filename, 'rb'): if b'var newTip' not in line: continue print(line) m = re_tip.search(line) tip = m.group(1).decode('unicode_escape').replace('\/', '/') # tip = m.group(1) # .replace('\/', '/') # print(tip) if '