#!/usr/bin/python3 import os import re import lxml.html import jinja2 import json import sys import requests from time import sleep from pprint import pprint parser = lxml.html.HTMLParser(encoding='utf-8') env = jinja2.Environment(loader=jinja2.FileSystemLoader('templates')) re_book = re.compile('function refreshGroupBox(group_id, book_id) \{(.*?)\n *\}', re.DOTALL) re_book = re.compile('function refreshGroupBox$group_id, book_id$ \{(.*?)\n *\}', re.DOTALL) re_tip = re.compile(br'var newTip = new Tip$\$\(\'[^\']+\'$, "(.*?)", {') dirs = ['shelf', 'genre'] start = 'https://www.goodreads.com/book/show/' existing = {book['title'] for book in json.load(open('calibre_book_list.json'))} def iter_books(): for d in dirs: for f in sorted(os.listdir(d)): filename = os.path.join(d, f) root = lxml.html.parse(filename, parser=parser).getroot() for div in root.find_class('bookInformation'): link = div.find('.//a') rating = div.find('.//span[@class="minirating"]') description = div.find('./div[@class="bookDescription"]') r = rating[0].tail.strip() cover = div.getnext().find('.//img').get('src') book = { 'title': link.text, 'url': link.get('href'), 'rating': r, 'r': float(r[:3]), 'cover': cover, 'authors': [a.text for a in div.find_class('authorName')], } if description is not None: index = 1 if len(description) == 3 else 0 book['description'] = description[index].text yield d, f, book continue # print(filename) for line in open(filename, 'rb'): if b'var newTip' not in line: continue print(line) m = re_tip.search(line) tip = m.group(1).decode('unicode_escape').replace('\/', '/') # tip = m.group(1) # .replace('\/', '/') # print(tip) if '

' in tip: continue if 'Recommendations are disabled for that shelf.' in tip: continue if 'Customize by selecting your' in tip: continue print(tip) yield (d, f, lxml.html.fromstring(tip)) template = env.get_template('books.html') seen = set() books = [] first_authors = set() for d, f, book in sorted(iter_books(), key=lambda i: i[2]['r'], reverse=True): # pprint(book) # print(repr(book.get('description'))) # continue # title_link = book.find_class('bookTitle')[0] url = book['url'] url = url[:url.find('?')] if url in seen: continue seen.add(url) title = book['title'] authors = book['authors'] first_authors.add(authors[0]) main_title = title # for sep in ['(']: for sep in ':', ' - ', '(', ',': if sep in title: main_title = title[:title.find(sep)] break # print((main_title + ' by ' + u', '.join(authors)).encode('utf-8')) if len(main_title) < 10: continue if main_title in existing: continue # print(u'{} by {}'.format(main_title, authors[0]).encode('utf-8')) print(u'{}'.format(main_title)) # print(main_title.encode('utf-8')) assert url.startswith(start) filename = 'books/' + url[len(start):] + '.html' # print(filename) if False and not os.path.exists(filename): open(filename, 'w').write(requests.get(url).content) sleep(1) books.append({ 'dir': d, 'file': f[:-5], 'title': title, 'main_title': main_title, 'authors': authors, 'url': url, 'rating': book['rating'], 'cover': book['cover'], 'description': book.get('description'), }) page = template.render(books=books) open('book_list.html', 'w').write(page) sys.exit(0) for a in sorted(first_authors): print(a) # authors = u' OR '.join(u'"{}"'.format(a) for a in sorted(first_authors) if a not in {'Hugh Howey', 'Elizabeth Moon', 'Max Hastings'}) # print(authors.encode('utf-8'))