goodreads-backup/get.py

#!/usr/bin/python3

"""Download shelves from goodreads."""

import os
import re
from http.cookiejar import LWPCookieJar
from random import shuffle
from time import sleep

import lxml.html
import requests

re_recommend = re.compile(
    '  <a class="actionLinkLite " href="(/recommendations/([^/]*?)/([^/]*?))">'
)

s = requests.Session()

cookie_dir = "/home/edward/lib/cookies"
cookie_file = os.path.join(cookie_dir, "goodreads")

cj = LWPCookieJar(cookie_file)
if os.path.exists(cookie_file):
    cj.load()
s.cookies = cj  # type: ignore


def login() -> None:
    """Login."""
    sign_in_page = "https://www.goodreads.com/user/sign_in"
    page = s.get(sign_in_page).text
    open("sign_in.html", "w").write(page)
    if '"name":"Edward Betts"' in page:
        return  # already signed in

    re_token = re.compile(
        '<input type="hidden" name="authenticity_token" value="([^"]*?)" />'
    )
    re_n = re.compile(r"<input name='n' type='hidden' value='(\d+)'>")
    m_n = re_n.search(page)
    m_token = re_token.search(page)

    assert m_token and m_n
    token = m_token.group(1)

    data = {
        "utf8": "\u2713",
        "authenticity_token": token,
        "user[email]": "edward@4angle.com",
        "user[password]": "8V8~9:3~U!Ly",
        "remember_me": 1,
        "next": "Sign in",
        "n": m_n.group(1),
    }

    print(token)
    print(data["n"])

    r = s.post(sign_in_page, data=data, headers={"referer": sign_in_page})

    open("signed_in.html", "w").write(r.text)

    root = lxml.html.fromstring(r.content)
    flash = root.find_class("flash")
    if flash:
        print("flash:", flash[0].text)

    cj.save(ignore_discard=True)


def get_index() -> None:
    """Get index."""
    # url = 'https://www.goodreads.com/recommendations'
    url = "https://www.goodreads.com/recommendations/?recs_current_view=list"

    r = s.get(url)
    open("recommendations.html", "w").write(r.text)


def get_individual():
    """Get individual page."""
    for line in open("recommendations.html"):
        if "actionLinkLite" not in line:
            continue
        m = re_recommend.match(line)
        if m:
            yield m.groups()


def main() -> None:
    """Login and download shelves."""
    # art = 'https://www.goodreads.com/recommendations/genre/art'
    login()
    get_index()
    recommend_list = list(get_individual())
    shuffle(recommend_list)

    headers = {"Accept": "text/html"}

    for a, b, c in recommend_list:
        print((b, c))
        url = "https://www.goodreads.com" + a

        r = s.get(url, headers=headers)
        filename = os.path.join(b, c + ".html")
        open(filename, "w").write(r.text)
        sleep(0.5)


if __name__ == "__main__":
    main()
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00			`#!/usr/bin/python3`

Update code style 2024-04-17 10:02:26 +01:00			`"""Download shelves from goodreads."""`

Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00			`import os`
			`import re`
Update code style 2024-04-17 10:02:26 +01:00			`from http.cookiejar import LWPCookieJar`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00			`from random import shuffle`
			`from time import sleep`

Update code style 2024-04-17 10:02:26 +01:00			`import lxml.html`
			`import requests`

Reformat code with black 2024-04-17 09:12:11 +01:00			`re_recommend = re.compile(`
			`' <a class="actionLinkLite " href="(/recommendations/([^/]?)/([^/]?))">'`
			`)`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
			`s = requests.Session()`

Reformat code with black 2024-04-17 09:12:11 +01:00			`cookie_dir = "/home/edward/lib/cookies"`
			`cookie_file = os.path.join(cookie_dir, "goodreads")`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
			`cj = LWPCookieJar(cookie_file)`
			`if os.path.exists(cookie_file):`
			`cj.load()`
Update code style 2024-04-17 10:02:26 +01:00			`s.cookies = cj # type: ignore`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
Reformat code with black 2024-04-17 09:12:11 +01:00
Update code style 2024-04-17 10:02:26 +01:00			`def login() -> None:`
			`"""Login."""`
Reformat code with black 2024-04-17 09:12:11 +01:00			`sign_in_page = "https://www.goodreads.com/user/sign_in"`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00			`page = s.get(sign_in_page).text`
Reformat code with black 2024-04-17 09:12:11 +01:00			`open("sign_in.html", "w").write(page)`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00			`if '"name":"Edward Betts"' in page:`
			`return # already signed in`

Reformat code with black 2024-04-17 09:12:11 +01:00			`re_token = re.compile(`
			`'<input type="hidden" name="authenticity_token" value="([^"]*?)" />'`
			`)`
Update code style 2024-04-17 10:02:26 +01:00			`re_n = re.compile(r"<input name='n' type='hidden' value='(\d+)'>")`
			`m_n = re_n.search(page)`
			`m_token = re_token.search(page)`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
Update code style 2024-04-17 10:02:26 +01:00			`assert m_token and m_n`
			`token = m_token.group(1)`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
			`data = {`
Reformat code with black 2024-04-17 09:12:11 +01:00			`"utf8": "\u2713",`
			`"authenticity_token": token,`
			`"user[email]": "edward@4angle.com",`
			`"user[password]": "8V8~9:3~U!Ly",`
			`"remember_me": 1,`
			`"next": "Sign in",`
Update code style 2024-04-17 10:02:26 +01:00			`"n": m_n.group(1),`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00			`}`

			`print(token)`
Reformat code with black 2024-04-17 09:12:11 +01:00			`print(data["n"])`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
Reformat code with black 2024-04-17 09:12:11 +01:00			`r = s.post(sign_in_page, data=data, headers={"referer": sign_in_page})`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
Reformat code with black 2024-04-17 09:12:11 +01:00			`open("signed_in.html", "w").write(r.text)`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
			`root = lxml.html.fromstring(r.content)`
Reformat code with black 2024-04-17 09:12:11 +01:00			`flash = root.find_class("flash")`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00			`if flash:`
Reformat code with black 2024-04-17 09:12:11 +01:00			`print("flash:", flash[0].text)`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
			`cj.save(ignore_discard=True)`

Reformat code with black 2024-04-17 09:12:11 +01:00
Update code style 2024-04-17 10:02:26 +01:00			`def get_index() -> None:`
			`"""Get index."""`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00			`# url = 'https://www.goodreads.com/recommendations'`
Reformat code with black 2024-04-17 09:12:11 +01:00			`url = "https://www.goodreads.com/recommendations/?recs_current_view=list"`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
			`r = s.get(url)`
Reformat code with black 2024-04-17 09:12:11 +01:00			`open("recommendations.html", "w").write(r.text)`

Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
			`def get_individual():`
Update code style 2024-04-17 10:02:26 +01:00			`"""Get individual page."""`
Reformat code with black 2024-04-17 09:12:11 +01:00			`for line in open("recommendations.html"):`
			`if "actionLinkLite" not in line:`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00			`continue`
			`m = re_recommend.match(line)`
			`if m:`
			`yield m.groups()`


Update code style 2024-04-17 10:02:26 +01:00			`def main() -> None:`
			`"""Login and download shelves."""`
			`# art = 'https://www.goodreads.com/recommendations/genre/art'`
			`login()`
			`get_index()`
			`recommend_list = list(get_individual())`
			`shuffle(recommend_list)`

			`headers = {"Accept": "text/html"}`

			`for a, b, c in recommend_list:`
			`print((b, c))`
			`url = "https://www.goodreads.com" + a`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00
Update code style 2024-04-17 10:02:26 +01:00			`r = s.get(url, headers=headers)`
			`filename = os.path.join(b, c + ".html")`
			`open(filename, "w").write(r.text)`
			`sleep(0.5)`
Add old code for downloading recommendations 2024-04-17 09:11:41 +01:00

Update code style 2024-04-17 10:02:26 +01:00			`if __name__ == "__main__":`
			`main()`