Fix book importing

This commit is contained in:
2023-03-06 14:05:33 -05:00
parent 73c72ef465
commit 788e1ab9e9
11 changed files with 424 additions and 43 deletions

View File

@ -1,6 +1,6 @@
from django.contrib import admin
from books.models import Author, Book
from books.models import Author, Book, Page
from scrobbles.admin import ScrobbleInline
@ -8,11 +8,22 @@ from scrobbles.admin import ScrobbleInline
@admin.register(Author)
class AuthorAdmin(admin.ModelAdmin):
date_hierarchy = "created"
list_display = ("name", "openlibrary_id")
list_display = (
"name",
"openlibrary_id",
"bio",
"wikipedia_url",
)
ordering = ("-created",)
search_fields = ("name",)
@admin.register(Page)
class PageAdmin(admin.ModelAdmin):
date_hierarchy = "created"
list_filter = ("book",)
@admin.register(Book)
class BookAdmin(admin.ModelAdmin):
date_hierarchy = "created"

View File

@ -5,9 +5,8 @@ from enum import Enum
import pytz
from books.models import Author, Book
from books.models import Author, Book, Page
from scrobbles.models import Scrobble
from django.utils import timezone
logger = logging.getLogger(__name__)
@ -35,6 +34,38 @@ class KoReaderPageStatColumn(Enum):
TOTAL_PAGES = 4
def process_pages_for_book(book_id, sqlite_file_path):
con = sqlite3.connect(sqlite_file_path)
cur = con.cursor()
book = Book.objects.filter(koreader_id=book_id).first()
if not book:
logger.error(f"No book found with KoReader ID of {book_id}")
return
page_table = cur.execute(
f"SELECT * FROM page_stat_data where id_book={book_id}"
)
new_pages = []
for page_row in page_table:
page_number = page_row[KoReaderPageStatColumn.PAGE.value]
page, page_created = Page.objects.get_or_create(
book=book, number=page_number
)
if page_created:
ts = page_row[KoReaderPageStatColumn.START_TIME.value]
page.start_time = datetime.utcfromtimestamp(ts).replace(
tzinfo=pytz.utc
)
page.duration_seconds = page_row[
KoReaderPageStatColumn.DURATION.value
]
page.save()
new_pages.append(page)
logger.info("Added {len(new_pages)} for book {book}")
return new_pages
def process_koreader_sqlite_file(sqlite_file_path, user_id):
"""Given a sqlite file from KoReader, open the book table, iterate
over rows creating scrobbles from each book found"""
@ -65,21 +96,21 @@ def process_koreader_sqlite_file(sqlite_file_path, user_id):
)
if created:
book.title = book_row[KoReaderBookColumn.TITLE.value]
book.pages = book_row[KoReaderBookColumn.PAGES.value]
book.koreader_md5 = book_row[KoReaderBookColumn.MD5.value]
book.koreader_id = int(book_row[KoReaderBookColumn.ID.value])
book.koreader_authors = book_row[KoReaderBookColumn.AUTHORS.value]
book.run_time_ticks = int(book_row[KoReaderBookColumn.PAGES.value])
book.save(
update_fields=[
"pages",
"koreader_md5",
"koreader_id",
"koreader_authors",
]
)
pages = book_row[KoReaderBookColumn.PAGES.value]
run_time = pages * book.AVG_PAGE_READING_SECONDS
run_time_ticks = run_time * 1000
book_dict = {
"title": book_row[KoReaderBookColumn.TITLE.value],
"pages": book_row[KoReaderBookColumn.PAGES.value],
"koreader_md5": book_row[KoReaderBookColumn.MD5.value],
"koreader_id": int(book_row[KoReaderBookColumn.ID.value]),
"koreader_authors": book_row[KoReaderBookColumn.AUTHORS.value],
"run_time": run_time,
"run_time_ticks": run_time_ticks,
}
Book.objects.filter(pk=book.id).update(**book_dict)
book.fix_metadata()
if author_list:
book.authors.add(*[a.id for a in author_list])
@ -91,6 +122,7 @@ def process_koreader_sqlite_file(sqlite_file_path, user_id):
timestamp = datetime.utcfromtimestamp(
book_row[KoReaderBookColumn.LAST_OPEN.value]
).replace(tzinfo=pytz.utc)
process_pages_for_book(book.koreader_id, sqlite_file_path)
new_scrobble = Scrobble(
book_id=book.id,

View File

@ -0,0 +1,69 @@
# Generated by Django 4.1.5 on 2023-03-06 16:31
from django.db import migrations, models
import django.db.models.deletion
import django_extensions.db.fields
class Migration(migrations.Migration):
dependencies = [
("books", "0003_book_cover"),
]
operations = [
migrations.RemoveField(
model_name="book",
name="author_name",
),
migrations.RemoveField(
model_name="book",
name="author_openlibrary_id",
),
migrations.AddField(
model_name="author",
name="headshot",
field=models.ImageField(
blank=True, null=True, upload_to="books/authors/"
),
),
migrations.CreateModel(
name="Page",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"created",
django_extensions.db.fields.CreationDateTimeField(
auto_now_add=True, verbose_name="created"
),
),
(
"modified",
django_extensions.db.fields.ModificationDateTimeField(
auto_now=True, verbose_name="modified"
),
),
("number", models.IntegerField()),
("start_time", models.DateTimeField()),
("duration_seconds", models.IntegerField()),
(
"book",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
to="books.book",
),
),
],
options={
"unique_together": {("book", "number")},
},
),
]

View File

@ -0,0 +1,21 @@
# Generated by Django 4.1.5 on 2023-03-06 16:34
from django.db import migrations, models
import uuid
class Migration(migrations.Migration):
dependencies = [
("books", "0004_remove_book_author_name_and_more"),
]
operations = [
migrations.AddField(
model_name="author",
name="uuid",
field=models.UUIDField(
blank=True, default=uuid.uuid4, editable=False, null=True
),
),
]

View File

@ -0,0 +1,23 @@
# Generated by Django 4.1.5 on 2023-03-06 17:13
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("books", "0005_author_uuid"),
]
operations = [
migrations.AlterField(
model_name="page",
name="duration_seconds",
field=models.IntegerField(blank=True, null=True),
),
migrations.AlterField(
model_name="page",
name="start_time",
field=models.DateTimeField(blank=True, null=True),
),
]

View File

@ -0,0 +1,48 @@
# Generated by Django 4.1.5 on 2023-03-06 17:44
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("books", "0006_alter_page_duration_seconds_alter_page_start_time"),
]
operations = [
migrations.AddField(
model_name="author",
name="amazon_id",
field=models.CharField(blank=True, max_length=255, null=True),
),
migrations.AddField(
model_name="author",
name="bio",
field=models.TextField(blank=True, null=True),
),
migrations.AddField(
model_name="author",
name="goodreads_id",
field=models.CharField(blank=True, max_length=255, null=True),
),
migrations.AddField(
model_name="author",
name="isni",
field=models.CharField(blank=True, max_length=255, null=True),
),
migrations.AddField(
model_name="author",
name="librarything_id",
field=models.CharField(blank=True, max_length=255, null=True),
),
migrations.AddField(
model_name="author",
name="wikidata_id",
field=models.CharField(blank=True, max_length=255, null=True),
),
migrations.AddField(
model_name="author",
name="wikipedia_url",
field=models.CharField(blank=True, max_length=255, null=True),
),
]

View File

@ -0,0 +1,21 @@
# Generated by Django 4.1.5 on 2023-03-06 18:42
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
(
"books",
"0007_author_amazon_id_author_bio_author_goodreads_id_and_more",
),
]
operations = [
migrations.AddField(
model_name="book",
name="first_sentence",
field=models.CharField(blank=True, max_length=255, null=True),
),
]

View File

@ -1,7 +1,11 @@
import logging
from uuid import uuid4
import requests
from books.openlibrary import lookup_book_from_openlibrary
from books.openlibrary import (
lookup_author_from_openlibrary,
lookup_book_from_openlibrary,
)
from django.conf import settings
from django.contrib.auth import get_user_model
from django.core.files.base import ContentFile
@ -18,17 +22,44 @@ BNULL = {"blank": True, "null": True}
class Author(TimeStampedModel):
name = models.CharField(max_length=255)
uuid = models.UUIDField(default=uuid4, editable=False, **BNULL)
openlibrary_id = models.CharField(max_length=255, **BNULL)
headshot = models.ImageField(upload_to="books/authors/", **BNULL)
bio = models.TextField(**BNULL)
wikipedia_url = models.CharField(max_length=255, **BNULL)
isni = models.CharField(max_length=255, **BNULL)
wikidata_id = models.CharField(max_length=255, **BNULL)
goodreads_id = models.CharField(max_length=255, **BNULL)
librarything_id = models.CharField(max_length=255, **BNULL)
amazon_id = models.CharField(max_length=255, **BNULL)
def __str__(self):
return f"{self.name}"
def fix_metadata(self):
logger.warn("Not implemented yet")
def fix_metadata(self, data_dict: dict = {}):
if not data_dict and self.openlibrary_id:
data_dict = lookup_author_from_openlibrary(self.openlibrary_id)
if not data_dict or not data_dict.get("name"):
return
headshot_url = data_dict.pop("author_headshot_url", "")
Author.objects.filter(pk=self.id).update(**data_dict)
self.refresh_from_db()
if headshot_url:
r = requests.get(headshot_url)
if r.status_code == 200:
fname = f"{self.name}_{self.uuid}.jpg"
self.headshot.save(fname, ContentFile(r.content), save=True)
class Book(ScrobblableMixin):
COMPLETION_PERCENT = getattr(settings, "BOOK_COMPLETION_PERCENT", 95)
AVG_PAGE_READING_SECONDS = getattr(
settings, "AVERAGE_PAGE_READING_SECONDS", 60
)
title = models.CharField(max_length=255)
authors = models.ManyToManyField(Author)
@ -40,8 +71,7 @@ class Book(ScrobblableMixin):
pages = models.IntegerField(**BNULL)
language = models.CharField(max_length=4, **BNULL)
first_publish_year = models.IntegerField(**BNULL)
author_name = models.CharField(max_length=255, **BNULL)
author_openlibrary_id = models.CharField(max_length=255, **BNULL)
first_sentence = models.CharField(max_length=255, **BNULL)
openlibrary_id = models.CharField(max_length=255, **BNULL)
cover = models.ImageField(upload_to="books/covers/", **BNULL)
@ -52,13 +82,47 @@ class Book(ScrobblableMixin):
if not self.openlibrary_id or force_update:
book_dict = lookup_book_from_openlibrary(self.title, self.author)
cover_url = book_dict.pop("cover_url")
Book.objects.filter(pk=self.id).update(**book_dict)
cover_url = book_dict.pop("cover_url", "")
ol_author_id = book_dict.pop("ol_author_id", "")
ol_author_name = book_dict.pop("ol_author_name", "")
# Don't want to overwrite KoReader's pages if OL is ignorant
if not book_dict.get("pages"):
book_dict.pop("pages")
r = requests.get(cover_url)
if r.status_code == 200:
fname = f"{self.title}_{self.uuid}.jpg"
self.cover.save(fname, ContentFile(r.content), save=True)
ol_title = book_dict.get("title")
if ol_title.lower() != self.title.lower():
logger.warn(
f"OL and KoReader disagree on this book title {self.title} != {ol_title}"
)
return
Book.objects.filter(pk=self.id).update(**book_dict)
self.refresh_from_db()
# Process authors
author = None
author_created = False
if ol_author_id:
author, author_created = Author.objects.get_or_create(
openlibrary_id=ol_author_id
)
if author_created or force_update:
author.fix_metadata()
if not author and ol_author_name:
author, author_created = Author.objects.get_or_create(
name=ol_author_name
)
self.authors.add(author)
if cover_url:
r = requests.get(cover_url)
if r.status_code == 200:
fname = f"{self.title}_{self.uuid}.jpg"
self.cover.save(fname, ContentFile(r.content), save=True)
if self.pages:
self.run_time = self.pages * int(self.AVG_PAGE_READING_SECONDS)
self.run_time_ticks = int(self.run_time) * 1000
self.save()
@ -76,6 +140,11 @@ class Book(ScrobblableMixin):
return 0
return int(self.pages * (self.COMPLETION_PERCENT / 100))
def update_long_play_seconds(self):
"""Check page timestamps and duration and update"""
if self.page_set.all():
...
def progress_for_user(self, user_id: int) -> int:
"""Used to keep track of whether the book is complete or not"""
user = User.objects.get(id=user_id)
@ -86,6 +155,22 @@ class Book(ScrobblableMixin):
def find_or_create(cls, data_dict: dict) -> "Game":
from books.utils import get_or_create_book
return get_or_create_book(
return update_or_create_book(
data_dict.get("title"), data_dict.get("author")
)
class Page(TimeStampedModel):
book = models.ForeignKey(Book, on_delete=models.CASCADE)
number = models.IntegerField()
start_time = models.DateTimeField(**BNULL)
duration_seconds = models.IntegerField(**BNULL)
class Meta:
unique_together = (
"book",
"number",
)
def __str__(self):
return f"Page {self.number} of {self.book.pages} in {self.book.title}"

View File

@ -1,6 +1,8 @@
import json
import requests
import logging
import urllib
import requests
logger = logging.getLogger(__name__)
@ -8,6 +10,8 @@ SEARCH_URL = "https://openlibrary.org/search.json?title={title}"
ISBN_URL = "https://openlibrary.org/isbn/{isbn}.json"
SEARCH_URL = "https://openlibrary.org/search.json?title={title}"
COVER_URL = "https://covers.openlibrary.org/b/olid/{id}-L.jpg"
AUTHOR_URL = "https://openlibrary.org/authors/{id}.json"
AUTHOR_IMAGE_URL = "https://covers.openlibrary.org/a/olid/{id}-L.jpg"
def get_first(key: str, result: dict) -> str:
@ -17,8 +21,43 @@ def get_first(key: str, result: dict) -> str:
return obj
def lookup_author_from_openlibrary(olid: str) -> dict:
author_url = AUTHOR_URL.format(id=olid)
response = requests.get(author_url)
if response.status_code != 200:
logger.warn(f"Bad response from OL: {response.status_code}")
return {}
results = json.loads(response.content)
if not results:
logger.warn(f"No author results found from OL for {olid}")
return {}
remote_ids = results.get("remote_ids", {})
bio = ""
if results.get("bio"):
try:
bio = results.get("bio").get("value")
except AttributeError:
bio = results.get("bio")
return {
"name": results.get("name"),
"wikipedia_url": results.get("wikipedia"),
"wikidata_id": remote_ids.get("wikidata"),
"isni": remote_ids.get("isni"),
"goodreads_id": remote_ids.get("goodreads"),
"librarything_id": remote_ids.get("librarything"),
"amazon_id": remote_ids.get("amazon"),
"bio": bio,
"author_headshot_url": AUTHOR_IMAGE_URL.format(id=olid),
}
def lookup_book_from_openlibrary(title: str, author: str = None) -> dict:
search_url = SEARCH_URL.format(title=title)
title_quoted = urllib.parse.quote(title)
search_url = SEARCH_URL.format(title=title_quoted)
response = requests.get(search_url)
if response.status_code != 200:
@ -37,14 +76,22 @@ def lookup_book_from_openlibrary(title: str, author: str = None) -> dict:
f"Lookup for {title} found top result with mismatched author"
)
ol_id = top.get("cover_edition_key")
author_ol_id = get_first("author_key", top)
first_sentence = ""
if top.get("first_sentence"):
try:
first_sentence = top.get("first_sentence")[0].get("value")
except AttributeError:
first_sentence = top.get("first_sentence")[0]
print(top)
return {
"title": top.get("title"),
"isbn": top.get("isbn")[0],
"openlibrary_id": top.get("cover_edition_key"),
"author_name": get_first("author_name", top),
"author_openlibrary_id": get_first("author_key", top),
"openlibrary_id": ol_id,
"goodreads_id": get_first("id_goodreads", top),
"first_publish_year": top.get("first_publish_year"),
"pages": top.get("number_of_pages_median"),
"first_sentence": first_sentence,
"pages": top.get("number_of_pages_median", None),
"cover_url": COVER_URL.format(id=ol_id),
"ol_author_id": author_ol_id,
}

View File

@ -2,22 +2,44 @@ from django.core.files.base import ContentFile
import requests
from typing import Optional
from books.openlibrary import lookup_book_from_openlibrary
from books.models import Book
from books.models import Author, Book
def get_or_create_book(title: str, author: Optional[str] = None) -> Book:
def update_or_create_book(
title: str, author: Optional[str] = None, force_update=False
) -> Book:
book_dict = lookup_book_from_openlibrary(title, author)
book, book_created = Book.objects.get_or_create(
isbn=book_dict.get("isbn"),
)
if book_created:
if book_created or force_update:
cover_url = book_dict.pop("cover_url")
ol_author_id = book_dict.pop("ol_author_id")
ol_author_name = book_dict.pop("ol_author_name")
author_image_url = book_dict.pop("author_image_url")
Book.objects.filter(pk=book.id).update(**book_dict)
book.refresh_from_db()
# Process authors
if ol_author_name and ol_author_id:
author, author_created = Author.objects.get_or_create(
name=ol_author_name
)
if author_created or force_update:
author.openlibrary_id = ol_author_id
author.save()
r = requests.get(author_image_url)
if r.status_code == 200:
fname = f"{author.name}_{author.uuid}.jpg"
author.headshot.save(
fname, ContentFile(r.content), save=True
)
# Process cover URL
r = requests.get(cover_url)
if r.status_code == 200:
fname = f"{book.title}_{book.uuid}.jpg"

View File

@ -71,10 +71,12 @@ class Artist(TimeStampedModel):
self.theaudiodb_genre = tadb_info["genre"]
self.theaudiodb_mood = tadb_info["mood"]
r = requests.get(tadb_info.get("thumb_url", ""))
if r.status_code == 200:
fname = f"{self.name}_{self.uuid}.jpg"
self.thumbnail.save(fname, ContentFile(r.content), save=True)
thumb_url = tadb_info.get("thumb_url", "")
if thumb_url:
r = requests.get(thumb_url)
if r.status_code == 200:
fname = f"{self.name}_{self.uuid}.jpg"
self.thumbnail.save(fname, ContentFile(r.content), save=True)
@property
def rym_link(self):