[books] Add papers as a data model

This commit is contained in:
2025-02-20 23:07:56 -05:00
parent e2b0decd83
commit 9e3f714c61
11 changed files with 944 additions and 548 deletions

1028
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -12,6 +12,7 @@ python-dateutil = "^2.8.2"
python-dotenv = "^0.20.0"
python-json-logger = "^2.0.2"
colorlog = "^6.6.0"
httpx = "<=0.27.2"
djangorestframework = "^3.13.1"
Markdown = "^3.3.6"
django-filter = "^21.1"

View File

@ -0,0 +1,136 @@
# Generated by Django 4.2.19 on 2025-02-18 05:11
from django.db import migrations, models
import django_extensions.db.fields
import taggit.managers
import uuid
class Migration(migrations.Migration):
dependencies = [
("scrobbles", "0067_scrobble_food_alter_scrobble_media_type"),
("books", "0024_book_publisher"),
]
operations = [
migrations.RemoveField(
model_name="author",
name="amazon_id",
),
migrations.RemoveField(
model_name="author",
name="librarything_id",
),
migrations.RemoveField(
model_name="author",
name="locg_slug",
),
migrations.AddField(
model_name="author",
name="semantic_id",
field=models.CharField(blank=True, max_length=50, null=True),
),
migrations.CreateModel(
name="Paper",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"created",
django_extensions.db.fields.CreationDateTimeField(
auto_now_add=True, verbose_name="created"
),
),
(
"modified",
django_extensions.db.fields.ModificationDateTimeField(
auto_now=True, verbose_name="modified"
),
),
(
"uuid",
models.UUIDField(
blank=True,
default=uuid.uuid4,
editable=False,
null=True,
),
),
("run_time_seconds", models.IntegerField(default=900)),
(
"run_time_ticks",
models.PositiveBigIntegerField(blank=True, null=True),
),
("title", models.CharField(max_length=255)),
("semantic_title", models.CharField(max_length=255)),
(
"koreader_data_by_hash",
models.JSONField(blank=True, null=True),
),
(
"semantic_id",
models.CharField(blank=True, max_length=50, null=True),
),
(
"arxiv_id",
models.CharField(blank=True, max_length=50, null=True),
),
(
"corpus_id",
models.CharField(blank=True, max_length=50, null=True),
),
(
"doi_id",
models.CharField(blank=True, max_length=50, null=True),
),
("pages", models.IntegerField(blank=True, null=True)),
(
"language",
models.CharField(blank=True, max_length=4, null=True),
),
(
"first_publish_year",
models.IntegerField(blank=True, null=True),
),
("publish_date", models.DateField(blank=True, null=True)),
(
"journal",
models.CharField(blank=True, max_length=255, null=True),
),
(
"journal_volume",
models.CharField(blank=True, max_length=50, null=True),
),
("abstract", models.TextField(blank=True, null=True)),
("num_citations", models.IntegerField(blank=True, null=True)),
(
"openaccess_pdf_url",
models.CharField(blank=True, max_length=255, null=True),
),
(
"authors",
models.ManyToManyField(blank=True, to="books.author"),
),
(
"genre",
taggit.managers.TaggableManager(
help_text="A comma-separated list of tags.",
through="scrobbles.ObjectWithGenres",
to="scrobbles.Genre",
verbose_name="Tags",
),
),
],
options={
"abstract": False,
},
),
]

View File

@ -0,0 +1,18 @@
# Generated by Django 4.2.19 on 2025-02-18 05:28
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("books", "0025_remove_author_amazon_id_and_more"),
]
operations = [
migrations.AlterField(
model_name="paper",
name="semantic_title",
field=models.CharField(blank=True, max_length=255, null=True),
),
]

View File

@ -0,0 +1,22 @@
# Generated by Django 4.2.19 on 2025-02-18 05:33
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("books", "0026_alter_paper_semantic_title"),
]
operations = [
migrations.RemoveField(
model_name="paper",
name="num_citations",
),
migrations.AddField(
model_name="paper",
name="tldr",
field=models.CharField(blank=True, max_length=255, null=True),
),
]

View File

@ -36,6 +36,7 @@ from vrobbler.apps.books.locg import (
lookup_comic_writer_by_locg_slug,
)
from vrobbler.apps.books.sources.google import lookup_book_from_google
from vrobbler.apps.books.sources.semantic import lookup_paper_from_semantic
from vrobbler.apps.scrobbles.dataclasses import BookLogData
COMICVINE_API_KEY = getattr(settings, "COMICVINE_API_KEY", "")
@ -64,22 +65,27 @@ class Author(TimeStampedModel):
)
bio = models.TextField(**BNULL)
wikipedia_url = models.CharField(max_length=255, **BNULL)
isni = models.CharField(max_length=255, **BNULL)
locg_slug = models.CharField(max_length=255, **BNULL)
wikidata_id = models.CharField(max_length=255, **BNULL)
isni = models.CharField(max_length=255, **BNULL)
goodreads_id = models.CharField(max_length=255, **BNULL)
librarything_id = models.CharField(max_length=255, **BNULL)
comicvine_data = models.JSONField(**BNULL)
amazon_id = models.CharField(max_length=255, **BNULL)
semantic_id = models.CharField(max_length=50, **BNULL)
def __str__(self):
return f"{self.name}"
def fix_metadata(self, data_dict: dict = {}):
if not data_dict and self.openlibrary_id:
data_dict = lookup_author_from_openlibrary(self.openlibrary_id)
def enrich_from_semantic(self, overwrite=False):
...
def enrich_from_google_books(self, overwrite=False):
...
def enrich_from_openlibrary(self, overwrite=False):
data_dict = lookup_author_from_openlibrary(self.openlibrary_id)
if not data_dict or not data_dict.get("name"):
logger.warning("Could not find author on openlibrary")
return
headshot_url = data_dict.pop("author_headshot_url", "")
@ -160,26 +166,32 @@ class Book(LongPlayScrobblableMixin):
if not created and not overwrite:
return book
bdict, authors, cover, genres = lookup_book_from_google(
title
).as_dict_with_authors_cover_and_genres()
book_dict = lookup_book_from_google(title)
if created or overwrite:
author_list = []
authors = book_dict.pop("authors")
cover_url = book_dict.pop("cover_url")
genres = book_dict.pop("generes")
if authors:
for author_str in authors:
if author_str:
author_list.append(
Author.objects.get_or_create(name=author_str)[0]
author, a_created = Author.objects.get_or_create(
name=author_str
)
author_list.append(author)
if a_created:
# TODO enrich author
...
for k, v in bdict.items():
for k, v in book_dict.items():
setattr(book, k, v)
book.save()
book.save()
book.save_image_from_url(cover)
book.genre.add(*genres)
book.authors.add(*author_list)
book.save_image_from_url(cover_url)
book.genre.add(*genres)
book.authors.add(*author_list)
return book
def save_image_from_url(self, url: str, force_update: bool = False):
@ -474,3 +486,68 @@ class Page(TimeStampedModel):
seconds=self.duration_seconds
)
self.save(update_fields=["end_time"])
class Paper(LongPlayScrobblableMixin):
"""Keeps track of Academic Papers"""
COMPLETION_PERCENT = getattr(settings, "PAPER_COMPLETION_PERCENT", 60)
AVG_PAGE_READING_SECONDS = getattr(
settings, "AVERAGE_PAGE_READING_SECONDS", 60
)
title = models.CharField(max_length=255)
semantic_title = models.CharField(max_length=255, **BNULL)
authors = models.ManyToManyField(Author, blank=True)
koreader_data_by_hash = models.JSONField(**BNULL)
arxiv_id = models.CharField(max_length=50, **BNULL)
semantic_id = models.CharField(max_length=50, **BNULL)
arxiv_id = models.CharField(max_length=50, **BNULL)
corpus_id = models.CharField(max_length=50, **BNULL)
doi_id = models.CharField(max_length=50, **BNULL)
pages = models.IntegerField(**BNULL)
language = models.CharField(max_length=4, **BNULL)
first_publish_year = models.IntegerField(**BNULL)
publish_date = models.DateField(**BNULL)
journal = models.CharField(max_length=255, **BNULL)
journal_volume = models.CharField(max_length=50, **BNULL)
abstract = models.TextField(**BNULL)
tldr = models.CharField(max_length=255, **BNULL)
openaccess_pdf_url = models.CharField(max_length=255, **BNULL)
genre = TaggableManager(through=ObjectWithGenres)
@classmethod
def get_from_semantic(cls, title: str, overwrite: bool = False) -> "Paper":
paper, created = cls.objects.get_or_create(title=title)
if not created and not overwrite:
return paper
paper_dict = lookup_paper_from_semantic(title)
if created or overwrite:
author_list = []
author_dicts = paper_dict.pop("author_dicts")
if author_dicts:
for author_dict in author_dicts:
if author_dict.get("authorId"):
author, a_created = Author.objects.get_or_create(
semantic_id=author_dict.get("authorId")
)
author_list.append(author)
if a_created:
author.name = author_dict.get("name")
author.save()
# TODO enrich author?
...
for k, v in paper_dict.items():
setattr(paper, k, v)
paper.save()
if author_list:
paper.authors.add(*author_list)
genres = paper_dict.pop("genres", [])
if genres:
paper.genre.add(*genres)
return paper

View File

@ -3,7 +3,6 @@ import logging
import pendulum
import requests
from books.metadata import BookMetadata
from django.conf import settings
API_KEY = settings.GOOGLE_API_KEY
@ -14,8 +13,8 @@ GOOGLE_BOOKS_URL = (
logger = logging.getLogger(__name__)
def lookup_book_from_google(title: str) -> BookMetadata:
book_metadata = BookMetadata(title=title)
def lookup_book_from_google(title: str) -> dict:
book_dict = {"title": title}
url = GOOGLE_BOOKS_URL.format(title=title, key=API_KEY)
headers = {"User-Agent": "Vrobbler 0.11.12"}
@ -25,7 +24,7 @@ def lookup_book_from_google(title: str) -> BookMetadata:
logger.warning(
"Bad response from Google", extra={"response": response}
)
return book_metadata
return book_dict
google_result = (
json.loads(response.content).get("items", [{}])[0].get("volumeInfo")
@ -39,30 +38,32 @@ def lookup_book_from_google(title: str) -> BookMetadata:
isbn_13 = ident.get("identifier")
if ident.get("type") == "ISBN_10":
isbn_10 = ident.get("identifier")
book_metadata.title = google_result.get("title")
if google_result.get("subtitle"):
book_metadata.title = ": ".join(
[google_result.get("title"), google_result.get("subtitle")]
)
book_metadata.authors = google_result.get("authors")
book_metadata.publisher = google_result.get("publisher")
book_metadata.first_publish_year = publish_date.year
book_metadata.pages = google_result.get("pageCount")
book_metadata.isbn_13 = isbn_13
book_metadata.isbn_10 = isbn_10
book_metadata.publish_date = google_result.get("publishedDate")
book_metadata.language = google_result.get("language")
book_metadata.summary = google_result.get("description")
book_metadata.genres = google_result.get("categories")
book_metadata.cover_url = (
# TODO this may lead to issues with the first get if Google changes our title
# book_metadata.title = google_result.get("title")
# if google_result.get("subtitle"):
# book_metadata["title"] = ": ".join(
# [google_result.get("title"), google_result.get("subtitle")]
# )
book_dict["subtitle"] = google_result.get("subtitle")
book_dict["authors"] = google_result.get("authors")
book_dict["publisher"] = google_result.get("publisher")
book_dict["first_publish_year"] = publish_date.year
book_dict["pages"] = google_result.get("pageCount")
book_dict["isbn_13"] = isbn_13
book_dict["isbn_10"] = isbn_10
book_dict["publish_date"] = google_result.get("publishedDate")
book_dict["language"] = google_result.get("language")
book_dict["summary"] = google_result.get("description")
book_dict["genres"] = google_result.get("categories")
book_dict["cover_url"] = (
google_result.get("imageLinks", {})
.get("thumbnail")
.replace("zoom=1", "zoom=15")
.replace("&edge=curl", "")
)
book_metadata.run_time_seconds = book_metadata.pages * getattr(
book_dict["run_time_seconds"] = book_dict.get("pages", 10) * getattr(
settings, "AVERAGE_PAGE_READING_SECONDS", 60
)
return book_metadata
return book_dict

View File

@ -0,0 +1,73 @@
import json
import logging
from datetime import datetime
import requests
from django.conf import settings
PAPER_SEARCH_URL = (
"https://api.semanticscholar.org/graph/v1/paper/search/match?query={}"
)
PAPER_DETAIL_URL = "https://api.semanticscholar.org/graph/v1/paper/{}?fields=title,authors,url,year,abstract,externalIds,citationCount,referenceCount,journal,fieldsOfStudy,publicationDate,openAccessPdf"
logger = logging.getLogger(__name__)
def get_api_result(url):
headers = {"User-Agent": "Vrobbler 0.11.12"}
response = requests.get(url, headers=headers)
if response.status_code != 200:
logger.warning(
"Bad response from Semantic", extra={"response": response}
)
return None
return response
def lookup_paper_from_semantic(title: str) -> dict:
paper_dict = {"title": title}
response = get_api_result(PAPER_SEARCH_URL.format(title))
if not response:
return paper_dict
semantic_id = json.loads(response.content).get("data")[0].get("paperId")
response = get_api_result(PAPER_DETAIL_URL.format(semantic_id))
result = json.loads(response.content)
page_str = result.get("journal", {}).get("pages")
if page_str:
try:
start_page = page_str.split(" - ")[0]
end_page = page_str.split(" - ")[1]
paper_dict["pages"] = int(end_page) - int(start_page)
except IndexError:
pass
paper_dict["semantic_id"] = result.get("paperId")
paper_dict["doi_id"] = result.get("externalIds", {}).get("DOI")
paper_dict["arxiv_id"] = result.get("externalIds", {}).get("ArXiv")
paper_dict["pubmed_id"] = result.get("externalIds", {}).get("PubMed")
paper_dict["corpus_id"] = result.get("externalIds", {}).get("CorpusId")
paper_dict["semantic_title"] = result.get("title")
paper_dict["first_publish_year"] = result.get("year")
paper_dict["publish_date"] = datetime.strptime(
result.get("publicationDate", "1950-01-01"), "%Y-%m-%d"
)
paper_dict["abstract"] = result.get("abstract")
paper_dict["tldr"] = result.get("bib", {}).get("abstract")
paper_dict["journal"] = result.get("journal", {}).get("name")
paper_dict["journal_volume"] = result.get("journal", {}).get("volume")
paper_dict["openaccess_pdf_url"] = result.get("openAccessPdf", {}).get(
"url"
)
paper_dict["run_time_seconds"] = paper_dict.get("pages", 10) * getattr(
settings, "AVERAGE_PAGE_READING_SECONDS", 60
)
paper_dict["author_dicts"] = result.get("authors")
paper_dict["genres"] = result.get("fieldsOfStudy")
return paper_dict

View File

@ -20,7 +20,9 @@ class ScrobbleInline(admin.TabularInline):
"track",
"video_game",
"book",
"paper",
"sport_event",
"food",
"board_game",
"geo_location",
"task",

View File

@ -0,0 +1,52 @@
# Generated by Django 4.2.19 on 2025-02-18 05:27
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
("books", "0025_remove_author_amazon_id_and_more"),
("scrobbles", "0067_scrobble_food_alter_scrobble_media_type"),
]
operations = [
migrations.AddField(
model_name="scrobble",
name="paper",
field=models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.DO_NOTHING,
to="books.paper",
),
),
migrations.AlterField(
model_name="scrobble",
name="media_type",
field=models.CharField(
choices=[
("Video", "Video"),
("Track", "Track"),
("PodcastEpisode", "Podcast episode"),
("SportEvent", "Sport event"),
("Book", "Book"),
("Paper", "Paper"),
("VideoGame", "Video game"),
("BoardGame", "Board game"),
("GeoLocation", "GeoLocation"),
("Trail", "Trail"),
("Beer", "Beer"),
("Food", "Food"),
("Task", "Task"),
("WebPage", "Web Page"),
("LifeEvent", "Life event"),
("Mood", "Mood"),
("BrickSet", "Brick set"),
],
default="Video",
max_length=14,
),
),
]

View File

@ -10,7 +10,7 @@ import pytz
from beers.models import Beer
from boardgames.models import BoardGame
from books.koreader import process_koreader_sqlite_file
from books.models import Book
from books.models import Book, Paper
from bricksets.models import BrickSet
from django.conf import settings
from django.contrib.auth import get_user_model
@ -502,6 +502,7 @@ class Scrobble(TimeStampedModel):
PODCAST_EPISODE = "PodcastEpisode", "Podcast episode"
SPORT_EVENT = "SportEvent", "Sport event"
BOOK = "Book", "Book"
PAPER = "Paper", "Paper"
VIDEO_GAME = "VideoGame", "Video game"
BOARD_GAME = "BoardGame", "Board game"
GEO_LOCATION = "GeoLocation", "GeoLocation"
@ -524,6 +525,7 @@ class Scrobble(TimeStampedModel):
SportEvent, on_delete=models.DO_NOTHING, **BNULL
)
book = models.ForeignKey(Book, on_delete=models.DO_NOTHING, **BNULL)
paper = models.ForeignKey(Paper, on_delete=models.DO_NOTHING, **BNULL)
video_game = models.ForeignKey(
VideoGame, on_delete=models.DO_NOTHING, **BNULL
)