Compare commits

...

3 Commits
58.1 ... 58.2

Author SHA1 Message Date
5e0dffdc7a [release] Bump to version 58.2
Some checks failed
build / test (push) Successful in 2m5s
deploy / test (push) Successful in 2m3s
deploy / build-and-deploy (push) Failing after 25s
- Add more robust webpage scraping
- Time of Day Categories trend
2026-06-23 23:04:48 -04:00
2283a6c640 [webpages] Add more robust scraping 2026-06-23 23:04:30 -04:00
327ba94c63 [trends] Add new time of day trend
All checks were successful
build / test (push) Successful in 2m4s
2026-06-23 22:21:18 -04:00
10 changed files with 286 additions and 9 deletions

View File

@ -604,6 +604,23 @@ independent of the email flow it was originally creatdd for
** TODO [#B] Is there way to create unique slugs for media instances :media_types:
* Version 58.2 [2/2]
** DONE [#B] Add more robust webpage scraping :webpages:metadata:
:PROPERTIES:
:ID: 84d9bfa5-75c0-0718-764e-379f7456602a
:END:
** DONE [#B] Time of Day Categories trend :trends:
:PROPERTIES:
:ID: 6598074f-2290-46db-967b-29f45d30be29
:END:
*** Description
Added a "Time of Day Categories" trend that groups scrobbles for Books, Trails,
Birding Locations, and Board Games into Early Bird (5-10:59am), Day Jay (11am-6:59pm),
and Night Owl (7pm-4:59am) buckets. Shows both overall and per-media-type breakdowns.
* Version 58.1 [1/1]
** DONE [#B] Add auto genre tagging for papers :books:papers:metadata:
:PROPERTIES:

70
poetry.lock generated
View File

@ -654,7 +654,6 @@ description = "Foreign Function Interface for Python calling C code."
optional = false
python-versions = ">=3.9"
groups = ["main"]
markers = "platform_python_implementation != \"PyPy\" or sys_platform == \"darwin\""
files = [
{file = "cffi-2.0.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44"},
{file = "cffi-2.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f73b96c41e3b2adedc34a7356e64c8eb96e03a3782b535e043a986276ce12a49"},
@ -967,6 +966,23 @@ prompt-toolkit = ">=3.0.36"
[package.extras]
testing = ["pytest (>=7.2.1)", "pytest-cov (>=4.0.0)", "tox (>=4.4.3)"]
[[package]]
name = "cloudscraper"
version = "1.2.71"
description = "A Python module to bypass Cloudflare's anti-bot page."
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "cloudscraper-1.2.71-py2.py3-none-any.whl", hash = "sha256:76f50ca529ed2279e220837befdec892626f9511708e200d48d5bb76ded679b0"},
{file = "cloudscraper-1.2.71.tar.gz", hash = "sha256:429c6e8aa6916d5bad5c8a5eac50f3ea53c9ac22616f6cb21b18dcc71517d0d3"},
]
[package.dependencies]
pyparsing = ">=2.4.7"
requests = ">=2.9.2"
requests-toolbelt = ">=0.9.1"
[[package]]
name = "colorama"
version = "0.4.6"
@ -1221,6 +1237,48 @@ ssh = ["bcrypt (>=3.1.5)"]
test = ["certifi (>=2024)", "cryptography-vectors (==46.0.6)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"]
test-randomorder = ["pytest-randomly"]
[[package]]
name = "curl-cffi"
version = "0.15.0"
description = "libcurl ffi bindings for Python, with impersonation support."
optional = false
python-versions = ">=3.10"
groups = ["main"]
files = [
{file = "curl_cffi-0.15.0-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:bda66404010e9ed743b1b83c20c86f24fe21a9a6873e17479d6e67e29d8ded28"},
{file = "curl_cffi-0.15.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:a25620d9bf989c9c029a7d1642999c4c265abb0bad811deb2f77b0b5b2b12e5b"},
{file = "curl_cffi-0.15.0-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:582e570aa2586b96ed47cf4a17586b9a3c462cbe43f780487c3dc245c6ef1527"},
{file = "curl_cffi-0.15.0-cp310-abi3-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:838e48212447d9c81364b04707a5c861daf08f8320f9ecb3406a8919d1d5c3b3"},
{file = "curl_cffi-0.15.0-cp310-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2b6c847d86283b07ae69bb72c82eb8a59242277142aa35b89850f89e792a02fc"},
{file = "curl_cffi-0.15.0-cp310-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:9e5e69eee735f659287e2c84444319d68a1fa68dd37abf228943a4074864283a"},
{file = "curl_cffi-0.15.0-cp310-abi3-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:aa1323950224db24f4c510d010b3affa02196ca853fb424191fa917a513d3f4b"},
{file = "curl_cffi-0.15.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:41f80170ba844009273b2660da1964ec31e99e5719d16b3422ada87177e32e13"},
{file = "curl_cffi-0.15.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1977e1e12cfb5c11352cbb74acef1bed24eb7d226dab61ca57c168c21acd4d61"},
{file = "curl_cffi-0.15.0-cp310-abi3-win_amd64.whl", hash = "sha256:5a0c1896a0d5a5ac1eb89cd24b008d2b718dd1df6fd2f75451b59ca66e49e572"},
{file = "curl_cffi-0.15.0-cp310-abi3-win_arm64.whl", hash = "sha256:a6d57f8389273a3a1f94370473c74897467bcc36af0a17336989780c507fa43d"},
{file = "curl_cffi-0.15.0-cp313-abi3-android_24_arm64_v8a.whl", hash = "sha256:4682dc38d4336e0eb0b185374db90a760efde63cbea994b4e63f3521d44c4c92"},
{file = "curl_cffi-0.15.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:967ad7355bd8e9586f8c2d02eaa99953747549e7ea4a9b25cd53353e6b67fe6d"},
{file = "curl_cffi-0.15.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7e63539d0d839d0a8c5eacf86229bc68c57803547f35e0db7ee0986328b478c3"},
{file = "curl_cffi-0.15.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:08c799b89740b9bc49c09fbc3d5907f13ac1f845ca52620507ef9466d4639dd5"},
{file = "curl_cffi-0.15.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7b7a92767a888ee90147e18964b396d8435ff42737030d6fb00824ffd6094805"},
{file = "curl_cffi-0.15.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:829cc357061ecb99cc2d406301f609a039e05665322f5c025ec67c38b0dc49ce"},
{file = "curl_cffi-0.15.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:408d6f14e346841cd889c2e0962832bb235ba3b6749ebf609f347f747da5e60f"},
{file = "curl_cffi-0.15.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b624c7ce087bfda967a013ed0a64702a525444e5b6e97d23534d567ccc6525aa"},
{file = "curl_cffi-0.15.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0b6c0543b993996670e9e4b78e305a2d60809d5681903ffb5568e21a387434d3"},
{file = "curl_cffi-0.15.0.tar.gz", hash = "sha256:ea0c67652bf6893d34ee0f82c944f37e488f6147e9421bef1771cc6545b02ded"},
]
[package.dependencies]
certifi = ">=2024.2.2"
cffi = ">=2.0.0"
rich = "*"
[package.extras]
build = ["cibuildwheel", "wheel"]
dev = ["charset_normalizer (>=3.3.2,<4.0)", "coverage (>=6.4.1,<7.0)", "cryptography (>=46.0.4,<47.0)", "httpx (==0.23.1)", "mypy (>=1.9.0,<2.0)", "pytest (>=8.1.1,<9.0)", "pytest-asyncio (>=0.23.6,<1.0)", "pytest-trio (>=0.8.0,<1.0)", "ruff (>=0.3.5,<1.0)", "trio (>=0.25.0,<1.0)", "trustme (>=1.1.0,<2.0)", "typing_extensions", "uvicorn (>=0.29.0,<1.0)", "websockets (>=14.0)"]
extra = ["lxml_html_clean", "markdownify (>=1.1.0)", "readability-lxml (>=0.8.1)"]
test = ["charset_normalizer (>=3.3.2,<4.0)", "cryptography (>=46.0.4,<47.0)", "httpx (==0.23.1)", "litestar (>=2.19.0,<3.0)", "proxy.py (>=2.4.3,<3.0)", "pytest (>=8.1.1,<9.0)", "pytest-asyncio (>=0.23.6,<1.0)", "pytest-trio (>=0.8.0,<1.0)", "python-multipart (>=0.0.9,<1.0)", "trio (>=0.25.0,<1.0)", "trustme (>=1.1.0,<2.0)", "typing_extensions", "uvicorn (>=0.29.0,<1.0)", "websockets (>=14.0)"]
[[package]]
name = "dacite"
version = "1.9.2"
@ -2872,7 +2930,7 @@ version = "4.0.0"
description = "Python port of markdown-it. Markdown parsing, done right!"
optional = false
python-versions = ">=3.10"
groups = ["test"]
groups = ["main", "test"]
files = [
{file = "markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147"},
{file = "markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3"},
@ -2968,7 +3026,7 @@ version = "0.1.2"
description = "Markdown URL utilities"
optional = false
python-versions = ">=3.7"
groups = ["test"]
groups = ["main", "test"]
files = [
{file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
{file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
@ -4170,7 +4228,7 @@ description = "C parser in Python"
optional = false
python-versions = ">=3.10"
groups = ["main"]
markers = "(platform_python_implementation != \"PyPy\" or sys_platform == \"darwin\") and implementation_name != \"PyPy\""
markers = "implementation_name != \"PyPy\""
files = [
{file = "pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992"},
{file = "pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29"},
@ -5349,7 +5407,7 @@ version = "14.3.3"
description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
optional = false
python-versions = ">=3.8.0"
groups = ["test"]
groups = ["main", "test"]
files = [
{file = "rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d"},
{file = "rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b"},
@ -6812,4 +6870,4 @@ cffi = ["cffi (>=1.17,<2.0) ; platform_python_implementation != \"PyPy\" and pyt
[metadata]
lock-version = "2.1"
python-versions = ">=3.11,<3.15"
content-hash = "3081efc33689e0cb1e2cc12709f2fbfd84a084ee4a083eebba80db2eb80fb0a6"
content-hash = "8b1c608beddabe6884d4c2614170ccd71986daccb7db3944cb36531937fdee4e"

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "vrobbler"
version = "58.1"
version = "58.2"
description = ""
authors = ["Colin Powell <colin@unbl.ink>"]
@ -11,6 +11,8 @@ django-extensions = "^3.1.5"
python-dateutil = "^2.8.2"
python-dotenv = ">=0.20.0,<2"
python-json-logger = "^2.0.2"
cloudscraper = "^1.2.71"
curl-cffi = "^0.15.0"
colorlog = "^6.6.0"
httpx = "<=0.27.2"
djangorestframework = "^3.13.1"

View File

@ -0,0 +1,65 @@
<div class="row">
<div class="col-12">
{% if data.total and data.total > 0 %}
<h5>Overall</h5>
<div class="table-responsive mb-4">
<table class="table table-striped table-sm">
<thead>
<tr>
<th>Category</th>
<th class="text-end">Scrobbles</th>
<th class="text-end">%</th>
</tr>
</thead>
<tbody>
{% for slug, info in data.categories.items %}
<tr>
<td>{{ info.label }}</td>
<td class="text-end">{{ info.count }}</td>
<td class="text-end">{{ info.pct }}%</td>
</tr>
{% endfor %}
<tr class="table-secondary">
<td><strong>Total</strong></td>
<td class="text-end"><strong>{{ data.total }}</strong></td>
<td class="text-end"></td>
</tr>
</tbody>
</table>
</div>
<h5>By Media Type</h5>
{% for mt, mt_data in data.by_media_type.items %}
<h6 class="mt-3">{{ mt }}</h6>
<div class="table-responsive">
<table class="table table-striped table-sm">
<thead>
<tr>
<th>Category</th>
<th class="text-end">Scrobbles</th>
<th class="text-end">%</th>
</tr>
</thead>
<tbody>
{% for slug, info in mt_data.categories.items %}
<tr>
<td>{{ info.label }}</td>
<td class="text-end">{{ info.count }}</td>
<td class="text-end">{{ info.pct }}%</td>
</tr>
{% endfor %}
<tr class="table-secondary">
<td><strong>Total</strong></td>
<td class="text-end"><strong>{{ mt_data.total }}</strong></td>
<td class="text-end"></td>
</tr>
</tbody>
</table>
</div>
{% endfor %}
{% else %}
<p class="text-muted">No data found for Books, Trails, Birding Locations, or Board Games in this period.</p>
{% endif %}
</div>
</div>

View File

@ -55,6 +55,9 @@
{% elif trend.slug == "reading-pace-vs-activity" %}
{% include "trends/_reading_pace.html" %}
{% elif trend.slug == "time-of-day-categories" %}
{% include "trends/_time_of_day_categories.html" %}
{% elif trend.slug == "trending-up" %}
{% include "trends/_trending_up.html" %}

View File

@ -15,6 +15,7 @@ from trends.trends.mood import (
compute_mood_weather,
)
from trends.trends.reading import compute_reading_pace_vs_activity
from trends.trends.time_of_day import compute_time_of_day_categories
from trends.trends.trending import compute_trending_up
TREND_REGISTRY = {}
@ -44,5 +45,8 @@ compute_peak_hours = register("peak-hours")(compute_peak_hours)
compute_reading_pace_vs_activity = register("reading-pace-vs-activity")(
compute_reading_pace_vs_activity
)
compute_time_of_day_categories = register("time-of-day-categories")(
compute_time_of_day_categories
)
compute_trending_up = register("trending-up")(compute_trending_up)
compute_weekly_rhythm = register("weekly-rhythm")(compute_weekly_rhythm)

View File

@ -0,0 +1,89 @@
from collections import OrderedDict
from django.db.models import Count, Q
from django.db.models.functions import Extract
from scrobbles.models import Scrobble
TARGET_MEDIA_TYPES = ["Book", "Trail", "BirdingLocation", "BoardGame"]
CATEGORIES = OrderedDict(
[
("early_bird", {"label": "Early Bird", "hours": {5, 6, 7, 8, 9, 10}}),
("day_jay", {"label": "Day Jay", "hours": {11, 12, 13, 14, 15, 16, 17, 18}}),
("night_owl", {"label": "Night Owl", "hours": {19, 20, 21, 22, 23, 0, 1, 2, 3, 4}}),
]
)
def _categorize_hour(hour):
for slug, cat in CATEGORIES.items():
if hour in cat["hours"]:
return slug
return None
def compute_time_of_day_categories(user, period="last_30"):
from trends.utils import get_date_range
start, end = get_date_range(period)
filters = Q(user=user, media_type__in=TARGET_MEDIA_TYPES, timestamp__isnull=False)
if start:
filters &= Q(timestamp__gte=start)
if end:
filters &= Q(timestamp__lte=end)
qs = (
Scrobble.objects.filter(filters)
.annotate(hour=Extract("timestamp", "hour"))
.values("media_type", "hour")
.annotate(count=Count("id"))
.order_by("media_type", "hour")
)
raw = {}
for row in qs:
mt = row["media_type"]
raw.setdefault(mt, {})[row["hour"]] = row["count"]
by_media_type = {}
grand_totals = {"early_bird": 0, "day_jay": 0, "night_owl": 0}
grand_total = 0
for mt in TARGET_MEDIA_TYPES:
mt_data = raw.get(mt, {})
cat_counts = {"early_bird": 0, "day_jay": 0, "night_owl": 0}
mt_total = 0
for hour, count in mt_data.items():
slug = _categorize_hour(hour)
if slug:
cat_counts[slug] += count
mt_total += count
by_media_type[mt] = {
"total": mt_total,
"categories": {},
}
for slug in CATEGORIES:
c = cat_counts[slug]
by_media_type[mt]["categories"][slug] = {
"count": c,
"pct": round((c / mt_total * 100), 1) if mt_total else 0,
"label": CATEGORIES[slug]["label"],
}
grand_totals[slug] += c
grand_total += mt_total
categories = {}
for slug in CATEGORIES:
c = grand_totals[slug]
categories[slug] = {
"count": c,
"pct": round((c / grand_total * 100), 1) if grand_total else 0,
"label": CATEGORIES[slug]["label"],
}
return {
"categories": categories,
"total": grand_total,
"by_media_type": by_media_type,
}

View File

@ -25,6 +25,7 @@ TIME_BOUND_TRENDS = {
"mood-weather",
"peak-hours",
"reading-pace-vs-activity",
"time-of-day-categories",
"trending-up",
"weekly-rhythm",
}

View File

@ -55,6 +55,11 @@ TREND_METADATA = {
"description": "Compare how long you read per session with and without concurrent music.",
"icon": "📊",
},
"time-of-day-categories": {
"title": "Time of Day Categories",
"description": "Are you an early bird, day jay, or night owl? Categorized by Books, Trails, Birding Locations, and Board Games.",
"icon": "🦉",
},
"trending-up": {
"title": "Trending Media Types",
"description": "Which media types have you been consuming more or less of recently?",

View File

@ -2,6 +2,8 @@ import logging
from typing import Dict, Optional
from uuid import uuid4
import cloudscraper
from curl_cffi import requests as curl_requests
import pendulum
import requests
import trafilatura
@ -45,6 +47,37 @@ class Domain(TimeStampedModel):
)
def _fetch_url_raw(url: str) -> Optional[str]:
"""Fetch raw HTML for a URL.
Tries three strategies in order:
1. trafilatura (standard, works for most sites)
2. cloudscraper (handles basic Cloudflare JS challenges)
3. curl_cffi (impersonates browser TLS fingerprint for aggressive
Cloudflare / bot detection)
"""
raw = trafilatura.fetch_url(url)
if raw:
return raw
logger.debug("trafilatura returned nothing for %s, trying cloudscraper", url)
try:
scraper = cloudscraper.create_scraper()
resp = scraper.get(url, timeout=30)
resp.raise_for_status()
return resp.text
except Exception as exc:
logger.debug("cloudscraper failed for %s: %s, trying curl_cffi", url, exc)
try:
resp = curl_requests.get(
url, impersonate="chrome124", timeout=30
)
resp.raise_for_status()
return resp.text
except Exception as e:
logger.warning("curl_cffi also failed for %s: %s", url, e)
return None
class WebPage(ScrobblableMixin):
COMPLETION_PERCENT = getattr(settings, "WEBSITE_COMPLETION_PERCENT", 100)
@ -80,7 +113,7 @@ class WebPage(ScrobblableMixin):
def _update_extract_from_web(self, raw_text: str = "", force=True):
if not raw_text:
raw_text = requests.get(self.url, headers=headers).text
raw_text = _fetch_url_raw(self.url)
if not self.extract or force:
self.extract = trafilatura.extract(raw_text)
self.save(update_fields=["extract"])
@ -259,7 +292,7 @@ class WebPage(ScrobblableMixin):
return False
def fetch_data_from_web(self, save=True, force=True):
raw_text = trafilatura.fetch_url(self.url)
raw_text = _fetch_url_raw(self.url)
if not self.extract or force:
self.extract = trafilatura.extract(
raw_text,