[webpages] Remove curl_cffi as it doesn't work on FreeBSD

This commit is contained in:
2026-06-23 23:17:08 -04:00
parent 5e0dffdc7a
commit 662ebe66b9
4 changed files with 30 additions and 70 deletions

View File

@ -88,7 +88,7 @@ fetching and simple saving.
*** Metadata sources
**** Scraper
* Backlog [0/22] :vrobbler:project:personal:
* Backlog [1/23] :vrobbler:project:personal:
** TODO [#C] Create small utility to clean up tracks scrobbled with wonky playback times :bug:music:scrobbles:
:PROPERTIES:
:ID: 702462cf-d54b-48c6-8a7c-78b8de751deb
@ -604,6 +604,10 @@ independent of the email flow it was originally creatdd for
** TODO [#B] Is there way to create unique slugs for media instances :media_types:
** DONE [#A] Remove curl-cffi as it doesn't work on FreeBSD :webpages:deps:
:PROPERTIES:
:ID: 6bc1b0dd-e449-3d32-a176-46451e793e5d
:END:
* Version 58.2 [2/2]
** DONE [#B] Add more robust webpage scraping :webpages:metadata:
:PROPERTIES:

53
poetry.lock generated
View File

@ -654,6 +654,7 @@ description = "Foreign Function Interface for Python calling C code."
optional = false
python-versions = ">=3.9"
groups = ["main"]
markers = "platform_python_implementation != \"PyPy\" or sys_platform == \"darwin\""
files = [
{file = "cffi-2.0.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44"},
{file = "cffi-2.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f73b96c41e3b2adedc34a7356e64c8eb96e03a3782b535e043a986276ce12a49"},
@ -1237,48 +1238,6 @@ ssh = ["bcrypt (>=3.1.5)"]
test = ["certifi (>=2024)", "cryptography-vectors (==46.0.6)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"]
test-randomorder = ["pytest-randomly"]
[[package]]
name = "curl-cffi"
version = "0.15.0"
description = "libcurl ffi bindings for Python, with impersonation support."
optional = false
python-versions = ">=3.10"
groups = ["main"]
files = [
{file = "curl_cffi-0.15.0-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:bda66404010e9ed743b1b83c20c86f24fe21a9a6873e17479d6e67e29d8ded28"},
{file = "curl_cffi-0.15.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:a25620d9bf989c9c029a7d1642999c4c265abb0bad811deb2f77b0b5b2b12e5b"},
{file = "curl_cffi-0.15.0-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:582e570aa2586b96ed47cf4a17586b9a3c462cbe43f780487c3dc245c6ef1527"},
{file = "curl_cffi-0.15.0-cp310-abi3-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:838e48212447d9c81364b04707a5c861daf08f8320f9ecb3406a8919d1d5c3b3"},
{file = "curl_cffi-0.15.0-cp310-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2b6c847d86283b07ae69bb72c82eb8a59242277142aa35b89850f89e792a02fc"},
{file = "curl_cffi-0.15.0-cp310-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:9e5e69eee735f659287e2c84444319d68a1fa68dd37abf228943a4074864283a"},
{file = "curl_cffi-0.15.0-cp310-abi3-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:aa1323950224db24f4c510d010b3affa02196ca853fb424191fa917a513d3f4b"},
{file = "curl_cffi-0.15.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:41f80170ba844009273b2660da1964ec31e99e5719d16b3422ada87177e32e13"},
{file = "curl_cffi-0.15.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1977e1e12cfb5c11352cbb74acef1bed24eb7d226dab61ca57c168c21acd4d61"},
{file = "curl_cffi-0.15.0-cp310-abi3-win_amd64.whl", hash = "sha256:5a0c1896a0d5a5ac1eb89cd24b008d2b718dd1df6fd2f75451b59ca66e49e572"},
{file = "curl_cffi-0.15.0-cp310-abi3-win_arm64.whl", hash = "sha256:a6d57f8389273a3a1f94370473c74897467bcc36af0a17336989780c507fa43d"},
{file = "curl_cffi-0.15.0-cp313-abi3-android_24_arm64_v8a.whl", hash = "sha256:4682dc38d4336e0eb0b185374db90a760efde63cbea994b4e63f3521d44c4c92"},
{file = "curl_cffi-0.15.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:967ad7355bd8e9586f8c2d02eaa99953747549e7ea4a9b25cd53353e6b67fe6d"},
{file = "curl_cffi-0.15.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7e63539d0d839d0a8c5eacf86229bc68c57803547f35e0db7ee0986328b478c3"},
{file = "curl_cffi-0.15.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:08c799b89740b9bc49c09fbc3d5907f13ac1f845ca52620507ef9466d4639dd5"},
{file = "curl_cffi-0.15.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7b7a92767a888ee90147e18964b396d8435ff42737030d6fb00824ffd6094805"},
{file = "curl_cffi-0.15.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:829cc357061ecb99cc2d406301f609a039e05665322f5c025ec67c38b0dc49ce"},
{file = "curl_cffi-0.15.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:408d6f14e346841cd889c2e0962832bb235ba3b6749ebf609f347f747da5e60f"},
{file = "curl_cffi-0.15.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b624c7ce087bfda967a013ed0a64702a525444e5b6e97d23534d567ccc6525aa"},
{file = "curl_cffi-0.15.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0b6c0543b993996670e9e4b78e305a2d60809d5681903ffb5568e21a387434d3"},
{file = "curl_cffi-0.15.0.tar.gz", hash = "sha256:ea0c67652bf6893d34ee0f82c944f37e488f6147e9421bef1771cc6545b02ded"},
]
[package.dependencies]
certifi = ">=2024.2.2"
cffi = ">=2.0.0"
rich = "*"
[package.extras]
build = ["cibuildwheel", "wheel"]
dev = ["charset_normalizer (>=3.3.2,<4.0)", "coverage (>=6.4.1,<7.0)", "cryptography (>=46.0.4,<47.0)", "httpx (==0.23.1)", "mypy (>=1.9.0,<2.0)", "pytest (>=8.1.1,<9.0)", "pytest-asyncio (>=0.23.6,<1.0)", "pytest-trio (>=0.8.0,<1.0)", "ruff (>=0.3.5,<1.0)", "trio (>=0.25.0,<1.0)", "trustme (>=1.1.0,<2.0)", "typing_extensions", "uvicorn (>=0.29.0,<1.0)", "websockets (>=14.0)"]
extra = ["lxml_html_clean", "markdownify (>=1.1.0)", "readability-lxml (>=0.8.1)"]
test = ["charset_normalizer (>=3.3.2,<4.0)", "cryptography (>=46.0.4,<47.0)", "httpx (==0.23.1)", "litestar (>=2.19.0,<3.0)", "proxy.py (>=2.4.3,<3.0)", "pytest (>=8.1.1,<9.0)", "pytest-asyncio (>=0.23.6,<1.0)", "pytest-trio (>=0.8.0,<1.0)", "python-multipart (>=0.0.9,<1.0)", "trio (>=0.25.0,<1.0)", "trustme (>=1.1.0,<2.0)", "typing_extensions", "uvicorn (>=0.29.0,<1.0)", "websockets (>=14.0)"]
[[package]]
name = "dacite"
version = "1.9.2"
@ -2930,7 +2889,7 @@ version = "4.0.0"
description = "Python port of markdown-it. Markdown parsing, done right!"
optional = false
python-versions = ">=3.10"
groups = ["main", "test"]
groups = ["test"]
files = [
{file = "markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147"},
{file = "markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3"},
@ -3026,7 +2985,7 @@ version = "0.1.2"
description = "Markdown URL utilities"
optional = false
python-versions = ">=3.7"
groups = ["main", "test"]
groups = ["test"]
files = [
{file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
{file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
@ -4228,7 +4187,7 @@ description = "C parser in Python"
optional = false
python-versions = ">=3.10"
groups = ["main"]
markers = "implementation_name != \"PyPy\""
markers = "(platform_python_implementation != \"PyPy\" or sys_platform == \"darwin\") and implementation_name != \"PyPy\""
files = [
{file = "pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992"},
{file = "pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29"},
@ -5407,7 +5366,7 @@ version = "14.3.3"
description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
optional = false
python-versions = ">=3.8.0"
groups = ["main", "test"]
groups = ["test"]
files = [
{file = "rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d"},
{file = "rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b"},
@ -6870,4 +6829,4 @@ cffi = ["cffi (>=1.17,<2.0) ; platform_python_implementation != \"PyPy\" and pyt
[metadata]
lock-version = "2.1"
python-versions = ">=3.11,<3.15"
content-hash = "8b1c608beddabe6884d4c2614170ccd71986daccb7db3944cb36531937fdee4e"
content-hash = "beac677c269bb8618ca802e5f92f7558391d8d26f1b2150f3c8a6d3417848cb1"

View File

@ -12,7 +12,6 @@ python-dateutil = "^2.8.2"
python-dotenv = ">=0.20.0,<2"
python-json-logger = "^2.0.2"
cloudscraper = "^1.2.71"
curl-cffi = "^0.15.0"
colorlog = "^6.6.0"
httpx = "<=0.27.2"
djangorestframework = "^3.13.1"

View File

@ -2,8 +2,6 @@ import logging
from typing import Dict, Optional
from uuid import uuid4
import cloudscraper
from curl_cffi import requests as curl_requests
import pendulum
import requests
import trafilatura
@ -50,32 +48,32 @@ class Domain(TimeStampedModel):
def _fetch_url_raw(url: str) -> Optional[str]:
"""Fetch raw HTML for a URL.
Tries three strategies in order:
Tries two strategies in order:
1. trafilatura (standard, works for most sites)
2. cloudscraper (handles basic Cloudflare JS challenges)
3. curl_cffi (impersonates browser TLS fingerprint for aggressive
Cloudflare / bot detection)
2. cloudscraper (handles Cloudflare JS challenges)
cloudscraper is lazy-imported so deployments that cannot compile
its dependencies (e.g. FreeBSD) still work.
"""
raw = trafilatura.fetch_url(url)
if raw:
return raw
logger.debug("trafilatura returned nothing for %s, trying cloudscraper", url)
try:
scraper = cloudscraper.create_scraper()
resp = scraper.get(url, timeout=30)
resp.raise_for_status()
return resp.text
except Exception as exc:
logger.debug("cloudscraper failed for %s: %s, trying curl_cffi", url, exc)
try:
resp = curl_requests.get(
url, impersonate="chrome124", timeout=30
)
resp.raise_for_status()
return resp.text
except Exception as e:
logger.warning("curl_cffi also failed for %s: %s", url, e)
return None
import cloudscraper
except ImportError:
logger.debug("cloudscraper not available")
else:
try:
scraper = cloudscraper.create_scraper()
resp = scraper.get(url, timeout=30)
resp.raise_for_status()
return resp.text
except Exception as exc:
logger.debug("cloudscraper failed for %s: %s", url, exc)
return None
class WebPage(ScrobblableMixin):