From 662ebe66b9d804e7700c7271d094a907f4dcee80 Mon Sep 17 00:00:00 2001 From: Colin Powell Date: Tue, 23 Jun 2026 23:17:08 -0400 Subject: [PATCH] [webpages] Remove curl_cffi as it doesn't work on FreeBSD --- PROJECT.org | 6 +++- poetry.lock | 53 ++++---------------------------- pyproject.toml | 1 - vrobbler/apps/webpages/models.py | 40 ++++++++++++------------ 4 files changed, 30 insertions(+), 70 deletions(-) diff --git a/PROJECT.org b/PROJECT.org index d2c46f6..aae47f3 100644 --- a/PROJECT.org +++ b/PROJECT.org @@ -88,7 +88,7 @@ fetching and simple saving. *** Metadata sources **** Scraper -* Backlog [0/22] :vrobbler:project:personal: +* Backlog [1/23] :vrobbler:project:personal: ** TODO [#C] Create small utility to clean up tracks scrobbled with wonky playback times :bug:music:scrobbles: :PROPERTIES: :ID: 702462cf-d54b-48c6-8a7c-78b8de751deb @@ -604,6 +604,10 @@ independent of the email flow it was originally creatdd for ** TODO [#B] Is there way to create unique slugs for media instances :media_types: +** DONE [#A] Remove curl-cffi as it doesn't work on FreeBSD :webpages:deps: +:PROPERTIES: +:ID: 6bc1b0dd-e449-3d32-a176-46451e793e5d +:END: * Version 58.2 [2/2] ** DONE [#B] Add more robust webpage scraping :webpages:metadata: :PROPERTIES: diff --git a/poetry.lock b/poetry.lock index 53de7c3..46f5309 100644 --- a/poetry.lock +++ b/poetry.lock @@ -654,6 +654,7 @@ description = "Foreign Function Interface for Python calling C code." optional = false python-versions = ">=3.9" groups = ["main"] +markers = "platform_python_implementation != \"PyPy\" or sys_platform == \"darwin\"" files = [ {file = "cffi-2.0.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44"}, {file = "cffi-2.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f73b96c41e3b2adedc34a7356e64c8eb96e03a3782b535e043a986276ce12a49"}, @@ -1237,48 +1238,6 @@ ssh = ["bcrypt (>=3.1.5)"] test = ["certifi (>=2024)", "cryptography-vectors (==46.0.6)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"] test-randomorder = ["pytest-randomly"] -[[package]] -name = "curl-cffi" -version = "0.15.0" -description = "libcurl ffi bindings for Python, with impersonation support." -optional = false -python-versions = ">=3.10" -groups = ["main"] -files = [ - {file = "curl_cffi-0.15.0-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:bda66404010e9ed743b1b83c20c86f24fe21a9a6873e17479d6e67e29d8ded28"}, - {file = "curl_cffi-0.15.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:a25620d9bf989c9c029a7d1642999c4c265abb0bad811deb2f77b0b5b2b12e5b"}, - {file = "curl_cffi-0.15.0-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:582e570aa2586b96ed47cf4a17586b9a3c462cbe43f780487c3dc245c6ef1527"}, - {file = "curl_cffi-0.15.0-cp310-abi3-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:838e48212447d9c81364b04707a5c861daf08f8320f9ecb3406a8919d1d5c3b3"}, - {file = "curl_cffi-0.15.0-cp310-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2b6c847d86283b07ae69bb72c82eb8a59242277142aa35b89850f89e792a02fc"}, - {file = "curl_cffi-0.15.0-cp310-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:9e5e69eee735f659287e2c84444319d68a1fa68dd37abf228943a4074864283a"}, - {file = "curl_cffi-0.15.0-cp310-abi3-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:aa1323950224db24f4c510d010b3affa02196ca853fb424191fa917a513d3f4b"}, - {file = "curl_cffi-0.15.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:41f80170ba844009273b2660da1964ec31e99e5719d16b3422ada87177e32e13"}, - {file = "curl_cffi-0.15.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1977e1e12cfb5c11352cbb74acef1bed24eb7d226dab61ca57c168c21acd4d61"}, - {file = "curl_cffi-0.15.0-cp310-abi3-win_amd64.whl", hash = "sha256:5a0c1896a0d5a5ac1eb89cd24b008d2b718dd1df6fd2f75451b59ca66e49e572"}, - {file = "curl_cffi-0.15.0-cp310-abi3-win_arm64.whl", hash = "sha256:a6d57f8389273a3a1f94370473c74897467bcc36af0a17336989780c507fa43d"}, - {file = "curl_cffi-0.15.0-cp313-abi3-android_24_arm64_v8a.whl", hash = "sha256:4682dc38d4336e0eb0b185374db90a760efde63cbea994b4e63f3521d44c4c92"}, - {file = "curl_cffi-0.15.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:967ad7355bd8e9586f8c2d02eaa99953747549e7ea4a9b25cd53353e6b67fe6d"}, - {file = "curl_cffi-0.15.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7e63539d0d839d0a8c5eacf86229bc68c57803547f35e0db7ee0986328b478c3"}, - {file = "curl_cffi-0.15.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:08c799b89740b9bc49c09fbc3d5907f13ac1f845ca52620507ef9466d4639dd5"}, - {file = "curl_cffi-0.15.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7b7a92767a888ee90147e18964b396d8435ff42737030d6fb00824ffd6094805"}, - {file = "curl_cffi-0.15.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:829cc357061ecb99cc2d406301f609a039e05665322f5c025ec67c38b0dc49ce"}, - {file = "curl_cffi-0.15.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:408d6f14e346841cd889c2e0962832bb235ba3b6749ebf609f347f747da5e60f"}, - {file = "curl_cffi-0.15.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b624c7ce087bfda967a013ed0a64702a525444e5b6e97d23534d567ccc6525aa"}, - {file = "curl_cffi-0.15.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0b6c0543b993996670e9e4b78e305a2d60809d5681903ffb5568e21a387434d3"}, - {file = "curl_cffi-0.15.0.tar.gz", hash = "sha256:ea0c67652bf6893d34ee0f82c944f37e488f6147e9421bef1771cc6545b02ded"}, -] - -[package.dependencies] -certifi = ">=2024.2.2" -cffi = ">=2.0.0" -rich = "*" - -[package.extras] -build = ["cibuildwheel", "wheel"] -dev = ["charset_normalizer (>=3.3.2,<4.0)", "coverage (>=6.4.1,<7.0)", "cryptography (>=46.0.4,<47.0)", "httpx (==0.23.1)", "mypy (>=1.9.0,<2.0)", "pytest (>=8.1.1,<9.0)", "pytest-asyncio (>=0.23.6,<1.0)", "pytest-trio (>=0.8.0,<1.0)", "ruff (>=0.3.5,<1.0)", "trio (>=0.25.0,<1.0)", "trustme (>=1.1.0,<2.0)", "typing_extensions", "uvicorn (>=0.29.0,<1.0)", "websockets (>=14.0)"] -extra = ["lxml_html_clean", "markdownify (>=1.1.0)", "readability-lxml (>=0.8.1)"] -test = ["charset_normalizer (>=3.3.2,<4.0)", "cryptography (>=46.0.4,<47.0)", "httpx (==0.23.1)", "litestar (>=2.19.0,<3.0)", "proxy.py (>=2.4.3,<3.0)", "pytest (>=8.1.1,<9.0)", "pytest-asyncio (>=0.23.6,<1.0)", "pytest-trio (>=0.8.0,<1.0)", "python-multipart (>=0.0.9,<1.0)", "trio (>=0.25.0,<1.0)", "trustme (>=1.1.0,<2.0)", "typing_extensions", "uvicorn (>=0.29.0,<1.0)", "websockets (>=14.0)"] - [[package]] name = "dacite" version = "1.9.2" @@ -2930,7 +2889,7 @@ version = "4.0.0" description = "Python port of markdown-it. Markdown parsing, done right!" optional = false python-versions = ">=3.10" -groups = ["main", "test"] +groups = ["test"] files = [ {file = "markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147"}, {file = "markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3"}, @@ -3026,7 +2985,7 @@ version = "0.1.2" description = "Markdown URL utilities" optional = false python-versions = ">=3.7" -groups = ["main", "test"] +groups = ["test"] files = [ {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, @@ -4228,7 +4187,7 @@ description = "C parser in Python" optional = false python-versions = ">=3.10" groups = ["main"] -markers = "implementation_name != \"PyPy\"" +markers = "(platform_python_implementation != \"PyPy\" or sys_platform == \"darwin\") and implementation_name != \"PyPy\"" files = [ {file = "pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992"}, {file = "pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29"}, @@ -5407,7 +5366,7 @@ version = "14.3.3" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" optional = false python-versions = ">=3.8.0" -groups = ["main", "test"] +groups = ["test"] files = [ {file = "rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d"}, {file = "rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b"}, @@ -6870,4 +6829,4 @@ cffi = ["cffi (>=1.17,<2.0) ; platform_python_implementation != \"PyPy\" and pyt [metadata] lock-version = "2.1" python-versions = ">=3.11,<3.15" -content-hash = "8b1c608beddabe6884d4c2614170ccd71986daccb7db3944cb36531937fdee4e" +content-hash = "beac677c269bb8618ca802e5f92f7558391d8d26f1b2150f3c8a6d3417848cb1" diff --git a/pyproject.toml b/pyproject.toml index b87d518..0b0b9e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,6 @@ python-dateutil = "^2.8.2" python-dotenv = ">=0.20.0,<2" python-json-logger = "^2.0.2" cloudscraper = "^1.2.71" -curl-cffi = "^0.15.0" colorlog = "^6.6.0" httpx = "<=0.27.2" djangorestframework = "^3.13.1" diff --git a/vrobbler/apps/webpages/models.py b/vrobbler/apps/webpages/models.py index f979ba0..1c6427a 100644 --- a/vrobbler/apps/webpages/models.py +++ b/vrobbler/apps/webpages/models.py @@ -2,8 +2,6 @@ import logging from typing import Dict, Optional from uuid import uuid4 -import cloudscraper -from curl_cffi import requests as curl_requests import pendulum import requests import trafilatura @@ -50,32 +48,32 @@ class Domain(TimeStampedModel): def _fetch_url_raw(url: str) -> Optional[str]: """Fetch raw HTML for a URL. - Tries three strategies in order: + Tries two strategies in order: 1. trafilatura (standard, works for most sites) - 2. cloudscraper (handles basic Cloudflare JS challenges) - 3. curl_cffi (impersonates browser TLS fingerprint for aggressive - Cloudflare / bot detection) + 2. cloudscraper (handles Cloudflare JS challenges) + + cloudscraper is lazy-imported so deployments that cannot compile + its dependencies (e.g. FreeBSD) still work. """ raw = trafilatura.fetch_url(url) if raw: return raw + logger.debug("trafilatura returned nothing for %s, trying cloudscraper", url) try: - scraper = cloudscraper.create_scraper() - resp = scraper.get(url, timeout=30) - resp.raise_for_status() - return resp.text - except Exception as exc: - logger.debug("cloudscraper failed for %s: %s, trying curl_cffi", url, exc) - try: - resp = curl_requests.get( - url, impersonate="chrome124", timeout=30 - ) - resp.raise_for_status() - return resp.text - except Exception as e: - logger.warning("curl_cffi also failed for %s: %s", url, e) - return None + import cloudscraper + except ImportError: + logger.debug("cloudscraper not available") + else: + try: + scraper = cloudscraper.create_scraper() + resp = scraper.get(url, timeout=30) + resp.raise_for_status() + return resp.text + except Exception as exc: + logger.debug("cloudscraper failed for %s: %s", url, exc) + + return None class WebPage(ScrobblableMixin):