[webpages] Remove curl_cffi as it doesn't work on FreeBSD
This commit is contained in:
@ -88,7 +88,7 @@ fetching and simple saving.
|
||||
*** Metadata sources
|
||||
**** Scraper
|
||||
|
||||
* Backlog [0/22] :vrobbler:project:personal:
|
||||
* Backlog [1/23] :vrobbler:project:personal:
|
||||
** TODO [#C] Create small utility to clean up tracks scrobbled with wonky playback times :bug:music:scrobbles:
|
||||
:PROPERTIES:
|
||||
:ID: 702462cf-d54b-48c6-8a7c-78b8de751deb
|
||||
@ -604,6 +604,10 @@ independent of the email flow it was originally creatdd for
|
||||
|
||||
** TODO [#B] Is there way to create unique slugs for media instances :media_types:
|
||||
|
||||
** DONE [#A] Remove curl-cffi as it doesn't work on FreeBSD :webpages:deps:
|
||||
:PROPERTIES:
|
||||
:ID: 6bc1b0dd-e449-3d32-a176-46451e793e5d
|
||||
:END:
|
||||
* Version 58.2 [2/2]
|
||||
** DONE [#B] Add more robust webpage scraping :webpages:metadata:
|
||||
:PROPERTIES:
|
||||
|
||||
53
poetry.lock
generated
53
poetry.lock
generated
@ -654,6 +654,7 @@ description = "Foreign Function Interface for Python calling C code."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
markers = "platform_python_implementation != \"PyPy\" or sys_platform == \"darwin\""
|
||||
files = [
|
||||
{file = "cffi-2.0.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44"},
|
||||
{file = "cffi-2.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f73b96c41e3b2adedc34a7356e64c8eb96e03a3782b535e043a986276ce12a49"},
|
||||
@ -1237,48 +1238,6 @@ ssh = ["bcrypt (>=3.1.5)"]
|
||||
test = ["certifi (>=2024)", "cryptography-vectors (==46.0.6)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"]
|
||||
test-randomorder = ["pytest-randomly"]
|
||||
|
||||
[[package]]
|
||||
name = "curl-cffi"
|
||||
version = "0.15.0"
|
||||
description = "libcurl ffi bindings for Python, with impersonation support."
|
||||
optional = false
|
||||
python-versions = ">=3.10"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "curl_cffi-0.15.0-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:bda66404010e9ed743b1b83c20c86f24fe21a9a6873e17479d6e67e29d8ded28"},
|
||||
{file = "curl_cffi-0.15.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:a25620d9bf989c9c029a7d1642999c4c265abb0bad811deb2f77b0b5b2b12e5b"},
|
||||
{file = "curl_cffi-0.15.0-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:582e570aa2586b96ed47cf4a17586b9a3c462cbe43f780487c3dc245c6ef1527"},
|
||||
{file = "curl_cffi-0.15.0-cp310-abi3-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:838e48212447d9c81364b04707a5c861daf08f8320f9ecb3406a8919d1d5c3b3"},
|
||||
{file = "curl_cffi-0.15.0-cp310-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2b6c847d86283b07ae69bb72c82eb8a59242277142aa35b89850f89e792a02fc"},
|
||||
{file = "curl_cffi-0.15.0-cp310-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:9e5e69eee735f659287e2c84444319d68a1fa68dd37abf228943a4074864283a"},
|
||||
{file = "curl_cffi-0.15.0-cp310-abi3-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:aa1323950224db24f4c510d010b3affa02196ca853fb424191fa917a513d3f4b"},
|
||||
{file = "curl_cffi-0.15.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:41f80170ba844009273b2660da1964ec31e99e5719d16b3422ada87177e32e13"},
|
||||
{file = "curl_cffi-0.15.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1977e1e12cfb5c11352cbb74acef1bed24eb7d226dab61ca57c168c21acd4d61"},
|
||||
{file = "curl_cffi-0.15.0-cp310-abi3-win_amd64.whl", hash = "sha256:5a0c1896a0d5a5ac1eb89cd24b008d2b718dd1df6fd2f75451b59ca66e49e572"},
|
||||
{file = "curl_cffi-0.15.0-cp310-abi3-win_arm64.whl", hash = "sha256:a6d57f8389273a3a1f94370473c74897467bcc36af0a17336989780c507fa43d"},
|
||||
{file = "curl_cffi-0.15.0-cp313-abi3-android_24_arm64_v8a.whl", hash = "sha256:4682dc38d4336e0eb0b185374db90a760efde63cbea994b4e63f3521d44c4c92"},
|
||||
{file = "curl_cffi-0.15.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:967ad7355bd8e9586f8c2d02eaa99953747549e7ea4a9b25cd53353e6b67fe6d"},
|
||||
{file = "curl_cffi-0.15.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7e63539d0d839d0a8c5eacf86229bc68c57803547f35e0db7ee0986328b478c3"},
|
||||
{file = "curl_cffi-0.15.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:08c799b89740b9bc49c09fbc3d5907f13ac1f845ca52620507ef9466d4639dd5"},
|
||||
{file = "curl_cffi-0.15.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7b7a92767a888ee90147e18964b396d8435ff42737030d6fb00824ffd6094805"},
|
||||
{file = "curl_cffi-0.15.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:829cc357061ecb99cc2d406301f609a039e05665322f5c025ec67c38b0dc49ce"},
|
||||
{file = "curl_cffi-0.15.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:408d6f14e346841cd889c2e0962832bb235ba3b6749ebf609f347f747da5e60f"},
|
||||
{file = "curl_cffi-0.15.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b624c7ce087bfda967a013ed0a64702a525444e5b6e97d23534d567ccc6525aa"},
|
||||
{file = "curl_cffi-0.15.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0b6c0543b993996670e9e4b78e305a2d60809d5681903ffb5568e21a387434d3"},
|
||||
{file = "curl_cffi-0.15.0.tar.gz", hash = "sha256:ea0c67652bf6893d34ee0f82c944f37e488f6147e9421bef1771cc6545b02ded"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
certifi = ">=2024.2.2"
|
||||
cffi = ">=2.0.0"
|
||||
rich = "*"
|
||||
|
||||
[package.extras]
|
||||
build = ["cibuildwheel", "wheel"]
|
||||
dev = ["charset_normalizer (>=3.3.2,<4.0)", "coverage (>=6.4.1,<7.0)", "cryptography (>=46.0.4,<47.0)", "httpx (==0.23.1)", "mypy (>=1.9.0,<2.0)", "pytest (>=8.1.1,<9.0)", "pytest-asyncio (>=0.23.6,<1.0)", "pytest-trio (>=0.8.0,<1.0)", "ruff (>=0.3.5,<1.0)", "trio (>=0.25.0,<1.0)", "trustme (>=1.1.0,<2.0)", "typing_extensions", "uvicorn (>=0.29.0,<1.0)", "websockets (>=14.0)"]
|
||||
extra = ["lxml_html_clean", "markdownify (>=1.1.0)", "readability-lxml (>=0.8.1)"]
|
||||
test = ["charset_normalizer (>=3.3.2,<4.0)", "cryptography (>=46.0.4,<47.0)", "httpx (==0.23.1)", "litestar (>=2.19.0,<3.0)", "proxy.py (>=2.4.3,<3.0)", "pytest (>=8.1.1,<9.0)", "pytest-asyncio (>=0.23.6,<1.0)", "pytest-trio (>=0.8.0,<1.0)", "python-multipart (>=0.0.9,<1.0)", "trio (>=0.25.0,<1.0)", "trustme (>=1.1.0,<2.0)", "typing_extensions", "uvicorn (>=0.29.0,<1.0)", "websockets (>=14.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "dacite"
|
||||
version = "1.9.2"
|
||||
@ -2930,7 +2889,7 @@ version = "4.0.0"
|
||||
description = "Python port of markdown-it. Markdown parsing, done right!"
|
||||
optional = false
|
||||
python-versions = ">=3.10"
|
||||
groups = ["main", "test"]
|
||||
groups = ["test"]
|
||||
files = [
|
||||
{file = "markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147"},
|
||||
{file = "markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3"},
|
||||
@ -3026,7 +2985,7 @@ version = "0.1.2"
|
||||
description = "Markdown URL utilities"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
groups = ["main", "test"]
|
||||
groups = ["test"]
|
||||
files = [
|
||||
{file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
|
||||
{file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
|
||||
@ -4228,7 +4187,7 @@ description = "C parser in Python"
|
||||
optional = false
|
||||
python-versions = ">=3.10"
|
||||
groups = ["main"]
|
||||
markers = "implementation_name != \"PyPy\""
|
||||
markers = "(platform_python_implementation != \"PyPy\" or sys_platform == \"darwin\") and implementation_name != \"PyPy\""
|
||||
files = [
|
||||
{file = "pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992"},
|
||||
{file = "pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29"},
|
||||
@ -5407,7 +5366,7 @@ version = "14.3.3"
|
||||
description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
|
||||
optional = false
|
||||
python-versions = ">=3.8.0"
|
||||
groups = ["main", "test"]
|
||||
groups = ["test"]
|
||||
files = [
|
||||
{file = "rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d"},
|
||||
{file = "rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b"},
|
||||
@ -6870,4 +6829,4 @@ cffi = ["cffi (>=1.17,<2.0) ; platform_python_implementation != \"PyPy\" and pyt
|
||||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = ">=3.11,<3.15"
|
||||
content-hash = "8b1c608beddabe6884d4c2614170ccd71986daccb7db3944cb36531937fdee4e"
|
||||
content-hash = "beac677c269bb8618ca802e5f92f7558391d8d26f1b2150f3c8a6d3417848cb1"
|
||||
|
||||
@ -12,7 +12,6 @@ python-dateutil = "^2.8.2"
|
||||
python-dotenv = ">=0.20.0,<2"
|
||||
python-json-logger = "^2.0.2"
|
||||
cloudscraper = "^1.2.71"
|
||||
curl-cffi = "^0.15.0"
|
||||
colorlog = "^6.6.0"
|
||||
httpx = "<=0.27.2"
|
||||
djangorestframework = "^3.13.1"
|
||||
|
||||
@ -2,8 +2,6 @@ import logging
|
||||
from typing import Dict, Optional
|
||||
from uuid import uuid4
|
||||
|
||||
import cloudscraper
|
||||
from curl_cffi import requests as curl_requests
|
||||
import pendulum
|
||||
import requests
|
||||
import trafilatura
|
||||
@ -50,32 +48,32 @@ class Domain(TimeStampedModel):
|
||||
def _fetch_url_raw(url: str) -> Optional[str]:
|
||||
"""Fetch raw HTML for a URL.
|
||||
|
||||
Tries three strategies in order:
|
||||
Tries two strategies in order:
|
||||
1. trafilatura (standard, works for most sites)
|
||||
2. cloudscraper (handles basic Cloudflare JS challenges)
|
||||
3. curl_cffi (impersonates browser TLS fingerprint for aggressive
|
||||
Cloudflare / bot detection)
|
||||
2. cloudscraper (handles Cloudflare JS challenges)
|
||||
|
||||
cloudscraper is lazy-imported so deployments that cannot compile
|
||||
its dependencies (e.g. FreeBSD) still work.
|
||||
"""
|
||||
raw = trafilatura.fetch_url(url)
|
||||
if raw:
|
||||
return raw
|
||||
|
||||
logger.debug("trafilatura returned nothing for %s, trying cloudscraper", url)
|
||||
try:
|
||||
scraper = cloudscraper.create_scraper()
|
||||
resp = scraper.get(url, timeout=30)
|
||||
resp.raise_for_status()
|
||||
return resp.text
|
||||
except Exception as exc:
|
||||
logger.debug("cloudscraper failed for %s: %s, trying curl_cffi", url, exc)
|
||||
try:
|
||||
resp = curl_requests.get(
|
||||
url, impersonate="chrome124", timeout=30
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.text
|
||||
except Exception as e:
|
||||
logger.warning("curl_cffi also failed for %s: %s", url, e)
|
||||
return None
|
||||
import cloudscraper
|
||||
except ImportError:
|
||||
logger.debug("cloudscraper not available")
|
||||
else:
|
||||
try:
|
||||
scraper = cloudscraper.create_scraper()
|
||||
resp = scraper.get(url, timeout=30)
|
||||
resp.raise_for_status()
|
||||
return resp.text
|
||||
except Exception as exc:
|
||||
logger.debug("cloudscraper failed for %s: %s", url, exc)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class WebPage(ScrobblableMixin):
|
||||
|
||||
Reference in New Issue
Block a user