[foods] Add recipe website parsing
Some checks failed
build & deploy / test (push) Failing after 1m10s
build & deploy / deploy (push) Has been skipped

This commit is contained in:
2026-03-06 10:46:34 -05:00
parent 210ff0a4aa
commit 770a51b9c0
12 changed files with 1493 additions and 29 deletions

View File

@ -92,7 +92,7 @@ fetching and simple saving.
:LOGBOOK:
CLOCK: [2025-07-09 Wed 09:55]--[2025-07-09 Wed 10:15] => 0:20
:END:
* Backlog [4/27]
* Backlog [6/30]
** TODO [#C] Create small utility to clean up tracks scrobbled with wonky playback times :vrobbler:personal:bug:music:scrobbles:
** TODO [#C] Move to using more robust mopidy-webhooks pacakge form pypi :utility:improvement:
:PROPERTIES:
@ -458,7 +458,17 @@ Turns out we're not looking up music tracks properly, again.
:PROPERTIES:
:ID: e16228b2-b062-bd00-32e6-b2353e6406e9
:END:
** TODO [#A] Videos are scrobbling duplicates again :vrobbler:bug:videos:scrobbles:
** TODO Add sentiment parsing for Scrobbles with notes :vrobbler:project:scrobbles:sentiment:
** TODO Check opencode about a way to present stats like movies per month :vrobbler:scrobbles:stats:personal:project:
** DONE Add recipe parsing for food lookups :vrobbler:foods:project:feature:personal:
:PROPERTIES:
:ID: 86456c78-247b-fb63-7ae8-d6d17e7666b1
:END:
** DONE [#A] Videos are scrobbling duplicates again :vrobbler:bug:videos:scrobbles:
:PROPERTIES:
:ID: a46884fe-7ef1-410a-316b-7ac6d7599331
:END:
<2026-03-06 Fri>
** DONE Fix board games not saving BGG id on lookup :vrobbler:bug:boardgames:
:PROPERTIES:
:ID: 506c2965-51d6-6cb9-fc4f-4f0468d2d62f

17
scripts/README.org Normal file
View File

@ -0,0 +1,17 @@
#+title: Readme
Scripts are a collection of helpful utility scripts, or simple gut-check tests for various functional pieces.
* test_recipe_scraper.py
Asserts various urls by making actual calls out to the internet, while our test suite mocks return values.
#+begin_src sh
python ../manage.py shell < ../scripts/test_recipe_scraper.py
#+end_src
#+RESULTS:
| Eagerly | running | all | tasks |
| Connected | to | sqlite@db.sqlite3 | |
| Checking: | https://cookingwithmike.com/quinoa-meatloaf/ | | |
| Checking: | https://www.kingarthurbaking.com/recipes/overnight-sourdough-waffles-recipe | | |
| Checking: | https://dirt.fyi/article/2026/02/25-years-of-ipod-brain?src=longreads | | |

View File

@ -0,0 +1,21 @@
import requests
from foods.sources.rscraper import (
RecipeScraperService,
)
test_urls = {
"https://cookingwithmike.com/quinoa-meatloaf/": True,
"https://www.kingarthurbaking.com/recipes/overnight-sourdough-waffles-recipe": True,
"https://dirt.fyi/article/2026/02/25-years-of-ipod-brain?src=longreads": False,
"https://tastesbetterfromscratch.com/belgian-waffles/": True,
}
for k, v in test_urls.items():
html = requests.get(k).text
print("Checking: ", k)
if v:
assert RecipeScraperService().is_recipe(html, k)
else:
assert not RecipeScraperService().is_recipe(html, k)

View File

View File

@ -0,0 +1,198 @@
import pytest
from foods.sources.rscraper import (
RecipeScraperService,
)
RECIPE_HTML_WITH_SCHEMA = """
<!DOCTYPE html>
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org/",
"@type": "Recipe",
"name": "Test Recipe",
"author": {
"@type": "Person",
"name": "Test Author"
},
"recipeIngredient": ["1 cup flour", "2 eggs", "1/2 cup sugar"],
"recipeInstructions": [
{
"@type": "HowToStep",
"text": "Mix ingredients together"
}
],
"totalTime": "PT30M",
"recipeYield": "4 servings"
}
</script>
</head>
<body>
<h1>Test Recipe</h1>
</body>
</html>
"""
RECIPE_HTML_WITHOUT_SCHEMA = """
<!DOCTYPE html>
<html>
<head>
<title>Not a Recipe Page</title>
</head>
<body>
<h1>Welcome to My Blog</h1>
<p>This is just a regular blog post about cooking.</p>
</body>
</html>
"""
RECIPE_HTML_WITH_MICRODATA = """
<!DOCTYPE html>
<html>
<head>
<title>Test Recipe</title>
</head>
<body itemscope itemtype="http://schema.org/Recipe">
<h1 itemprop="name">Microdata Recipe</h1>
<div itemprop="author" itemscope itemtype="http://schema.org/Person">
<span itemprop="name">Test Author</span>
</div>
<div itemprop="recipeIngredient">1 cup flour</div>
<div itemprop="recipeIngredient">2 eggs</div>
<div itemprop="recipeInstructions">
<div itemprop="text">Mix all ingredients</div>
</div>
</body>
</html>
"""
class TestRecipeScraperService:
@pytest.fixture
def scraper(self):
return RecipeScraperService()
def test_is_recipe_with_valid_schema(self, scraper):
result = scraper.is_recipe(
RECIPE_HTML_WITH_SCHEMA, "https://example.com/recipe"
)
assert result is True
def test_is_recipe_without_schema(self, scraper):
result = scraper.is_recipe(
RECIPE_HTML_WITHOUT_SCHEMA, "https://example.com/blog"
)
assert result is False
def test_is_recipe_with_microdata(self, scraper):
result = scraper.is_recipe(
RECIPE_HTML_WITH_MICRODATA, "https://example.com/recipe"
)
assert result is True
def test_scrape_returns_title(self, scraper):
result = scraper.scrape(
RECIPE_HTML_WITH_SCHEMA, "https://example.com/recipe"
)
assert result["title"] == "Test Recipe"
def test_scrape_returns_ingredients(self, scraper):
result = scraper.scrape(
RECIPE_HTML_WITH_SCHEMA, "https://example.com/recipe"
)
assert len(result["ingredients"]) == 3
assert "1 cup flour" in result["ingredients"]
def test_scrape_returns_instructions(self, scraper):
result = scraper.scrape(
RECIPE_HTML_WITH_SCHEMA, "https://example.com/recipe"
)
assert len(result["instructions"]) > 0
assert "Mix ingredients together" in result["instructions"]
def test_scrape_returns_yields(self, scraper):
result = scraper.scrape(
RECIPE_HTML_WITH_SCHEMA, "https://example.com/recipe"
)
assert result["yields"] == "4 servings"
def test_scrape_returns_total_time(self, scraper):
result = scraper.scrape(
RECIPE_HTML_WITH_SCHEMA, "https://example.com/recipe"
)
assert result["total_time"] == 30
def test_scrape_returns_url(self, scraper):
result = scraper.scrape(
RECIPE_HTML_WITH_SCHEMA, "https://example.com/recipe"
)
assert result["url"] == "https://example.com/recipe"
def test_scrape_raises_on_invalid_html(self, scraper):
with pytest.raises(ValueError):
scraper.scrape("", "https://example.com/recipe")
def test_scrape_handles_missing_optional_fields(self, scraper):
minimal_html = """
<!DOCTYPE html>
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org/",
"@type": "Recipe",
"name": "Minimal Recipe"
}
</script>
</head>
<body></body>
</html>
"""
result = scraper.scrape(minimal_html, "https://example.com/minimal")
assert result["title"] == "Minimal Recipe"
assert result["ingredients"] == []
assert result["instructions"] == []
def test_parse_servings(self, scraper):
assert scraper.parse_servings("4 servings") == 4
assert scraper.parse_servings("6 people") == 6
assert scraper.parse_servings("2") == 2
assert scraper.parse_servings("serves 8") == 8
assert scraper.parse_servings(None) is None
assert scraper.parse_servings("") is None
def test_extract_tags_from_cuisine(self, scraper):
recipe_data = {"cuisine": "Italian"}
tags = scraper.extract_tags(recipe_data)
assert "Italian" in tags
def test_extract_tags_from_cuisine_list(self, scraper):
recipe_data = {"cuisine": ["Italian", "Mexican"]}
tags = scraper.extract_tags(recipe_data)
assert "Italian" in tags
assert "Mexican" in tags
def test_extract_tags_from_dietary(self, scraper):
recipe_data = {"dietary": "Gluten-Free"}
tags = scraper.extract_tags(recipe_data)
assert "Gluten-Free" in tags
def test_extract_tags_from_course(self, scraper):
recipe_data = {"course": "Dessert"}
tags = scraper.extract_tags(recipe_data)
assert "Dessert" in tags
def test_extract_tags_from_keywords(self, scraper):
recipe_data = {"keywords": "easy, quick, healthy"}
tags = scraper.extract_tags(recipe_data)
assert "easy" in tags
assert "quick" in tags
assert "healthy" in tags
def test_extract_tags_from_keywords_list(self, scraper):
recipe_data = {"keywords": ["comfort food", "winter"]}
tags = scraper.extract_tags(recipe_data)
assert "comfort food" in tags
assert "winter" in tags

View File

@ -0,0 +1,135 @@
import pytest
from unittest.mock import patch
from foods.sources.usda import (
USDAFoodAPI,
NutritionCalculator,
)
class TestUSDAFoodAPI:
@pytest.fixture
def usda_api(self):
with patch(
"vrobbler.apps.foods.sources.usda.settings"
) as mock_settings:
mock_settings.USDA_API_KEY = "test_api_key"
return USDAFoodAPI(api_key="test_api_key")
def test_extract_nutrients_with_nutrient_number(self, usda_api):
food_data = {
"description": "Test Food",
"foodNutrients": [
{
"nutrientNumber": "203",
"nutrientName": "Protein",
"value": 10.0,
},
{
"nutrientNumber": "204",
"nutrientName": "Total lipid (fat)",
"value": 5.0,
},
{
"nutrientNumber": "205",
"nutrientName": "Carbohydrate, by difference",
"value": 20.0,
},
{
"nutrientNumber": "208",
"nutrientName": "Energy",
"value": 150.0,
},
{
"nutrientNumber": "269",
"nutrientName": "Sugars, total",
"value": 5.0,
},
],
}
result = usda_api.extract_nutrients(food_data)
assert result["protein"] == 10.0
assert result["fat"] == 5.0
assert result["carbohydrates"] == 20.0
assert result["calories"] == 150.0
assert result["sugar"] == 5.0
def test_extract_nutrients_with_nested_nutrient(self, usda_api):
food_data = {
"description": "Test Food",
"foodNutrients": [
{
"nutrient": {"id": 203, "name": "Protein"},
"value": 10.0,
},
],
}
result = usda_api.extract_nutrients(food_data)
assert result["protein"] == 10.0
def test_extract_nutrients_with_empty_nutrients(self, usda_api):
food_data = {"description": "Test Food", "foodNutrients": []}
result = usda_api.extract_nutrients(food_data)
assert result["protein"] == 0
assert result["calories"] == 0
def test_extract_nutrients_with_no_nutrients_key(self, usda_api):
food_data = {"description": "Test Food"}
result = usda_api.extract_nutrients(food_data)
assert result["protein"] == 0
class TestNutritionCalculator:
@pytest.fixture
def calculator(self):
with patch("vrobbler.apps.foods.sources.usda.USDAFoodAPI"):
return NutritionCalculator()
def test_parse_ingredient_with_fraction(self, calculator):
result = calculator.parse_ingredient("1/2 cup flour")
assert result["quantity"] == 0.5
assert result["unit"] == "cup"
assert result["ingredient"] == "flour"
def test_parse_ingredient_with_mixed_number(self, calculator):
result = calculator.parse_ingredient("1 1/2 cups sugar")
assert result["quantity"] == 1.5
assert result["unit"] == "cups"
assert result["ingredient"] == "sugar"
def test_parse_ingredient_with_decimal(self, calculator):
result = calculator.parse_ingredient("0.5 tsp salt")
assert result["quantity"] == 0.5
assert result["unit"] == "tsp"
assert result["ingredient"] == "salt"
def test_parse_ingredient_with_whole_number(self, calculator):
result = calculator.parse_ingredient("3 eggs")
assert result["quantity"] == 3
assert result["unit"] is None
assert result["ingredient"] == "eggs"
def test_parse_ingredient_with_no_quantity(self, calculator):
result = calculator.parse_ingredient("salt to taste")
assert result["quantity"] == 1
def test_clean_ingredient_name_removes_modifiers(self, calculator):
result = calculator._clean_ingredient_name("fresh chopped onions")
assert "fresh" not in result.lower()
assert "chopped" not in result.lower()
def test_clean_ingredient_name_removes_parentheses(self, calculator):
result = calculator._clean_ingredient_name("flour (sifted)")
assert "(" not in result
assert ")" not in result
def test_convert_to_grams_cup(self, calculator):
result = calculator._convert_to_grams(2, "cups", "flour")
assert result == 480
def test_convert_to_grams_tablespoon(self, calculator):
result = calculator._convert_to_grams(3, "tbsp", "olive oil")
assert result == 45
def test_convert_to_grams_unknown_unit(self, calculator):
result = calculator._convert_to_grams(1, "unknown", "something")
assert result == 100

View File

@ -0,0 +1,131 @@
# Generated by Django 4.2.29 on 2026-03-05 17:01
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("foods", "0004_remove_food_run_time_seconds_and_more"),
]
operations = [
migrations.AddField(
model_name="food",
name="carbohydrates",
field=models.DecimalField(
blank=True, decimal_places=2, max_digits=10, null=True
),
),
migrations.AddField(
model_name="food",
name="cook_time_minutes",
field=models.IntegerField(blank=True, null=True),
),
migrations.AddField(
model_name="food",
name="fat",
field=models.DecimalField(
blank=True, decimal_places=2, max_digits=10, null=True
),
),
migrations.AddField(
model_name="food",
name="fiber",
field=models.DecimalField(
blank=True, decimal_places=2, max_digits=10, null=True
),
),
migrations.AddField(
model_name="food",
name="ingredients",
field=models.TextField(blank=True, null=True),
),
migrations.AddField(
model_name="food",
name="instructions",
field=models.TextField(blank=True, null=True),
),
migrations.AddField(
model_name="food",
name="prep_time_minutes",
field=models.IntegerField(blank=True, null=True),
),
migrations.AddField(
model_name="food",
name="protein",
field=models.DecimalField(
blank=True, decimal_places=2, max_digits=10, null=True
),
),
migrations.AddField(
model_name="food",
name="servings",
field=models.IntegerField(blank=True, null=True),
),
migrations.AddField(
model_name="food",
name="sodium",
field=models.DecimalField(
blank=True, decimal_places=2, max_digits=10, null=True
),
),
migrations.AddField(
model_name="food",
name="source_site",
field=models.CharField(blank=True, max_length=100, null=True),
),
migrations.AddField(
model_name="food",
name="source_url",
field=models.URLField(blank=True, null=True, unique=True),
),
migrations.AddField(
model_name="food",
name="sugar",
field=models.DecimalField(
blank=True, decimal_places=2, max_digits=10, null=True
),
),
migrations.AddField(
model_name="food",
name="total_time_minutes",
field=models.IntegerField(blank=True, null=True),
),
migrations.AddField(
model_name="food",
name="yield_text",
field=models.CharField(blank=True, max_length=255, null=True),
),
migrations.AddIndex(
model_name="food",
index=models.Index(
fields=["source_url"], name="foods_food_source__f42f6f_idx"
),
),
migrations.AddIndex(
model_name="food",
index=models.Index(
fields=["allrecipe_id"], name="foods_food_allreci_3e7a6c_idx"
),
),
migrations.AddIndex(
model_name="food",
index=models.Index(
fields=["description"], name="foods_food_descrip_fccd4d_idx"
),
),
migrations.RenameField(
model_name="food",
old_name="allrecipe_image",
new_name="recipe_image",
),
migrations.RemoveField(
model_name="foodcategory",
name="allrecipe_image",
),
migrations.AddField(
model_name="foodcategory",
name="category_image",
field=models.ImageField(blank=True, null=True, upload_to="food/category/"),
),
]

View File

@ -1,5 +1,6 @@
import json
from dataclasses import dataclass
from typing import Optional
from typing import Optional, Tuple
from uuid import uuid4
from django.apps import apps
@ -10,6 +11,8 @@ from imagekit.models import ImageSpecField
from imagekit.processors import ResizeToFit
from scrobbles.dataclasses import BaseLogData, WithPeopleLogData
from scrobbles.mixins import ScrobblableConstants, ScrobblableMixin
from foods.sources.rscraper import RecipeScraperService
from foods.sources.usda import NutritionCalculator
BNULL = {"blank": True, "null": True}
@ -23,15 +26,15 @@ class FoodLogData(BaseLogData, WithPeopleLogData):
class FoodCategory(TimeStampedModel):
uuid = models.UUIDField(default=uuid4, editable=False, **BNULL)
name = models.CharField(max_length=255)
allrecipe_image = models.ImageField(upload_to="food/recipe/", **BNULL)
allrecipe_image_small = ImageSpecField(
source="recipe_image",
category_image = models.ImageField(upload_to="food/category/", **BNULL)
category_image_small = ImageSpecField(
source="category_image",
processors=[ResizeToFit(100, 100)],
format="JPEG",
options={"quality": 60},
)
allrecipe_image_medium = ImageSpecField(
source="recipe_image",
category_image_medium = ImageSpecField(
source="category_image",
processors=[ResizeToFit(300, 300)],
format="JPEG",
options={"quality": 75},
@ -39,6 +42,7 @@ class FoodCategory(TimeStampedModel):
allrecipe_id = models.CharField(max_length=255, **BNULL)
description = models.TextField(**BNULL)
@classmethod
def find_or_create(cls, title: str) -> "FoodCategory":
return cls.objects.filter(title=title).first()
@ -47,17 +51,42 @@ class FoodCategory(TimeStampedModel):
class Food(ScrobblableMixin):
description = models.TextField(**BNULL)
# Recipe source tracking
source_url = models.URLField(null=True, blank=True, unique=True)
source_site = models.CharField(max_length=100, null=True, blank=True)
# Recipe data
description = models.TextField(**BNULL) # Recipe title
ingredients = models.TextField(**BNULL) # JSON or newline-separated
instructions = models.TextField(**BNULL) # JSON or newline-separated
prep_time_minutes = models.IntegerField(**BNULL)
cook_time_minutes = models.IntegerField(**BNULL)
total_time_minutes = models.IntegerField(**BNULL)
servings = models.IntegerField(**BNULL)
yield_text = models.CharField(
max_length=255, **BNULL
) # e.g., "8 calzones"
# Nutrition (per serving)
calories = models.IntegerField(**BNULL)
allrecipe_image = models.ImageField(upload_to="food/recipe/", **BNULL)
allrecipe_image_small = ImageSpecField(
source="allrecipe_image",
protein = models.DecimalField(max_digits=10, decimal_places=2, **BNULL)
fat = models.DecimalField(max_digits=10, decimal_places=2, **BNULL)
carbohydrates = models.DecimalField(
max_digits=10, decimal_places=2, **BNULL
)
fiber = models.DecimalField(max_digits=10, decimal_places=2, **BNULL)
sugar = models.DecimalField(max_digits=10, decimal_places=2, **BNULL)
sodium = models.DecimalField(max_digits=10, decimal_places=2, **BNULL)
recipe_image = models.ImageField(upload_to="food/recipe/", **BNULL)
recipe_image_small = ImageSpecField(
source="recipe_image",
processors=[ResizeToFit(100, 100)],
format="JPEG",
options={"quality": 60},
)
allrecipe_image_medium = ImageSpecField(
source="allrecipe_image",
recipe_image_medium = ImageSpecField(
source="recipe_image",
processors=[ResizeToFit(300, 300)],
format="JPEG",
options={"quality": 75},
@ -68,6 +97,13 @@ class Food(ScrobblableMixin):
FoodCategory, on_delete=models.DO_NOTHING, **BNULL
)
class Meta:
indexes = [
models.Index(fields=["source_url"]),
models.Index(fields=["allrecipe_id"]),
models.Index(fields=["description"]),
]
def get_absolute_url(self) -> str:
return reverse("foods:food_detail", kwargs={"slug": self.uuid})
@ -91,8 +127,8 @@ class Food(ScrobblableMixin):
@property
def primary_image_url(self) -> str:
url = ""
if self.allrecipe_image:
url = self.allrecipe_image.url
if self.recipe_image:
url = self.recipe_image.url
return url
@property
@ -100,21 +136,155 @@ class Food(ScrobblableMixin):
return FoodLogData
@classmethod
def find_or_create(cls, allrecipe_id: str) -> "Food":
food = cls.objects.filter(allrecipe_id=allrecipe_id).first()
def find_or_create_from_recipe(
cls, url: str, category=None
) -> Tuple["Food", bool]:
"""
Scrape a recipe URL and create/update Food instance.
Uses django-taggit for tags and existing FoodCategory for category.
"""
# Check if URL already exists
existing = cls.objects.filter(source_url=url).first()
if existing:
return existing, False
if not food:
food_dict = get_food_from_allrecipe_id(allrecipe_id)
# category_dict = {}
# Scrape the recipe
scraper = RecipeScraperService()
recipe_data = scraper.scrape_url(url)
# category, _created = FoodCategory.objects.get_or_create(
# **category_dict
# )
food = Food.objects.create(**food_dict)
# for category_id in category_ids:
# food.category.add(category_id)
# Download image
recipe_image = None
if recipe_data.get("image"):
recipe_image = scraper.download_image(recipe_data["image"])
return food
# Extract tags
tags = scraper.extract_tags(recipe_data)
# Calculate nutrition
nutrition = recipe_data.get("nutrition")
if not nutrition:
calculator = NutritionCalculator()
servings = (
scraper.parse_servings(recipe_data.get("yields", "1")) or 1
)
nutrition = calculator.calculate_nutrition(
recipe_data.get("ingredients", []), servings
)
else:
servings = (
scraper.parse_servings(recipe_data.get("yields", "1")) or 1
)
# Get or create category (if not provided)
if not category and recipe_data.get("category"):
category, _ = FoodCategory.objects.get_or_create(
name=recipe_data["category"]
)
# Create Food instance
food = cls.objects.create(
title=recipe_data.get("title"),
source_url=url,
source_site=recipe_data.get("site"),
ingredients=json.dumps(recipe_data.get("ingredients", [])),
instructions=json.dumps(recipe_data.get("instructions", [])),
prep_time_minutes=recipe_data.get("prep_time"),
cook_time_minutes=recipe_data.get("cook_time"),
total_time_minutes=recipe_data.get("total_time"),
servings=servings,
yield_text=recipe_data.get("yields"),
# cuisine=recipe_data.get("cuisine"),
# course=recipe_data.get("course"),
calories=int(nutrition.get("calories", 0)) if nutrition else None,
protein=nutrition.get("protein") if nutrition else None,
fat=nutrition.get("fat") if nutrition else None,
carbohydrates=(
nutrition.get("carbohydrates") if nutrition else None
),
fiber=nutrition.get("fiber") if nutrition else None,
category=category,
)
# Add tags via django-taggit
if tags:
food.genre.add(*tags)
# Save image if downloaded
if recipe_image:
food.recipe_image.save(recipe_image.name, recipe_image, save=True)
if "allrecipes.com" in url:
food.allrecipe_id = url.split("/recipe/")[-1].split("/")[0]
food.save()
return food, True
@classmethod
def find_or_create_from_ingredients(
cls,
ingredients: list[str],
title: str,
servings: int = 1,
category=None,
) -> Tuple["Food", bool]:
"""
Create Food from ingredient list (no URL).
Calculates nutrition using USDA API.
"""
# Check if similar recipe exists
existing = cls.objects.filter(description__icontains=title).first()
if existing:
return existing, False
calculator = NutritionCalculator()
nutrition = calculator.calculate_nutrition(ingredients, servings)
food = cls.objects.create(
description=title,
ingredients=json.dumps(ingredients),
servings=servings,
calories=int(nutrition.get("calories", 0)) if nutrition else None,
protein=nutrition.get("protein") if nutrition else None,
fat=nutrition.get("fat") if nutrition else None,
carbohydrates=(
nutrition.get("carbohydrates") if nutrition else None
),
fiber=nutrition.get("fiber") if nutrition else None,
category=category,
)
return food, True
def refresh_nutrition(self) -> bool:
"""Recalculate nutrition from ingredients (useful if USDA data updated)."""
if not self.ingredients:
return False
try:
ingredients = json.loads(self.ingredients)
calculator = NutritionCalculator()
nutrition = calculator.calculate_nutrition(
ingredients, self.servings or 1
)
self.calories = int(nutrition.get("calories", 0))
self.protein = nutrition.get("protein")
self.fat = nutrition.get("fat")
self.carbohydrates = nutrition.get("carbohydrates")
self.fiber = nutrition.get("fiber")
self.save(
update_fields=[
"calories",
"protein",
"fat",
"carbohydrates",
"fiber",
]
)
return True
except Exception:
return False
def scrobbles(self, user_id):
Scrobble = apps.get_model("scrobbles", "Scrobble")

View File

@ -0,0 +1,182 @@
# services/recipe_scraper.py
from recipe_scrapers import scrape_html
from recipe_scrapers._exceptions import (
NoSchemaFoundInWildMode,
SchemaOrgException,
)
import requests
from typing import Dict, Optional, List
from django.core.files.base import ContentFile
class RecipeScraperService:
def __init__(self):
self.session = requests.Session()
self.session.headers.update(
{"User-Agent": "Mozilla/5.0 (compatible; Vrobbler/0.3)"}
)
def scrape(self, html: str, org_url: str) -> Dict:
"""Scrape recipe data from HTML.
Args:
html: The HTML content of the page
org_url: The original URL the HTML was fetched from
"""
try:
scraper = scrape_html(html, org_url=org_url, supported_only=False)
except Exception as e:
raise ValueError(f"Failed to scrape {org_url}: {str(e)}")
data_dict = {
"title": scraper.title(),
"ingredients": self._safe_call(scraper, "ingredients", []),
"instructions": self._safe_call(scraper, "instructions_list", []),
"yields": self._safe_call(scraper, "yields"),
"nutrition": self._safe_call(scraper, "nutrition"),
"course": self._safe_call(scraper, "course"),
"dietary": self._safe_call(scraper, "dietary", []),
"prep_time": self._safe_call(scraper, "prep_time"),
"total_time": self._safe_call(scraper, "total_time"),
"image": self._safe_call(scraper, "image"),
"cook_time": self._safe_call(scraper, "cook_time"),
"cuisine": self._safe_call(scraper, "cuisine"),
"site": self._safe_call(scraper, "host"),
"url": org_url,
"category": self._safe_call(scraper, "category"),
"keywords": self._safe_call(scraper, "keywords"),
}
return data_dict
def scrape_url(self, url: str):
response = self.session.get(url)
if response.status_code != 200:
raise Exception("Recipe website returned non 200 response", e)
return self.scrape(response.text, org_url=url)
@classmethod
def is_recipe(self, html: str, org_url: str) -> bool:
"""Check if HTML contains a recipe.
Args:
html: The HTML content to check
org_url: The original URL the HTML was fetched from
"""
try:
scraper = scrape_html(html, org_url=org_url, supported_only=False)
return scraper.schema is not None
except NoSchemaFoundInWildMode:
return False
except Exception:
return False
@classmethod
def is_recipe_url(cls, url: str) -> bool:
"""Check if a URL points to a recipe by fetching and checking the HTML."""
try:
response = requests.get(
url,
timeout=10,
headers={
"User-Agent": "Mozilla/5.0 (compatible; Vrobbler/0.3)"
},
)
if response.status_code != 200:
logger.debug("Recipe website returned non 200 response")
return False
return cls.is_recipe(response.text, url)
except Exception as e:
print(e)
return False
def _safe_call(self, scraper, method, default=None):
"""
Safely call a scraper method, returning default on error.
Handles both missing methods (AttributeError) and methods that return None.
"""
try:
# Check if method exists before calling
if not hasattr(scraper, method):
return default
result = getattr(scraper, method)()
return result if result else default
except (
AttributeError,
TypeError,
KeyError,
ValueError,
SchemaOrgException,
):
# AttributeError: method doesn't exist on this scraper class
# TypeError: method exists but can't be called
# KeyError: method exists but data structure is unexpected
return default
def download_image(self, image_url: str) -> Optional[ContentFile]:
"""Download recipe image and return as Django ContentFile."""
if not image_url:
return None
try:
response = self.session.get(image_url, timeout=10)
response.raise_for_status()
filename = image_url.split("/")[-1].split("?")[0]
if not filename.endswith((".jpg", ".jpeg", ".png", ".webp")):
filename = "recipe_image.jpg"
return ContentFile(response.content, name=filename)
except Exception:
return None
def parse_servings(self, yield_text: str) -> Optional[int]:
"""Extract number of servings from yield text."""
import re
if not yield_text:
return None
match = re.search(r"(\d+)", yield_text)
return int(match.group(1)) if match else None
def extract_tags(self, recipe_data: Dict) -> List[str]:
"""Extract all tags from recipe metadata."""
tags = set()
# Add cuisine
if recipe_data.get("cuisine"):
cuisine = recipe_data["cuisine"]
if isinstance(cuisine, str):
tags.add(cuisine.strip())
elif isinstance(cuisine, list):
tags.update([c.strip() for c in cuisine if c])
# Add dietary
if recipe_data.get("dietary"):
dietary = recipe_data["dietary"]
if isinstance(dietary, str):
tags.add(dietary.strip())
elif isinstance(dietary, list):
tags.update([d.strip() for d in dietary if d])
# Add course
if recipe_data.get("course"):
course = recipe_data["course"]
if isinstance(course, str):
tags.add(course.strip())
elif isinstance(course, list):
tags.update([c.strip() for c in course if c])
# Add keywords
keywords = recipe_data.get("keywords")
if keywords:
if isinstance(keywords, str):
tags.update(
[k.strip() for k in keywords.split(",") if k.strip()]
)
elif isinstance(keywords, list):
tags.update([k.strip() for k in keywords if k.strip()])
return list(tags)

View File

@ -0,0 +1,547 @@
from django.conf import settings
import requests
from typing import Dict, List, Optional
from functools import lru_cache
from typing import List, Dict
import re
from fractions import Fraction
import logging
logger = logging.getLogger(__name__)
class USDAFoodAPI:
BASE_URL = "https://api.nal.usda.gov/fdc/v1"
def __init__(self, api_key: Optional[str] = None):
self.api_key = api_key or getattr(settings, "USDA_API_KEY", None)
if not self.api_key:
raise ValueError(
"USDA API key not found. Set USDA_API_KEY in your Django settings."
)
self.session = requests.Session()
def _get_base_params(self) -> Dict:
"""Return base parameters including the API key."""
return {"api_key": self.api_key}
@lru_cache(maxsize=100)
def search_foods(
self, query: str, page_size: int = 50, page_number: int = 1
) -> List[Dict]:
"""Search for food items by keyword."""
url = f"{self.BASE_URL}/foods/search"
params = {
**self._get_base_params(),
"query": query,
"pageSize": page_size,
"pageNumber": page_number,
"includeNutrientCodes": "203,204,205,208,269",
}
response = self.session.get(url, params=params)
response.raise_for_status()
data = response.json()
return data.get("foods", [])
def get_food_details(self, fdc_id: int) -> Dict:
"""Retrieve detailed nutrient data for a specific food item."""
url = f"{self.BASE_URL}/food/{fdc_id}"
params = {
**self._get_base_params(),
"nutrients": [
203,
204,
205,
208,
269,
], # Protein, Fat, Carbs, Calories, Fiber
}
response = self.session.get(url, params=params)
response.raise_for_status()
return response.json()
def get_multiple_foods(self, fdc_ids: List[int]) -> List[Dict]:
"""Retrieve details for multiple food items."""
url = f"{self.BASE_URL}/foods"
params = {
**self._get_base_params(),
"fdcIds": fdc_ids,
"nutrients": ["203", "204", "205", "208", "269"],
}
response = self.session.get(url, params=params)
response.raise_for_status()
return response.json().get("foods", [])
def extract_nutrients(self, food_data: Dict) -> Dict:
"""Extract key nutrients into a clean dictionary."""
nutrients = {
"protein": 0,
"fat": 0,
"carbohydrates": 0,
"calories": 0,
"fiber": 0,
"sugar": 0,
"sodium": 0,
}
nutrient_map = {
"203": "protein",
"204": "fat",
"205": "carbohydrates",
"208": "calories",
"269": "fiber",
"2000": "sodium",
"269": "sugar",
}
food_nutrients = food_data.get("foodNutrients", [])
if not food_nutrients:
logger.warning(
f"No nutrients found for food: {food_data.get('description', 'unknown')}"
)
return nutrients
for nutrient in food_nutrients:
nutrient_id = None
value = None
if isinstance(nutrient, dict):
if "nutrientNumber" in nutrient:
nutrient_id = str(nutrient.get("nutrientNumber", ""))
value = nutrient.get("value")
elif "nutrient" in nutrient and isinstance(nutrient["nutrient"], dict):
nutrient_id = str(nutrient.get("nutrient", {}).get("id", ""))
value = nutrient.get("value")
elif "number" in nutrient:
nutrient_id = str(nutrient.get("number", ""))
value = nutrient.get("value")
if nutrient_id and nutrient_id in nutrient_map and value is not None:
key = nutrient_map[nutrient_id]
try:
nutrients[key] = float(value)
except (TypeError, ValueError):
pass
return nutrients
def get_food_summary(self, fdc_id: int) -> Dict:
"""Return a simplified summary of a food item."""
details = self.get_food_details(fdc_id)
return {
"fdcId": details.get("fdcId"),
"description": details.get("description"),
"dataType": details.get("dataType"),
"brandOwner": details.get("brandOwner"),
"servingSize": details.get("servingSize"),
"servingSizeUnit": details.get("servingSizeUnit"),
"nutrients": self.extract_nutrients(details),
}
class NutritionCalculator:
"""Calculate recipe nutrition from ingredients using USDA API."""
# Common units to strip from ingredient names
UNITS = [
"cup",
"cups",
"c",
"tablespoon",
"tablespoons",
"tbsp",
"tb",
"t",
"teaspoon",
"teaspoons",
"tsp",
"ts",
"ounce",
"ounces",
"oz",
"fluid ounce",
"fluid ounces",
"fl oz",
"pound",
"pounds",
"lb",
"lbs",
"gram",
"grams",
"g",
"kilogram",
"kilograms",
"kg",
"milliliter",
"milliliters",
"ml",
"liter",
"liters",
"l",
"piece",
"pieces",
"whole",
"large",
"medium",
"small",
"can",
"cans",
"bottle",
"bottles",
"jar",
"jars",
"package",
"packages",
"pkg",
"box",
"boxes",
"slice",
"slices",
"clove",
"cloves",
"stick",
"sticks",
"pinch",
"dash",
"handful",
"bunch",
"sprig",
"sprigs",
]
# Common modifiers to strip (don't affect nutrition much)
MODIFIERS = [
"all-purpose",
"all purpose",
"ap",
"bread",
"cake",
"self-rising",
"self rising",
"granulated",
"powdered",
"confectioners",
"brown",
"white",
"raw",
"extra virgin",
"virgin",
"light",
"dark",
"heavy",
"whipping",
"fresh",
"dried",
"frozen",
"canned",
"jarred",
"bottled",
"chopped",
"diced",
"minced",
"sliced",
"grated",
"shredded",
"crushed",
"melted",
"softened",
"room temperature",
"cold",
"warm",
"hot",
"packed",
"sifted",
"sieved",
"ground",
"whole",
"halved",
"quartered",
"peeled",
"unpeeled",
"seeded",
"pitted",
"cored",
"trimmed",
"optional",
"to taste",
"as needed",
"for garnish",
"for serving",
]
def __init__(self):
self.usda = USDAFoodAPI()
self._unit_pattern = self._build_unit_pattern()
def _build_unit_pattern(self) -> re.Pattern:
"""Build regex pattern for matching units."""
# Sort by length (longest first) to match "tablespoon" before "tbsp"
sorted_units = sorted(self.UNITS, key=len, reverse=True)
unit_str = "|".join(re.escape(u) for u in sorted_units)
return re.compile(rf"\b({unit_str})\b", re.IGNORECASE)
def parse_ingredient(self, ingredient_text: str) -> Dict:
"""
Parse ingredient string into quantity, unit, and clean ingredient name.
Examples:
"2 cups all-purpose flour"{qty: 2, unit: 'cups', name: 'flour'}
"1 1/2 tsp salt"{qty: 1.5, unit: 'tsp', name: 'salt'}
"1/4 cup olive oil"{qty: 0.25, unit: 'cup', name: 'olive oil'}
"3 large eggs"{qty: 3, unit: 'large', name: 'eggs'}
"""
text = ingredient_text.strip()
# Parse quantity (handles whole numbers, fractions, and mixed numbers)
quantity, remaining = self._parse_quantity(text)
# Parse unit from remaining text
unit, remaining = self._parse_unit(remaining)
# Clean the ingredient name
ingredient_name = self._clean_ingredient_name(remaining)
return {
"quantity": quantity,
"unit": unit,
"ingredient": ingredient_name,
"original": text,
}
def _parse_quantity(self, text: str) -> tuple:
"""
Extract quantity from start of ingredient text.
Handles: "2", "1/2", "1 1/2", "0.5", etc.
"""
text = text.strip()
# Pattern for mixed numbers: "1 1/2", "2 3/4"
mixed_pattern = r"^(\d+)\s+(\d+)/(\d+)"
mixed_match = re.match(mixed_pattern, text)
if mixed_match:
whole = int(mixed_match.group(1))
numerator = int(mixed_match.group(2))
denominator = int(mixed_match.group(3))
quantity = whole + (numerator / denominator)
remaining = text[mixed_match.end() :].strip()
return quantity, remaining
# Pattern for simple fractions: "1/2", "3/4"
fraction_pattern = r"^(\d+)/(\d+)"
fraction_match = re.match(fraction_pattern, text)
if fraction_match:
numerator = int(fraction_match.group(1))
denominator = int(fraction_match.group(2))
quantity = numerator / denominator
remaining = text[fraction_match.end() :].strip()
return quantity, remaining
# Pattern for decimals: "0.5", "1.5"
decimal_pattern = r"^(\d+\.?\d*)"
decimal_match = re.match(decimal_pattern, text)
if decimal_match:
quantity = float(decimal_match.group(1))
remaining = text[decimal_match.end() :].strip()
return quantity, remaining
# No quantity found
return 1, text
def _parse_unit(self, text: str) -> tuple:
"""Extract unit from text."""
text = text.strip()
match = self._unit_pattern.match(text)
if match:
unit = match.group(1).lower()
remaining = text[match.end() :].strip()
return unit, remaining
return None, text
def _clean_ingredient_name(self, text: str) -> str:
"""
Clean ingredient name for better USDA matching.
Removes modifiers, extra whitespace, and normalizes text.
"""
text = text.strip()
# Remove parenthetical notes: "flour (sifted)" → "flour"
text = re.sub(r"\s*\([^)]*\)", "", text)
# Remove common modifiers
for modifier in sorted(self.MODIFIERS, key=len, reverse=True):
# Use word boundaries to avoid partial matches
pattern = rf"\b{re.escape(modifier)}\b"
text = re.sub(pattern, "", text, flags=re.IGNORECASE)
# Remove extra whitespace
text = re.sub(r"\s+", " ", text)
# Remove trailing punctuation
text = text.strip(" ,;:.")
# Remove leading "of" (e.g., "cup of flour" → "flour")
text = re.sub(r"^of\s+", "", text, flags=re.IGNORECASE)
return text.strip()
def _find_usda_match(self, ingredient_name: str) -> Optional[Dict]:
"""
Find best USDA match for an ingredient name.
Tries multiple search strategies for better matching.
"""
strategies = [
# Strategy 1: Direct search
ingredient_name,
# Strategy 2: Singular form (remove trailing 's')
(ingredient_name.rstrip("s") if ingredient_name.endswith("s") else None),
# Strategy 3: Remove common suffixes
re.sub(
r"\s+(fresh|dried|ground|chopped|sliced)$",
"",
ingredient_name,
flags=re.IGNORECASE,
),
# Strategy 4: Just the last word (sometimes works for simple ingredients)
(ingredient_name.split()[-1] if len(ingredient_name.split()) > 1 else None),
]
for query in strategies:
if not query or len(query) < 2:
continue
try:
results = self.usda.search_foods(query, page_size=5)
if results:
logger.debug(f"Found {query}: {results[0].get('description')}")
return results[0]
except Exception as e:
logger.warning(f"USDA search failed for '{query}': {e}")
continue
return None
def _convert_to_grams(
self, quantity: float, unit: str, ingredient_name: str
) -> float:
"""
Convert quantity to grams for more accurate nutrition calculation.
This is simplified - production code should use a proper conversion table.
"""
# Base conversions (approximate)
volume_to_grams = {
"cup": 240,
"cups": 240,
"tablespoon": 15,
"tablespoons": 15,
"tbsp": 15,
"teaspoon": 5,
"teaspoons": 5,
"tsp": 5,
"ounce": 28,
"ounces": 28,
"oz": 28,
"pound": 454,
"pounds": 454,
"lb": 454,
"lbs": 454,
"gram": 1,
"grams": 1,
"g": 1,
}
# Get base weight in grams
base_grams = volume_to_grams.get(unit, 100) # Default to 100g if unknown
# Adjust for ingredient type (very simplified)
if any(
word in ingredient_name.lower()
for word in ["flour", "sugar", "rice", "grain"]
):
base_grams = volume_to_grams.get(unit, 125) # Dry ingredients
elif any(word in ingredient_name.lower() for word in ["oil", "butter", "fat"]):
base_grams = volume_to_grams.get(unit, 220) # Dense liquids
elif any(
word in ingredient_name.lower() for word in ["milk", "water", "broth"]
):
base_grams = volume_to_grams.get(unit, 240) # Liquids
return quantity * base_grams
def calculate_nutrition(self, ingredients: List[str], servings: int = 1) -> Dict:
"""
Calculate total nutrition for a recipe from ingredient list.
Returns nutrition per serving.
"""
totals = {
"calories": 0,
"protein": 0,
"fat": 0,
"carbohydrates": 0,
"fiber": 0,
"sugar": 0,
"sodium": 0,
}
matched_count = 0
unmatched = []
for ingredient in ingredients:
parsed = self.parse_ingredient(ingredient)
# Find USDA match
match = self._find_usda_match(
" ".join(
[
str(parsed["quantity"]),
str(parsed["unit"]),
str(parsed["ingredient"]),
]
)
)
if not match:
match = self._find_usda_match(parsed["ingredient"])
if match:
try:
nutrients = self.usda.extract_nutrients(match)
# Convert to grams for more accurate calculation
grams = self._convert_to_grams(
parsed["quantity"],
parsed["unit"],
parsed["ingredient"],
)
# USDA data is typically per 100g, so scale accordingly
multiplier = grams / 100
totals["calories"] += nutrients.get("calories", 0) * multiplier
totals["protein"] += nutrients.get("protein", 0) * multiplier
totals["fat"] += nutrients.get("fat", 0) * multiplier
totals["carbohydrates"] += (
nutrients.get("carbohydrates", 0) * multiplier
)
totals["fiber"] += nutrients.get("fiber", 0) * multiplier
totals["sugar"] += nutrients.get("sugar", 0) * multiplier
totals["sodium"] += nutrients.get("sodium", 0) * multiplier
matched_count += 1
except Exception:
unmatched.append(parsed["original"])
else:
unmatched.append(parsed["original"])
# Divide by servings for per-serving nutrition
if servings > 0:
for key in totals:
totals[key] = round(totals[key] / servings, 2)
# Add metadata
totals["_matched"] = matched_count
totals["_total"] = len(ingredients)
totals["_unmatched"] = unmatched
return totals

View File

@ -5,6 +5,7 @@ from typing import Any, Optional
import pendulum
import pytz
import requests
from beers.models import Beer
from boardgames.models import BoardGame, BoardGameDesigner, BoardGameLocation
from books.constants import READCOMICSONLINE_URL
@ -13,6 +14,8 @@ from books.utils import parse_readcomicsonline_uri
from bricksets.models import BrickSet
from dateutil.parser import parse
from django.utils import timezone
from foods.models import Food
from foods.sources.rscraper import RecipeScraperService
from locations.constants import LOCATION_PROVIDERS
from locations.models import GeoLocation
from music.constants import JELLYFIN_POST_KEYS, MOPIDY_POST_KEYS
@ -81,7 +84,10 @@ def mopidy_scrobble_media(post_data: dict, user_id: int) -> Scrobble:
log = {}
try:
log = {"mopidy_source": post_data.get("mopidy_uri", "").split(":")[0]}
log = {
"mopidy_source": post_data.get("mopidy_uri", "").split(":")[0],
"raw_data": post_data,
}
except IndexError:
pass
@ -544,6 +550,40 @@ def email_scrobble_board_game(
return scrobbles_created
def scrobble_from_recipe_website(
url: str,
user_id: int,
source: str = "Vrobbler",
action: Optional[str] = None,
) -> Scrobble:
"""Scrobble a recipe from a website URL."""
food, created = Food.find_or_create_from_recipe(url)
scrobble_dict = {
"user_id": user_id,
"timestamp": timezone.now(),
"playback_position_seconds": 0,
"source": source,
}
logger.info(
"[vrobbler-scrobble] recipe scrobble request received",
extra={
"food_id": food.id,
"user_id": user_id,
"scrobble_dict": scrobble_dict,
"media_type": Scrobble.MediaType.FOOD,
"created_bool": created,
},
)
scrobble = Scrobble.create_or_update(food, user_id, scrobble_dict)
if action == "stop":
scrobble.stop(force_finish=True)
return scrobble
def manual_scrobble_from_url(
url: str,
user_id: int,
@ -554,6 +594,12 @@ def manual_scrobble_from_url(
we want to scrobble as a media type in and of itself. This checks whether
we know about the content type, and routes it to the appropriate media
scrobbler. Otherwise, return nothing."""
if RecipeScraperService.is_recipe_url(url):
return scrobble_from_recipe_website(
url, user_id, source=source, action=action
)
content_key = ""
domain = extract_domain(url)
@ -906,6 +952,12 @@ def manual_scrobble_webpage(
source: str = "Bookmarklet",
action: Optional[str] = None,
):
if RecipeScraperService.is_recipe_url(url):
return scrobble_from_recipe_website(
url, user_id, source=source, action=action
)
webpage = WebPage.find_or_create({"url": url})
scrobble_dict = {

View File

@ -58,6 +58,7 @@ DUMP_REQUEST_DATA = (
os.getenv("VROBBLER_DUMP_REQUEST_DATA", "false").lower() in TRUTHY
)
USDA_API_KEY = os.getenv("VROBBLER_USDA_API_KEY")
THESPORTSDB_API_KEY = os.getenv("VROBBLER_THESPORTSDB_API_KEY", "2")
THEAUDIODB_API_KEY = os.getenv("VROBBLER_THEAUDIODB_API_KEY", "2")
PODCASTINDEX_API_KEY = os.getenv("VROBBLER_PODCASTINDEX_API_KEY", "")