[importers] Add flag to reimport already processed files

2026-05-28 17:19:08 -04:00
parent bea2b2d187
commit 62d8ffd794
3 changed files with 98 additions and 5 deletions
--- a/PROJECT.org
+++ b/PROJECT.org
@ -93,7 +93,7 @@ fetching and simple saving.
 :LOGBOOK:
 CLOCK: [2025-07-09 Wed 09:55]--[2025-07-09 Wed 10:15] =>  0:20
 :END:
-* Backlog [26/42] :vrobbler:project:personal:
+* Backlog [28/44] :vrobbler:project:personal:
 ** TODO [#C] Add sentiment parsing for Scrobbles with notes :vrobbler:project:scrobbles:sentiment:
 :PROPERTIES:
 :ID:       37781d6a-f3b0-48b2-bf98-33c2c791cf85
@ -496,7 +496,45 @@ or at least make it optional, and then require a M2M between Track and Artist.
 Then this one would be a Track by both `Matt Sweeney` and `Bonnie "Prince"
 Billy`

-** TODO [#A] Allow special parameter to re-import already processed GPX files :imports:gpx:
+** TODO [#A] Move imported eBird CSV files to processed/ directory on WebDAV :webdav:ebird:importers:
+:PROPERTIES:
+:ID:       445e1253-d353-4b55-b1d8-39d0a0dcdd34
+:END:
+    - File: ~vrobbler/apps/scrobbles/importers/webdav.py~ (line 439)
+    - Same pattern as the GPX importer: after importing a =.csv= file from
+      WebDAV, move it to =var/ebird/processed/= with a timestamp appended.
+
+** TODO [#A] Move imported Scale CSV files to processed/ directory on WebDAV :webdav:scale:importers:
+:PROPERTIES:
+:ID:       1a0de363-d1ea-466e-9966-e24941a6180b
+:END:
+    - File: ~vrobbler/apps/scrobbles/importers/webdav.py~ (line 496)
+    - Same pattern as the GPX importer: after importing a =.csv= file from
+      WebDAV, move it to =var/scale/processed/= with a timestamp appended.
+
+** DONE [#A] Allow special parameter to re-import already processed GPX files :imports:gpx:
+:PROPERTIES:
+:ID:       166c0809-c11a-4d02-8071-7a69dcb36e64
+:END:
+
+*** Description
+
+Now that we stash imported GPX files in the processed/ subdirectory on import,
+it would be nice to have a flag like --include-processed on the webdav importer
+that would import files both in the root of gpx/ and also in the processed
+directory. This would aide testing imports in staging quickly without constantly
+moving files back and forth.
+
+** DONE [#A] Move imported GPX files to processed/ directory on WebDAV :webdav:gpx:importers:
+:PROPERTIES:
+:ID:       db2b02fc-817c-4c39-bd51-13f9b77c7888
+:END:
+    - File: ~vrobbler/apps/scrobbles/importers/webdav.py~ (line 198)
+    - After importing a GPX/FIT file from WebDAV, move it to a =processed/=
+      subdirectory with a timestamp appended. This eliminates the DB lookup for
+      already-imported filenames — any file present in the top-level directory
+      is new. Also makes manual re-imports easy (just move a file back).
+
 ** DONE [#A] Add CSS Grid calendar view for scrobbles :vrobbler:personal:project:templates:feature:
 :PROPERTIES:
 :ID:       be915acf-d803-466a-8770-823819ebf2a9
--- a/vrobbler/apps/scrobbles/importers/webdav.py
+++ b/vrobbler/apps/scrobbles/importers/webdav.py
@ -22,7 +22,10 @@ DEFAULT_SCALE_PATH = "var/scale/"


 def import_from_webdav_for_all_users(
-    restart=False, update_retroarch_hash=False, update_koreader_etag=False
+    restart=False,
+    update_retroarch_hash=False,
+    update_koreader_etag=False,
+    include_processed=False,
 ):
    """Iterate all WebDAV-enabled users, scanning each media-type directory."""
    webdav_enabled_user_ids = UserProfile.objects.filter(
@ -31,6 +34,12 @@ def import_from_webdav_for_all_users(
        webdav_pass__isnull=False,
        webdav_auto_import=True,
    ).values_list("user_id", flat=True)
+    if include_processed:
+        logger.warning(
+            "Re-importing previously-processed files from processed/ subdirectories "
+            "— this may create duplicate scrobbles."
+        )
+
    logger.info(f"Start import of {webdav_enabled_user_ids.count()} webdav accounts")

    ko_count = 0
@ -49,7 +58,9 @@ def import_from_webdav_for_all_users(
            client, user_id, restart, update_etag_only=update_koreader_etag
        )
        logger.info("Scanning WebDAV gpx for user %s", user_id)
-        gpx_count += scan_webdav_for_gpx(client, user_id)
+        gpx_count += scan_webdav_for_gpx(
+            client, user_id, include_processed=include_processed
+        )
        logger.info("Scanning WebDAV retroarch for user %s", user_id)
        retro_count += scan_webdav_for_retroarch(
            client, user_id, update_hash_only=update_retroarch_hash
@ -200,7 +211,7 @@ def scan_webdav_for_koreader(
    return 1


-def scan_webdav_for_gpx(webdav_client, user_id):
+def scan_webdav_for_gpx(webdav_client, user_id, include_processed=False):
    gpx_path = DEFAULT_GPX_PATH  # TODO allow this to be configured in user settings
    try:
        webdav_client.info(DEFAULT_GPX_PATH)
@ -259,6 +270,41 @@ def scan_webdav_for_gpx(webdav_client, user_id):
        finally:
            os.unlink(tmp.name)

+    if include_processed:
+        try:
+            processed_files = webdav_client.list(processed_dir)
+        except Exception as e:
+            logger.warning(
+                "Could not list var/gpx/processed/",
+                extra={"user_id": user_id, "error": str(e)},
+            )
+            return new_imports
+
+        for fname in processed_files:
+            fname = os.path.basename(fname)
+            if not fname.lower().endswith(gpx_extensions):
+                continue
+
+            remote_path = f"{processed_dir}{fname}"
+            tmp = tempfile.NamedTemporaryFile(delete=False, suffix=fname)
+            try:
+                webdav_client.download_sync(
+                    remote_path=remote_path, local_path=tmp.name
+                )
+                imp = TrailGPXImport.objects.create(
+                    user_id=user_id,
+                    original_filename=fname,
+                )
+                with open(tmp.name, "rb") as f:
+                    imp.gpx_file.save(fname, f, save=True)
+
+                process_trail_gpx_import.delay(imp.id)
+                new_imports += 1
+            except Exception as e:
+                logger.error(f"Failed to import processed GPX file {fname}: {e}")
+            finally:
+                os.unlink(tmp.name)
+
    return new_imports


--- a/vrobbler/apps/scrobbles/management/commands/import_from_webdav.py
+++ b/vrobbler/apps/scrobbles/management/commands/import_from_webdav.py
@ -1,4 +1,5 @@
 from django.core.management.base import BaseCommand
+
 from vrobbler.apps.scrobbles.importers import webdav


@ -21,6 +22,12 @@ class Command(BaseCommand):
            help="Store current WebDAV ETag on last KoReader import "
            "without re-importing (migration helper)",
        )
+        parser.add_argument(
+            "--include-processed",
+            action="store_true",
+            help="Also import files already moved to processed/ subdirectories "
+            "(may produce duplicate scrobbles; use with care on production)",
+        )

    def handle(self, *args, **options):
        restart = False
@ -28,8 +35,10 @@ class Command(BaseCommand):
            restart = True
        update_hash = options.get("update_retroarch_hash", False)
        update_etag = options.get("update_koreader_etag", False)
+        include_processed = options.get("include_processed", False)
        webdav.import_from_webdav_for_all_users(
            restart=restart,
            update_retroarch_hash=update_hash,
            update_koreader_etag=update_etag,
+            include_processed=include_processed,
        )