From 4e201533463a7989923164e55bb3d2986b5b0a6b Mon Sep 17 00:00:00 2001
From: yokarion <yokarion@pm.me>
Date: Sat, 17 May 2025 23:52:12 +0200
Subject: [PATCH] e621_tagger new features (#558)

---
 plugins/e621_tagger/e621_tagger.py  | 145 +++++++++++++---------------
 plugins/e621_tagger/e621_tagger.yml |   2 +-
 2 files changed, 68 insertions(+), 79 deletions(-)

diff --git a/plugins/e621_tagger/e621_tagger.py b/plugins/e621_tagger/e621_tagger.py
index 4b66bdb..f5bd37b 100644
--- a/plugins/e621_tagger/e621_tagger.py
+++ b/plugins/e621_tagger/e621_tagger.py
@@ -8,54 +8,61 @@ import stashapi.log as log
 from stashapi.stashapp import StashInterface
 
 
-
 def get_all_images(
-    client: StashInterface, 
-    skip_tags: list[str], 
-    exclude_organized: bool
+    client: StashInterface,
+    skip_tags: list[int],
+    exclude_organized: bool,
+    per_page: int = 100,
 ) -> list[dict]:
     """
-    Get all images with proper tag exclusion and organization filter
+    Generator to fetch images in pages from the stash API.
     """
-    image_filter = {}
-    pagination = {
-        "page": 1,
-        "per_page": -1,  # -1 gets all results at once
-        "sort": "created_at",
-        "direction": "ASC",
-    }
-
-    # Convert tag names to IDs
-    tag_ids = []
-    for tag_name in skip_tags:
-        tag = get_or_create_tag(client, tag_name)
-        if tag:
-            tag_ids.append(tag["id"])
-
-    if tag_ids:
-        image_filter["tags"] = {
-            "value": [],
-            "excludes": tag_ids,
-            "modifier": "INCLUDES_ALL",
-            "depth": -1,
+    page = 1
+    while True:
+        image_filter = {}
+        pagination = {
+            "page": page,
+            "per_page": per_page,
+            "sort": "created_at",
+            "direction": "ASC",
         }
 
-    if exclude_organized:
-        image_filter["organized"] = False  # Correct field name
+        if skip_tags:
+            image_filter["tags"] = {
+                "value": [],
+                "excludes": skip_tags,
+                "modifier": "INCLUDES_ALL",
+                "depth": -1,
+            }
 
-    # Maintain original parameter structure
-    return client.find_images(f=image_filter, filter=pagination)
+        if exclude_organized:
+            image_filter["organized"] = False
+
+        images = client.find_images(f=image_filter, filter=pagination)
+        if not images:
+            # no more pages
+            break
+
+        log.info(f"Fetched page {page} with {len(images)} images")
+        for img in images:
+            yield img
+
+        # move to next page
+        page += 1
 
 
 def process_e621_post(stash: StashInterface, image_id: str, image_md5: str) -> None:
     """Process e621 metadata and update Stash records"""
-    # Skip already processed images
+    # same as before...
     image = stash.find_image(image_id)
-    if any(tag["name"] == "e621_tagged" for tag in image.get("tags", [])):
+    if any(t["name"] == "e621_tagged" for t in image.get("tags", [])):
+        return
+
+    if any(t["name"] == "e621_tag_failed" for t in image.get("tags", [])):
         return
 
     try:
-        time.sleep(2)  # Rate limiting
+        time.sleep(0.5)
         response = requests.get(
             f"https://e621.net/posts.json?md5={image_md5}",
             headers={"User-Agent": "Stash-e621-Tagger/1.0"},
@@ -64,53 +71,49 @@ def process_e621_post(stash: StashInterface, image_id: str, image_md5: str) -> N
         response.raise_for_status()
         post_data = response.json().get("post", {})
     except Exception as e:
-        log.error(f"e621 API error: {str(e)}")
+        log.error(f"Marking as failed. e621 API error: {str(e)}")
+        e621_tag_failed = get_or_create_tag(stash, "e621_tag_failed")
+        fail_ids = [e621_tag_failed["id"]] + [t["id"] for t in image.get("tags", [])]
+        stash.update_image({"id": image_id, "tag_ids": list(set(fail_ids))})
         return
 
     if not post_data:
         return
 
-    # Create essential entities
     e621_tag = get_or_create_tag(stash, "e621_tagged")
     post_url = f"https://e621.net/posts/{post_data['id']}"
 
-    # Process tags
     tag_ids = [e621_tag["id"]]
-    for category in ["general", "species", "character", "artist", "copyright"]:
-        for tag in post_data.get("tags", {}).get(category, []):
-            # Clean and validate tag
+    for cat in ["general", "species", "character", "artist", "copyright"]:
+        for tag in post_data.get("tags", {}).get(cat, []):
             clean_tag = tag.strip()
             if not clean_tag:
                 continue
-            
             stash_tag = get_or_create_tag(stash, clean_tag)
             if stash_tag:
                 tag_ids.append(stash_tag["id"])
 
-    # Process studio
     studio_id = None
     if artists := post_data.get("tags", {}).get("artist"):
         studio = get_or_create_studio(stash, artists[0])
         studio_id = studio["id"]
 
-    # Process performers
     performer_ids = []
-    for char_tag in post_data.get("tags", {}).get("character", []):
-        performer_name = char_tag.split('_(')[0]
-        performer = get_or_create_performer(stash, performer_name)
-        performer_ids.append(performer["id"])
+    for char in post_data.get("tags", {}).get("character", []):
+        name = char.split('_(')[0]
+        perf = get_or_create_performer(stash, name)
+        performer_ids.append(perf["id"])
 
-    # Update image
     try:
         stash.update_image({
             "id": image_id,
+            "organized": True,
             "urls": [post_url],
             "tag_ids": list(set(tag_ids)),
             "studio_id": studio_id,
             "performer_ids": performer_ids
         })
-
-        log.info("Image updated: ${image_id}")
+        log.info(f"Image updated: {image_id}")
     except Exception as e:
         log.error(f"Update failed: {str(e)}")
 
@@ -166,6 +169,7 @@ def get_or_create_performer(stash: StashInterface, name: str) -> dict:
 
 def scrape_image(client: StashInterface, image_id: str) -> None:
     """Main scraping handler"""
+    # same logic as before for MD5 extraction and process_e621_post call
     image = client.find_image(image_id)
     if not image or not image.get("visual_files"):
         return
@@ -173,65 +177,50 @@ def scrape_image(client: StashInterface, image_id: str) -> None:
     file_data = image["visual_files"][0]
     filename = file_data["basename"]
     filename_md5 = filename.split('.')[0]
-    final_md5 = None
 
-    # First try filename-based MD5
     if re.match(r"^[a-f0-9]{32}$", filename_md5):
         final_md5 = filename_md5
         log.info(f"Using filename MD5: {final_md5}")
     else:
-        # Fallback to content-based MD5
         try:
-            file_path = file_data["path"]
-            log.info(f"Generating MD5 from file content: {file_path}")
-            
             md5_hash = hashlib.md5()
-            with open(file_path, "rb") as f:
-                # Read file in 64kb chunks for memory efficiency
+            with open(file_data["path"], "rb") as f:
                 for chunk in iter(lambda: f.read(65536), b""):
                     md5_hash.update(chunk)
-            
             final_md5 = md5_hash.hexdigest()
             log.info(f"Generated content MD5: {final_md5}")
         except Exception as e:
             log.error(f"Failed to generate MD5: {str(e)}")
             return
 
-    if final_md5:
-        process_e621_post(client, image_id, final_md5)
-    else:
-        log.warning("No valid MD5 available for processing")
+    process_e621_post(client, image_id, final_md5)
+
 
-# Plugin setup and execution
-# In the main execution block:
 if __name__ == "__main__":
+    log.info("Starting tagger with pagination...")
     json_input = json.loads(sys.stdin.read())
     stash = StashInterface(json_input["server_connection"])
 
     config = stash.get_configuration().get("plugins", {})
     settings = {
-        "SkipTags": "e621_tagged",  # Add automatic filtering
+        "SkipTags": "e621_tagged, e621_tag_failed",
         "ExcludeOrganized": False
     }
     settings.update(config.get("e621_tagger", {}))
 
-    log.info(settings)
+    e621_tagged = get_or_create_tag(stash, "e621_tagged")
+    e621_failed = get_or_create_tag(stash, "e621_tag_failed")
 
-    # Get e621_tagged ID for filtering
-    e621_tag = get_or_create_tag(stash, "e621_tagged")
-
-    # Existing tags + automatic e621_tagged exclusion
     skip_tags = [t.strip() for t in settings["SkipTags"].split(",") if t.strip()]
-    skip_tags.append(e621_tag["id"])  # Filter by ID instead of name
+    skip_tags = [st for st in skip_tags]
+    skip_tags.extend([e621_tagged["id"], e621_failed["id"]])
 
-    images = get_all_images(stash, skip_tags, settings["ExcludeOrganized"])
-
-    # Rest of the loop remains the same
-    for i, image in enumerate(images, 1):
-        image_tag_names = [tag["name"] for tag in image.get("tags", [])]
-        if any(tag in image_tag_names for tag in skip_tags):
+    log.info("Fetching images in pages...")
+    for idx, image in enumerate(get_all_images(stash, skip_tags, settings["ExcludeOrganized"], per_page=100), start=1):
+        current_tags = [t["name"] for t in image.get("tags", [])]
+        if any(t in current_tags for t in skip_tags):
             log.info(f"Skipping image {image['id']} - contains skip tag")
             continue
 
-        log.progress(i/len(images))
+        log.progress(idx)
         scrape_image(stash, image["id"])
diff --git a/plugins/e621_tagger/e621_tagger.yml b/plugins/e621_tagger/e621_tagger.yml
index 3ad0186..9b8ecc2 100644
--- a/plugins/e621_tagger/e621_tagger.yml
+++ b/plugins/e621_tagger/e621_tagger.yml
@@ -1,6 +1,6 @@
 name: e621_tagger
 description: Finding images and videos on e621 and tagging them.
-version: 0.1
+version: 0.2
 url: https://github.com/stashapp/CommunityScripts/
 exec:
   - python