Improve e621_tagger (#615)

This commit is contained in:
yokarion 2025-09-11 22:19:32 +02:00 committed by GitHub
parent c1151f670b
commit 233d90c689
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 225 additions and 85 deletions

View File

@ -4,77 +4,131 @@ import sys
import json
import time
import requests
import itertools
import stashapi.log as log
from stashapi.stashapp import StashInterface
from typing import List
MD5_RE = re.compile(r"^[a-f0-9]{32}$")
def get_all_images(
def _build_filter(skip_tag_ids, exclude_organized):
f = {}
if skip_tag_ids:
f["tags"] = {
"value": [],
"excludes": skip_tag_ids,
"modifier": "INCLUDES_ALL",
"depth": -1,
}
if exclude_organized:
f["organized"] = False
return f
def count_images(
client: StashInterface, skip_tag_ids: list, exclude_organized: bool
) -> int:
image_filter = _build_filter(skip_tag_ids, exclude_organized)
pagination = {"page": 1, "per_page": 0, "sort": "created_at", "direction": "ASC"}
total, _ = client.find_images(f=image_filter, filter=pagination, get_count=True)
return total
def count_scenes(
client: StashInterface, skip_tag_ids: list, exclude_organized: bool
) -> int:
scene_filter = _build_filter(skip_tag_ids, exclude_organized)
pagination = {"page": 1, "per_page": 0, "sort": "created_at", "direction": "ASC"}
total, _ = client.find_scenes(f=scene_filter, filter=pagination, get_count=True)
return total
def stream_images(
client: StashInterface,
skip_tags: list[int],
skip_tag_ids: List[int],
exclude_organized: bool,
per_page: int = 100,
) -> list[dict]:
"""
Generator to fetch images in pages from the stash API.
"""
):
page = 1
base_filter = _build_filter(skip_tag_ids, exclude_organized)
while True:
image_filter = {}
pagination = {
"page": page,
"per_page": per_page,
"sort": "created_at",
"direction": "ASC",
}
if skip_tags:
image_filter["tags"] = {
"value": [],
"excludes": skip_tags,
"modifier": "INCLUDES_ALL",
"depth": -1,
}
if exclude_organized:
image_filter["organized"] = False
images = client.find_images(f=image_filter, filter=pagination)
images = client.find_images(f=base_filter, filter=pagination)
if not images:
# no more pages
break
log.info(f"Fetched page {page} with {len(images)} images")
log.info(f"Fetched image page {page} with {len(images)} images")
for img in images:
yield img
# move to next page
yield ("image", img)
page += 1
def process_e621_post(stash: StashInterface, image_id: str, image_md5: str) -> None:
"""Process e621 metadata and update Stash records"""
# same as before...
image = stash.find_image(image_id)
if any(t["name"] == "e621_tagged" for t in image.get("tags", [])):
return
def stream_scenes(
client: StashInterface,
skip_tag_ids: List[int],
exclude_organized: bool,
per_page: int = 100,
):
page = 1
base_filter = _build_filter(skip_tag_ids, exclude_organized)
while True:
pagination = {
"page": page,
"per_page": per_page,
"sort": "created_at",
"direction": "ASC",
}
scenes = client.find_scenes(f=base_filter, filter=pagination)
if not scenes:
break
log.info(f"Fetched scene page {page} with {len(scenes)} scenes")
for sc in scenes:
yield ("scene", sc)
page += 1
if any(t["name"] == "e621_tag_failed" for t in image.get("tags", [])):
def process_e621_post_for_item(
stash: StashInterface, item_type: str, item_id: str, item_md5: str
) -> None:
# Fetch latest object to check tags
if item_type == "image":
obj = stash.find_image(item_id)
already_tagged = any(t["name"] == "e621_tagged" for t in obj.get("tags", []))
already_failed = any(
t["name"] == "e621_tag_failed" for t in obj.get("tags", [])
)
else:
obj = stash.find_scene(item_id)
already_tagged = any(t["name"] == "e621_tagged" for t in obj.get("tags", []))
already_failed = any(
t["name"] == "e621_tag_failed" for t in obj.get("tags", [])
)
if already_tagged or already_failed:
return
try:
time.sleep(0.5)
response = requests.get(
f"https://e621.net/posts.json?md5={image_md5}",
f"https://e621.net/posts.json?md5={item_md5}",
headers={"User-Agent": "Stash-e621-Tagger/1.0"},
timeout=10
timeout=10,
)
response.raise_for_status()
post_data = response.json().get("post", {})
except Exception as e:
log.error(f"Marking as failed. e621 API error: {str(e)}")
e621_tag_failed = get_or_create_tag(stash, "e621_tag_failed")
fail_ids = [e621_tag_failed["id"]] + [t["id"] for t in image.get("tags", [])]
stash.update_image({"id": image_id, "tag_ids": list(set(fail_ids))})
fail_ids = [e621_tag_failed["id"]] + [t["id"] for t in obj.get("tags", [])]
if item_type == "image":
stash.update_image({"id": item_id, "tag_ids": list(set(fail_ids))})
else:
stash.update_scene({"id": item_id, "tag_ids": list(set(fail_ids))})
return
if not post_data:
@ -84,7 +138,7 @@ def process_e621_post(stash: StashInterface, image_id: str, image_md5: str) -> N
post_url = f"https://e621.net/posts/{post_data['id']}"
tag_ids = [e621_tag["id"]]
for cat in ["general", "species", "character", "artist", "copyright"]:
for cat in ["general", "species", "artist", "copyright", "meta"]:
for tag in post_data.get("tags", {}).get(cat, []):
clean_tag = tag.strip()
if not clean_tag:
@ -100,27 +154,30 @@ def process_e621_post(stash: StashInterface, image_id: str, image_md5: str) -> N
performer_ids = []
for char in post_data.get("tags", {}).get("character", []):
name = char.split('_(')[0]
name = char.split("_(")[0]
perf = get_or_create_performer(stash, name)
performer_ids.append(perf["id"])
try:
stash.update_image({
"id": image_id,
update_payload = {
"id": item_id,
"organized": True,
"urls": [post_url],
"tag_ids": list(set(tag_ids)),
"studio_id": studio_id,
"performer_ids": performer_ids
})
log.info(f"Image updated: {image_id}")
"performer_ids": performer_ids,
}
if item_type == "image":
stash.update_image(update_payload)
log.info(f"Image updated: {item_id}")
else:
stash.update_scene(update_payload)
log.info(f"Scene updated: {item_id}")
except Exception as e:
log.error(f"Update failed: {str(e)}")
def get_or_create_tag(stash: StashInterface, tag_name: str) -> dict:
"""Find or create tag with hierarchy handling"""
# Validate tag name
tag_name = tag_name.strip()
if not tag_name:
log.error("Attempted to create tag with empty name")
@ -129,15 +186,17 @@ def get_or_create_tag(stash: StashInterface, tag_name: str) -> dict:
existing = stash.find_tags(f={"name": {"value": tag_name, "modifier": "EQUALS"}})
if existing:
return existing[0]
parts = tag_name.split(":")
parent_id = None
for i in range(len(parts)):
current_name = ":".join(parts[:i+1]).strip()
current_name = ":".join(parts[: i + 1]).strip()
if not current_name:
continue
existing = stash.find_tags(f={"name": {"value": current_name, "modifier": "EQUALS"}})
existing = stash.find_tags(
f={"name": {"value": current_name, "modifier": "EQUALS"}}
)
if not existing:
create_data = {"name": current_name}
if parent_id:
@ -155,72 +214,153 @@ def get_or_create_tag(stash: StashInterface, tag_name: str) -> dict:
parent_id = existing[0]["id"]
return {"id": parent_id}
def get_or_create_studio(stash: StashInterface, name: str) -> dict:
"""Find or create studio"""
studios = stash.find_studios(f={"name": {"value": name, "modifier": "EQUALS"}})
return studios[0] if studios else stash.create_studio({"name": name})
def get_or_create_performer(stash: StashInterface, name: str) -> dict:
"""Find or create performer"""
performers = stash.find_performers(f={"name": {"value": name, "modifier": "EQUALS"}})
performers = stash.find_performers(
f={"name": {"value": name, "modifier": "EQUALS"}}
)
return performers[0] if performers else stash.create_performer({"name": name})
def scrape_image(client: StashInterface, image_id: str) -> None:
"""Main scraping handler"""
# same logic as before for MD5 extraction and process_e621_post call
image = client.find_image(image_id)
if not image or not image.get("visual_files"):
return
file_data = image["visual_files"][0]
filename = file_data["basename"]
filename_md5 = filename.split('.')[0]
filename = file_data.get("basename", "")
filename_md5 = filename.split(".")[0] if filename else ""
if re.match(r"^[a-f0-9]{32}$", filename_md5):
if MD5_RE.match(filename_md5):
final_md5 = filename_md5
log.info(f"Using filename MD5: {final_md5}")
log.info(f"Using filename MD5 for image: {final_md5}")
else:
try:
md5_hash = hashlib.md5()
with open(file_data["path"], "rb") as f:
for chunk in iter(lambda: f.read(65536), b""):
md5_hash.update(chunk)
final_md5 = md5_hash.hexdigest()
log.info(f"Generated content MD5: {final_md5}")
except Exception as e:
log.error(f"Failed to generate MD5: {str(e)}")
if image.get("checksum"):
final_md5 = image["checksum"]
log.info(f"Using image checksum: {final_md5}")
elif image.get("md5"):
final_md5 = image["md5"]
log.info(f"Using image md5: {final_md5}")
else:
try:
md5_hash = hashlib.md5()
with open(file_data["path"], "rb") as f:
for chunk in iter(lambda: f.read(65536), b""):
md5_hash.update(chunk)
final_md5 = md5_hash.hexdigest()
log.info(f"Generated content MD5 for image: {final_md5}")
except Exception as e:
log.error(f"Failed to generate MD5 for image: {str(e)}")
return
process_e621_post_for_item(client, "image", image_id, final_md5)
def scrape_scene(client: StashInterface, scene_id: str) -> None:
scene = client.find_scene(scene_id)
if not scene:
return
final_md5 = None
if scene.get("checksum") and MD5_RE.match(scene.get("checksum")):
final_md5 = scene.get("checksum")
log.info(f"Using scene checksum: {final_md5}")
elif scene.get("md5") and MD5_RE.match(scene.get("md5")):
final_md5 = scene.get("md5")
log.info(f"Using scene md5: {final_md5}")
else:
files = scene.get("files") or scene.get("scene_files") or []
if files:
file_data = files[0]
if file_data.get("checksum") and MD5_RE.match(file_data.get("checksum")):
final_md5 = file_data.get("checksum")
log.info(f"Using file checksum for scene: {final_md5}")
else:
basename = file_data.get("basename", "")
filename_md5 = basename.split(".")[0] if basename else ""
if MD5_RE.match(filename_md5):
final_md5 = filename_md5
log.info(f"Using filename MD5 for scene: {final_md5}")
else:
try:
md5_hash = hashlib.md5()
with open(file_data["path"], "rb") as f:
for chunk in iter(lambda: f.read(65536), b""):
md5_hash.update(chunk)
final_md5 = md5_hash.hexdigest()
log.info(f"Generated content MD5 for scene: {final_md5}")
except Exception as e:
log.error(f"Failed to generate MD5 for scene: {str(e)}")
return
else:
log.error(f"No files found for scene {scene_id}; cannot compute md5")
return
process_e621_post(client, image_id, final_md5)
if final_md5:
process_e621_post_for_item(client, "scene", scene_id, final_md5)
if __name__ == "__main__":
log.info("Starting tagger with pagination...")
log.info("Starting tagger with stable pagination snapshot (streamed)...")
json_input = json.loads(sys.stdin.read())
stash = StashInterface(json_input["server_connection"])
config = stash.get_configuration().get("plugins", {})
settings = {
"SkipTags": "e621_tagged, e621_tag_failed",
"ExcludeOrganized": False
}
settings = {"SkipTags": "e621_tagged, e621_tag_failed", "ExcludeOrganized": False}
settings.update(config.get("e621_tagger", {}))
e621_tagged = get_or_create_tag(stash, "e621_tagged")
e621_failed = get_or_create_tag(stash, "e621_tag_failed")
skip_tags = [t.strip() for t in settings["SkipTags"].split(",") if t.strip()]
skip_tags = [st for st in skip_tags]
skip_tags.extend([e621_tagged["id"], e621_failed["id"]])
skip_tag_names = [n.strip() for n in settings["SkipTags"].split(",") if n.strip()]
skip_tag_ids: List[int] = []
for name in skip_tag_names:
found = stash.find_tags(f={"name": {"value": name, "modifier": "EQUALS"}})
if found:
skip_tag_ids.append(found[0]["id"])
skip_tag_ids.extend([e621_tagged["id"], e621_failed["id"]])
log.info("Fetching images in pages...")
for idx, image in enumerate(get_all_images(stash, skip_tags, settings["ExcludeOrganized"], per_page=100), start=1):
current_tags = [t["name"] for t in image.get("tags", [])]
if any(t in current_tags for t in skip_tags):
log.info(f"Skipping image {image['id']} - contains skip tag")
per_page = 50
log.info("Counting images (no storage)...")
num_images = count_images(stash, skip_tag_ids, settings["ExcludeOrganized"])
log.info("Counting scenes (no storage)...")
num_scenes = count_scenes(stash, skip_tag_ids, settings["ExcludeOrganized"])
total = (num_images + num_scenes) or 1
log.info(f"Total items (images + scenes): {total}")
stream = itertools.chain(
stream_images(
stash, skip_tag_ids, settings["ExcludeOrganized"], per_page=per_page
),
stream_scenes(
stash, skip_tag_ids, settings["ExcludeOrganized"], per_page=per_page
),
)
for idx, (item_type, item) in enumerate(stream, start=1):
log.progress(float(idx - 1) / float(total))
item_id = item["id"]
current_tag_ids = [t["id"] for t in item.get("tags", [])]
if any(tid in current_tag_ids for tid in skip_tag_ids):
log.info(f"Skipping {item_type} {item_id} - contains skip tag")
log.progress(float(idx) / float(total))
continue
log.progress(idx)
scrape_image(stash, image["id"])
if item_type == "image":
scrape_image(stash, item_id)
else:
scrape_scene(stash, item_id)
log.progress(float(idx) / float(total))
log.progress(1.0)

View File

@ -1,6 +1,6 @@
name: e621_tagger
description: Finding images and videos on e621 and tagging them.
version: 0.2
version: 0.3
url: https://github.com/stashapp/CommunityScripts/
exec:
- python