CommunityScripts/plugins/GroupAutoScraper/autoScraper.py

#!/usr/bin/env python3
"""
autoScraper.py

External raw plugin for Stash that:
- Triggers on group hooks (e.g. Group.Create.Post).
- If the group has at least one URL, calls ScrapeGroupURL on the first URL.
- Merges scraped data back into the group via GroupUpdate:
  * Uses scraped values when present, otherwise keeps existing ones.
  * For studio/tags, only uses scraped entries where stored_id is not null.
  * Tag ids from scraped data are merged with existing tag ids (unique).

This script is designed to be run by Stash as a raw external plugin and
expects its input JSON on stdin (the standard Stash plugin FRAGMENT format).

Requires:
  - Python 3.7+
  - requests (pip install requests)
"""

import sys
import json
import time
from typing import Any, Dict, List, Optional

import requests
import stashapi.log as log
from stashapi.stashapp import StashInterface


START_TIME = time.time()


def exit_plugin(msg: Optional[str] = None, err: Optional[str] = None) -> None:
    if msg is None and err is None:
        msg = "plugin ended"
    log.debug(f"Execution time: {round(time.time() - START_TIME, 5)}s")
    output_json = {"output": msg, "error": err}
    print(json.dumps(output_json))
    sys.exit(0 if err is None else 1)


def load_fragment() -> Dict[str, Any]:
    try:
        raw = sys.stdin.read()
        fragment = json.loads(raw)
    except Exception as exc:
        log.error(f"Failed to read/parse plugin input: {exc}")
        exit_plugin(err="invalid plugin input")
    return fragment


def build_graphql_client(server: Dict[str, Any]) -> Dict[str, Any]:
    scheme = server.get("Scheme", "http")
    host = server.get("Host", "localhost")
    port = str(server.get("Port", "9999"))
    if host == "0.0.0.0":
        host = "localhost"

    url = f"{scheme}://{host}:{port}/graphql"
    cookies = {}
    session = server.get("SessionCookie") or {}
    if session.get("Value"):
        cookies["session"] = session["Value"]

    headers = {
        "Accept-Encoding": "gzip, deflate, br",
        "Content-Type": "application/json",
        "Accept": "application/json",
        "Connection": "keep-alive",
        "DNT": "1",
    }

    return {"url": url, "headers": headers, "cookies": cookies}


def graphql_request(
    client: Dict[str, Any], query: str, variables: Dict[str, Any]
) -> Dict[str, Any]:
    payload = {"query": query, "variables": variables}
    try:
        resp = requests.post(
            client["url"],
            json=payload,
            headers=client["headers"],
            cookies=client["cookies"],
            timeout=20,
        )
    except Exception as exc:
        log.error(f"Error calling GraphQL: {exc}")
        exit_plugin(err="graphql request failed")

    if resp.status_code != 200:
        log.error(
            f"GraphQL HTTP {resp.status_code}: {resp.content!r}"
        )
        exit_plugin(err="graphql http error")

    data = resp.json()
    if "errors" in data and data["errors"]:
        log.error(f"GraphQL errors: {data['errors']}")
        exit_plugin(err="graphql errors")
    return data.get("data", {})


def seconds_from_duration(duration: Optional[str]) -> Optional[int]:
    """
    Convert a duration string like "3:16:00" or "16:00" into seconds.
    Returns None if duration is falsy or cannot be parsed.
    """
    if not duration:
        return None
    parts = duration.split(":")
    if not all(p.isdigit() for p in parts):
        return None
    try:
        if len(parts) == 3:
            h, m, s = map(int, parts)
        elif len(parts) == 2:
            h = 0
            m, s = map(int, parts)
        elif len(parts) == 1:
            h = 0
            m = 0
            s = int(parts[0])
        else:
            return None
    except ValueError:
        return None
    return h * 3600 + m * 60 + s


def coalesce(new_val: Any, old_val: Any) -> Any:
    """Return new_val if it is not None, otherwise old_val."""
    return new_val if new_val is not None else old_val


def build_group_update_input(
    group_id: int,
    existing: Dict[str, Any],
    scraped: Dict[str, Any],
) -> Dict[str, Any]:
    """
    Build the GroupUpdateInput payload, merging scraped data with existing.
    """
    input_obj: Dict[str, Any] = {"id": str(group_id)}

    # Basic scalar fields
    input_obj["name"] = coalesce(scraped.get("name"), existing.get("name"))

    # aliases: scraped may be list or string; convert list -> comma separated string
    scraped_aliases = scraped.get("aliases")
    if isinstance(scraped_aliases, list):
        aliases_str = ", ".join(a for a in scraped_aliases if a)
    else:
        aliases_str = scraped_aliases
    input_obj["aliases"] = coalesce(aliases_str, existing.get("aliases") or "")

    # duration: convert scraped duration string to seconds; keep existing if scrape missing
    scraped_duration_seconds = seconds_from_duration(scraped.get("duration"))
    if scraped_duration_seconds is not None:
        input_obj["duration"] = scraped_duration_seconds
    elif existing.get("duration") is not None:
        input_obj["duration"] = existing.get("duration")

    input_obj["date"] = coalesce(scraped.get("date"), existing.get("date"))

    # Director
    input_obj["director"] = coalesce(scraped.get("director"), existing.get("director"))

    # URLs: prefer scraped urls when non-empty
    scraped_urls = scraped.get("urls") or []
    existing_urls = existing.get("urls") or []
    if scraped_urls:
        input_obj["urls"] = scraped_urls
    elif existing_urls:
        input_obj["urls"] = existing_urls

    # Synopsis
    input_obj["synopsis"] = coalesce(scraped.get("synopsis"), existing.get("synopsis"))

    # Studio: use scraped.studio.stored_id when present, else existing studio.id
    existing_studio = existing.get("studio") or {}
    existing_studio_id = existing_studio.get("id")
    scraped_studio = scraped.get("studio") or {}
    scraped_studio_id = scraped_studio.get("stored_id")
    studio_id = coalesce(scraped_studio_id, existing_studio_id)
    if studio_id is not None:
        input_obj["studio_id"] = str(studio_id)

    # Tags: union of existing tag ids and scraped tags with stored_id, filtering nulls
    existing_tags = existing.get("tags") or []
    existing_tag_ids: List[str] = [str(t.get("id")) for t in existing_tags if t.get("id") is not None]

    scraped_tags = scraped.get("tags") or []
    scraped_tag_ids: List[str] = [
        str(t.get("stored_id"))
        for t in scraped_tags
        if t.get("stored_id") is not None
    ]

    if existing_tag_ids or scraped_tag_ids:
        merged_ids: List[str] = []
        for tid in existing_tag_ids + scraped_tag_ids:
            if tid not in merged_ids:
                merged_ids.append(tid)
        input_obj["tag_ids"] = merged_ids

    # Images: only send when we actually have scraped data URIs; otherwise omit so we
    # don't overwrite existing images with null.
    front_image = scraped.get("front_image")
    if front_image:
        input_obj["front_image"] = front_image
    back_image = scraped.get("back_image")
    if back_image:
        input_obj["back_image"] = back_image

    return input_obj


def main() -> None:
    fragment = load_fragment()
    server = fragment.get("server_connection") or {}
    client = build_graphql_client(server)
    # Create StashInterface instance for consistency with other plugins,
    # even though this plugin currently uses direct GraphQL requests.
    _stash = StashInterface(server)

    args = fragment.get("args") or {}

    # When triggered by a hook, we get hookContext with type/id
    hook_ctx = args.get("hookContext") or {}
    hook_type = hook_ctx.get("type")
    hook_id = hook_ctx.get("id")

    if not hook_type or not hook_id:
        # Not a hook invocation – nothing to do.
        exit_plugin("No hook context; skipping.")

    if hook_type not in ("Group.Create.Post", "Group.Update.Post"):
        # Only act on group create/update
        exit_plugin(f"Ignoring hook type {hook_type}")

    try:
        group_id = int(hook_id)
    except (TypeError, ValueError):
        log.error(f"Invalid group id in hookContext: {hook_id!r}")
        exit_plugin(err="invalid group id")

    log.debug(f"Running GroupAutoScraper for group id {group_id} ({hook_type})")

    # 1. Fetch existing group
    find_group_query = """
    query FindGroup($id: ID!) {
      findGroup(id: $id) {
        id
        name
        aliases
        duration
        date
        director
        urls
        synopsis
        front_image_path
        back_image_path
        studio {
          id
        }
        tags {
          id
        }
        containing_groups {
          group {
            id
          }
          description
        }
      }
    }
    """

    data = graphql_request(client, find_group_query, {"id": str(group_id)})
    group = data.get("findGroup")
    if not group:
        log.error(f"No group found with id {group_id}")
        exit_plugin(err="group not found")

    urls = group.get("urls") or []
    if not urls:
        # Nothing to scrape, but not an error
        log.info(f"Group {group_id} has no URLs; nothing to do.")
        exit_plugin("group has no URLs; skipped")

    target_url = urls[0]

    # Only handle AdultDVD Empire URLs
    if "adultdvdempire.com/" not in target_url:
        log.info("AutoGroup only uses AdultDVDEmpire URLS. Exiting.")
        exit_plugin("non-AdultDVDEmpire URL; skipped")

    # 2. Scrape group URL
    scrape_query = """
    query ScrapeGroupURL($url: String!) {
      scrapeGroupURL(url: $url) {
        name
        aliases
        duration
        date
        rating
        director
        urls
        synopsis
        front_image
        back_image
        studio {
          stored_id
          name
          urls
        }
        tags {
          stored_id
          name
          remote_site_id
        }
      }
    }
    """

    scrape_data = graphql_request(client, scrape_query, {"url": target_url})
    scraped = scrape_data.get("scrapeGroupURL")
    if not scraped:
        log.error(f"ScrapeGroupURL returned no data for URL {target_url}")
        exit_plugin(err="scrapeGroupURL returned no data")

    # 3. Build GroupUpdate input
    # Compute tag additions and studio status for logging.
    existing_tags = group.get("tags") or []
    existing_tag_ids = {str(t.get("id")) for t in existing_tags if t.get("id") is not None}

    scraped_tags = scraped.get("tags") or []
    scraped_tag_ids = [
        str(t.get("stored_id"))
        for t in scraped_tags
        if t.get("stored_id") is not None
    ]
    tags_added_count = sum(1 for tid in scraped_tag_ids if tid not in existing_tag_ids)

    scraped_studio = scraped.get("studio") or {}
    scraped_studio_name = scraped_studio.get("name")
    scraped_studio_id = scraped_studio.get("stored_id")
    if scraped_studio_id is not None:
        studio_msg = "set studio"
    elif scraped_studio_name:
        studio_msg = f"could not set studio '{scraped_studio_name}', not found in studios"
    else:
        studio_msg = "no studio in scrape"

    update_input = build_group_update_input(group_id, group, scraped)

    # 4. Perform GroupUpdate
    update_query = """
    mutation GroupUpdate($input: GroupUpdateInput!) {
      groupUpdate(input: $input) {
        id
        name
      }
    }
    """

    result = graphql_request(client, update_query, {"input": update_input})
    updated = result.get("groupUpdate")
    if not updated:
        log.error("GroupUpdate did not return a group")
        exit_plugin(err="groupUpdate failed")

    log.info(
        f"Group {updated.get('id')} '{updated.get('name')}' updated. "
        f"Added {tags_added_count} tag(s), {studio_msg}."
    )
    exit_plugin(
        msg=f"Updated group {updated.get('id')} '{updated.get('name')}' from {target_url}"
    )


if __name__ == "__main__":
    main()