CommunityScrapers/scrapers/py_common/util.py

from argparse import ArgumentParser
from functools import reduce
from typing import Any, Callable, TypeVar
from urllib.error import URLError
from urllib.request import Request, urlopen
import json
import sys


def dig(c: dict | list, *keys: str | int | tuple[str | int, ...], default=None) -> Any:
    """
    Helper function to get a value from a nested dict or list

    If a key is a tuple the items will be tried in order until a value is found

    :param c: dict or list to search
    :param keys: keys to search for
    :param default: default value to return if not found
    :return: value if found, None otherwise

    >>> obj = {"a": {"b": ["c", "d"], "f": {"g": "h"}}}
    >>> dig(obj, "a", "b", 1)
    'd'
    >>> dig(obj, "a", ("e", "f"), "g")
    'h'
    """

    def inner(d: dict | list, key: str | int | tuple):
        if isinstance(d, dict):
            if isinstance(key, tuple):
                for k in key:
                    if k in d:
                        return d[k]
            return d.get(key)
        elif isinstance(d, list) and isinstance(key, int) and key < len(d):
            return d[key]
        else:
            return default

    return reduce(inner, keys, c)  # type: ignore


T = TypeVar("T")


def replace_all(obj: dict, key: str, replacement: Callable[[T], T]) -> dict:
    """
    Helper function to recursively replace values in a nested dict, returning a new dict

    If the key refers to a list the replacement function will be called for each item

    :param obj: dict to search
    :param key: key to search for
    :param replacement: function called on the value to replace it
    :return: new dict

    >>> obj = {"a": {"b": ["c", "d"], "f": {"g": "h"}}}
    >>> replace(obj, "g", lambda x: x.upper()) # Replace a single item
    {'a': {'b': ['c', 'd'], 'f': {'g': 'H'}}}
    >>> replace(obj, "b", lambda x: x.upper()) # Replace all items in a list
    {'a': {'b': ['C', 'D'], 'f': {'g': 'h'}}}
    >>> replace(obj, "z", lambda x: x.upper()) # Do nothing if the key is not found
    {'a': {'b': ['c', 'd'], 'f': {'g': 'h'}}}
    """
    if not isinstance(obj, dict):
        return obj

    new = {}
    for k, v in obj.items():
        if k == key:
            if isinstance(v, list):
                new[k] = [replacement(x) for x in v]
            else:
                new[k] = replacement(v)
        elif isinstance(v, dict):
            new[k] = replace_all(v, key, replacement)
        elif isinstance(v, list):
            new[k] = [replace_all(x, key, replacement) for x in v]
        else:
            new[k] = v
    return new


def replace_at(obj: dict, *path: str, replacement: Callable[[T], T]) -> dict:
    """
    Helper function to replace a value at a given path in a nested dict, returning a new dict

    If the path refers to a list the replacement function will be called for each item

    If the path does not exist, the replacement function will not be called and the dict will be returned as-is

    :param obj: dict to search
    :param path: path to search for
    :param replacement: function called on the value to replace it
    :return: new dict

    >>> obj = {"a": {"b": ["c", "d"], "f": {"g": "h"}}}
    >>> replace_at(obj, "a", "f", "g", replacement=lambda x: x.upper()) # Replace a single item
    {'a': {'b': ['c', 'd'], 'f': {'g': 'H'}}}
    >>> replace_at(obj, "a", "b", replacement=lambda x: x.upper()) # Replace all items in a list
    {'a': {'b': ['C', 'D'], 'f': {'g': 'h'}}}
    >>> replace_at(obj, "a", "z", "g", replacement=lambda x: x.upper()) # Broken path, do nothing
    {'a': {'b': ['c', 'd'], 'f': {'g': 'h'}}}
    """

    def inner(d: dict, *keys: str):
        match keys:
            case [k] if isinstance(d, dict) and k in d:
                if isinstance(d[k], list):
                    return {**d, k: [replacement(x) for x in d[k]]}
                return {**d, k: replacement(d[k])}
            case [k, *ks] if isinstance(d, dict) and k in d:
                return {**d, k: inner(d[k], *ks)}
            case _:
                return d

    return inner(obj, *path)  # type: ignore


def is_valid_url(url):
    """
    Checks if an URL is valid by making a GET request and ensuring the response code is 2xx
    """
    try:
        req = Request(url)
        with urlopen(req) as response:
            return 200 <= response.getcode() < 300
    except URLError:
        return False


def __default_parser(**kwargs):
    parser = ArgumentParser(**kwargs)
    # Some scrapers can take extra arguments so we can
    # do rudimentary configuration in the YAML file
    parser.add_argument("extra", nargs="*")
    subparsers = parser.add_subparsers(dest="operation", required=True)

    # "Scrape with..." and the subsequent search box
    subparsers.add_parser(
        "performer-by-name", help="Search for performers"
    ).add_argument("--name", help="Performer name to search for")

    # The results of performer-by-name will be passed to this
    pbf = subparsers.add_parser("performer-by-fragment", help="Scrape a performer")
    # Technically there's more information in this fragment,
    # but in 99.9% of cases we only need the URL or the name
    pbf.add_argument("--url", help="Scene URL")
    pbf.add_argument("--name", help="Performer name to search for")

    # Filling in an URL and hitting the "Scrape" icon
    subparsers.add_parser(
        "performer-by-url", help="Scrape a performer by their URL"
    ).add_argument("--url")

    # Filling in an URL and hitting the "Scrape" icon
    subparsers.add_parser(
        "movie-by-url", help="Scrape a movie by its URL"
    ).add_argument("--url")

    # The looking glass search icon
    # name field is guaranteed to be filled by Stash
    subparsers.add_parser("scene-by-name", help="Scrape a scene by name").add_argument(
        "--name", help="Name to search for"
    )

    # Filling in an URL and hitting the "Scrape" icon
    subparsers.add_parser(
        "scene-by-url", help="Scrape a scene by its URL"
    ).add_argument("--url")

    # "Scrape with..."
    sbf = subparsers.add_parser("scene-by-fragment", help="Scrape a scene")
    sbf.add_argument("-u", "--url")
    sbf.add_argument("--id")
    sbf.add_argument("--title")  # Title will be filename if not set in Stash
    sbf.add_argument("--date")
    sbf.add_argument("--details")
    sbf.add_argument("--urls", nargs="+")

    # Tagger view or search box
    sbqf = subparsers.add_parser("scene-by-query-fragment", help="Scrape a scene")
    sbqf.add_argument("-u", "--url")
    sbqf.add_argument("--id")
    sbqf.add_argument("--title")  # Title will be filename if not set in Stash
    sbqf.add_argument("--code")
    sbqf.add_argument("--details")
    sbqf.add_argument("--director")
    sbqf.add_argument("--date")
    sbqf.add_argument("--urls", nargs="+")

    # Filling in an URL and hitting the "Scrape" icon
    subparsers.add_parser(
        "gallery-by-url", help="Scrape a gallery by its URL"
    ).add_argument("--url", help="Gallery URL")

    # "Scrape with..."
    gbf = subparsers.add_parser("gallery-by-fragment", help="Scrape a gallery")
    gbf.add_argument("-u", "--url")
    gbf.add_argument("--id")
    gbf.add_argument("--title")
    gbf.add_argument("--date")
    gbf.add_argument("--details")
    gbf.add_argument("--urls", nargs="+")

    return parser


def scraper_args(**kwargs):
    """
    Helper function to parse arguments for a scraper

    This allows scrapers to be called from the command line without
    piping JSON to stdin but also from Stash

    Returns a tuple of the operation and the parsed arguments: operation is one of
    - performer-by-name
    - performer-by-fragment
    - performer-by-url
    - movie-by-url
    - scene-by-name
    - scene-by-url
    - scene-by-fragment
    - scene-by-query-fragment
    - gallery-by-url
    - gallery-by-fragment

    A scraper can be configured to take extra arguments by adding them to the YAML file:
    ```yaml
    sceneByName:
      action: script
      script:
        - python
        - my-scraper.py
        - extra
        - args
        - scene-by-name
    ```

    When called from Stash through the above configuration this function would return:
    ```python
    ("scene-by-name", {"extra": ["extra", "args"], "name": "scene name"})
    ```
    """

    parser = __default_parser(**kwargs)
    args = vars(parser.parse_args())

    # If stdin is not connected to a TTY the script is being executed by Stash
    if not sys.stdin.isatty():
        try:
            stash_fragment = json.load(sys.stdin)
            args.update(stash_fragment)
        except json.decoder.JSONDecodeError:
            # This would only happen if Stash passed invalid JSON
            sys.exit(69)

    return args.pop("operation"), args


def guess_nationality(country: str) -> str:
    """
    Tries to guess the country from a string

    Returns the original string if no match is found
    """
    for c in country.split(","):
        c = c.strip().lower()
        if c in demonyms:
            return demonyms[c]
    return country


US_states = [
    "AK",
    "AL",
    "AR",
    "AZ",
    "CA",
    "CO",
    "CT",
    "DC",
    "DE",
    "FL",
    "GA",
    "HI",
    "IA",
    "ID",
    "IL",
    "IN",
    "KS",
    "KY",
    "LA",
    "MA",
    "MD",
    "ME",
    "MI",
    "MN",
    "MO",
    "MS",
    "MT",
    "NC",
    "ND",
    "NE",
    "NH",
    "NJ",
    "NM",
    "NV",
    "NY",
    "OH",
    "OK",
    "OR",
    "PA",
    "RI",
    "SC",
    "SD",
    "TN",
    "TX",
    "UT",
    "VA",
    "VT",
    "WA",
    "WI",
    "WV",
    "WY",
    "Alabama",
    "Alaska",
    "Arizona",
    "Arkansas",
    "California",
    "Colorado",
    "Connecticut",
    "Delaware",
    "Florida",
    "Georgia",
    "Hawaii",
    "Idaho",
    "Illinois",
    "Indiana",
    "Iowa",
    "Kansas",
    "Kentucky",
    "Louisiana",
    "Maine",
    "Maryland",
    "Massachusetts",
    "Michigan",
    "Minnesota",
    "Mississippi",
    "Missouri",
    "Montana",
    "Nebraska",
    "Nevada",
    "New Hampshire",
    "New Jersey",
    "New Mexico",
    "New York",
    "North Carolina",
    "North Dakota",
    "Ohio",
    "Oklahoma",
    "Oregon",
    "Pennsylvania",
    "Rhode Island",
    "South Carolina",
    "South Dakota",
    "Tennessee",
    "Texas",
    "Utah",
    "Vermont",
    "Virginia",
    "Washington",
    "West Virginia",
    "Wisconsin",
    "Wyoming",
]

demonyms = {
    # https://en.wikipedia.org/wiki/List_of_adjectival_and_demonymic_forms_for_countries_and_nations
    "abkhaz": "Abkhazia",
    "abkhazian": "Abkhazia",
    "afghan": "Afghanistan",
    "african american": "USA",
    "albanian": "Albania",
    "algerian": "Algeria",
    "american samoan": "American Samoa",
    "american": "USA",
    "andorran": "Andorra",
    "angolan": "Angola",
    "anguillan": "Anguilla",
    "antarctic": "Antarctica",
    "antiguan": "Antigua and Barbuda",
    "argentine": "Argentina",
    "argentinian": "Argentina",
    "armenian": "Armenia",
    "aruban": "Aruba",
    "australian": "Australia",
    "austrian": "Austria",
    "azerbaijani": "Azerbaijan",
    "azeri": "Azerbaijan",
    "bahamian": "Bahamas",
    "bahraini": "Bahrain",
    "bangladeshi": "Bangladesh",
    "barbadian": "Barbados",
    "barbudan": "Antigua and Barbuda",
    "basotho": "Lesotho",
    "belarusian": "Belarus",
    "belgian": "Belgium",
    "belizean": "Belize",
    "beninese": "Benin",
    "beninois": "Benin",
    "bermudan": "Bermuda",
    "bermudian": "Bermuda",
    "bhutanese": "Bhutan",
    "biot": "British Indian Ocean Territory",
    "bissau-guinean": "Guinea-Bissau",
    "bolivian": "Bolivia",
    "bonaire": "Bonaire",
    "bonairean": "Bonaire",
    "bosnian": "Bosnia and Herzegovina",
    "botswanan": "Botswana",
    "bouvet island": "Bouvet Island",
    "brazilian": "Brazil",
    "british virgin island": "Virgin Islands, British",
    "british": "United Kingdom",
    "bruneian": "Brunei",
    "bulgarian": "Bulgaria",
    "burkinabé": "Burkina Faso",
    "burmese": "Burma",
    "burundian": "Burundi",
    "cabo verdean": "Cabo Verde",
    "cambodian": "Cambodia",
    "cameroonian": "Cameroon",
    "canadian": "Canada",
    "cantonese": "Hong Kong",
    "caymanian": "Cayman Islands",
    "central african": "Central African Republic",
    "chadian": "Chad",
    "channel island": "Guernsey",
    "chilean": "Chile",
    "chinese": "China",
    "christmas island": "Christmas Island",
    "cocos island": "Cocos (Keeling) Islands",
    "colombian": "Colombia",
    "comoran": "Comoros",
    "comorian": "Comoros",
    "congolese": "Congo",
    "cook island": "Cook Islands",
    "costa rican": "Costa Rica",
    "croatian": "Croatia",
    "cuban": "Cuba",
    "curaçaoan": "Curaçao",
    "cypriot": "Cyprus",
    "czech": "Czech Republic",
    "danish": "Denmark",
    "djiboutian": "Djibouti",
    "dominican": "Dominica",
    "dutch": "Netherlands",
    "ecuadorian": "Ecuador",
    "egyptian": "Egypt",
    "emirati": "United Arab Emirates",
    "emiri": "United Arab Emirates",
    "emirian": "United Arab Emirates",
    "english people": "England",
    "english": "England",
    "equatoguinean": "Equatorial Guinea",
    "equatorial guinean": "Equatorial Guinea",
    "eritrean": "Eritrea",
    "estonian": "Estonia",
    "ethiopian": "Ethiopia",
    "european": "European Union",
    "falkland island": "Falkland Islands",
    "faroese": "Faroe Islands",
    "fijian": "Fiji",
    "filipino": "Philippines",
    "finnish": "Finland",
    "formosan": "Taiwan",
    "french guianese": "French Guiana",
    "french polynesian": "French Polynesia",
    "french southern territories": "French Southern Territories",
    "french": "France",
    "futunan": "Wallis and Futuna",
    "gabonese": "Gabon",
    "gambian": "Gambia",
    "georgian": "Georgia",
    "german": "Germany",
    "ghanaian": "Ghana",
    "gibraltar": "Gibraltar",
    "greek": "Greece",
    "greenlandic": "Greenland",
    "grenadian": "Grenada",
    "guadeloupe": "Guadeloupe",
    "guamanian": "Guam",
    "guatemalan": "Guatemala",
    "guinean": "Guinea",
    "guyanese": "Guyana",
    "haitian": "Haiti",
    "heard island": "Heard Island and McDonald Islands",
    "hellenic": "Greece",
    "herzegovinian": "Bosnia and Herzegovina",
    "honduran": "Honduras",
    "hong kong": "Hong Kong",
    "hong konger": "Hong Kong",
    "hungarian": "Hungary",
    "icelandic": "Iceland",
    "indian": "India",
    "indonesian": "Indonesia",
    "iranian": "Iran",
    "iraqi": "Iraq",
    "irish": "Ireland",
    "israeli": "Israel",
    "israelite": "Israel",
    "italian": "Italy",
    "ivorian": "Ivory Coast",
    "jamaican": "Jamaica",
    "jan mayen": "Jan Mayen",
    "japanese": "Japan",
    "jordanian": "Jordan",
    "kazakh": "Kazakhstan",
    "kazakhstani": "Kazakhstan",
    "kenyan": "Kenya",
    "kirghiz": "Kyrgyzstan",
    "kirgiz": "Kyrgyzstan",
    "kiribati": "Kiribati",
    "korean": "South Korea",
    "kosovan": "Kosovo",
    "kosovar": "Kosovo",
    "kuwaiti": "Kuwait",
    "kyrgyz": "Kyrgyzstan",
    "kyrgyzstani": "Kyrgyzstan",
    "lao": "Lao People's Democratic Republic",
    "laotian": "Lao People's Democratic Republic",
    "latvian": "Latvia",
    "lebanese": "Lebanon",
    "lettish": "Latvia",
    "liberian": "Liberia",
    "libyan": "Libya",
    "liechtensteiner": "Liechtenstein",
    "lithuanian": "Lithuania",
    "luxembourg": "Luxembourg",
    "luxembourgish": "Luxembourg",
    "macanese": "Macau",
    "macedonian": "North Macedonia",
    "magyar": "Hungary",
    "mahoran": "Mayotte",
    "malagasy": "Madagascar",
    "malawian": "Malawi",
    "malaysian": "Malaysia",
    "maldivian": "Maldives",
    "malian": "Mali",
    "malinese": "Mali",
    "maltese": "Malta",
    "manx": "Isle of Man",
    "marshallese": "Marshall Islands",
    "martinican": "Martinique",
    "martiniquais": "Martinique",
    "mauritanian": "Mauritania",
    "mauritian": "Mauritius",
    "mcdonald islands": "Heard Island and McDonald Islands",
    "mexican": "Mexico",
    "moldovan": "Moldova",
    "monacan": "Monaco",
    "mongolian": "Mongolia",
    "montenegrin": "Montenegro",
    "montserratian": "Montserrat",
    "monégasque": "Monaco",
    "moroccan": "Morocco",
    "motswana": "Botswana",
    "mozambican": "Mozambique",
    "myanma": "Myanmar",
    "namibian": "Namibia",
    "nauruan": "Nauru",
    "nepalese": "Nepal",
    "nepali": "Nepal",
    "netherlandic": "Netherlands",
    "new caledonian": "New Caledonia",
    "new zealand": "New Zealand",
    "ni-vanuatu": "Vanuatu",
    "nicaraguan": "Nicaragua",
    "nigerian": "Nigeria",
    "nigerien": "Niger",
    "niuean": "Niue",
    "norfolk island": "Norfolk Island",
    "northern irish": "Northern Ireland",
    "northern marianan": "Northern Mariana Islands",
    "norwegian": "Norway",
    "omani": "Oman",
    "pakistani": "Pakistan",
    "palauan": "Palau",
    "palestinian": "Palestine",
    "panamanian": "Panama",
    "papua new guinean": "Papua New Guinea",
    "papuan": "Papua New Guinea",
    "paraguayan": "Paraguay",
    "persian": "Iran",
    "peruvian": "Peru",
    "philippine": "Philippines",
    "pitcairn island": "Pitcairn Islands",
    "polish": "Poland",
    "portuguese": "Portugal",
    "puerto rican": "Puerto Rico",
    "qatari": "Qatar",
    "romanian": "Romania",
    "russian": "Russia",
    "rwandan": "Rwanda",
    "saba": "Saba",
    "saban": "Saba",
    "sahraouian": "Western Sahara",
    "sahrawi": "Western Sahara",
    "sahrawian": "Western Sahara",
    "salvadoran": "El Salvador",
    "sammarinese": "San Marino",
    "samoan": "Samoa",
    "saudi arabian": "Saudi Arabia",
    "saudi": "Saudi Arabia",
    "scottish": "Scotland",
    "senegalese": "Senegal",
    "serbian": "Serbia",
    "seychellois": "Seychelles",
    "sierra leonean": "Sierra Leone",
    "singapore": "Singapore",
    "singaporean": "Singapore",
    "slovak": "Slovakia",
    "slovene": "Slovenia",
    "slovenian": "Slovenia",
    "solomon island": "Solomon Islands",
    "somali": "Somalia",
    "somalilander": "Somaliland",
    "south african": "South Africa",
    "south georgia island": "South Georgia and the South Sandwich Islands",
    "south ossetian": "South Ossetia",
    "south sandwich island": "South Georgia and the South Sandwich Islands",
    "south sudanese": "South Sudan",
    "spanish": "Spain",
    "sri lankan": "Sri Lanka",
    "sudanese": "Sudan",
    "surinamese": "Suriname",
    "svalbard resident": "Svalbard",
    "swati": "Eswatini",
    "swazi": "Eswatini",
    "swedish": "Sweden",
    "swiss": "Switzerland",
    "syrian": "Syrian Arab Republic",
    "taiwanese": "Taiwan",
    "tajikistani": "Tajikistan",
    "tanzanian": "Tanzania",
    "thai": "Thailand",
    "timorese": "Timor-Leste",
    "tobagonian": "Trinidad and Tobago",
    "togolese": "Togo",
    "tokelauan": "Tokelau",
    "tongan": "Tonga",
    "trinidadian": "Trinidad and Tobago",
    "tunisian": "Tunisia",
    "turkish": "Turkey",
    "turkmen": "Turkmenistan",
    "turks and caicos island": "Turks and Caicos Islands",
    "tuvaluan": "Tuvalu",
    "ugandan": "Uganda",
    "ukrainian": "Ukraine",
    "uruguayan": "Uruguay",
    "uzbek": "Uzbekistan",
    "uzbekistani": "Uzbekistan",
    "vanuatuan": "Vanuatu",
    "vatican": "Vatican City State",
    "venezuelan": "Venezuela",
    "vietnamese": "Vietnam",
    "wallis and futuna": "Wallis and Futuna",
    "wallisian": "Wallis and Futuna",
    "welsh": "Wales",
    "yemeni": "Yemen",
    "zambian": "Zambia",
    "zimbabwean": "Zimbabwe",
    "åland island": "Åland Islands",
    **{s.lower(): "USA" for s in US_states},
}