2024-07-12 08:11:41 +02:00

677 lines
19 KiB
Python

from argparse import ArgumentParser
from functools import reduce
from typing import Any, Callable, TypeVar
from urllib.error import URLError
from urllib.request import Request, urlopen
import json
import sys
def dig(c: dict | list, *keys: str | int | tuple[str | int, ...], default=None) -> Any:
"""
Helper function to get a value from a nested dict or list
If a key is a tuple the items will be tried in order until a value is found
:param c: dict or list to search
:param keys: keys to search for
:param default: default value to return if not found
:return: value if found, None otherwise
>>> obj = {"a": {"b": ["c", "d"], "f": {"g": "h"}}}
>>> dig(obj, "a", "b", 1)
'd'
>>> dig(obj, "a", ("e", "f"), "g")
'h'
"""
def inner(d: dict | list, key: str | int | tuple):
if isinstance(d, dict):
if isinstance(key, tuple):
for k in key:
if k in d:
return d[k]
return d.get(key)
elif isinstance(d, list) and isinstance(key, int) and key < len(d):
return d[key]
else:
return default
return reduce(inner, keys, c) # type: ignore
T = TypeVar("T")
def replace_all(obj: dict, key: str, replacement: Callable[[T], T]) -> dict:
"""
Helper function to recursively replace values in a nested dict, returning a new dict
If the key refers to a list the replacement function will be called for each item
:param obj: dict to search
:param key: key to search for
:param replacement: function called on the value to replace it
:return: new dict
>>> obj = {"a": {"b": ["c", "d"], "f": {"g": "h"}}}
>>> replace(obj, "g", lambda x: x.upper()) # Replace a single item
{'a': {'b': ['c', 'd'], 'f': {'g': 'H'}}}
>>> replace(obj, "b", lambda x: x.upper()) # Replace all items in a list
{'a': {'b': ['C', 'D'], 'f': {'g': 'h'}}}
>>> replace(obj, "z", lambda x: x.upper()) # Do nothing if the key is not found
{'a': {'b': ['c', 'd'], 'f': {'g': 'h'}}}
"""
if not isinstance(obj, dict):
return obj
new = {}
for k, v in obj.items():
if k == key:
if isinstance(v, list):
new[k] = [replacement(x) for x in v]
else:
new[k] = replacement(v)
elif isinstance(v, dict):
new[k] = replace_all(v, key, replacement)
elif isinstance(v, list):
new[k] = [replace_all(x, key, replacement) for x in v]
else:
new[k] = v
return new
def replace_at(obj: dict, *path: str, replacement: Callable[[T], T]) -> dict:
"""
Helper function to replace a value at a given path in a nested dict, returning a new dict
If the path refers to a list the replacement function will be called for each item
If the path does not exist, the replacement function will not be called and the dict will be returned as-is
:param obj: dict to search
:param path: path to search for
:param replacement: function called on the value to replace it
:return: new dict
>>> obj = {"a": {"b": ["c", "d"], "f": {"g": "h"}}}
>>> replace_at(obj, "a", "f", "g", replacement=lambda x: x.upper()) # Replace a single item
{'a': {'b': ['c', 'd'], 'f': {'g': 'H'}}}
>>> replace_at(obj, "a", "b", replacement=lambda x: x.upper()) # Replace all items in a list
{'a': {'b': ['C', 'D'], 'f': {'g': 'h'}}}
>>> replace_at(obj, "a", "z", "g", replacement=lambda x: x.upper()) # Broken path, do nothing
{'a': {'b': ['c', 'd'], 'f': {'g': 'h'}}}
"""
def inner(d: dict, *keys: str):
match keys:
case [k] if isinstance(d, dict) and k in d:
if isinstance(d[k], list):
return {**d, k: [replacement(x) for x in d[k]]}
return {**d, k: replacement(d[k])}
case [k, *ks] if isinstance(d, dict) and k in d:
return {**d, k: inner(d[k], *ks)}
case _:
return d
return inner(obj, *path) # type: ignore
def is_valid_url(url):
"""
Checks if an URL is valid by making a GET request and ensuring the response code is 2xx
"""
try:
req = Request(url)
with urlopen(req) as response:
return 200 <= response.getcode() < 300
except URLError:
return False
def __default_parser(**kwargs):
parser = ArgumentParser(**kwargs)
# Some scrapers can take extra arguments so we can
# do rudimentary configuration in the YAML file
parser.add_argument("extra", nargs="*")
subparsers = parser.add_subparsers(dest="operation", required=True)
# "Scrape with..." and the subsequent search box
subparsers.add_parser(
"performer-by-name", help="Search for performers"
).add_argument("--name", help="Performer name to search for")
# The results of performer-by-name will be passed to this
pbf = subparsers.add_parser("performer-by-fragment", help="Scrape a performer")
# Technically there's more information in this fragment,
# but in 99.9% of cases we only need the URL or the name
pbf.add_argument("--url", help="Scene URL")
pbf.add_argument("--name", help="Performer name to search for")
# Filling in an URL and hitting the "Scrape" icon
subparsers.add_parser(
"performer-by-url", help="Scrape a performer by their URL"
).add_argument("--url")
# Filling in an URL and hitting the "Scrape" icon
subparsers.add_parser(
"movie-by-url", help="Scrape a movie by its URL"
).add_argument("--url")
# The looking glass search icon
# name field is guaranteed to be filled by Stash
subparsers.add_parser("scene-by-name", help="Scrape a scene by name").add_argument(
"--name", help="Name to search for"
)
# Filling in an URL and hitting the "Scrape" icon
subparsers.add_parser(
"scene-by-url", help="Scrape a scene by its URL"
).add_argument("--url")
# "Scrape with..."
sbf = subparsers.add_parser("scene-by-fragment", help="Scrape a scene")
sbf.add_argument("-u", "--url")
sbf.add_argument("--id")
sbf.add_argument("--title") # Title will be filename if not set in Stash
sbf.add_argument("--date")
sbf.add_argument("--details")
sbf.add_argument("--urls", nargs="+")
# Tagger view or search box
sbqf = subparsers.add_parser("scene-by-query-fragment", help="Scrape a scene")
sbqf.add_argument("-u", "--url")
sbqf.add_argument("--id")
sbqf.add_argument("--title") # Title will be filename if not set in Stash
sbqf.add_argument("--code")
sbqf.add_argument("--details")
sbqf.add_argument("--director")
sbqf.add_argument("--date")
sbqf.add_argument("--urls", nargs="+")
# Filling in an URL and hitting the "Scrape" icon
subparsers.add_parser(
"gallery-by-url", help="Scrape a gallery by its URL"
).add_argument("--url", help="Gallery URL")
# "Scrape with..."
gbf = subparsers.add_parser("gallery-by-fragment", help="Scrape a gallery")
gbf.add_argument("-u", "--url")
gbf.add_argument("--id")
gbf.add_argument("--title")
gbf.add_argument("--date")
gbf.add_argument("--details")
gbf.add_argument("--urls", nargs="+")
return parser
def scraper_args(**kwargs):
"""
Helper function to parse arguments for a scraper
This allows scrapers to be called from the command line without
piping JSON to stdin but also from Stash
Returns a tuple of the operation and the parsed arguments: operation is one of
- performer-by-name
- performer-by-fragment
- performer-by-url
- movie-by-url
- scene-by-name
- scene-by-url
- scene-by-fragment
- scene-by-query-fragment
- gallery-by-url
- gallery-by-fragment
A scraper can be configured to take extra arguments by adding them to the YAML file:
```yaml
sceneByName:
action: script
script:
- python
- my-scraper.py
- extra
- args
- scene-by-name
```
When called from Stash through the above configuration this function would return:
```python
("scene-by-name", {"extra": ["extra", "args"], "name": "scene name"})
```
"""
parser = __default_parser(**kwargs)
args = vars(parser.parse_args())
# If stdin is not connected to a TTY the script is being executed by Stash
if not sys.stdin.isatty():
try:
stash_fragment = json.load(sys.stdin)
args.update(stash_fragment)
except json.decoder.JSONDecodeError:
# This would only happen if Stash passed invalid JSON
sys.exit(69)
return args.pop("operation"), args
def guess_nationality(country: str) -> str:
"""
Tries to guess the country from a string
Returns the original string if no match is found
"""
for c in country.split(","):
c = c.strip().lower()
if c in demonyms:
return demonyms[c]
return country
US_states = [
"AK",
"AL",
"AR",
"AZ",
"CA",
"CO",
"CT",
"DC",
"DE",
"FL",
"GA",
"HI",
"IA",
"ID",
"IL",
"IN",
"KS",
"KY",
"LA",
"MA",
"MD",
"ME",
"MI",
"MN",
"MO",
"MS",
"MT",
"NC",
"ND",
"NE",
"NH",
"NJ",
"NM",
"NV",
"NY",
"OH",
"OK",
"OR",
"PA",
"RI",
"SC",
"SD",
"TN",
"TX",
"UT",
"VA",
"VT",
"WA",
"WI",
"WV",
"WY",
"Alabama",
"Alaska",
"Arizona",
"Arkansas",
"California",
"Colorado",
"Connecticut",
"Delaware",
"Florida",
"Georgia",
"Hawaii",
"Idaho",
"Illinois",
"Indiana",
"Iowa",
"Kansas",
"Kentucky",
"Louisiana",
"Maine",
"Maryland",
"Massachusetts",
"Michigan",
"Minnesota",
"Mississippi",
"Missouri",
"Montana",
"Nebraska",
"Nevada",
"New Hampshire",
"New Jersey",
"New Mexico",
"New York",
"North Carolina",
"North Dakota",
"Ohio",
"Oklahoma",
"Oregon",
"Pennsylvania",
"Rhode Island",
"South Carolina",
"South Dakota",
"Tennessee",
"Texas",
"Utah",
"Vermont",
"Virginia",
"Washington",
"West Virginia",
"Wisconsin",
"Wyoming",
]
demonyms = {
# https://en.wikipedia.org/wiki/List_of_adjectival_and_demonymic_forms_for_countries_and_nations
"abkhaz": "Abkhazia",
"abkhazian": "Abkhazia",
"afghan": "Afghanistan",
"african american": "USA",
"albanian": "Albania",
"algerian": "Algeria",
"american samoan": "American Samoa",
"american": "USA",
"andorran": "Andorra",
"angolan": "Angola",
"anguillan": "Anguilla",
"antarctic": "Antarctica",
"antiguan": "Antigua and Barbuda",
"argentine": "Argentina",
"argentinian": "Argentina",
"armenian": "Armenia",
"aruban": "Aruba",
"australian": "Australia",
"austrian": "Austria",
"azerbaijani": "Azerbaijan",
"azeri": "Azerbaijan",
"bahamian": "Bahamas",
"bahraini": "Bahrain",
"bangladeshi": "Bangladesh",
"barbadian": "Barbados",
"barbudan": "Antigua and Barbuda",
"basotho": "Lesotho",
"belarusian": "Belarus",
"belgian": "Belgium",
"belizean": "Belize",
"beninese": "Benin",
"beninois": "Benin",
"bermudan": "Bermuda",
"bermudian": "Bermuda",
"bhutanese": "Bhutan",
"biot": "British Indian Ocean Territory",
"bissau-guinean": "Guinea-Bissau",
"bolivian": "Bolivia",
"bonaire": "Bonaire",
"bonairean": "Bonaire",
"bosnian": "Bosnia and Herzegovina",
"botswanan": "Botswana",
"bouvet island": "Bouvet Island",
"brazilian": "Brazil",
"british virgin island": "Virgin Islands, British",
"british": "United Kingdom",
"bruneian": "Brunei",
"bulgarian": "Bulgaria",
"burkinabé": "Burkina Faso",
"burmese": "Burma",
"burundian": "Burundi",
"cabo verdean": "Cabo Verde",
"cambodian": "Cambodia",
"cameroonian": "Cameroon",
"canadian": "Canada",
"cantonese": "Hong Kong",
"caymanian": "Cayman Islands",
"central african": "Central African Republic",
"chadian": "Chad",
"channel island": "Guernsey",
"chilean": "Chile",
"chinese": "China",
"christmas island": "Christmas Island",
"cocos island": "Cocos (Keeling) Islands",
"colombian": "Colombia",
"comoran": "Comoros",
"comorian": "Comoros",
"congolese": "Congo",
"cook island": "Cook Islands",
"costa rican": "Costa Rica",
"croatian": "Croatia",
"cuban": "Cuba",
"curaçaoan": "Curaçao",
"cypriot": "Cyprus",
"czech": "Czech Republic",
"danish": "Denmark",
"djiboutian": "Djibouti",
"dominican": "Dominica",
"dutch": "Netherlands",
"ecuadorian": "Ecuador",
"egyptian": "Egypt",
"emirati": "United Arab Emirates",
"emiri": "United Arab Emirates",
"emirian": "United Arab Emirates",
"english people": "England",
"english": "England",
"equatoguinean": "Equatorial Guinea",
"equatorial guinean": "Equatorial Guinea",
"eritrean": "Eritrea",
"estonian": "Estonia",
"ethiopian": "Ethiopia",
"european": "European Union",
"falkland island": "Falkland Islands",
"faroese": "Faroe Islands",
"fijian": "Fiji",
"filipino": "Philippines",
"finnish": "Finland",
"formosan": "Taiwan",
"french guianese": "French Guiana",
"french polynesian": "French Polynesia",
"french southern territories": "French Southern Territories",
"french": "France",
"futunan": "Wallis and Futuna",
"gabonese": "Gabon",
"gambian": "Gambia",
"georgian": "Georgia",
"german": "Germany",
"ghanaian": "Ghana",
"gibraltar": "Gibraltar",
"greek": "Greece",
"greenlandic": "Greenland",
"grenadian": "Grenada",
"guadeloupe": "Guadeloupe",
"guamanian": "Guam",
"guatemalan": "Guatemala",
"guinean": "Guinea",
"guyanese": "Guyana",
"haitian": "Haiti",
"heard island": "Heard Island and McDonald Islands",
"hellenic": "Greece",
"herzegovinian": "Bosnia and Herzegovina",
"honduran": "Honduras",
"hong kong": "Hong Kong",
"hong konger": "Hong Kong",
"hungarian": "Hungary",
"icelandic": "Iceland",
"indian": "India",
"indonesian": "Indonesia",
"iranian": "Iran",
"iraqi": "Iraq",
"irish": "Ireland",
"israeli": "Israel",
"israelite": "Israel",
"italian": "Italy",
"ivorian": "Ivory Coast",
"jamaican": "Jamaica",
"jan mayen": "Jan Mayen",
"japanese": "Japan",
"jordanian": "Jordan",
"kazakh": "Kazakhstan",
"kazakhstani": "Kazakhstan",
"kenyan": "Kenya",
"kirghiz": "Kyrgyzstan",
"kirgiz": "Kyrgyzstan",
"kiribati": "Kiribati",
"korean": "South Korea",
"kosovan": "Kosovo",
"kosovar": "Kosovo",
"kuwaiti": "Kuwait",
"kyrgyz": "Kyrgyzstan",
"kyrgyzstani": "Kyrgyzstan",
"lao": "Lao People's Democratic Republic",
"laotian": "Lao People's Democratic Republic",
"latvian": "Latvia",
"lebanese": "Lebanon",
"lettish": "Latvia",
"liberian": "Liberia",
"libyan": "Libya",
"liechtensteiner": "Liechtenstein",
"lithuanian": "Lithuania",
"luxembourg": "Luxembourg",
"luxembourgish": "Luxembourg",
"macanese": "Macau",
"macedonian": "North Macedonia",
"magyar": "Hungary",
"mahoran": "Mayotte",
"malagasy": "Madagascar",
"malawian": "Malawi",
"malaysian": "Malaysia",
"maldivian": "Maldives",
"malian": "Mali",
"malinese": "Mali",
"maltese": "Malta",
"manx": "Isle of Man",
"marshallese": "Marshall Islands",
"martinican": "Martinique",
"martiniquais": "Martinique",
"mauritanian": "Mauritania",
"mauritian": "Mauritius",
"mcdonald islands": "Heard Island and McDonald Islands",
"mexican": "Mexico",
"moldovan": "Moldova",
"monacan": "Monaco",
"mongolian": "Mongolia",
"montenegrin": "Montenegro",
"montserratian": "Montserrat",
"monégasque": "Monaco",
"moroccan": "Morocco",
"motswana": "Botswana",
"mozambican": "Mozambique",
"myanma": "Myanmar",
"namibian": "Namibia",
"nauruan": "Nauru",
"nepalese": "Nepal",
"nepali": "Nepal",
"netherlandic": "Netherlands",
"new caledonian": "New Caledonia",
"new zealand": "New Zealand",
"ni-vanuatu": "Vanuatu",
"nicaraguan": "Nicaragua",
"nigerian": "Nigeria",
"nigerien": "Niger",
"niuean": "Niue",
"norfolk island": "Norfolk Island",
"northern irish": "Northern Ireland",
"northern marianan": "Northern Mariana Islands",
"norwegian": "Norway",
"omani": "Oman",
"pakistani": "Pakistan",
"palauan": "Palau",
"palestinian": "Palestine",
"panamanian": "Panama",
"papua new guinean": "Papua New Guinea",
"papuan": "Papua New Guinea",
"paraguayan": "Paraguay",
"persian": "Iran",
"peruvian": "Peru",
"philippine": "Philippines",
"pitcairn island": "Pitcairn Islands",
"polish": "Poland",
"portuguese": "Portugal",
"puerto rican": "Puerto Rico",
"qatari": "Qatar",
"romanian": "Romania",
"russian": "Russia",
"rwandan": "Rwanda",
"saba": "Saba",
"saban": "Saba",
"sahraouian": "Western Sahara",
"sahrawi": "Western Sahara",
"sahrawian": "Western Sahara",
"salvadoran": "El Salvador",
"sammarinese": "San Marino",
"samoan": "Samoa",
"saudi arabian": "Saudi Arabia",
"saudi": "Saudi Arabia",
"scottish": "Scotland",
"senegalese": "Senegal",
"serbian": "Serbia",
"seychellois": "Seychelles",
"sierra leonean": "Sierra Leone",
"singapore": "Singapore",
"singaporean": "Singapore",
"slovak": "Slovakia",
"slovene": "Slovenia",
"slovenian": "Slovenia",
"solomon island": "Solomon Islands",
"somali": "Somalia",
"somalilander": "Somaliland",
"south african": "South Africa",
"south georgia island": "South Georgia and the South Sandwich Islands",
"south ossetian": "South Ossetia",
"south sandwich island": "South Georgia and the South Sandwich Islands",
"south sudanese": "South Sudan",
"spanish": "Spain",
"sri lankan": "Sri Lanka",
"sudanese": "Sudan",
"surinamese": "Suriname",
"svalbard resident": "Svalbard",
"swati": "Eswatini",
"swazi": "Eswatini",
"swedish": "Sweden",
"swiss": "Switzerland",
"syrian": "Syrian Arab Republic",
"taiwanese": "Taiwan",
"tajikistani": "Tajikistan",
"tanzanian": "Tanzania",
"thai": "Thailand",
"timorese": "Timor-Leste",
"tobagonian": "Trinidad and Tobago",
"togolese": "Togo",
"tokelauan": "Tokelau",
"tongan": "Tonga",
"trinidadian": "Trinidad and Tobago",
"tunisian": "Tunisia",
"turkish": "Turkey",
"turkmen": "Turkmenistan",
"turks and caicos island": "Turks and Caicos Islands",
"tuvaluan": "Tuvalu",
"ugandan": "Uganda",
"ukrainian": "Ukraine",
"uruguayan": "Uruguay",
"uzbek": "Uzbekistan",
"uzbekistani": "Uzbekistan",
"vanuatuan": "Vanuatu",
"vatican": "Vatican City State",
"venezuelan": "Venezuela",
"vietnamese": "Vietnam",
"wallis and futuna": "Wallis and Futuna",
"wallisian": "Wallis and Futuna",
"welsh": "Wales",
"yemeni": "Yemen",
"zambian": "Zambia",
"zimbabwean": "Zimbabwe",
"åland island": "Åland Islands",
**{s.lower(): "USA" for s in US_states},
}