mirror of
https://github.com/stashapp/CommunityScrapers.git
synced 2025-12-13 00:07:59 -06:00
268 lines
10 KiB
Python
268 lines
10 KiB
Python
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from datetime import datetime
|
|
|
|
# to import from a parent directory we need to add that directory to the system path
|
|
csd = os.path.dirname(os.path.realpath(__file__)) # get current script directory
|
|
parent = os.path.dirname(csd) # parent directory (should be the scrapers one)
|
|
sys.path.append(
|
|
parent
|
|
) # add parent dir to sys path so that we can import py_common from there
|
|
|
|
try:
|
|
import cloudscraper
|
|
except ModuleNotFoundError:
|
|
print("You need to install the cloudscraper module. (https://pypi.org/project/cloudscraper/)", file=sys.stderr)
|
|
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install cloudscraper", file=sys.stderr)
|
|
sys.exit()
|
|
|
|
try:
|
|
import requests
|
|
except ModuleNotFoundError:
|
|
print("You need to install the requests module. (https://docs.python-requests.org/en/latest/user/install/)", file=sys.stderr)
|
|
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install requests", file=sys.stderr)
|
|
sys.exit()
|
|
|
|
try:
|
|
from lxml import html
|
|
except ModuleNotFoundError:
|
|
print("You need to install the lxml module. (https://lxml.de/installation.html#installation)", file=sys.stderr)
|
|
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install lxml", file=sys.stderr)
|
|
sys.exit()
|
|
|
|
try:
|
|
import py_common.log as log
|
|
except ModuleNotFoundError:
|
|
print(
|
|
"You need to download the folder 'py_common' from the community repo (CommunityScrapers/tree/master/scrapers/py_common)",
|
|
file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# --------------------------------------
|
|
|
|
# This is a scraper for: animecharactersdatabase.com
|
|
#
|
|
# AnimeCharactersDatabase includes characters from:
|
|
# Anime, Hentai, (Mobile) Games, Eroge, Virtual Idols/YouTubers, Vocaloid
|
|
#
|
|
# These fields will be populated if available:
|
|
# Name, Gender, Birthdate, Country, Hair Color, Eye Color, Height, Measurements, URL, Details, Tags, Image
|
|
#
|
|
# A number of additional tags can be configured below.
|
|
|
|
# ---------------------------------------
|
|
# ---------- Tag Configuration ----------
|
|
# ---------------------------------------
|
|
|
|
# Maximum number of search results (between 1 and 30).
|
|
# Search by name includes the franchise for each result to make it easier to choose the correct one.
|
|
# Some (non ascii, very short) names require querying the API individually to get the franchise for each result.
|
|
# This might get you banned, since the API is rate limited.
|
|
# See: http://wiki.animecharactersdatabase.com/index.php?title=API_Access
|
|
limit = 15
|
|
|
|
# Prefix for performer tags.
|
|
prefix = "performer:"
|
|
|
|
# List of additional tags.
|
|
additional_tags = [{"name": "fictional"}] # []
|
|
|
|
# Tags mostly include appearance indicators like: ahoge, dress, hat, twintails, etc.
|
|
include_tag = True
|
|
tag_prefix = prefix
|
|
|
|
# Scrape the source material as tag (name of anime/game): Kantai Collection, Idolmaster: Cinderella Girls, etc.
|
|
include_parody = True
|
|
parody_prefix = "parody:"
|
|
|
|
# Scrape Zodiac Sign as tag: Libra ♎, Sagittarius ♐, etc.
|
|
include_sign = True
|
|
sign_prefix = prefix + "sign:"
|
|
|
|
# Scrape race of non-human characters as tag: Orc, Elf, etc.
|
|
include_race = True
|
|
race_prefix = prefix + "race:"
|
|
|
|
# Scrape ship class of ship girls as tag (kancolle, etc.): Destroyer, etc.
|
|
include_ship_class = True
|
|
ship_class_prefix = prefix + "ship:"
|
|
|
|
# Scrape blood type as tag: A, B, etc.
|
|
include_blood_type = True
|
|
blood_type_prefix = prefix + "Blood Type "
|
|
|
|
# Scrape apparent age as tag: Adult, Teen, etc.
|
|
# Might differ from canonical age.
|
|
# Canonical age will be ignored, since it would result in too many tags.
|
|
# Birthdate is sometimes available, but the resulting calculated age represents neither canonical age nor apparent age.
|
|
include_apparent_age = True
|
|
apparent_age_prefix = prefix + "Apparent "
|
|
|
|
# Scrape Hair Length as tag: To Shoulders, To Neck, Past Waist, etc.
|
|
include_hair_length = True
|
|
hair_length_prefix = prefix + "Hair "
|
|
|
|
|
|
# ---------------------------------------
|
|
# ---------------------------------------
|
|
# ---------------------------------------
|
|
|
|
def readJSONInput():
|
|
input = sys.stdin.read()
|
|
return json.loads(input)
|
|
|
|
|
|
def scrapeURL(url):
|
|
return html.fromstring(scrapeUrlToString(url))
|
|
|
|
|
|
def scrapeUrlToString(url):
|
|
scraper = cloudscraper.create_scraper()
|
|
try:
|
|
scraped = scraper.get(url)
|
|
except:
|
|
log.error("scrape error")
|
|
sys.exit(1)
|
|
|
|
if scraped.status_code >= 400:
|
|
log.error('HTTP Error: %s' % scraped.status_code)
|
|
sys.exit(1)
|
|
|
|
return scraped.content
|
|
|
|
|
|
def performerByName(query):
|
|
cleanedQuery = requests.utils.quote(query)
|
|
url = f"https://www.animecharactersdatabase.com/searchall.php?in=characters&sq={cleanedQuery}"
|
|
tree = scrapeURL(url)
|
|
names = tree.xpath("//li/div[@class='tile3top']/a/text()")
|
|
ids = tree.xpath("//li/div[@class='tile3top']/a/@href")
|
|
|
|
results = []
|
|
for name, id in zip(names, ids):
|
|
results.append({
|
|
"name": name,
|
|
"id": id.replace("characters.php?id=", ""),
|
|
"url": "https://www.animecharactersdatabase.com/" + id
|
|
})
|
|
log.info(f"scraped {len(results)} results on: {url}")
|
|
return results
|
|
|
|
|
|
def addFranchise(query, results):
|
|
cleanedQuery = requests.utils.quote(query)
|
|
url = f"https://www.animecharactersdatabase.com/api_series_characters.php?character_q={cleanedQuery}"
|
|
data = json.loads(scrapeUrlToString(url))
|
|
count1 = 0
|
|
count2 = 0
|
|
for result in results:
|
|
try:
|
|
# Try to find the franchise in API search results.
|
|
# These results are ordered by alphabet and limited to 100,
|
|
# so short queries might not include the correct result.
|
|
# The API query also does not seem to support any Kanji.
|
|
franchise = next(e["anime_name"] for e in data["search_results"] if str(e["id"]) == result["id"])
|
|
count1 += 1
|
|
except:
|
|
# Use separate API calls as a backup.
|
|
# This might get you banned, since the API is rate limited.
|
|
franchise = apiGetCharacter(result["id"])["origin"]
|
|
count2 += 1
|
|
# Append franchise to character name for easier differentiation.
|
|
result["name"] = f"{result['name']} ({franchise})"
|
|
result.pop("id")
|
|
log.debug(f"scraped {count1} franchises by single API call")
|
|
log.debug(f"scraped {count2} franchises by separate API calls")
|
|
return results
|
|
|
|
|
|
def apiGetCharacter(id):
|
|
url = f"https://www.animecharactersdatabase.com/api_series_characters.php?character_id={id}"
|
|
return json.loads(scrapeUrlToString(url))
|
|
|
|
|
|
def performerByURL(url, result={}):
|
|
log.debug("performerByURL: " + url)
|
|
tree = scrapeURL(url)
|
|
result["url"] = url
|
|
result["name"] = next(iter(tree.xpath(
|
|
"//h3[@id='section001_summary']/following-sibling::p/a[contains(@href,'character')]/text()")), "").strip()
|
|
result["details"] = "\n".join([s.strip() for s in tree.xpath(
|
|
"//div[@style='padding: 0 15px 15px 15px; text-align: left;']/text()")])
|
|
if not result["details"]:
|
|
result["details"] = re.sub(" .$", ".", " ".join([s.strip() for s in tree.xpath(
|
|
"//h3[@id='section001_summary']/following-sibling::p[contains(a/@href,'character')]//text()") if
|
|
s.strip()]))
|
|
result["image"] = next(iter(tree.xpath("//meta[@property='og:image']/@content")), "")
|
|
|
|
# left table, works for link and plain text fields, return result list
|
|
def parse_left(field):
|
|
template = "//table//th[text()='{0}' or a/text()='{0}']/following-sibling::td/a/text()"
|
|
return tree.xpath(template.format(field))
|
|
|
|
result["tags"] = additional_tags
|
|
if include_tag:
|
|
result["tags"] += [{"name": tag_prefix + tag.strip()} for tag in parse_left("Tags ")]
|
|
if include_parody:
|
|
result["tags"] += [{"name": parody_prefix + tag.strip()} for tag in parse_left("From")]
|
|
if include_blood_type:
|
|
result["tags"] += [{"name": blood_type_prefix + tag.strip()} for tag in parse_left("Blood Type")]
|
|
if include_race:
|
|
result["tags"] += [{"name": race_prefix + tag.strip()} for tag in parse_left("Race")]
|
|
if include_sign:
|
|
result["tags"] += [{"name": sign_prefix + tag.strip()} for tag in parse_left("Sign")]
|
|
if include_ship_class:
|
|
result["tags"] += [{"name": ship_class_prefix + tag.strip()} for tag in parse_left("Ship Class")]
|
|
result["country"] = next(iter(parse_left("Nationality")), "")
|
|
birthday = parse_left("Birthday")
|
|
birthyear = parse_left("Birthyear")
|
|
if birthday and birthyear:
|
|
birthdate = datetime.strptime(birthday[0].strip(), "%B %d").replace(year=int(birthyear[0].strip()))
|
|
result["birthdate"] = birthdate.strftime("%Y-%m-%d")
|
|
bust = parse_left("Bust")
|
|
waist = parse_left("Waist")
|
|
hip = parse_left("Hip")
|
|
if bust and waist and hip:
|
|
bust = bust[0].strip().replace("cm", "")
|
|
waist = waist[0].strip().replace("cm", "")
|
|
hip = hip[0].strip().replace("cm", "")
|
|
result["measurements"] = "{}-{}-{}".format(bust, waist, hip)
|
|
result["height"] = next(iter(parse_left("Height")), "").strip().replace("cm", "")
|
|
|
|
# middle/right table, reverse result list to prefer official appearance, return result or empty string
|
|
def parse_right(field):
|
|
template = "//table//th[text()='{}']/following-sibling::td/text()"
|
|
return next(reversed(tree.xpath(template.format(field))), "").strip().replace("Unknown", "")
|
|
|
|
# should be tagged anyway if yes
|
|
# if parse_right("Animal Ears") == "Yes":
|
|
# result["tags"] += [{"name": "performer:animal ears"}]
|
|
hair_length = parse_right("Hair Length")
|
|
if include_hair_length and hair_length:
|
|
result["tags"] += [{"name": hair_length_prefix + hair_length}]
|
|
apparent_age = parse_right("Apparent Age")
|
|
if include_apparent_age and apparent_age:
|
|
result["tags"] += [{"name": apparent_age_prefix + apparent_age}]
|
|
result["gender"] = parse_right("Gender")
|
|
result["eye_color"] = parse_right("Eye Color")
|
|
result["hair_color"] = parse_right("Hair Color")
|
|
|
|
return result
|
|
|
|
|
|
# read the input
|
|
i = readJSONInput()
|
|
|
|
if sys.argv[1] == "performerByURL":
|
|
url = i["url"]
|
|
result = performerByURL(url)
|
|
print(json.dumps(result))
|
|
elif sys.argv[1] == "performerByName":
|
|
name = i["name"]
|
|
log.info(f"Searching for name: {name}")
|
|
results = performerByName(name)[:limit]
|
|
results = addFranchise(name, results)
|
|
print(json.dumps(results))
|