CommunityScrapers/scrapers/AnimeCharactersDatabase/AnimeCharactersDatabase.py

import json
import os
import re
import sys
from datetime import datetime

# to import from a parent directory we need to add that directory to the system path
csd = os.path.dirname(os.path.realpath(__file__))  # get current script directory
parent = os.path.dirname(csd)  # parent directory (should be the scrapers one)
sys.path.append(
    parent
)  # add parent dir to sys path so that we can import py_common from there

try:
    import cloudscraper
except ModuleNotFoundError:
    print("You need to install the cloudscraper module. (https://pypi.org/project/cloudscraper/)", file=sys.stderr)
    print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install cloudscraper", file=sys.stderr)
    sys.exit()

try:
    import requests
except ModuleNotFoundError:
    print("You need to install the requests module. (https://docs.python-requests.org/en/latest/user/install/)", file=sys.stderr)
    print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install requests", file=sys.stderr)
    sys.exit()

try:
    from lxml import html
except ModuleNotFoundError:
    print("You need to install the lxml module. (https://lxml.de/installation.html#installation)", file=sys.stderr)
    print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install lxml", file=sys.stderr)
    sys.exit()

try:
    import py_common.log as log
except ModuleNotFoundError:
    print(
        "You need to download the folder 'py_common' from the community repo (CommunityScrapers/tree/master/scrapers/py_common)",
        file=sys.stderr)
    sys.exit(1)

#  --------------------------------------

# This is a scraper for: animecharactersdatabase.com
#
# AnimeCharactersDatabase includes characters from:
# Anime, Hentai, (Mobile) Games, Eroge, Virtual Idols/YouTubers, Vocaloid
#
# These fields will be populated if available:
# Name, Gender, Birthdate, Country, Hair Color, Eye Color, Height, Measurements, URL, Details, Tags, Image
#
# A number of additional tags can be configured below.

# ---------------------------------------
# ---------- Tag Configuration ----------
# ---------------------------------------

# Maximum number of search results (between 1 and 30).
# Search by name includes the franchise for each result to make it easier to choose the correct one.
# Some (non ascii, very short) names require querying the API individually to get the franchise for each result.
# This might get you banned, since the API is rate limited.
# See: http://wiki.animecharactersdatabase.com/index.php?title=API_Access
limit = 15

# Prefix for performer tags.
prefix = "performer:"

# List of additional tags.
additional_tags = [{"name": "fictional"}]  # []

# Tags mostly include appearance indicators like: ahoge, dress, hat, twintails, etc.
include_tag = True
tag_prefix = prefix

# Scrape the source material as tag (name of anime/game): Kantai Collection, Idolmaster: Cinderella Girls, etc.
include_parody = True
parody_prefix = "parody:"

# Scrape Zodiac Sign as tag: Libra ♎, Sagittarius ♐, etc.
include_sign = True
sign_prefix = prefix + "sign:"

# Scrape race of non-human characters as tag: Orc, Elf, etc.
include_race = True
race_prefix = prefix + "race:"

# Scrape ship class of ship girls as tag (kancolle, etc.): Destroyer, etc.
include_ship_class = True
ship_class_prefix = prefix + "ship:"

# Scrape blood type as tag: A, B, etc.
include_blood_type = True
blood_type_prefix = prefix + "Blood Type "

# Scrape apparent age as tag: Adult, Teen, etc.
# Might differ from canonical age.
# Canonical age will be ignored, since it would result in too many tags.
# Birthdate is sometimes available, but the resulting calculated age represents neither canonical age nor apparent age.
include_apparent_age = True
apparent_age_prefix = prefix + "Apparent "

# Scrape Hair Length as tag: To Shoulders, To Neck, Past Waist, etc.
include_hair_length = True
hair_length_prefix = prefix + "Hair "


# ---------------------------------------
# ---------------------------------------
# ---------------------------------------

def readJSONInput():
    input = sys.stdin.read()
    return json.loads(input)


def scrapeURL(url):
    return html.fromstring(scrapeUrlToString(url))


def scrapeUrlToString(url):
    scraper = cloudscraper.create_scraper()
    try:
        scraped = scraper.get(url)
    except:
        log.error("scrape error")
        sys.exit(1)

    if scraped.status_code >= 400:
        log.error('HTTP Error: %s' % scraped.status_code)
        sys.exit(1)

    return scraped.content


def performerByName(query):
    cleanedQuery = requests.utils.quote(query)
    url = f"https://www.animecharactersdatabase.com/searchall.php?in=characters&sq={cleanedQuery}"
    tree = scrapeURL(url)
    names = tree.xpath("//li/div[@class='tile3top']/a/text()")
    ids = tree.xpath("//li/div[@class='tile3top']/a/@href")

    results = []
    for name, id in zip(names, ids):
        results.append({
            "name": name,
            "id": id.replace("characters.php?id=", ""),
            "url": "https://www.animecharactersdatabase.com/" + id
        })
    log.info(f"scraped {len(results)} results on: {url}")
    return results


def addFranchise(query, results):
    cleanedQuery = requests.utils.quote(query)
    url = f"https://www.animecharactersdatabase.com/api_series_characters.php?character_q={cleanedQuery}"
    data = json.loads(scrapeUrlToString(url))
    count1 = 0
    count2 = 0
    for result in results:
        try:
            # Try to find the franchise in API search results.
            # These results are ordered by alphabet and limited to 100,
            # so short queries might not include the correct result.
            # The API query also does not seem to support any Kanji.
            franchise = next(e["anime_name"] for e in data["search_results"] if str(e["id"]) == result["id"])
            count1 += 1
        except:
            # Use separate API calls as a backup.
            # This might get you banned, since the API is rate limited.
            franchise = apiGetCharacter(result["id"])["origin"]
            count2 += 1
        # Append franchise to character name for easier differentiation.
        result["name"] = f"{result['name']} ({franchise})"
        result.pop("id")
    log.debug(f"scraped {count1} franchises by single API call")
    log.debug(f"scraped {count2} franchises by separate API calls")
    return results


def apiGetCharacter(id):
    url = f"https://www.animecharactersdatabase.com/api_series_characters.php?character_id={id}"
    return json.loads(scrapeUrlToString(url))


def performerByURL(url, result={}):
    log.debug("performerByURL: " + url)
    tree = scrapeURL(url)
    result["url"] = url
    result["name"] = next(iter(tree.xpath(
        "//h3[@id='section001_summary']/following-sibling::p/a[contains(@href,'character')]/text()")), "").strip()
    result["details"] = "\n".join([s.strip() for s in tree.xpath(
        "//div[@style='padding: 0 15px 15px 15px; text-align: left;']/text()")])
    if not result["details"]:
        result["details"] = re.sub(" .$", ".", " ".join([s.strip() for s in tree.xpath(
            "//h3[@id='section001_summary']/following-sibling::p[contains(a/@href,'character')]//text()") if
                                                         s.strip()]))
    result["image"] = next(iter(tree.xpath("//meta[@property='og:image']/@content")), "")

    # left table, works for link and plain text fields, return result list
    def parse_left(field):
        template = "//table//th[text()='{0}' or a/text()='{0}']/following-sibling::td/a/text()"
        return tree.xpath(template.format(field))

    result["tags"] = additional_tags
    if include_tag:
        result["tags"] += [{"name": tag_prefix + tag.strip()} for tag in parse_left("Tags ")]
    if include_parody:
        result["tags"] += [{"name": parody_prefix + tag.strip()} for tag in parse_left("From")]
    if include_blood_type:
        result["tags"] += [{"name": blood_type_prefix + tag.strip()} for tag in parse_left("Blood Type")]
    if include_race:
        result["tags"] += [{"name": race_prefix + tag.strip()} for tag in parse_left("Race")]
    if include_sign:
        result["tags"] += [{"name": sign_prefix + tag.strip()} for tag in parse_left("Sign")]
    if include_ship_class:
        result["tags"] += [{"name": ship_class_prefix + tag.strip()} for tag in parse_left("Ship Class")]
    result["country"] = next(iter(parse_left("Nationality")), "")
    birthday = parse_left("Birthday")
    birthyear = parse_left("Birthyear")
    if birthday and birthyear:
        birthdate = datetime.strptime(birthday[0].strip(), "%B %d").replace(year=int(birthyear[0].strip()))
        result["birthdate"] = birthdate.strftime("%Y-%m-%d")
    bust = parse_left("Bust")
    waist = parse_left("Waist")
    hip = parse_left("Hip")
    if bust and waist and hip:
        bust = bust[0].strip().replace("cm", "")
        waist = waist[0].strip().replace("cm", "")
        hip = hip[0].strip().replace("cm", "")
        result["measurements"] = "{}-{}-{}".format(bust, waist, hip)
    result["height"] = next(iter(parse_left("Height")), "").strip().replace("cm", "")

    # middle/right table, reverse result list to prefer official appearance, return result or empty string
    def parse_right(field):
        template = "//table//th[text()='{}']/following-sibling::td/text()"
        return next(reversed(tree.xpath(template.format(field))), "").strip().replace("Unknown", "")

    # should be tagged anyway if yes
    # if parse_right("Animal Ears") == "Yes":
    #     result["tags"] += [{"name": "performer:animal ears"}]
    hair_length = parse_right("Hair Length")
    if include_hair_length and hair_length:
        result["tags"] += [{"name": hair_length_prefix + hair_length}]
    apparent_age = parse_right("Apparent Age")
    if include_apparent_age and apparent_age:
        result["tags"] += [{"name": apparent_age_prefix + apparent_age}]
    result["gender"] = parse_right("Gender")
    result["eye_color"] = parse_right("Eye Color")
    result["hair_color"] = parse_right("Hair Color")

    return result


# read the input
i = readJSONInput()

if sys.argv[1] == "performerByURL":
    url = i["url"]
    result = performerByURL(url)
    print(json.dumps(result))
elif sys.argv[1] == "performerByName":
    name = i["name"]
    log.info(f"Searching for name: {name}")
    results = performerByName(name)[:limit]
    results = addFranchise(name, results)
    print(json.dumps(results))