mirror of
https://github.com/stashapp/CommunityScrapers.git
synced 2025-12-11 03:08:29 -06:00
196 lines
7.2 KiB
Python
196 lines
7.2 KiB
Python
import json
|
|
import sys
|
|
import re
|
|
import requests
|
|
import time
|
|
|
|
# initialize the session for making requests
|
|
session = requests.session()
|
|
|
|
try:
|
|
import py_common.log as log
|
|
except ModuleNotFoundError:
|
|
print(
|
|
"You need to download the folder 'py_common' from the community repo (CommunityScrapers/tree/master/scrapers/py_common)",
|
|
file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
try:
|
|
from lxml import html
|
|
except ModuleNotFoundError:
|
|
print("You need to install the lxml module. (https://lxml.de/installation.html#installation)", file=sys.stderr)
|
|
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install lxml",
|
|
file=sys.stderr)
|
|
sys.exit()
|
|
|
|
|
|
# --------------------------------------------
|
|
# This is a scraper for: RealityLovers sites
|
|
#
|
|
|
|
def get_scraped(inp):
|
|
if not inp['url']:
|
|
log.error('No URL Entered')
|
|
return None
|
|
|
|
# get the url specified in the input and validate the response code
|
|
scraped = session.get(inp['url'])
|
|
if scraped.status_code >= 400:
|
|
log.error('HTTP Error: %s' % scraped.status_code)
|
|
return None
|
|
log.trace('Scraped the url: ' + inp["url"])
|
|
return scraped
|
|
|
|
def performerByURL():
|
|
# read the input. A URL must be passed in for the sceneByURL call
|
|
inp = json.loads(sys.stdin.read())
|
|
scraped = get_scraped(inp)
|
|
if not scraped:
|
|
return {}
|
|
|
|
# get the data we can scrape directly from the page
|
|
tree = html.fromstring(scraped.content)
|
|
name = "".join(tree.xpath('//div[@class="girlDetails-details"]/h1/text()')).strip()
|
|
main_image_src = re.sub(r'.*1x,(.*) 2x', r'\1', tree.xpath('//img[@class="girlDetails-posterImage"]/@srcset')[0])
|
|
birthdate = time.strptime(
|
|
re.sub(r"(st|nd|rd|th)", r"", tree.xpath('//*[text()="Birthday:"]/../following-sibling::dd/text()')[0].strip()),
|
|
'%b %d %Y')
|
|
country = tree.xpath('//*[text()="Country:"]/../following-sibling::dd/text()')[0].strip()
|
|
measurements = tree.xpath('//*[text()="Cup:"]/../following-sibling::dd/text()')[0].strip()
|
|
height = re.sub(r".*\(([0-9]*) cm\)",
|
|
r"\1",
|
|
tree.xpath('//*[text()="Height:"]/../following-sibling::dd/text()')[0].strip())
|
|
weight = re.sub(r".*\(([0-9]*) kg\)",
|
|
r"\1",
|
|
tree.xpath('//*[text()="Weight:"]/../following-sibling::dd/text()')[0].strip())
|
|
socials = tree.xpath('//*/a[@class="girlDetails-socialIcon"]/@href')
|
|
tags = tree.xpath('//*/a[@itemprop="keyword"]/text()')
|
|
details = tree.xpath('//*/p[@class="girlDetails-bioText"]/text()')[0].strip()
|
|
|
|
return {
|
|
"Name": name,
|
|
# making some assumptions here
|
|
"Gender": "transgender_female" if re.match(r'.*tsvirtuallovers.*', inp['url'], re.IGNORECASE) else "female",
|
|
"URL": inp['url'],
|
|
"Twitter": next((social for social in socials if re.match(r'.*twitter.*', social, re.IGNORECASE)), ""),
|
|
"Instagram": next((social for social in socials if re.match(r'.*instagram.*', social, re.IGNORECASE)), ""),
|
|
"Birthdate": time.strftime("%Y-%m-%d", birthdate),
|
|
"Country": country,
|
|
"Height": height,
|
|
"Weight": weight,
|
|
"Measurements": measurements,
|
|
"Tags": [{
|
|
'name': x
|
|
} for x in tags],
|
|
"Image": main_image_src,
|
|
"Details": details,
|
|
}
|
|
|
|
|
|
def sceneByURL():
|
|
# read the input. A URL must be passed in for the sceneByURL call
|
|
inp = json.loads(sys.stdin.read())
|
|
scraped = get_scraped(inp)
|
|
if not scraped:
|
|
return {}
|
|
|
|
# get the data we can scrape directly from the page
|
|
tree = html.fromstring(scraped.content)
|
|
title = "".join(tree.xpath('//*[@class="video-detail-name"]/text()')).strip()
|
|
details = tree.xpath('//*[@itemprop="description"]/text()[1]')[0].strip()
|
|
details = details + " " + tree.xpath('//*[@itemprop="description"]/span/text()')[0].strip()
|
|
images = tree.xpath('//*[@itemprop="thumbnail"]/@data-big')
|
|
image_url = ""
|
|
if len(images) > 0:
|
|
image_url = re.sub(r' .*', r'', images[0])
|
|
tags = tree.xpath('//a[@itemprop="keyword"]/text()')
|
|
actors = tree.xpath('//a[@itemprop="actor"]/text()')
|
|
studio = tree.xpath('//span[@class="icon header-logo"]/text()')[0]
|
|
|
|
# create our output
|
|
return {
|
|
'title': title,
|
|
'tags': [{
|
|
'name': x
|
|
} for x in tags],
|
|
'details': details,
|
|
'image': image_url,
|
|
'studio': {
|
|
'name': studio
|
|
},
|
|
'performers': [{
|
|
'name': x
|
|
} for x in actors]
|
|
}
|
|
|
|
|
|
# Get the scene by the fragment. The title is used as the search field. Should return the JSON response.
|
|
# This is a script instead of a JSON scraper since the search is a POST
|
|
def sceneByName():
|
|
# read the input. A title or name must be passed in
|
|
inp = json.loads(sys.stdin.read())
|
|
log.trace("Input: " + json.dumps(inp))
|
|
query_value = inp['title'] if 'title' in inp else inp['name']
|
|
if not query_value:
|
|
log.error('No title or name Entered')
|
|
return []
|
|
log.trace("Query Value: " + query_value)
|
|
|
|
# call the query url based on the input and validate the response code
|
|
data = {
|
|
"sortBy": "MOST_RELEVANT",
|
|
"searchQuery": query_value,
|
|
"offset": 0,
|
|
"isInitialLoad": True,
|
|
"videoView": "MEDIUM",
|
|
"device": "DESKTOP"
|
|
}
|
|
scraped_scenes = session.post(f"https://realitylovers.com/videos/search?tc={time.time()}",
|
|
data=data)
|
|
log.debug("Called: " + scraped_scenes.url + " with body: " + json.dumps(data))
|
|
if scraped_scenes.status_code >= 400:
|
|
log.error('HTTP Error: %s' % scraped_scenes.status_code)
|
|
return []
|
|
|
|
# get the data we can scrape directly from the page
|
|
scenes = json.loads(scraped_scenes.content)
|
|
log.debug("Scenes: " + json.dumps(scenes))
|
|
results = []
|
|
for scene in scenes['contents']:
|
|
# Parse the date published. Get rid of the 'st' (like in 1st) via a regex. ex: "Sep 27th 2018"
|
|
published = time.strptime(re.sub(r"(st|nd|rd|th)", r"", scene['released']), '%b %d %Y')
|
|
main_image_src = re.sub(r'.*1x,(.*) 2x', r'\1', scene['mainImageSrcset'])
|
|
log.debug("Image: " + main_image_src)
|
|
# Add the new scene to the results
|
|
results.append({
|
|
'Title': scene['title'],
|
|
'URL': "https://realitylovers.com/" + scene['videoUri'],
|
|
'Image': main_image_src,
|
|
'Date': time.strftime("%Y-%m-%d", published)
|
|
})
|
|
|
|
# create our output
|
|
return results
|
|
|
|
|
|
# Figure out what was invoked by Stash and call the correct thing
|
|
if sys.argv[1] == "performerByURL":
|
|
print(json.dumps(performerByURL()))
|
|
elif sys.argv[1] == "sceneByURL":
|
|
print(json.dumps(sceneByURL()))
|
|
elif sys.argv[1] == "sceneByName":
|
|
scenes = sceneByName()
|
|
print(json.dumps(scenes))
|
|
elif sys.argv[1] == "sceneByQueryFragment" or sys.argv[1] == "sceneByFragment":
|
|
scenes = sceneByName()
|
|
if len(scenes) > 0:
|
|
# return the first query result
|
|
print(json.dumps(scenes[0]))
|
|
else:
|
|
# empty array for no results
|
|
log.info("No results")
|
|
print("{}")
|
|
else:
|
|
log.error("Unknown argument passed: " + sys.argv[1])
|
|
print(json.dumps({}))
|