2023-11-22 11:31:17 +11:00

136 lines
5.1 KiB
Python

#this scraper scrapes title and uses it to search the site and grab a cover from the search results, among other things
import base64
import datetime
import json
import re
import sys
import urllib.parse
# extra modules below need to be installed
try:
import py_common.log as log
except ModuleNotFoundError:
print("You need to download the folder 'py_common' from the community repo! (CommunityScrapers/tree/master/scrapers/py_common)", file=sys.stderr)
sys.exit()
try:
import cloudscraper
except ModuleNotFoundError:
log.error("You need to install the cloudscraper module. (https://pypi.org/project/cloudscraper/)")
log.error("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install cloudscraper")
sys.exit()
try:
from lxml import html, etree
except ModuleNotFoundError:
log.error("You need to install the lxml module. (https://lxml.de/installation.html#installation)")
log.error("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install lxml")
sys.exit()
STUDIO_MAP = {
'https://missax.com/tour/': 'MissaX',
'https://allherluv.com/tour/': 'All Her Luv',
}
PROXIES = {
}
TIMEOUT = 10
MAX_PAGES_TO_SEARCH = 20
def scraped_content(scraper, url):
try:
scraped = scraper.get(url, timeout=TIMEOUT, proxies=PROXIES)
except:
log.error("scrape error")
if scraped.status_code >= 400:
log.error(f"HTTP Error: {scraped.status_code}")
return scraped.content
def scrape_scene_page(url): #scrape the main url
tree = scraped_content(scraper, url) #get page content
tree = html.fromstring(tree) #parse html
title = tree.xpath('//p[@class="raiting-section__title"]/text()')[0].strip() #title scrape
log.debug(f'Title:{title}')
date = tree.xpath('//p[@class="dvd-scenes__data" and contains(text(), " Added:")]/text()[1]')[0] #get date
date = re.sub("(?:.+Added:\s)([\d\/]*).+", r'\g<1>', date).strip() #date cleanup
date = datetime.datetime.strptime(date, "%m/%d/%Y").strftime("%Y-%m-%d") #date parse
log.debug(f'Date:{date}')
studio = tree.xpath('//base/@href')[0].strip() #studio scrape
studio = studio.replace("www.", "")
studio = STUDIO_MAP.get(studio) # studio map
log.debug(f'Studio:{studio}')
performers = tree.xpath('//p[@class="dvd-scenes__data" and contains(text(), "Featuring:")]//a/text()') #performers scrape
log.debug(f'Performers:{performers}')
tags = tree.xpath('//p[@class="dvd-scenes__data" and contains(text(), "Categories:")]//a/text()') #tags scrape
log.debug(f'Tags:{tags}')
details = tree.xpath('//p[@class="dvd-scenes__title"]/following-sibling::p//text()') #details scrape
details = ''.join(details) #join details
details = '\n'.join(' '.join(line.split()) for line in details.split('\n')) #get rid of double spaces
details = re.sub("\r?\n\n?", r'\n', details) #get rid of double newlines
log.debug(f'Details:{details}')
bad_cover_url = tree.xpath("//img[@src0_4x]/@src0_4x") #cover from scene's page if better one is not found (it will be)
datauri = "data:image/jpeg;base64,"
b64img = scrape_cover(scraper, studio, title, bad_cover_url)
return output_json(title,tags,date,details,datauri,b64img,studio,performers)
def scrape_cover(scraper, studio, title, bad_cover_url):
p = 1
# loop throught search result pages until img found
while p<MAX_PAGES_TO_SEARCH:
log.debug(f'Searching page {p} for cover')
url = f'https://{studio.replace(" ", "")}.com/tour/search.php?st=advanced&qall=&qany=&qex={urllib.parse.quote(title)}&none=&tadded=0&cat%5B%5D=5&page={p}'
tree = scraped_content(scraper, url) #get page content
tree = html.fromstring(tree) #parse html
if tree.xpath('//*[@class="photo-thumb video-thumb"]'): #if any search results present
try:
imgurl = tree.xpath(f'//img[@alt="{title}"]/@src0_4x')[0]
img = scraped_content(scraper, imgurl)
b64img = base64.b64encode(img)
log.debug('Cover found!')
return b64img
except:
if tree.xpath('//li[@class="active"]/following-sibling::li'): #if there is a next page
p+=1
else:
break
else:
break
#just a failsafe
log.warning('better cover not found, returning the bad one')
img = scraped_content(scraper, bad_cover_url)
b64img = base64.b64encode(img)
return b64img
def output_json(title,tags,date,details,datauri,b64img,studio,performers):
return {
'title': title,
'tags': [{
'name': x
} for x in tags],
'date': date,
'details': details.strip(),
'image': datauri + b64img.decode('utf-8'),
'studio': {
'name': studio
},
'performers': [{
'name': x.strip()
} for x in performers]
}
# FRAGNEMT = {"url": "https://allherluv.com/tour/trailers/Like-I-Do.html"}
FRAGNEMT = json.loads(sys.stdin.read())
if not FRAGNEMT['url']:
log.error('No URL entered.')
sys.exit()
url = FRAGNEMT["url"]
scraper = cloudscraper.create_scraper()
ret = scrape_scene_page(url)
print(json.dumps(ret))