Add JavHub xPath scene scraper (#919)

This commit is contained in:
bnkai 2022-02-28 18:50:13 +02:00 committed by GitHub
parent 72c24379dd
commit 472fa90ff3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 61 additions and 0 deletions

View File

@ -585,6 +585,7 @@ japanlust.com|Arx.yml|:heavy_check_mark:|:x:|:x:|:x:|Python|-
javdb.com|javdb.yml|:heavy_check_mark:|:x:|:heavy_check_mark:|:x:|-|Database
javdb36.com|javdb.yml|:heavy_check_mark:|:x:|:heavy_check_mark:|:x:|-|Database
javhd.com|JavHD.yml|:heavy_check_mark:|:x:|:x:|:x:|-|JAV Uncensored
javhub.com|JavHub.yml|:heavy_check_mark:|:x:|:x:|:x:|-|JAV Uncensored
jav.land|JavLand.yml|:heavy_check_mark:|:x:|:heavy_check_mark:|:x:|-|JAV
javlibrary.com|JavLibrary.yml|:heavy_check_mark:|:x:|:heavy_check_mark:|:x:|-|JAV
javlibrary.com|JavLibrary_python.yml|:heavy_check_mark:|:x:|:x:|:x:|Python|JAV

60
scrapers/JavHub.yml Normal file
View File

@ -0,0 +1,60 @@
name: JavHub
sceneByURL:
- action: scrapeXPath
url:
- javhub.com
scraper: sceneScraper
sceneByName:
action: scrapeXPath
queryURL: https://tour.javhub.com/search?s={}
scraper: sceneSearch
sceneByQueryFragment:
action: scrapeXPath
queryURL: "{url}"
scraper: sceneScraper
xPathScrapers:
sceneScraper:
scene:
Details: //p[@class="MsoNormal"]
Performers:
Name: //div[@class="model-wrap"]//h5
Image: //video/@poster
Title: //h1[@class="title"]
Date:
selector: //div[@class="container content-details-wrap"]//span[@class="pub-date"]/text()
postProcess: &ppDate
- replace:
- regex: .+\s+([a-zA-Z]+\s+\d+,\s\d+)
with: $1
- parseDate: January 02, 2006
Studio:
Name:
fixed: JavHub
URL: //input[starts-with(@id,"copy-url")]/@value
sceneSearch:
common:
# ignore search results that have join links (https://tour.javhub.com/join)
$content: //div[@class="content-item"][div[a[not(@href="https://tour.javhub.com/join")]]]
scene:
Image:
selector: $content//a/@data-images
postProcess:
- replace:
- regex: '^.+(https:[^&]+01\.jpg).*'
with: $1
- regex: '\\/'
with: "/"
Title: $content//h3[@class="title"]
URL: $content//h3[@class="title"]/a/@href
Date:
selector: $content//span[@class="pub-date"]/text()
postProcess: *ppDate
# show duration to avoid false matches
# there are duplicate scenes and scenes with identical titles
Details:
selector: $content//span[@class="video-duration"]/text()
postProcess:
- replace:
- regex: ^
with: "Duration "
# Last Updated February 26, 2022