Adding initial support for a wikidata based scraper. (#673)

* Adding initial support for a wikidata based scraper.
wikidata is the database behind wikipedia and can be used to power info boxes on wikipedia pages.
This database can have a lot of useful info such as height, weight, country, twitter handles etc.

Currently this is a WIP.
I might rewrite the processing performer with a python script as entries are easier to process.

* rename yml, fix check due to date format

* Adding more fields, hair color, country.

* Adding images from wikimedia commons.

* Fetch the wikipedia page for descriptions and fetch label for gender

* Updating a few more fields and removing + from units.

* Updating search terms to include more professions.
The search now looks for people with profession Pornographic Actor (Q488111), Glamour Model (Q3286043), Playboy Playmate (Q728711), AV Idol (Q1079215).

* Small fixes, convert strings to lower case before comparing. convert spaces to %20 and remove . from height as the unit in wikidata is metres and stash expects cm

---------

Co-authored-by: Tweeticoats <Tweeticoats@github.com>
Co-authored-by: bnkai <48220860+bnkai@users.noreply.github.com>
This commit is contained in:
Tweeticoats 2024-02-07 19:59:00 +10:30 committed by GitHub
parent b360649e4f
commit 96698c06cd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

153
scrapers/WikiData.yml Normal file
View File

@ -0,0 +1,153 @@
name: Wikidata
performerByName:
action: scrapeJson
queryURL: https://query.wikidata.org/sparql?query=SELECT%20%3Fpornographic_actor%20%3Fpornographic_actorLabel%20WHERE%20%7B%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22en%22.%20%7D%0A%20%20%3Fpornographic_actor%20wdt%3AP106%20wd%3AQ488111%3B%0A%20%20%20%20%20%20%20%20%20%20%20%20rdfs%3Alabel%20%3Flabel.%0A%20%20FILTER(LANG(%3Flabel)%20%3D%20%22en%22).%0A%20%20FILTER(STRSTARTS(lcase(%3Flabel)%2C%20lcase(%22{}%22)))%7D&format=json
scraper: performerSearch
performerByURL:
- action: scrapeJson
url:
- https://www.wikidata.org/wiki/Q
queryURL: "{url}"
queryURLReplace:
url:
- regex: https://www.wikidata.org/wiki/
with: https://www.wikidata.org/wiki/Special:EntityData/
- regex: $
with: .json
scraper: performerScraper
jsonScrapers:
performerSearch:
performer:
Name: results.bindings.#.pornographic_actorLabel.value
URL:
selector: results.bindings.#.pornographic_actor.value
postProcess:
- replace:
- regex: http:\/\/www.wikidata.org\/entity\/
with: https://www.wikidata.org/wiki/
performerScraper:
performer:
Name: entities.*.labels.en.value
Aliases:
selector: entities.*.aliases.en.#.value
concat: ", "
Image:
selector: entities.*.claims.P18.#.mainsnak.datavalue.value
postProcess:
- replace:
- regex: \s
with: "%20" # spaces cause 400 error
- regex: ^
with: "https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file/"
Weight:
selector: entities.*.claims.P2067.#.mainsnak.datavalue.value.amount
postProcess:
- replace:
- regex: \+
with:
Birthdate:
selector: entities.*.claims.P569.#.mainsnak.datavalue.value.time
postProcess:
- replace:
- regex: .*(\d{4}-\d{1,2}-\d{1,2}).*
with: $1
DeathDate:
selector: entities.*.claims.P570.#.mainsnak.datavalue.value.time
postProcess:
- replace:
- regex: .*(\d{4}-\d{1,2}-\d{1,2}).*
with: $1
Height:
selector: entities.*.claims.P2048.#.mainsnak.datavalue.value.amount
postProcess:
- replace:
- regex: \+
with:
- regex: \.
with:
CareerLength:
selector: entities.*.claims.P2031.#.mainsnak.datavalue.value.time
postProcess:
- replace:
- regex: .*(\d{4}).*
with: $1
Gender:
selector: entities.*.claims.P21.#.mainsnak.datavalue.value.numeric-id
postProcess:
- replace:
- regex: ^
with: "https://www.wikidata.org/wiki/Special:EntityData/Q"
- regex: $
with: .json
- subScraper:
selector: entities.*.labels.en.value
HairColor:
selector: entities.*.claims.P1884.#.mainsnak.datavalue.value.numeric-id
postProcess:
- replace:
- regex: ^
with: "https://www.wikidata.org/wiki/Special:EntityData/Q"
- regex: $
with: .json
- subScraper:
selector: entities.*.labels.en.value
EyeColor:
selector: entities.*.claims.P1340.#.mainsnak.datavalue.value.numeric-id
postProcess:
- replace:
- regex: ^
with: "https://www.wikidata.org/wiki/Special:EntityData/Q"
- regex: $
with: .json
- subScraper:
selector: entities.*.labels.en.value
Ethnicity:
selector: entities.*.claims.P172.#.mainsnak.datavalue.value.numeric-id
postProcess:
- replace:
- regex: ^
with: "https://www.wikidata.org/wiki/Special:EntityData/Q"
- regex: $
with: .json
- subScraper:
selector: entities.*.labels.en.value
Country:
selector: entities.*.claims.P27.#.mainsnak.datavalue.value.numeric-id
postProcess:
- replace:
- regex: ^
with: "https://www.wikidata.org/wiki/Special:EntityData/Q"
- regex: $
with: .json
- subScraper:
selector: entities.*.labels.en.value
# Personal preference, keep the wikidata url instead of the official website of the performer
# URL:
# selector: entities.*.claims.P856.#.mainsnak.datavalue.value
Twitter:
selector: entities.*.claims.P2002.#.mainsnak.datavalue.value
postProcess:
- replace:
- regex: ^
with: "https://twitter.com/"
Instagram:
selector: entities.*.claims.P2003.#.mainsnak.datavalue.value
postProcess:
- replace:
- regex: ^
with: "https://www.instagram.com/"
Details:
selector: entities.*.sitelinks.enwiki.title
postProcess:
- replace:
- regex: " "
with: "_"
- regex: ^
with: "https://en.wikipedia.org/w/api.php?action=query&origin=*&prop=extracts&explaintext&titles="
- regex: $
with: "&format=json"
- subScraper:
selector: query.pages.*.extract
# Last Updated August 29, 2021