Adding initial support for a wikidata based scraper. (#673)

* Adding initial support for a wikidata based scraper. wikidata is the database behind wikipedia and can be used to power info boxes on wikipedia pages. This database can have a lot of useful info such as height, weight, country, twitter handles etc. Currently this is a WIP. I might rewrite the processing performer with a python script as entries are easier to process. * rename yml, fix check due to date format * Adding more fields, hair color, country. * Adding images from wikimedia commons. * Fetch the wikipedia page for descriptions and fetch label for gender * Updating a few more fields and removing + from units. * Updating search terms to include more professions. The search now looks for people with profession Pornographic Actor (Q488111), Glamour Model (Q3286043), Playboy Playmate (Q728711), AV Idol (Q1079215). * Small fixes, convert strings to lower case before comparing. convert spaces to %20 and remove . from height as the unit in wikidata is metres and stash expects cm --------- Co-authored-by: Tweeticoats <Tweeticoats@github.com> Co-authored-by: bnkai <48220860+bnkai@users.noreply.github.com>
2025-12-11 03:08:29 -06:00 · 2024-02-07 19:59:00 +10:30 · 2024-02-07 19:59:00 +10:30 · 96698c06cd
commit 96698c06cd
parent b360649e4f
1 changed files with 153 additions and 0 deletions
--- a/scrapers/WikiData.yml
+++ b/scrapers/WikiData.yml
@ -0,0 +1,153 @@
+name: Wikidata
+performerByName:
+  action: scrapeJson
+  queryURL: https://query.wikidata.org/sparql?query=SELECT%20%3Fpornographic_actor%20%3Fpornographic_actorLabel%20WHERE%20%7B%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22en%22.%20%7D%0A%20%20%3Fpornographic_actor%20wdt%3AP106%20wd%3AQ488111%3B%0A%20%20%20%20%20%20%20%20%20%20%20%20rdfs%3Alabel%20%3Flabel.%0A%20%20FILTER(LANG(%3Flabel)%20%3D%20%22en%22).%0A%20%20FILTER(STRSTARTS(lcase(%3Flabel)%2C%20lcase(%22{}%22)))%7D&format=json
+  scraper: performerSearch
+performerByURL:
+  - action: scrapeJson
+    url:
+      - https://www.wikidata.org/wiki/Q
+    queryURL: "{url}"
+    queryURLReplace:
+      url:
+        - regex: https://www.wikidata.org/wiki/
+          with: https://www.wikidata.org/wiki/Special:EntityData/
+        - regex: $
+          with: .json
+    scraper: performerScraper
+jsonScrapers:
+  performerSearch:
+    performer:
+      Name: results.bindings.#.pornographic_actorLabel.value
+      URL:
+        selector: results.bindings.#.pornographic_actor.value
+        postProcess:
+          - replace:
+              - regex: http:\/\/www.wikidata.org\/entity\/
+                with: https://www.wikidata.org/wiki/
+  performerScraper:
+    performer:
+      Name: entities.*.labels.en.value
+      Aliases:
+        selector: entities.*.aliases.en.#.value
+        concat: ", "
+      Image:
+        selector: entities.*.claims.P18.#.mainsnak.datavalue.value
+        postProcess:
+          - replace:
+              - regex: \s
+                with: "%20" # spaces cause 400 error
+              - regex: ^
+                with: "https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file/"
+      Weight:
+        selector: entities.*.claims.P2067.#.mainsnak.datavalue.value.amount
+        postProcess:
+          - replace:
+              - regex: \+
+                with:
+      Birthdate:
+        selector: entities.*.claims.P569.#.mainsnak.datavalue.value.time
+        postProcess:
+          - replace:
+              - regex: .*(\d{4}-\d{1,2}-\d{1,2}).*
+                with: $1
+      DeathDate:
+        selector: entities.*.claims.P570.#.mainsnak.datavalue.value.time
+        postProcess:
+          - replace:
+              - regex: .*(\d{4}-\d{1,2}-\d{1,2}).*
+                with: $1
+      Height:
+        selector: entities.*.claims.P2048.#.mainsnak.datavalue.value.amount
+        postProcess:
+          - replace:
+              - regex: \+
+                with:
+              - regex: \.
+                with:
+      CareerLength:
+        selector: entities.*.claims.P2031.#.mainsnak.datavalue.value.time
+        postProcess:
+          - replace:
+              - regex: .*(\d{4}).*
+                with: $1
+      Gender:
+        selector: entities.*.claims.P21.#.mainsnak.datavalue.value.numeric-id
+        postProcess:
+          - replace:
+              - regex: ^
+                with: "https://www.wikidata.org/wiki/Special:EntityData/Q"
+              - regex: $
+                with: .json
+          - subScraper:
+              selector: entities.*.labels.en.value
+      HairColor:
+        selector: entities.*.claims.P1884.#.mainsnak.datavalue.value.numeric-id
+        postProcess:
+          - replace:
+              - regex: ^
+                with: "https://www.wikidata.org/wiki/Special:EntityData/Q"
+              - regex: $
+                with: .json
+          - subScraper:
+              selector: entities.*.labels.en.value
+      EyeColor:
+        selector: entities.*.claims.P1340.#.mainsnak.datavalue.value.numeric-id
+        postProcess:
+          - replace:
+              - regex: ^
+                with: "https://www.wikidata.org/wiki/Special:EntityData/Q"
+              - regex: $
+                with: .json
+          - subScraper:
+              selector: entities.*.labels.en.value
+      Ethnicity:
+        selector: entities.*.claims.P172.#.mainsnak.datavalue.value.numeric-id
+        postProcess:
+          - replace:
+              - regex: ^
+                with: "https://www.wikidata.org/wiki/Special:EntityData/Q"
+              - regex: $
+                with: .json
+          - subScraper:
+              selector: entities.*.labels.en.value
+      Country:
+        selector: entities.*.claims.P27.#.mainsnak.datavalue.value.numeric-id
+        postProcess:
+          - replace:
+              - regex: ^
+                with: "https://www.wikidata.org/wiki/Special:EntityData/Q"
+              - regex: $
+                with: .json
+          - subScraper:
+              selector: entities.*.labels.en.value
+# Personal preference, keep the wikidata url instead of the official website of the performer
+#      URL:
+#        selector: entities.*.claims.P856.#.mainsnak.datavalue.value
+      Twitter:
+        selector: entities.*.claims.P2002.#.mainsnak.datavalue.value
+        postProcess:
+          - replace:
+              - regex: ^
+                with: "https://twitter.com/"
+      Instagram:
+        selector: entities.*.claims.P2003.#.mainsnak.datavalue.value
+        postProcess:
+          - replace:
+              - regex: ^
+                with: "https://www.instagram.com/"
+      Details:
+        selector: entities.*.sitelinks.enwiki.title
+        postProcess:
+          - replace:
+              - regex: " "
+                with:  "_"
+              - regex: ^
+                with: "https://en.wikipedia.org/w/api.php?action=query&origin=*&prop=extracts&explaintext&titles="
+              - regex: $
+                with: "&format=json"
+          - subScraper:
+              selector: query.pages.*.extract
+
+
+# Last Updated August 29, 2021