Merge pull request #2540 from spaceyuck/gelbooru-xpath-improvements

feederbox826 · web-flow · commit 3311eda91dc4 · 2025-10-24T19:11:25.000-04:00
[gelbooru-xpath] various improvements
diff --git a/scrapers/gelbooru-xpath.yml b/scrapers/gelbooru-xpath.yml
@@ -2,6 +2,34 @@ name: gelbooru-xpath
 # for gelbooru 0.2+
 # https://github.com/stashapp/CommunityScrapers/issues/2273
 # loosely based on danbooru
+
+# intended to capture filename as produced by gallery-dl (rule34_<id>_<hash>.<ext>)
+sceneByFragment: &fragementscraper
+  action: scrapeXPath
+  queryURL: "{filename}"
+  queryURLReplace:
+    filename:
+      - regex: "[^a-zA-Z\\d\\-._~]" # clean filename so that it can construct a valid url
+        with: ""
+      - regex: "^gelbooru_(.*)" # map to domain by prefix
+        with: "https://gelbooru.com/index.php?page=post&s=view&id=$1"
+      - regex: "tbib_(.*)" # map to domain by prefix
+        with: "https://tbib.org/index.php?page=post&s=view&id=$1"
+      - regex: "^rule34_(.*)" # map to domain by prefix
+        with: "https://rule34.xxx/index.php?page=post&s=view&id=$1"
+      - regex: "^xbooru_(.*)" # map to domain by prefix
+        with: "https://xbooru.com/post/show/$1"
+      - regex: "^/safebooru_(.*)" # map to domain by prefix
+        with: "https://safebooru.org/post/show/$1"
+      - regex: "^/hypnohub_(.*)" # map to domain by prefix
+        with: "https://hypnohub.net/post/show/$1"
+      - regex: "^yandere_(.*)" # map to domain by prefix
+        with: "https://yande.re/post/show/$1"
+      - regex: '^(.*&id=)([0-9]+)_.*$' # capture numeric sequence at begining as ID
+        with: "$1$2"
+  scraper: postScraper
+imageByFragment: *fragementscraper
+
 sceneByURL:
   - action: scrapeXPath
     url: &urls
@@ -21,8 +49,11 @@ imageByURL:
 xPathScrapers:
   postScraper:
     image:
+      # title intentionally excluded
+      #Title: &title
+      #  selector: //title
       Date: &date
-        selector: //li[contains(text(),"Posted")]/text()[1]
+        selector: //div[@id="post-view" or @id="container"]//li[contains(text(),"Posted")]/text()[1]
         postProcess:
           - replace:
             - regex: 'Posted:'
@@ -32,15 +63,24 @@ xPathScrapers:
           - parseDate: 2006-01-02
       Performers: &performers
         Name:
-          selector: //li[contains(@class,"tag-type-character")]/a[last()]/text()
+          selector: //div[@id="post-view" or @id="container"]//li[contains(@class,"tag-type-character")]/a[last()]/text()
       Studio: &artist
-        Name: //li[contains(@class,"tag-type-artist")]/a[last()]/text()
+        Name: //div[@id="post-view" or @id="container"]//li[contains(@class,"tag-type-artist")]/a[last()]/text()
       Tags: &tag_string
         Name:
-          selector: //li[contains(@class,"tag-type-general")]/a[last()]/text()
+          # Variant A: only pull tags
+          #selector: //div[@id="post-view" or @id="container"]//li[contains(@class,"tag-type-general")]/a[last()]/text()
+          # Variant B: also pull metadata tags (like 2D, 3D, AI generated)
+          selector: //div[@id="post-view" or @id="container"]//li[contains(@class,"tag-type-general") or contains(@class,"tag-type-metadata")]/a[last()]/text()
       URLs: &source
-        selector: '//li[contains(text(),"Source:")]/a/@href'
+        selector: '//div[@id="post-view" or @id="container"]//li[contains(text(),"Source:")]/a/@href'
+      # pulls note overlay texts (translations) into Details box
+      Details:
+        selector: //div[@id="post-view" or @id="container"]//div[contains(@class,"note-body")]/text()
+        concat: "\n"
     scene:
+      # title intentionally excluded
+      #Title: *title
       Date: *date
       Performers: *performers
       Studio: *artist
@@ -52,4 +92,4 @@ driver:
     - Key: User-Agent
       Value: stashapp/stash scraper
 
-# Last Updated April 19, 2025
+# Last Updated October 17, 2025