Skip to content

Commit 3311eda

Browse files
authored
Merge pull request #2540 from spaceyuck/gelbooru-xpath-improvements
[gelbooru-xpath] various improvements
2 parents 996457f + 9d25464 commit 3311eda

File tree

1 file changed

+46
-6
lines changed

1 file changed

+46
-6
lines changed

scrapers/gelbooru-xpath.yml

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,34 @@ name: gelbooru-xpath
22
# for gelbooru 0.2+
33
# https://github.com/stashapp/CommunityScrapers/issues/2273
44
# loosely based on danbooru
5+
6+
# intended to capture filename as produced by gallery-dl (rule34_<id>_<hash>.<ext>)
7+
sceneByFragment: &fragementscraper
8+
action: scrapeXPath
9+
queryURL: "{filename}"
10+
queryURLReplace:
11+
filename:
12+
- regex: "[^a-zA-Z\\d\\-._~]" # clean filename so that it can construct a valid url
13+
with: ""
14+
- regex: "^gelbooru_(.*)" # map to domain by prefix
15+
with: "https://gelbooru.com/index.php?page=post&s=view&id=$1"
16+
- regex: "tbib_(.*)" # map to domain by prefix
17+
with: "https://tbib.org/index.php?page=post&s=view&id=$1"
18+
- regex: "^rule34_(.*)" # map to domain by prefix
19+
with: "https://rule34.xxx/index.php?page=post&s=view&id=$1"
20+
- regex: "^xbooru_(.*)" # map to domain by prefix
21+
with: "https://xbooru.com/post/show/$1"
22+
- regex: "^/safebooru_(.*)" # map to domain by prefix
23+
with: "https://safebooru.org/post/show/$1"
24+
- regex: "^/hypnohub_(.*)" # map to domain by prefix
25+
with: "https://hypnohub.net/post/show/$1"
26+
- regex: "^yandere_(.*)" # map to domain by prefix
27+
with: "https://yande.re/post/show/$1"
28+
- regex: '^(.*&id=)([0-9]+)_.*$' # capture numeric sequence at begining as ID
29+
with: "$1$2"
30+
scraper: postScraper
31+
imageByFragment: *fragementscraper
32+
533
sceneByURL:
634
- action: scrapeXPath
735
url: &urls
@@ -21,8 +49,11 @@ imageByURL:
2149
xPathScrapers:
2250
postScraper:
2351
image:
52+
# title intentionally excluded
53+
#Title: &title
54+
# selector: //title
2455
Date: &date
25-
selector: //li[contains(text(),"Posted")]/text()[1]
56+
selector: //div[@id="post-view" or @id="container"]//li[contains(text(),"Posted")]/text()[1]
2657
postProcess:
2758
- replace:
2859
- regex: 'Posted:'
@@ -32,15 +63,24 @@ xPathScrapers:
3263
- parseDate: 2006-01-02
3364
Performers: &performers
3465
Name:
35-
selector: //li[contains(@class,"tag-type-character")]/a[last()]/text()
66+
selector: //div[@id="post-view" or @id="container"]//li[contains(@class,"tag-type-character")]/a[last()]/text()
3667
Studio: &artist
37-
Name: //li[contains(@class,"tag-type-artist")]/a[last()]/text()
68+
Name: //div[@id="post-view" or @id="container"]//li[contains(@class,"tag-type-artist")]/a[last()]/text()
3869
Tags: &tag_string
3970
Name:
40-
selector: //li[contains(@class,"tag-type-general")]/a[last()]/text()
71+
# Variant A: only pull tags
72+
#selector: //div[@id="post-view" or @id="container"]//li[contains(@class,"tag-type-general")]/a[last()]/text()
73+
# Variant B: also pull metadata tags (like 2D, 3D, AI generated)
74+
selector: //div[@id="post-view" or @id="container"]//li[contains(@class,"tag-type-general") or contains(@class,"tag-type-metadata")]/a[last()]/text()
4175
URLs: &source
42-
selector: '//li[contains(text(),"Source:")]/a/@href'
76+
selector: '//div[@id="post-view" or @id="container"]//li[contains(text(),"Source:")]/a/@href'
77+
# pulls note overlay texts (translations) into Details box
78+
Details:
79+
selector: //div[@id="post-view" or @id="container"]//div[contains(@class,"note-body")]/text()
80+
concat: "\n"
4381
scene:
82+
# title intentionally excluded
83+
#Title: *title
4484
Date: *date
4585
Performers: *performers
4686
Studio: *artist
@@ -52,4 +92,4 @@ driver:
5292
- Key: User-Agent
5393
Value: stashapp/stash scraper
5494

55-
# Last Updated April 19, 2025
95+
# Last Updated October 17, 2025

0 commit comments

Comments
 (0)