Skip to content

Commit 991cf4a

Browse files
authored
Merge pull request #2610 from feederbox826/ersties
[Ersties] merge PRs
2 parents 85ea4d7 + f28482d commit 991cf4a

File tree

4 files changed

+123
-101
lines changed

4 files changed

+123
-101
lines changed

scrapers/Ersties/Ersties.py

Lines changed: 108 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -2,31 +2,27 @@
22
import requests
33
import re
44
import json
5-
from py_common.util import guess_nationality
5+
from py_common.util import guess_nationality, scraper_args
6+
from py_common.types import ScrapedScene, ScrapedPerformer, ScrapedGroup
7+
from py_common import log
8+
from py_common.config import get_config
69
from datetime import datetime
710
from bs4 import BeautifulSoup as bs
811

9-
#Authentication tokens and cookies are needed for this scraper. Use the network console in your browsers developer tools to find this information in an api call header.
10-
#Auth Variables For Header
11-
authorization = ''
12-
cookie = ''
13-
x_visit_uid = ''
12+
config = get_config(default="""
13+
# Ersties auth configuration
14+
# Use the network console in your browsers developer tools to find this information in an api call header.
15+
AUTHORIZATION =
16+
COOKIE =
17+
X_VISIT_UID =
18+
""")
1419

1520
#Headers for Requests
1621
scrape_headers = {
17-
'authorization': authorization,
18-
'cookie': cookie,
19-
'x-visit-uid': x_visit_uid,
22+
'authorization': config.AUTHORIZATION,
23+
'cookie': config.COOKIE,
24+
'x-visit-uid': config.X_VISIT_UID,
2025
}
21-
22-
#Get JSON from Stash
23-
def readJSONInput():
24-
input = sys.stdin.read()
25-
return json.loads(input)
26-
27-
def debugPrint(t):
28-
sys.stderr.write(t + "\n")
29-
3026
def clean_text(details: str) -> str:
3127
"""
3228
remove escaped backslashes and html parse the details text
@@ -47,20 +43,22 @@ def clean_text(details: str) -> str:
4743
details = details.strip()
4844
return details
4945

50-
def get_scene(inputurl):
46+
def get_scene(inputurl) -> ScrapedScene:
5147

52-
# Use a regular expression to extract the number after '#play-' and before '-comments'
53-
match = re.search(r'#play-(\d+)-comments', inputurl)
48+
# Try to extract the scene ID from URLs like:
49+
# ...#play-6138
50+
# ...#play-6138-comments
51+
match = re.search(r'#play-(\d+)(?:-comments)?', inputurl)
5452

5553
# Check if the pattern was found and save it as a variable
5654
if match:
5755
sceneid = match.group(1)
5856
else:
59-
debugPrint('No scene ID found in URL. Please make sure you are using the ULR ending with "#play-nnnn-comments".')
57+
log.error('No scene ID found in URL. Please make sure the URL contains "#play-<id>".')
6058
sys.exit()
6159

62-
#Build URL to scrape
63-
scrape_url='https://api.ersties.com/videos/'+sceneid
60+
# Build URL to scrape
61+
scrape_url = 'https://api.ersties.com/videos/' + sceneid
6462

6563
#Scrape URL
6664
scrape = requests.get(scrape_url, headers=scrape_headers)
@@ -70,87 +68,102 @@ def get_scene(inputurl):
7068
if scrape.status_code ==200:
7169
scrape_data = scrape.json()
7270

73-
ret = {}
74-
75-
ret['title'] = scrape_data['title_en']
76-
ret['code'] = str(scrape_data['id'])
77-
ret['details'] = clean_text(str(scrape_data['gallery']['description_en']))
78-
ret['studio'] = {'name':'Ersties'}
79-
ret['tags'] = [{'name': x['name_en']} for x in scrape_data['tags']]
80-
ret['performers'] = [{'name':x['name_en'], 'details':x['description_en'], 'urls':['https://ersties.com/profile/'+str(x['id'])],'images':[f'https://thumb.ersties.com/width=510,height=660,fit=cover,quality=85,sharpen=1,format=jpeg/content/images_mysql/Model_Cover_Image/backup/'+x['thumbnail']] } for x in scrape_data['participated_models']]
81-
for thumbnail in scrape_data['thumbnails']:
82-
if thumbnail['is_main']:
83-
ret['image'] = f'https://thumb.ersties.com/width=900,height=500,fit=cover,quality=85,sharpen=1,format=jpeg/content/images_mysql/images_videothumbnails/backup/'+thumbnail['file_name']
71+
ret: ScrapedScene = {}
72+
ret['code'] = str(scrape_data.get('id', ''))
73+
ret['tags'] = [{'name': x.get('name_en', '')} for x in scrape_data.get('tags', [])]
74+
75+
gallery = scrape_data.get('gallery') or {}
76+
gallery_title = gallery.get('title_en') or gallery.get('title')
77+
scene_title = scrape_data.get('title_en') or scrape_data.get('title')
78+
if gallery_title and scene_title:
79+
ret['title'] = f"{gallery_title}: {scene_title}"
80+
ret['details'] = clean_text(str(gallery.get('description_en', '')))
81+
ret['studio'] = {'name': 'Ersties'}
82+
ret['performers'] = [
83+
{
84+
'name': model.get('name_en', ''),
85+
'details': model.get('description_en', ''),
86+
'urls': [f'https://ersties.com/profile/{model.get("id")}'],
87+
'images': [
88+
f'https://thumb.ersties.com/format=jpeg/content/images_mysql/Model_Cover_Image/backup/{model.get("thumbnail", "")}'
89+
],
90+
}
91+
for model in scrape_data.get('participated_models', [])
92+
]
93+
94+
# Main image
95+
for thumbnail in scrape_data.get('thumbnails', []):
96+
if thumbnail.get('is_main'):
97+
ret['image'] = f"https://thumb.ersties.com/format=jpeg/content/images_mysql/images_videothumbnails/backup/{thumbnail.get('file_name', '')}"
8498
break
85-
#Get Date
86-
epoch_time = scrape_data['gallery']['available_since']
87-
# Check if the date is returned as an integer.
99+
# Date (scene + group)
100+
epoch_time = gallery.get('available_since')
101+
group_date = None
88102
if isinstance(epoch_time, int):
89-
#Convert date from Epoch Time
90-
ret['date'] = datetime.fromtimestamp(epoch_time).strftime("%Y-%m-%d")
91-
#Get Group Information
92-
#Get Group Date
93-
group_epoch_time = scrape_data['gallery']['available_since']
94-
# Check if the date is returned as an integer.
95-
if isinstance(group_epoch_time, int):
96-
#Convert date from Epoch Time
97-
group_date = datetime.fromtimestamp(group_epoch_time).strftime("%Y-%m-%d")
98-
ret['groups'] = [{'name': scrape_data['gallery']['title_en'], 'synopsis': clean_text(str(scrape_data['gallery']['description_en'])), 'studio': {'name':'Ersties'}, 'urls':[f'https://ersties.com/shoot/'+str(scrape_data['gallery']['id'])], 'front_image': f'https://thumb.ersties.com/width=510,height=660,fit=cover,quality=85,sharpen=1,format=jpeg/content/images_mysql/Shoot_Cover/'+scrape_data['gallery']['image'], 'date': group_date}]
103+
group_date = datetime.fromtimestamp(epoch_time).strftime("%Y-%m-%d")
104+
ret['date'] = group_date
105+
106+
ret['groups'] = [{
107+
'name': gallery.get('title_en', ''),
108+
'synopsis': clean_text(str(gallery.get('description_en', ''))),
109+
'studio': {'name': 'Ersties'},
110+
'urls': [f'https://ersties.com/shoot/{gallery.get("id", "")}'],
111+
'front_image': f"https://thumb.ersties.com/format=jpeg/content/images_mysql/Shoot_Cover/{gallery.get('image', '')}",
112+
'date': group_date,
113+
}]
114+
99115
else:
100-
debugPrint('Response: '+str(scrape.status_code)+'. Please check your auth header.')
101-
sys.exit()
116+
log.error(f"Response:{str(scrape.status_code)}. Please check your auth header.")
117+
sys.exit()
102118
return ret
103119

104-
def get_group(inputurl):
120+
def get_group(inputurl) -> ScrapedGroup:
105121
# Check if URL is a Shoot
106122
if re.search(r"/shoot/\d+$", inputurl):
107123
urltype = 'shoot'
108124
match = re.search(r'shoot/(\d+)', inputurl)
109-
groupid = match.group(1)
125+
if match:
126+
groupid = match.group(1)
110127
else:
111-
debugPrint('No shoot ID found in URL. Please make sure you are using the correct URL.')
128+
log.error('No shoot ID found in URL. Please make sure you are using the correct URL.')
112129
sys.exit()
113-
114-
#Scrape Shoot
130+
# Scrape Shoot
115131
if urltype == 'shoot':
116-
#Build URL to scrape group
117-
scrape_url='https://api.ersties.com/galleries/'+groupid
132+
# Build URL to scrape group
133+
scrape_url = 'https://api.ersties.com/galleries/' + groupid
118134

119-
#Scrape URL
135+
# Scrape URL
120136
scrape = requests.get(scrape_url, headers=scrape_headers)
121137

122-
#Parse response
123-
#Check for valid response
124-
if scrape.status_code ==200:
138+
# Parse response
139+
# Check for valid response
140+
if scrape.status_code == 200:
125141
scrape_data = scrape.json()
126142

127-
ret = {}
143+
ret: ScrapedGroup = {}
128144

129-
ret['name'] = scrape_data['title_en']
130-
ret['synopsis'] = clean_text(str(scrape_data['description_en']))
131-
ret['studio'] = {'name':'Ersties'}
132-
ret['front_image'] = f'https://thumb.ersties.com/width=510,height=660,fit=cover,quality=85,sharpen=1,format=jpeg/content/images_mysql/Shoot_Cover/'+scrape_data['image']
133-
#Get Date
134-
epoch_time = scrape_data['available_since']
135-
# Check if the date is returned as an integer.
145+
ret['name'] = scrape_data.get('title_en', '')
146+
ret['synopsis'] = clean_text(str(scrape_data.get('description_en', '')))
147+
ret['studio'] = {'name': 'Ersties'}
148+
ret['front_image'] = f"https://thumb.ersties.com/format=jpeg/content/images_mysql/Shoot_Cover/{scrape_data.get('image', '')}"
149+
# Get Date
150+
epoch_time = scrape_data.get('available_since')
136151
if isinstance(epoch_time, int):
137-
#Convert date from Epoch Time
138152
ret['date'] = datetime.fromtimestamp(epoch_time).strftime("%Y-%m-%d")
139153
else:
140-
debugPrint('Response: '+str(scrape.status_code)+'. Please check your auth header.')
141-
sys.exit()
142-
154+
log.error(f"Response: {str(scrape.status_code)}. Please check your auth header.")
155+
sys.exit()
143156
return ret
144157

145-
def get_performer(inputurl):
158+
def get_performer(inputurl) -> ScrapedPerformer:
146159
# Use a regular expression to extract the number after '#play-' and before '-comments'
147160
match = re.search(r'profile/(\d+)', inputurl)
148161

149162
# Check if the pattern was found and save it as a variable
150163
if match:
151164
groupid = match.group(1)
152165
else:
153-
debugPrint('No performer ID found in URL. Please make sure you are using the ULR ending with "profile/nnnn".')
166+
log.error('No performer ID found in URL. Please make sure you are using the ULR ending with "profile/nnnn".')
154167
sys.exit()
155168

156169
#Build URL to scrape group
@@ -164,29 +177,30 @@ def get_performer(inputurl):
164177
if scrape.status_code ==200:
165178
scrape_data = scrape.json()
166179

167-
ret = {}
180+
ret: ScrapedPerformer = {
181+
"name": scrape_data['name_en'],
182+
"details": scrape_data['description_en'],
183+
"image": f'https://thumb.ersties.com/width=510,height=660,fit=cover,quality=85,sharpen=1,format=jpeg/content/images_mysql/Model_Cover_Image/backup/'+scrape_data['thumbnail']
184+
}
168185

169-
ret['name'] = scrape_data['name_en']
170186
if scrape_data['location_en'] is not None:
171187
ret['country'] = guess_nationality(scrape_data['location_en'])
172-
ret['details'] = scrape_data['description_en']
173-
ret['image'] = f'https://thumb.ersties.com/width=510,height=660,fit=cover,quality=85,sharpen=1,format=jpeg/content/images_mysql/Model_Cover_Image/backup/'+scrape_data['thumbnail']
174188
else:
175-
debugPrint('No performer ID found in URL. Please make sure you are using the ULR ending with "profile/nnnn".')
189+
log.error('No performer ID found in URL. Please make sure you are using the ULR ending with "profile/nnnn".')
176190
sys.exit()
177191
return ret
178192

179-
if sys.argv[1] == 'sceneByURL':
180-
i = readJSONInput()
181-
ret = get_scene(i.get('url'))
182-
print(json.dumps(ret))
183-
184-
if sys.argv[1] == 'groupByURL':
185-
i = readJSONInput()
186-
ret = get_group(i.get('url'))
187-
print(json.dumps(ret))
188-
189-
if sys.argv[1] == 'performerByURL':
190-
i = readJSONInput()
191-
ret = get_performer(i.get('url'))
192-
print(json.dumps(ret))
193+
if __name__ == '__main__':
194+
op, args = scraper_args()
195+
result = None
196+
match op, args:
197+
case 'scene-by-url', { "url": url } if url:
198+
result = get_scene(url)
199+
case 'group-by-url', { "url": url } if url:
200+
result = get_group(url)
201+
case 'performer-by-url', { "url": url } if url:
202+
result = get_performer(url)
203+
case _:
204+
log.debug(f'Unknown operation {op} with arguments {args}')
205+
sys.exit(1)
206+
print(json.dumps(result))

scrapers/Ersties/Ersties.yml

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# requires: py_common
12
name: Ersties
23
sceneByURL:
34
- action: script
@@ -6,21 +7,21 @@ sceneByURL:
67
script:
78
- python
89
- Ersties.py
9-
- sceneByURL
10+
- scene-by-url
1011
groupByURL:
1112
- action: script
1213
url:
13-
- ersties.com/
14+
- ersties.com/shoot/
1415
script:
1516
- python
1617
- Ersties.py
17-
- groupByURL
18+
- group-by-url
1819
performerByURL:
1920
- action: script
2021
url:
21-
- ersties.com/
22+
- ersties.com/profile/
2223
script:
2324
- python
2425
- Ersties.py
25-
- performerByURL
26-
# Last Updated October 21, 2024
26+
- performer-by-url
27+
# Last Updated: December 7, 2025

scrapers/py_common/types.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ class ScrapedMovie(TypedDict, total=False):
106106
aliases: str
107107
tags: list[ScrapedTag]
108108

109+
ScrapedGroup = ScrapedMovie
109110

110111
class ScrapedGallery(TypedDict, total=False):
111112
title: str
@@ -130,6 +131,7 @@ class ScrapedScene(TypedDict, total=False):
130131
image: str
131132
studio: ScrapedStudio
132133
movies: list[ScrapedMovie]
134+
groups: list[ScrapedMovie]
133135
tags: list[ScrapedTag]
134136
performers: list[ScrapedPerformer]
135137
code: str

scrapers/py_common/util.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,10 +156,14 @@ def __default_parser(**kwargs):
156156
).add_argument("--url")
157157

158158
# Filling in an URL and hitting the "Scrape" icon
159-
subparsers.add_parser(
159+
subparsers.add_parser( # DEPRECATED, USE group-by-url instead
160160
"movie-by-url", help="Scrape a movie by its URL"
161161
).add_argument("--url")
162162

163+
subparsers.add_parser(
164+
"group-by-url", help="Scrape a group by its URL"
165+
).add_argument("--url")
166+
163167
# The looking glass search icon
164168
# name field is guaranteed to be filled by Stash
165169
subparsers.add_parser("scene-by-name", help="Scrape a scene by name").add_argument(
@@ -232,6 +236,7 @@ def scraper_args(**kwargs):
232236
- performer-by-fragment
233237
- performer-by-url
234238
- movie-by-url
239+
- group-by-url
235240
- scene-by-name
236241
- scene-by-url
237242
- scene-by-fragment

0 commit comments

Comments
 (0)