22import requests
33import re
44import json
5- from py_common .util import guess_nationality
5+ from py_common .util import guess_nationality , scraper_args
6+ from py_common .types import ScrapedScene , ScrapedPerformer , ScrapedGroup
7+ from py_common import log
8+ from py_common .config import get_config
69from datetime import datetime
710from bs4 import BeautifulSoup as bs
811
9- #Authentication tokens and cookies are needed for this scraper. Use the network console in your browsers developer tools to find this information in an api call header.
10- #Auth Variables For Header
11- authorization = ''
12- cookie = ''
13- x_visit_uid = ''
12+ config = get_config (default = """
13+ # Ersties auth configuration
14+ # Use the network console in your browsers developer tools to find this information in an api call header.
15+ AUTHORIZATION =
16+ COOKIE =
17+ X_VISIT_UID =
18+ """ )
1419
1520#Headers for Requests
1621scrape_headers = {
17- 'authorization' : authorization ,
18- 'cookie' : cookie ,
19- 'x-visit-uid' : x_visit_uid ,
22+ 'authorization' : config . AUTHORIZATION ,
23+ 'cookie' : config . COOKIE ,
24+ 'x-visit-uid' : config . X_VISIT_UID ,
2025}
21-
22- #Get JSON from Stash
23- def readJSONInput ():
24- input = sys .stdin .read ()
25- return json .loads (input )
26-
27- def debugPrint (t ):
28- sys .stderr .write (t + "\n " )
29-
3026def clean_text (details : str ) -> str :
3127 """
3228 remove escaped backslashes and html parse the details text
@@ -47,20 +43,22 @@ def clean_text(details: str) -> str:
4743 details = details .strip ()
4844 return details
4945
50- def get_scene (inputurl ):
46+ def get_scene (inputurl ) -> ScrapedScene :
5147
52- # Use a regular expression to extract the number after '#play-' and before '-comments'
53- match = re .search (r'#play-(\d+)-comments' , inputurl )
48+ # Try to extract the scene ID from URLs like:
49+ # ...#play-6138
50+ # ...#play-6138-comments
51+ match = re .search (r'#play-(\d+)(?:-comments)?' , inputurl )
5452
5553 # Check if the pattern was found and save it as a variable
5654 if match :
5755 sceneid = match .group (1 )
5856 else :
59- debugPrint ('No scene ID found in URL. Please make sure you are using the ULR ending with "#play-nnnn-comments ".' )
57+ log . error ('No scene ID found in URL. Please make sure the URL contains "#play-<id> ".' )
6058 sys .exit ()
6159
62- #Build URL to scrape
63- scrape_url = 'https://api.ersties.com/videos/' + sceneid
60+ # Build URL to scrape
61+ scrape_url = 'https://api.ersties.com/videos/' + sceneid
6462
6563 #Scrape URL
6664 scrape = requests .get (scrape_url , headers = scrape_headers )
@@ -70,87 +68,102 @@ def get_scene(inputurl):
7068 if scrape .status_code == 200 :
7169 scrape_data = scrape .json ()
7270
73- ret = {}
74-
75- ret ['title' ] = scrape_data ['title_en' ]
76- ret ['code' ] = str (scrape_data ['id' ])
77- ret ['details' ] = clean_text (str (scrape_data ['gallery' ]['description_en' ]))
78- ret ['studio' ] = {'name' :'Ersties' }
79- ret ['tags' ] = [{'name' : x ['name_en' ]} for x in scrape_data ['tags' ]]
80- ret ['performers' ] = [{'name' :x ['name_en' ], 'details' :x ['description_en' ], 'urls' :['https://ersties.com/profile/' + str (x ['id' ])],'images' :[f'https://thumb.ersties.com/width=510,height=660,fit=cover,quality=85,sharpen=1,format=jpeg/content/images_mysql/Model_Cover_Image/backup/' + x ['thumbnail' ]] } for x in scrape_data ['participated_models' ]]
81- for thumbnail in scrape_data ['thumbnails' ]:
82- if thumbnail ['is_main' ]:
83- ret ['image' ] = f'https://thumb.ersties.com/width=900,height=500,fit=cover,quality=85,sharpen=1,format=jpeg/content/images_mysql/images_videothumbnails/backup/' + thumbnail ['file_name' ]
71+ ret : ScrapedScene = {}
72+ ret ['code' ] = str (scrape_data .get ('id' , '' ))
73+ ret ['tags' ] = [{'name' : x .get ('name_en' , '' )} for x in scrape_data .get ('tags' , [])]
74+
75+ gallery = scrape_data .get ('gallery' ) or {}
76+ gallery_title = gallery .get ('title_en' ) or gallery .get ('title' )
77+ scene_title = scrape_data .get ('title_en' ) or scrape_data .get ('title' )
78+ if gallery_title and scene_title :
79+ ret ['title' ] = f"{ gallery_title } : { scene_title } "
80+ ret ['details' ] = clean_text (str (gallery .get ('description_en' , '' )))
81+ ret ['studio' ] = {'name' : 'Ersties' }
82+ ret ['performers' ] = [
83+ {
84+ 'name' : model .get ('name_en' , '' ),
85+ 'details' : model .get ('description_en' , '' ),
86+ 'urls' : [f'https://ersties.com/profile/{ model .get ("id" )} ' ],
87+ 'images' : [
88+ f'https://thumb.ersties.com/format=jpeg/content/images_mysql/Model_Cover_Image/backup/{ model .get ("thumbnail" , "" )} '
89+ ],
90+ }
91+ for model in scrape_data .get ('participated_models' , [])
92+ ]
93+
94+ # Main image
95+ for thumbnail in scrape_data .get ('thumbnails' , []):
96+ if thumbnail .get ('is_main' ):
97+ ret ['image' ] = f"https://thumb.ersties.com/format=jpeg/content/images_mysql/images_videothumbnails/backup/{ thumbnail .get ('file_name' , '' )} "
8498 break
85- #Get Date
86- epoch_time = scrape_data [ ' gallery' ][ ' available_since']
87- # Check if the date is returned as an integer.
99+ # Date (scene + group)
100+ epoch_time = gallery . get ( ' available_since')
101+ group_date = None
88102 if isinstance (epoch_time , int ):
89- #Convert date from Epoch Time
90- ret ['date' ] = datetime .fromtimestamp (epoch_time ).strftime ("%Y-%m-%d" )
91- #Get Group Information
92- #Get Group Date
93- group_epoch_time = scrape_data ['gallery' ]['available_since' ]
94- # Check if the date is returned as an integer.
95- if isinstance (group_epoch_time , int ):
96- #Convert date from Epoch Time
97- group_date = datetime .fromtimestamp (group_epoch_time ).strftime ("%Y-%m-%d" )
98- ret ['groups' ] = [{'name' : scrape_data ['gallery' ]['title_en' ], 'synopsis' : clean_text (str (scrape_data ['gallery' ]['description_en' ])), 'studio' : {'name' :'Ersties' }, 'urls' :[f'https://ersties.com/shoot/' + str (scrape_data ['gallery' ]['id' ])], 'front_image' : f'https://thumb.ersties.com/width=510,height=660,fit=cover,quality=85,sharpen=1,format=jpeg/content/images_mysql/Shoot_Cover/' + scrape_data ['gallery' ]['image' ], 'date' : group_date }]
103+ group_date = datetime .fromtimestamp (epoch_time ).strftime ("%Y-%m-%d" )
104+ ret ['date' ] = group_date
105+
106+ ret ['groups' ] = [{
107+ 'name' : gallery .get ('title_en' , '' ),
108+ 'synopsis' : clean_text (str (gallery .get ('description_en' , '' ))),
109+ 'studio' : {'name' : 'Ersties' },
110+ 'urls' : [f'https://ersties.com/shoot/{ gallery .get ("id" , "" )} ' ],
111+ 'front_image' : f"https://thumb.ersties.com/format=jpeg/content/images_mysql/Shoot_Cover/{ gallery .get ('image' , '' )} " ,
112+ 'date' : group_date ,
113+ }]
114+
99115 else :
100- debugPrint ( ' Response: ' + str (scrape .status_code )+ ' . Please check your auth header.' )
101- sys .exit ()
116+ log . error ( f" Response:{ str (scrape .status_code )} . Please check your auth header." )
117+ sys .exit ()
102118 return ret
103119
104- def get_group (inputurl ):
120+ def get_group (inputurl ) -> ScrapedGroup :
105121 # Check if URL is a Shoot
106122 if re .search (r"/shoot/\d+$" , inputurl ):
107123 urltype = 'shoot'
108124 match = re .search (r'shoot/(\d+)' , inputurl )
109- groupid = match .group (1 )
125+ if match :
126+ groupid = match .group (1 )
110127 else :
111- debugPrint ('No shoot ID found in URL. Please make sure you are using the correct URL.' )
128+ log . error ('No shoot ID found in URL. Please make sure you are using the correct URL.' )
112129 sys .exit ()
113-
114- #Scrape Shoot
130+ # Scrape Shoot
115131 if urltype == 'shoot' :
116- #Build URL to scrape group
117- scrape_url = 'https://api.ersties.com/galleries/' + groupid
132+ # Build URL to scrape group
133+ scrape_url = 'https://api.ersties.com/galleries/' + groupid
118134
119- #Scrape URL
135+ # Scrape URL
120136 scrape = requests .get (scrape_url , headers = scrape_headers )
121137
122- #Parse response
123- #Check for valid response
124- if scrape .status_code == 200 :
138+ # Parse response
139+ # Check for valid response
140+ if scrape .status_code == 200 :
125141 scrape_data = scrape .json ()
126142
127- ret = {}
143+ ret : ScrapedGroup = {}
128144
129- ret ['name' ] = scrape_data ['title_en' ]
130- ret ['synopsis' ] = clean_text (str (scrape_data ['description_en' ]))
131- ret ['studio' ] = {'name' :'Ersties' }
132- ret ['front_image' ] = f'https://thumb.ersties.com/width=510,height=660,fit=cover,quality=85,sharpen=1,format=jpeg/content/images_mysql/Shoot_Cover/' + scrape_data ['image' ]
133- #Get Date
134- epoch_time = scrape_data ['available_since' ]
135- # Check if the date is returned as an integer.
145+ ret ['name' ] = scrape_data .get ('title_en' , '' )
146+ ret ['synopsis' ] = clean_text (str (scrape_data .get ('description_en' , '' )))
147+ ret ['studio' ] = {'name' : 'Ersties' }
148+ ret ['front_image' ] = f"https://thumb.ersties.com/format=jpeg/content/images_mysql/Shoot_Cover/{ scrape_data .get ('image' , '' )} "
149+ # Get Date
150+ epoch_time = scrape_data .get ('available_since' )
136151 if isinstance (epoch_time , int ):
137- #Convert date from Epoch Time
138152 ret ['date' ] = datetime .fromtimestamp (epoch_time ).strftime ("%Y-%m-%d" )
139153 else :
140- debugPrint ('Response: ' + str (scrape .status_code )+ '. Please check your auth header.' )
141- sys .exit ()
142-
154+ log .error (f"Response: { str (scrape .status_code )} . Please check your auth header." )
155+ sys .exit ()
143156 return ret
144157
145- def get_performer (inputurl ):
158+ def get_performer (inputurl ) -> ScrapedPerformer :
146159 # Use a regular expression to extract the number after '#play-' and before '-comments'
147160 match = re .search (r'profile/(\d+)' , inputurl )
148161
149162 # Check if the pattern was found and save it as a variable
150163 if match :
151164 groupid = match .group (1 )
152165 else :
153- debugPrint ('No performer ID found in URL. Please make sure you are using the ULR ending with "profile/nnnn".' )
166+ log . error ('No performer ID found in URL. Please make sure you are using the ULR ending with "profile/nnnn".' )
154167 sys .exit ()
155168
156169 #Build URL to scrape group
@@ -164,29 +177,30 @@ def get_performer(inputurl):
164177 if scrape .status_code == 200 :
165178 scrape_data = scrape .json ()
166179
167- ret = {}
180+ ret : ScrapedPerformer = {
181+ "name" : scrape_data ['name_en' ],
182+ "details" : scrape_data ['description_en' ],
183+ "image" : f'https://thumb.ersties.com/width=510,height=660,fit=cover,quality=85,sharpen=1,format=jpeg/content/images_mysql/Model_Cover_Image/backup/' + scrape_data ['thumbnail' ]
184+ }
168185
169- ret ['name' ] = scrape_data ['name_en' ]
170186 if scrape_data ['location_en' ] is not None :
171187 ret ['country' ] = guess_nationality (scrape_data ['location_en' ])
172- ret ['details' ] = scrape_data ['description_en' ]
173- ret ['image' ] = f'https://thumb.ersties.com/width=510,height=660,fit=cover,quality=85,sharpen=1,format=jpeg/content/images_mysql/Model_Cover_Image/backup/' + scrape_data ['thumbnail' ]
174188 else :
175- debugPrint ('No performer ID found in URL. Please make sure you are using the ULR ending with "profile/nnnn".' )
189+ log . error ('No performer ID found in URL. Please make sure you are using the ULR ending with "profile/nnnn".' )
176190 sys .exit ()
177191 return ret
178192
179- if sys . argv [ 1 ] == 'sceneByURL ' :
180- i = readJSONInput ()
181- ret = get_scene ( i . get ( 'url' ))
182- print ( json . dumps ( ret ))
183-
184- if sys . argv [ 1 ] == 'groupByURL' :
185- i = readJSONInput ()
186- ret = get_group (i . get ( ' url' ) )
187- print ( json . dumps ( ret ))
188-
189- if sys . argv [ 1 ] == 'performerByURL' :
190- i = readJSONInput ( )
191- ret = get_performer ( i . get ( 'url' ) )
192- print (json .dumps (ret ))
193+ if __name__ == '__main__ ' :
194+ op , args = scraper_args ()
195+ result = None
196+ match op , args :
197+ case 'scene-by-url' , { "url" : url } if url :
198+ result = get_scene ( url )
199+ case 'group-by-url' , { "url" : url } if url :
200+ result = get_group (url )
201+ case 'performer-by-url' , { "url" : url } if url :
202+ result = get_performer ( url )
203+ case _ :
204+ log . debug ( f'Unknown operation { op } with arguments { args } ' )
205+ sys . exit ( 1 )
206+ print (json .dumps (result ))
0 commit comments