88from stashapi .stashapp import StashInterface
99
1010
11-
1211def get_all_images (
13- client : StashInterface ,
14- skip_tags : list [str ],
15- exclude_organized : bool
12+ client : StashInterface ,
13+ skip_tags : list [int ],
14+ exclude_organized : bool ,
15+ per_page : int = 100 ,
1616) -> list [dict ]:
1717 """
18- Get all images with proper tag exclusion and organization filter
18+ Generator to fetch images in pages from the stash API.
1919 """
20- image_filter = {}
21- pagination = {
22- "page" : 1 ,
23- "per_page" : - 1 , # -1 gets all results at once
24- "sort" : "created_at" ,
25- "direction" : "ASC" ,
26- }
27-
28- # Convert tag names to IDs
29- tag_ids = []
30- for tag_name in skip_tags :
31- tag = get_or_create_tag (client , tag_name )
32- if tag :
33- tag_ids .append (tag ["id" ])
34-
35- if tag_ids :
36- image_filter ["tags" ] = {
37- "value" : [],
38- "excludes" : tag_ids ,
39- "modifier" : "INCLUDES_ALL" ,
40- "depth" : - 1 ,
20+ page = 1
21+ while True :
22+ image_filter = {}
23+ pagination = {
24+ "page" : page ,
25+ "per_page" : per_page ,
26+ "sort" : "created_at" ,
27+ "direction" : "ASC" ,
4128 }
4229
43- if exclude_organized :
44- image_filter ["organized" ] = False # Correct field name
30+ if skip_tags :
31+ image_filter ["tags" ] = {
32+ "value" : [],
33+ "excludes" : skip_tags ,
34+ "modifier" : "INCLUDES_ALL" ,
35+ "depth" : - 1 ,
36+ }
37+
38+ if exclude_organized :
39+ image_filter ["organized" ] = False
40+
41+ images = client .find_images (f = image_filter , filter = pagination )
42+ if not images :
43+ # no more pages
44+ break
4545
46- # Maintain original parameter structure
47- return client .find_images (f = image_filter , filter = pagination )
46+ log .info (f"Fetched page { page } with { len (images )} images" )
47+ for img in images :
48+ yield img
49+
50+ # move to next page
51+ page += 1
4852
4953
5054def process_e621_post (stash : StashInterface , image_id : str , image_md5 : str ) -> None :
5155 """Process e621 metadata and update Stash records"""
52- # Skip already processed images
56+ # same as before...
5357 image = stash .find_image (image_id )
54- if any (tag ["name" ] == "e621_tagged" for tag in image .get ("tags" , [])):
58+ if any (t ["name" ] == "e621_tagged" for t in image .get ("tags" , [])):
59+ return
60+
61+ if any (t ["name" ] == "e621_tag_failed" for t in image .get ("tags" , [])):
5562 return
5663
5764 try :
58- time .sleep (2 ) # Rate limiting
65+ time .sleep (0.5 )
5966 response = requests .get (
6067 f"https://e621.net/posts.json?md5={ image_md5 } " ,
6168 headers = {"User-Agent" : "Stash-e621-Tagger/1.0" },
@@ -64,53 +71,49 @@ def process_e621_post(stash: StashInterface, image_id: str, image_md5: str) -> N
6471 response .raise_for_status ()
6572 post_data = response .json ().get ("post" , {})
6673 except Exception as e :
67- log .error (f"e621 API error: { str (e )} " )
74+ log .error (f"Marking as failed. e621 API error: { str (e )} " )
75+ e621_tag_failed = get_or_create_tag (stash , "e621_tag_failed" )
76+ fail_ids = [e621_tag_failed ["id" ]] + [t ["id" ] for t in image .get ("tags" , [])]
77+ stash .update_image ({"id" : image_id , "tag_ids" : list (set (fail_ids ))})
6878 return
6979
7080 if not post_data :
7181 return
7282
73- # Create essential entities
7483 e621_tag = get_or_create_tag (stash , "e621_tagged" )
7584 post_url = f"https://e621.net/posts/{ post_data ['id' ]} "
7685
77- # Process tags
7886 tag_ids = [e621_tag ["id" ]]
79- for category in ["general" , "species" , "character" , "artist" , "copyright" ]:
80- for tag in post_data .get ("tags" , {}).get (category , []):
81- # Clean and validate tag
87+ for cat in ["general" , "species" , "character" , "artist" , "copyright" ]:
88+ for tag in post_data .get ("tags" , {}).get (cat , []):
8289 clean_tag = tag .strip ()
8390 if not clean_tag :
8491 continue
85-
8692 stash_tag = get_or_create_tag (stash , clean_tag )
8793 if stash_tag :
8894 tag_ids .append (stash_tag ["id" ])
8995
90- # Process studio
9196 studio_id = None
9297 if artists := post_data .get ("tags" , {}).get ("artist" ):
9398 studio = get_or_create_studio (stash , artists [0 ])
9499 studio_id = studio ["id" ]
95100
96- # Process performers
97101 performer_ids = []
98- for char_tag in post_data .get ("tags" , {}).get ("character" , []):
99- performer_name = char_tag .split ('_(' )[0 ]
100- performer = get_or_create_performer (stash , performer_name )
101- performer_ids .append (performer ["id" ])
102+ for char in post_data .get ("tags" , {}).get ("character" , []):
103+ name = char .split ('_(' )[0 ]
104+ perf = get_or_create_performer (stash , name )
105+ performer_ids .append (perf ["id" ])
102106
103- # Update image
104107 try :
105108 stash .update_image ({
106109 "id" : image_id ,
110+ "organized" : True ,
107111 "urls" : [post_url ],
108112 "tag_ids" : list (set (tag_ids )),
109113 "studio_id" : studio_id ,
110114 "performer_ids" : performer_ids
111115 })
112-
113- log .info ("Image updated: ${image_id}" )
116+ log .info (f"Image updated: { image_id } " )
114117 except Exception as e :
115118 log .error (f"Update failed: { str (e )} " )
116119
@@ -166,72 +169,58 @@ def get_or_create_performer(stash: StashInterface, name: str) -> dict:
166169
167170def scrape_image (client : StashInterface , image_id : str ) -> None :
168171 """Main scraping handler"""
172+ # same logic as before for MD5 extraction and process_e621_post call
169173 image = client .find_image (image_id )
170174 if not image or not image .get ("visual_files" ):
171175 return
172176
173177 file_data = image ["visual_files" ][0 ]
174178 filename = file_data ["basename" ]
175179 filename_md5 = filename .split ('.' )[0 ]
176- final_md5 = None
177180
178- # First try filename-based MD5
179181 if re .match (r"^[a-f0-9]{32}$" , filename_md5 ):
180182 final_md5 = filename_md5
181183 log .info (f"Using filename MD5: { final_md5 } " )
182184 else :
183- # Fallback to content-based MD5
184185 try :
185- file_path = file_data ["path" ]
186- log .info (f"Generating MD5 from file content: { file_path } " )
187-
188186 md5_hash = hashlib .md5 ()
189- with open (file_path , "rb" ) as f :
190- # Read file in 64kb chunks for memory efficiency
187+ with open (file_data ["path" ], "rb" ) as f :
191188 for chunk in iter (lambda : f .read (65536 ), b"" ):
192189 md5_hash .update (chunk )
193-
194190 final_md5 = md5_hash .hexdigest ()
195191 log .info (f"Generated content MD5: { final_md5 } " )
196192 except Exception as e :
197193 log .error (f"Failed to generate MD5: { str (e )} " )
198194 return
199195
200- if final_md5 :
201- process_e621_post (client , image_id , final_md5 )
202- else :
203- log .warning ("No valid MD5 available for processing" )
196+ process_e621_post (client , image_id , final_md5 )
197+
204198
205- # Plugin setup and execution
206- # In the main execution block:
207199if __name__ == "__main__" :
200+ log .info ("Starting tagger with pagination..." )
208201 json_input = json .loads (sys .stdin .read ())
209202 stash = StashInterface (json_input ["server_connection" ])
210203
211204 config = stash .get_configuration ().get ("plugins" , {})
212205 settings = {
213- "SkipTags" : "e621_tagged" , # Add automatic filtering
206+ "SkipTags" : "e621_tagged, e621_tag_failed" ,
214207 "ExcludeOrganized" : False
215208 }
216209 settings .update (config .get ("e621_tagger" , {}))
217210
218- log .info (settings )
219-
220- # Get e621_tagged ID for filtering
221- e621_tag = get_or_create_tag (stash , "e621_tagged" )
211+ e621_tagged = get_or_create_tag (stash , "e621_tagged" )
212+ e621_failed = get_or_create_tag (stash , "e621_tag_failed" )
222213
223- # Existing tags + automatic e621_tagged exclusion
224214 skip_tags = [t .strip () for t in settings ["SkipTags" ].split ("," ) if t .strip ()]
225- skip_tags .append (e621_tag ["id" ]) # Filter by ID instead of name
226-
227- images = get_all_images (stash , skip_tags , settings ["ExcludeOrganized" ])
215+ skip_tags = [st for st in skip_tags ]
216+ skip_tags .extend ([e621_tagged ["id" ], e621_failed ["id" ]])
228217
229- # Rest of the loop remains the same
230- for i , image in enumerate (images , 1 ):
231- image_tag_names = [tag ["name" ] for tag in image .get ("tags" , [])]
232- if any (tag in image_tag_names for tag in skip_tags ):
218+ log . info ( "Fetching images in pages..." )
219+ for idx , image in enumerate (get_all_images ( stash , skip_tags , settings [ "ExcludeOrganized" ], per_page = 100 ), start = 1 ):
220+ current_tags = [t ["name" ] for t in image .get ("tags" , [])]
221+ if any (t in current_tags for t in skip_tags ):
233222 log .info (f"Skipping image { image ['id' ]} - contains skip tag" )
234223 continue
235224
236- log .progress (i / len ( images ) )
225+ log .progress (idx )
237226 scrape_image (stash , image ["id" ])
0 commit comments