@@ -130,49 +130,38 @@ async def execute(self, endpoint_infra_states: Dict[str, Tuple[bool, ModelEndpoi
130130
131131 image_repository_and_tag = state .image .split ("/" , 1 )[1 ]
132132 repository_name , image_tag = image_repository_and_tag .split (":" )
133- try :
134- if state .resource_state .gpus == 0 and (
135- (
136- state .image not in images_to_cache_priority ["cpu" ]
137- or last_updated_at .replace (tzinfo = pytz .utc )
138- > images_to_cache_priority ["cpu" ][state .image ].last_updated_at .replace (
139- tzinfo = pytz .utc
140- )
133+ if state .resource_state .gpus == 0 and (
134+ (
135+ state .image not in images_to_cache_priority ["cpu" ]
136+ or last_updated_at .replace (tzinfo = pytz .utc )
137+ > images_to_cache_priority ["cpu" ][state .image ].last_updated_at .replace (
138+ tzinfo = pytz .utc
141139 )
142- and self .docker_repository .image_exists (image_tag , repository_name )
143- ):
144- images_to_cache_priority ["cpu" ][state .image ] = cache_priority
145- elif state .resource_state .gpus > 0 :
146- for gpu_type , key in [
147- (GpuType .NVIDIA_AMPERE_A10 , "a10" ),
148- (GpuType .NVIDIA_AMPERE_A100 , "a100" ),
149- (GpuType .NVIDIA_TESLA_T4 , "t4" ),
150- ]:
151- if state .resource_state .gpu_type == gpu_type and (
152- (
153- state .image not in images_to_cache_priority [key ]
154- or last_updated_at .replace (tzinfo = pytz .utc )
155- > images_to_cache_priority [key ][
156- state .image
157- ].last_updated_at .replace (tzinfo = pytz .utc )
158- )
159- and self .docker_repository .image_exists (image_tag , repository_name )
160- ):
161- images_to_cache_priority [key ][state .image ] = cache_priority
162- except Exception as exc :
163- logger .warning (
164- f"Endpoint { endpoint_id } had an error. Error message: { exc } . Skipping caching ..."
165140 )
166- continue
167-
141+ and self .docker_repository .image_exists (image_tag , repository_name )
142+ ):
143+ images_to_cache_priority ["cpu" ][state .image ] = cache_priority
144+ elif state .resource_state .gpus > 0 :
145+ for gpu_type , key in [
146+ (GpuType .NVIDIA_AMPERE_A10 , "a10" ),
147+ (GpuType .NVIDIA_AMPERE_A100 , "a100" ),
148+ (GpuType .NVIDIA_TESLA_T4 , "t4" ),
149+ ]:
150+ if state .resource_state .gpu_type == gpu_type and (
151+ (
152+ state .image not in images_to_cache_priority [key ]
153+ or last_updated_at .replace (tzinfo = pytz .utc )
154+ > images_to_cache_priority [key ][state .image ].last_updated_at .replace (
155+ tzinfo = pytz .utc
156+ )
157+ )
158+ and self .docker_repository .image_exists (image_tag , repository_name )
159+ ):
160+ images_to_cache_priority [key ][state .image ] = cache_priority
168161 images_to_cache = CachedImages (cpu = [], a10 = [], a100 = [], t4 = [])
169- try :
170- for key , val in images_to_cache_priority .items ():
171- images_to_cache [key ] = sorted ( # type: ignore
172- val .keys (), key = lambda image : val [image ], reverse = True
173- )[:IMAGES_TO_CACHE_PER_INSTANCE_TYPE ]
174- logger .info ("sorted images to cache successfully" )
175- except Exception as exc :
176- logger .warning (f"sorting had an error. Error message: { exc } . Skipping sorting..." )
162+ for key , val in images_to_cache_priority .items ():
163+ images_to_cache [key ] = sorted ( # type: ignore
164+ val .keys (), key = lambda image : val [image ], reverse = True
165+ )[:IMAGES_TO_CACHE_PER_INSTANCE_TYPE ]
177166
178167 await self .image_cache_gateway .create_or_update_image_cache (images_to_cache )
0 commit comments