11from datetime import datetime
22from typing import Dict , NamedTuple , Tuple
33
4+ import pytz
45from model_engine_server .common .config import hmi_config
56from model_engine_server .common .env_vars import GIT_TAG
67from model_engine_server .core .config import infra_config
@@ -108,7 +109,11 @@ async def execute(self, endpoint_infra_states: Dict[str, Tuple[bool, ModelEndpoi
108109 if record is None :
109110 continue
110111
111- last_updated_at = record .last_updated_at or datetime .min
112+ last_updated_at = (
113+ record .last_updated_at .replace (tzinfo = pytz .utc )
114+ if record .last_updated_at is not None
115+ else datetime .min .replace (tzinfo = pytz .utc )
116+ )
112117 has_no_available_workers = int (state .deployment_state .available_workers == 0 )
113118 is_high_priority = int (state .high_priority is True )
114119
@@ -125,36 +130,36 @@ async def execute(self, endpoint_infra_states: Dict[str, Tuple[bool, ModelEndpoi
125130
126131 image_repository_and_tag = state .image .split ("/" , 1 )[1 ]
127132 repository_name , image_tag = image_repository_and_tag .split (":" )
128- if state .resource_state .gpus == 0 and (
129- (
130- state .image not in images_to_cache_priority ["cpu" ]
131- or last_updated_at .replace (
132- tzinfo = images_to_cache_priority ["cpu" ][state .image ].last_updated_at .tzinfo
133+ try :
134+ if state .resource_state .gpus == 0 and (
135+ (
136+ state .image not in images_to_cache_priority ["cpu" ]
137+ or last_updated_at .replace (tzinfo = pytz .utc )
138+ > images_to_cache_priority ["cpu" ][state .image ].last_updated_at
133139 )
134- > images_to_cache_priority ["cpu" ][state .image ].last_updated_at
135- )
136- and self .docker_repository .image_exists (image_tag , repository_name )
137- ):
138- images_to_cache_priority ["cpu" ][state .image ] = cache_priority
139- elif state .resource_state .gpus > 0 :
140- for gpu_type , key in [
141- (GpuType .NVIDIA_AMPERE_A10 , "a10" ),
142- (GpuType .NVIDIA_AMPERE_A100 , "a100" ),
143- (GpuType .NVIDIA_TESLA_T4 , "t4" ),
144- ]:
145- if state .resource_state .gpu_type == gpu_type and (
146- (
147- state .image not in images_to_cache_priority [key ]
148- or last_updated_at .replace (
149- tzinfo = images_to_cache_priority [key ][
150- state .image
151- ].last_updated_at .tzinfo
140+ and self .docker_repository .image_exists (image_tag , repository_name )
141+ ):
142+ images_to_cache_priority ["cpu" ][state .image ] = cache_priority
143+ elif state .resource_state .gpus > 0 :
144+ for gpu_type , key in [
145+ (GpuType .NVIDIA_AMPERE_A10 , "a10" ),
146+ (GpuType .NVIDIA_AMPERE_A100 , "a100" ),
147+ (GpuType .NVIDIA_TESLA_T4 , "t4" ),
148+ ]:
149+ if state .resource_state .gpu_type == gpu_type and (
150+ (
151+ state .image not in images_to_cache_priority [key ]
152+ or last_updated_at .replace (tzinfo = pytz .utc )
153+ > images_to_cache_priority [key ][state .image ].last_updated_at
152154 )
153- > images_to_cache_priority [key ][state .image ].last_updated_at
154- )
155- and self .docker_repository .image_exists (image_tag , repository_name )
156- ):
157- images_to_cache_priority [key ][state .image ] = cache_priority
155+ and self .docker_repository .image_exists (image_tag , repository_name )
156+ ):
157+ images_to_cache_priority [key ][state .image ] = cache_priority
158+ except Exception as exc :
159+ logger .warning (
160+ f"Endpoint { endpoint_id } had an error. Error message: { exc } . Skipping caching ..."
161+ )
162+ continue
158163
159164 images_to_cache = CachedImages (cpu = [], a10 = [], a100 = [], t4 = [])
160165 for key , val in images_to_cache_priority .items ():
0 commit comments