Skip to content

Commit 6ad4c6a

Browse files
authored
fixing cacher, tested in prod version of cacher deployment in k8s and… (#278)
* fixing cacher, tested in prod version of cacher deployment in k8s and seems to be working * update logging
1 parent 9b720dc commit 6ad4c6a

File tree

1 file changed

+15
-7
lines changed

1 file changed

+15
-7
lines changed

model-engine/model_engine_server/infra/services/image_cache_service.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def _cache_finetune_llm_images(
6464
is_high_priority=1, # make it a high priority
6565
has_no_available_workers=1,
6666
# assuming it has no available workers so that it will be at top after reverse sorting
67-
last_updated_at=datetime.max,
67+
last_updated_at=datetime.max.replace(tzinfo=pytz.utc),
6868
# setting it to max to ensure it will be at top after reverse sorting
6969
)
7070

@@ -135,7 +135,9 @@ async def execute(self, endpoint_infra_states: Dict[str, Tuple[bool, ModelEndpoi
135135
(
136136
state.image not in images_to_cache_priority["cpu"]
137137
or last_updated_at.replace(tzinfo=pytz.utc)
138-
> images_to_cache_priority["cpu"][state.image].last_updated_at
138+
> images_to_cache_priority["cpu"][state.image].last_updated_at.replace(
139+
tzinfo=pytz.utc
140+
)
139141
)
140142
and self.docker_repository.image_exists(image_tag, repository_name)
141143
):
@@ -150,7 +152,9 @@ async def execute(self, endpoint_infra_states: Dict[str, Tuple[bool, ModelEndpoi
150152
(
151153
state.image not in images_to_cache_priority[key]
152154
or last_updated_at.replace(tzinfo=pytz.utc)
153-
> images_to_cache_priority[key][state.image].last_updated_at
155+
> images_to_cache_priority[key][
156+
state.image
157+
].last_updated_at.replace(tzinfo=pytz.utc)
154158
)
155159
and self.docker_repository.image_exists(image_tag, repository_name)
156160
):
@@ -162,9 +166,13 @@ async def execute(self, endpoint_infra_states: Dict[str, Tuple[bool, ModelEndpoi
162166
continue
163167

164168
images_to_cache = CachedImages(cpu=[], a10=[], a100=[], t4=[])
165-
for key, val in images_to_cache_priority.items():
166-
images_to_cache[key] = sorted( # type: ignore
167-
val.keys(), key=lambda image: val[image], reverse=True
168-
)[:IMAGES_TO_CACHE_PER_INSTANCE_TYPE]
169+
try:
170+
for key, val in images_to_cache_priority.items():
171+
images_to_cache[key] = sorted( # type: ignore
172+
val.keys(), key=lambda image: val[image], reverse=True
173+
)[:IMAGES_TO_CACHE_PER_INSTANCE_TYPE]
174+
logger.info("sorted images to cache successfully")
175+
except Exception as exc:
176+
logger.warning(f"sorting had an error. Error message: {exc}. Skipping sorting...")
169177

170178
await self.image_cache_gateway.create_or_update_image_cache(images_to_cache)

0 commit comments

Comments
 (0)