Skip to content

Commit 321d8b6

Browse files
authored
Ianmacleod/fix cacher (#279)
* fixing cacher, tested in prod version of cacher deployment in k8s and seems to be working * update logging * removing unnecessary logging statements and try/except blocks
1 parent 6ad4c6a commit 321d8b6

File tree

1 file changed

+30
-41
lines changed

1 file changed

+30
-41
lines changed

model-engine/model_engine_server/infra/services/image_cache_service.py

Lines changed: 30 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -130,49 +130,38 @@ async def execute(self, endpoint_infra_states: Dict[str, Tuple[bool, ModelEndpoi
130130

131131
image_repository_and_tag = state.image.split("/", 1)[1]
132132
repository_name, image_tag = image_repository_and_tag.split(":")
133-
try:
134-
if state.resource_state.gpus == 0 and (
135-
(
136-
state.image not in images_to_cache_priority["cpu"]
137-
or last_updated_at.replace(tzinfo=pytz.utc)
138-
> images_to_cache_priority["cpu"][state.image].last_updated_at.replace(
139-
tzinfo=pytz.utc
140-
)
133+
if state.resource_state.gpus == 0 and (
134+
(
135+
state.image not in images_to_cache_priority["cpu"]
136+
or last_updated_at.replace(tzinfo=pytz.utc)
137+
> images_to_cache_priority["cpu"][state.image].last_updated_at.replace(
138+
tzinfo=pytz.utc
141139
)
142-
and self.docker_repository.image_exists(image_tag, repository_name)
143-
):
144-
images_to_cache_priority["cpu"][state.image] = cache_priority
145-
elif state.resource_state.gpus > 0:
146-
for gpu_type, key in [
147-
(GpuType.NVIDIA_AMPERE_A10, "a10"),
148-
(GpuType.NVIDIA_AMPERE_A100, "a100"),
149-
(GpuType.NVIDIA_TESLA_T4, "t4"),
150-
]:
151-
if state.resource_state.gpu_type == gpu_type and (
152-
(
153-
state.image not in images_to_cache_priority[key]
154-
or last_updated_at.replace(tzinfo=pytz.utc)
155-
> images_to_cache_priority[key][
156-
state.image
157-
].last_updated_at.replace(tzinfo=pytz.utc)
158-
)
159-
and self.docker_repository.image_exists(image_tag, repository_name)
160-
):
161-
images_to_cache_priority[key][state.image] = cache_priority
162-
except Exception as exc:
163-
logger.warning(
164-
f"Endpoint {endpoint_id} had an error. Error message: {exc}. Skipping caching ..."
165140
)
166-
continue
167-
141+
and self.docker_repository.image_exists(image_tag, repository_name)
142+
):
143+
images_to_cache_priority["cpu"][state.image] = cache_priority
144+
elif state.resource_state.gpus > 0:
145+
for gpu_type, key in [
146+
(GpuType.NVIDIA_AMPERE_A10, "a10"),
147+
(GpuType.NVIDIA_AMPERE_A100, "a100"),
148+
(GpuType.NVIDIA_TESLA_T4, "t4"),
149+
]:
150+
if state.resource_state.gpu_type == gpu_type and (
151+
(
152+
state.image not in images_to_cache_priority[key]
153+
or last_updated_at.replace(tzinfo=pytz.utc)
154+
> images_to_cache_priority[key][state.image].last_updated_at.replace(
155+
tzinfo=pytz.utc
156+
)
157+
)
158+
and self.docker_repository.image_exists(image_tag, repository_name)
159+
):
160+
images_to_cache_priority[key][state.image] = cache_priority
168161
images_to_cache = CachedImages(cpu=[], a10=[], a100=[], t4=[])
169-
try:
170-
for key, val in images_to_cache_priority.items():
171-
images_to_cache[key] = sorted( # type: ignore
172-
val.keys(), key=lambda image: val[image], reverse=True
173-
)[:IMAGES_TO_CACHE_PER_INSTANCE_TYPE]
174-
logger.info("sorted images to cache successfully")
175-
except Exception as exc:
176-
logger.warning(f"sorting had an error. Error message: {exc}. Skipping sorting...")
162+
for key, val in images_to_cache_priority.items():
163+
images_to_cache[key] = sorted( # type: ignore
164+
val.keys(), key=lambda image: val[image], reverse=True
165+
)[:IMAGES_TO_CACHE_PER_INSTANCE_TYPE]
177166

178167
await self.image_cache_gateway.create_or_update_image_cache(images_to_cache)

0 commit comments

Comments
 (0)