Skip to content

Commit a2bc8ce

Browse files
committed
ongoing
1 parent fad2a31 commit a2bc8ce

File tree

1 file changed

+11
-2
lines changed

1 file changed

+11
-2
lines changed

services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ async def check_clusters(app: FastAPI) -> None:
112112
if await ping_scheduler(get_scheduler_url(instance), get_scheduler_auth(app))
113113
}
114114

115+
# set intance heartbeat if scheduler is busy
115116
for instance in connected_intances:
116117
with log_catch(_logger, reraise=False):
117118
# NOTE: some connected instance could in theory break between these 2 calls, therefore this is silenced and will
@@ -124,6 +125,7 @@ async def check_clusters(app: FastAPI) -> None:
124125
f"{instance.id=} for {instance.tags=}",
125126
)
126127
await set_instance_heartbeat(app, instance=instance)
128+
# clean any cluster that is not doing anything
127129
if terminateable_instances := await _find_terminateable_instances(
128130
app, connected_intances
129131
):
@@ -138,7 +140,7 @@ async def check_clusters(app: FastAPI) -> None:
138140
for instance in disconnected_instances
139141
if _get_instance_last_heartbeat(instance) is None
140142
}
141-
143+
# remove instances that were starting for too long
142144
if terminateable_instances := await _find_terminateable_instances(
143145
app, starting_instances
144146
):
@@ -149,7 +151,14 @@ async def check_clusters(app: FastAPI) -> None:
149151
)
150152
await delete_clusters(app, instances=terminateable_instances)
151153

152-
# the other instances are broken (they were at some point connected but now not anymore)
154+
# TODO: transmit command to start docker swarm/stack if needed
155+
# once the instance is connected to the SSM server,
156+
# use ssm client to send the command to these instances,
157+
# we send a command that contain:
158+
# the docker-compose file in binary,
159+
# the call to init the docker swarm and the call to deploy the stack
160+
161+
# the remaining instances are broken (they were at some point connected but now not anymore)
153162
broken_instances = disconnected_instances - starting_instances
154163
if terminateable_instances := await _find_terminateable_instances(
155164
app, broken_instances

0 commit comments

Comments
 (0)