Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
f336e55
mypy
sanderegg Jun 20, 2025
3f0e8ab
refactor: clean up unused functions and improve node state checks
sanderegg Jun 23, 2025
c85574c
fixed warning
sanderegg Jun 23, 2025
8c2e527
removed depenedency
sanderegg Jun 23, 2025
48a2b63
remove circular dependency
sanderegg Jun 23, 2025
6a0d125
replaced ABC by a protocol
sanderegg Jun 23, 2025
7bfc7d5
movinf files around
sanderegg Jun 23, 2025
6a02944
moving files
sanderegg Jun 23, 2025
d315253
fixed paths
sanderegg Jun 23, 2025
f6bcf7b
fixed paths
sanderegg Jun 23, 2025
c396b8b
removed old files
sanderegg Jun 23, 2025
a090bb7
rename class
sanderegg Jun 23, 2025
7b5c59f
renamed module
sanderegg Jun 23, 2025
a71338a
renamed provider for computational
sanderegg Jun 23, 2025
92177fa
renamed provider for dynamic
sanderegg Jun 23, 2025
96bf218
renamed classes
sanderegg Jun 23, 2025
1f175d4
fix imports
sanderegg Jun 23, 2025
a4b37e8
tests passsing
sanderegg Jun 24, 2025
29b499d
renaming tests
sanderegg Jun 24, 2025
e92d0af
simplify
sanderegg Jun 24, 2025
ed1e8b6
moved computational utils to module
sanderegg Jun 27, 2025
2c1dcde
renaming utils
sanderegg Jun 27, 2025
4652882
mypy
sanderegg Jun 30, 2025
d5f78eb
linter
sanderegg Jun 30, 2025
728b2f4
sonar
sanderegg Jun 30, 2025
185930a
refactoring
sanderegg Jun 30, 2025
d974231
refactoring
sanderegg Jun 30, 2025
7cd39a7
fypo
sanderegg Jul 3, 2025
9da4a37
removed untrue comment
sanderegg Jul 3, 2025
667af3d
minor
sanderegg Jul 4, 2025
b934d39
minor
sanderegg Jul 4, 2025
5b663b6
minor
sanderegg Jul 4, 2025
8b21cb1
minor
sanderegg Jul 4, 2025
276dc70
renaming
sanderegg Jul 4, 2025
4ac43c7
renaming
sanderegg Jul 4, 2025
caae297
renaming
sanderegg Jul 4, 2025
7079f3e
renaming
sanderegg Jul 4, 2025
4a6f678
this will break the dashboard
sanderegg Jul 4, 2025
a7bcba5
renaming
sanderegg Jul 4, 2025
99d5f01
ruff
sanderegg Jul 4, 2025
2251d70
renaming
sanderegg Jul 4, 2025
ba2bdb0
sonar + time fix
sanderegg Jul 4, 2025
a4920a6
sonar
sanderegg Jul 4, 2025
0a19b3f
sonar
sanderegg Jul 4, 2025
a89070e
renaming
sanderegg Jul 4, 2025
c495e3c
renaming
sanderegg Jul 4, 2025
4422504
renamed
sanderegg Jul 4, 2025
3470fda
reduce complexity
sanderegg Jul 4, 2025
793026c
sonar
sanderegg Jul 4, 2025
97b0754
rename
sanderegg Jul 4, 2025
6d17828
sonar
sanderegg Jul 4, 2025
2afe9fd
renaming
sanderegg Jul 7, 2025
d8a0d4b
renaming
sanderegg Jul 7, 2025
ef60cad
Merge branch 'master' into autoscaling/refactoring
mergify[bot] Jul 7, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci-testing-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -587,7 +587,7 @@ jobs:
unit-test-autoscaling:
needs: changes
if: ${{ needs.changes.outputs.autoscaling == 'true' || github.event_name == 'push' || github.event.inputs.force_all_builds == 'true' }}
timeout-minutes: 22 # temporary: mypy takes a huge amount of time to run here, maybe we should cache it
timeout-minutes: 18 # if this timeout gets too small, then split the tests
name: "[unit] autoscaling"
runs-on: ${{ matrix.os }}
strategy:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def create_fake_association(
):
fake_node_to_instance_map = {}

async def _fake_node_creator(
def _fake_node_creator(
_nodes: list[Node], ec2_instances: list[EC2InstanceData]
) -> tuple[list[AssociatedInstance], list[EC2InstanceData]]:
def _create_fake_node_with_labels(instance: EC2InstanceData) -> Node:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,12 @@
APP_STARTED_DYNAMIC_BANNER_MSG,
)
from ..api.routes import setup_api_routes
from ..modules.auto_scaling_task import setup as setup_auto_scaler_background_task
from ..modules.buffer_machines_pool_task import setup as setup_buffer_machines_pool_task
from ..modules.cluster_scaling.auto_scaling_task import (
setup as setup_auto_scaler_background_task,
)
from ..modules.cluster_scaling.warm_buffer_machines_pool_task import (
setup as setup_warm_buffer_machines_pool_task,
)
from ..modules.docker import setup as setup_docker
from ..modules.ec2 import setup as setup_ec2
from ..modules.instrumentation import setup as setup_instrumentation
Expand Down Expand Up @@ -78,7 +82,7 @@ def create_app(settings: ApplicationSettings) -> FastAPI:
initialize_fastapi_app_tracing(app)

setup_auto_scaler_background_task(app)
setup_buffer_machines_pool_task(app)
setup_warm_buffer_machines_pool_task(app)

# ERROR HANDLERS

Expand Down
35 changes: 17 additions & 18 deletions services/autoscaling/src/simcore_service_autoscaling/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,7 @@ class AssociatedInstance(_BaseInstance):


@dataclass(frozen=True, kw_only=True, slots=True)
class NonAssociatedInstance(_BaseInstance):
...
class NonAssociatedInstance(_BaseInstance): ...


@dataclass(frozen=True, kw_only=True, slots=True)
Expand All @@ -68,9 +67,9 @@ class Cluster: # pylint: disable=too-many-instance-attributes
"description": "This is a EC2-backed docker node which is drained (cannot accept tasks)"
}
)
buffer_drained_nodes: list[AssociatedInstance] = field(
hot_buffer_drained_nodes: list[AssociatedInstance] = field(
metadata={
"description": "This is a EC2-backed docker node which is drained in the reserve if this is enabled (with no tasks)"
"description": "This is a EC2-backed docker node which is drained in the reserve if this is enabled (with no tasks, a.k.a. hot buffer)"
}
)
pending_ec2s: list[NonAssociatedInstance] = field(
Expand All @@ -83,9 +82,9 @@ class Cluster: # pylint: disable=too-many-instance-attributes
"description": "This is an existing EC2 instance that never properly joined the cluster and is deemed as broken and will be terminated"
}
)
buffer_ec2s: list[NonAssociatedInstance] = field(
warm_buffer_ec2s: list[NonAssociatedInstance] = field(
metadata={
"description": "This is a prepared stopped EC2 instance, not yet associated to a docker node, ready to be used"
"description": "This is a prepared stopped EC2 instance, not yet associated to a docker node, ready to be used (a.k.a. warm buffer)"
}
)
disconnected_nodes: list[Node] = field(
Expand Down Expand Up @@ -121,7 +120,7 @@ def total_number_of_machines(self) -> int:
len(self.active_nodes)
+ len(self.pending_nodes)
+ len(self.drained_nodes)
+ len(self.buffer_drained_nodes)
+ len(self.hot_buffer_drained_nodes)
+ len(self.pending_ec2s)
+ len(self.broken_ec2s)
+ len(self.terminating_nodes)
Expand All @@ -138,10 +137,10 @@ def _get_instance_ids(
f"Cluster(active-nodes: count={len(self.active_nodes)} {_get_instance_ids(self.active_nodes)}, "
f"pending-nodes: count={len(self.pending_nodes)} {_get_instance_ids(self.pending_nodes)}, "
f"drained-nodes: count={len(self.drained_nodes)} {_get_instance_ids(self.drained_nodes)}, "
f"reserve-drained-nodes: count={len(self.buffer_drained_nodes)} {_get_instance_ids(self.buffer_drained_nodes)}, "
f"hot-buffer-drained-nodes: count={len(self.hot_buffer_drained_nodes)} {_get_instance_ids(self.hot_buffer_drained_nodes)}, "
f"pending-ec2s: count={len(self.pending_ec2s)} {_get_instance_ids(self.pending_ec2s)}, "
f"broken-ec2s: count={len(self.broken_ec2s)} {_get_instance_ids(self.broken_ec2s)}, "
f"buffer-ec2s: count={len(self.buffer_ec2s)} {_get_instance_ids(self.buffer_ec2s)}, "
f"warm-buffer-ec2s: count={len(self.warm_buffer_ec2s)} {_get_instance_ids(self.warm_buffer_ec2s)}, "
f"disconnected-nodes: count={len(self.disconnected_nodes)}, "
f"terminating-nodes: count={len(self.terminating_nodes)} {_get_instance_ids(self.terminating_nodes)}, "
f"retired-nodes: count={len(self.retired_nodes)} {_get_instance_ids(self.retired_nodes)}, "
Expand All @@ -159,7 +158,7 @@ class DaskTask:


@dataclass(kw_only=True, slots=True)
class BufferPool:
class WarmBufferPool:
ready_instances: set[EC2InstanceData] = field(default_factory=set)
pending_instances: set[EC2InstanceData] = field(default_factory=set)
waiting_to_pull_instances: set[EC2InstanceData] = field(default_factory=set)
Expand All @@ -170,7 +169,7 @@ class BufferPool:

def __repr__(self) -> str:
return (
f"BufferPool(ready-count={len(self.ready_instances)}, "
f"WarmBufferPool(ready-count={len(self.ready_instances)}, "
f"pending-count={len(self.pending_instances)}, "
f"waiting-to-pull-count={len(self.waiting_to_pull_instances)}, "
f"waiting-to-stop-count={len(self.waiting_to_stop_instances)}, "
Expand Down Expand Up @@ -213,20 +212,20 @@ def remove_instance(self, instance: EC2InstanceData) -> None:


@dataclass
class BufferPoolManager:
buffer_pools: dict[InstanceTypeType, BufferPool] = field(
default_factory=lambda: defaultdict(BufferPool)
class WarmBufferPoolManager:
buffer_pools: dict[InstanceTypeType, WarmBufferPool] = field(
default_factory=lambda: defaultdict(WarmBufferPool)
)

def __repr__(self) -> str:
return f"BufferPoolManager({dict(self.buffer_pools)})"
return f"WarmBufferPoolManager({dict(self.buffer_pools)})"

def flatten_buffer_pool(self) -> BufferPool:
def flatten_buffer_pool(self) -> WarmBufferPool:
"""returns a flattened buffer pool with all the EC2InstanceData"""
flat_pool = BufferPool()
flat_pool = WarmBufferPool()

for buffer_pool in self.buffer_pools.values():
for f in fields(BufferPool):
for f in fields(WarmBufferPool):
getattr(flat_pool, f.name).update(getattr(buffer_pool, f.name))

return flat_pool

This file was deleted.

Loading
Loading