Skip to content

Commit 99de33d

Browse files
authored
🎨Autoscaling monitoring tool: compatible with TIP, small improvements (#6311)
1 parent b7098eb commit 99de33d

File tree

7 files changed

+67
-53
lines changed

7 files changed

+67
-53
lines changed
Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,34 @@
1-
.DEFAULT_GOAL := install
1+
.DEFAULT_GOAL := help
22

33
SHELL := /bin/bash
44

5+
PYTHON_VERSION := $(or $(PYTHON), 3.11)
6+
7+
8+
.PHONY: hel%
9+
# thanks to https://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
10+
hel%:
11+
@echo "usage: make [target] ..."
12+
@echo ""
13+
@echo "Targets for '$(notdir $(CURDIR))':"
14+
@echo ""
15+
@awk --posix 'BEGIN {FS = ":.*?## "} /^[[:alpha:][:space:]_-]+:.*?## / {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)
16+
@echo ""
17+
518
.venv:
619
# creating python virtual environment
7-
@uv venv .venv
8-
# installing python dependencies
9-
@uv pip install --upgrade pip setuptools wheel
10-
20+
@uv venv --python=$(PYTHON_VERSION)
1121

12-
install: .venv
13-
# activating python virtual environment
14-
@source .venv/bin/activate
22+
install: .venv ## installs using $PYTHON_VERSION or uses defaults
1523
# installing package
16-
@uv pip install .
24+
@uv pip install --python=$(PYTHON_VERSION) .
1725
# now you can call the maintenance scripts
1826
# source .venv/bin/activate
1927
# autoscaled-monitor --deploy-config PATH/TO/REPO.CONFIG summary
2028

21-
install-dev: .venv
22-
# activating python virtual environment
23-
@source .venv/bin/activate
29+
install-dev: .venv ## installs in devel mode using PYTHON_VERSION or uses defaults
2430
# installing package
2531
@uv pip install -e .
32+
# now you can call the maintenance scripts
33+
# source .venv/bin/activate
34+
# autoscaled-monitor --deploy-config PATH/TO/REPO.CONFIG summary

scripts/maintenance/computational-clusters/autoscaled_monitor/cli.py

Lines changed: 17 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -33,25 +33,14 @@
3333

3434
def _parse_environment(deploy_config: Path) -> dict[str, str | None]:
3535
repo_config = deploy_config / "repo.config"
36-
assert repo_config.exists()
37-
environment = dotenv_values(repo_config)
38-
if environment["AUTOSCALING_EC2_ACCESS_KEY_ID"] == "":
36+
if not repo_config.exists():
3937
rich.print(
40-
"Terraform variables detected, looking for repo.config.frozen as alternative."
41-
" TIP: you are responsible for them being up to date!!"
38+
f"[red]{repo_config} does not exist! Please run OPS code to generate it[/red]"
4239
)
43-
repo_config = deploy_config / "repo.config.frozen"
44-
assert repo_config.exists()
45-
environment = dotenv_values(repo_config)
46-
47-
if environment["AUTOSCALING_EC2_ACCESS_KEY_ID"] == "":
48-
error_msg = (
49-
"Terraform is necessary in order to check into that deployment!\n"
50-
f"install terraform (check README.md in {state.deploy_config} for instructions)"
51-
"then run make repo.config.frozen, then re-run this code"
52-
)
53-
rich.print(error_msg)
54-
raise typer.Abort(error_msg)
40+
raise typer.Exit(1)
41+
42+
environment = dotenv_values(repo_config)
43+
5544
assert environment
5645
return environment
5746

@@ -77,28 +66,21 @@ def main(
7766
assert state.environment["EC2_INSTANCES_KEY_NAME"]
7867
dynamic_pattern = f"{state.environment['EC2_INSTANCES_NAME_PREFIX']}-{{key_name}}"
7968
state.dynamic_parser = parse.compile(dynamic_pattern)
80-
rich.print(f"using dynamic-naming-regex: {dynamic_pattern}")
8169
if state.environment["CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX"]:
8270
state.computational_parser_primary = parse.compile(
83-
f"{state.environment['CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX'].strip('-')}-{DEFAULT_COMPUTATIONAL_EC2_FORMAT}",
84-
{"wallet_id_spec", wallet_id_spec},
71+
rf"{state.environment['CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX'].strip('-')}-{DEFAULT_COMPUTATIONAL_EC2_FORMAT}",
72+
{"wallet_id_spec": wallet_id_spec},
8573
)
8674
state.computational_parser_workers = parse.compile(
87-
f"{state.environment['CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX'].strip('-')}-{DEFAULT_COMPUTATIONAL_EC2_FORMAT_WORKERS}",
88-
{"wallet_id_spec", wallet_id_spec},
75+
rf"{state.environment['CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX'].strip('-')}-{DEFAULT_COMPUTATIONAL_EC2_FORMAT_WORKERS}",
76+
{"wallet_id_spec": wallet_id_spec},
8977
)
90-
rich.print(
91-
f"compuational-primary-naming-regex: {state.computational_parser_primary._expression}" # noqa: SLF001
92-
)
93-
rich.print(
94-
f"compuational-workers-naming-regex: {state.computational_parser_workers._expression}" # noqa: SLF001
95-
)
9678

9779
# locate ssh key path
9880
for file_path in deploy_config.glob("**/*.pem"):
99-
if "license" in file_path.name:
81+
if any(_ in file_path.name for _ in ["license", "pkcs8"]):
10082
continue
101-
# very bad HACK
83+
# very bad HACK where the license file contain openssh in the name
10284
if (
10385
any(_ in f"{file_path}" for _ in ("sim4life.io", "osparc-master"))
10486
and "openssh" not in f"{file_path}"
@@ -112,6 +94,11 @@ def main(
11294
)
11395
state.ssh_key_path = file_path
11496
break
97+
if not state.ssh_key_path:
98+
rich.print(
99+
f"[red]could not find ssh key in {deploy_config}! Please run OPS code to generate it[/red]"
100+
)
101+
raise typer.Exit(1)
115102

116103

117104
@app.command()

scripts/maintenance/computational-clusters/autoscaled_monitor/constants.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@ def wallet_id_spec(text) -> None | int:
1919
str
2020
] = r"osparc-computational-cluster-{role}-{swarm_stack_name}-user_id:{user_id:d}-wallet_id:{wallet_id:wallet_id_spec}-{key_name}"
2121
DEFAULT_DYNAMIC_EC2_FORMAT: Final[str] = r"osparc-dynamic-autoscaled-worker-{key_name}"
22-
DEPLOY_SSH_KEY_PARSER: Final[parse.Parser] = parse.compile(r"osparc-{random_name}.pem")
22+
DEPLOY_SSH_KEY_PARSER: Final[parse.Parser] = parse.compile(
23+
r"{prefix}-{random_name}.pem"
24+
)
2325

2426
MINUTE: Final[int] = 60
2527
HOUR: Final[int] = 60 * MINUTE

scripts/maintenance/computational-clusters/autoscaled_monitor/core.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,6 @@ def _parse_computational(
4343
or state.computational_parser_primary.parse(name)
4444
):
4545
assert isinstance(result, parse.Result)
46-
# special handling for optional wallet
47-
rich.print(result.named)
4846

4947
last_heartbeat = utils.get_last_heartbeat(instance)
5048
return ComputationalInstance(
@@ -450,7 +448,7 @@ async def summary(state: AppState, user_id: int | None, wallet_id: int | None) -
450448

451449
def _print_computational_tasks(
452450
user_id: int,
453-
wallet_id: int,
451+
wallet_id: int | None,
454452
tasks: list[tuple[ComputationalTask | None, DaskTask | None]],
455453
) -> None:
456454
table = Table(
@@ -489,7 +487,7 @@ def _print_computational_tasks(
489487

490488

491489
async def _list_computational_clusters(
492-
state: AppState, user_id: int, wallet_id: int
490+
state: AppState, user_id: int, wallet_id: int | None
493491
) -> list[ComputationalCluster]:
494492
assert state.ec2_resource_clusters_keeper
495493
computational_instances = await ec2.list_computational_instances_from_ec2(
@@ -501,7 +499,7 @@ async def _list_computational_clusters(
501499

502500

503501
async def cancel_jobs( # noqa: C901, PLR0912
504-
state: AppState, user_id: int, wallet_id: int, *, force: bool
502+
state: AppState, user_id: int, wallet_id: int | None, *, force: bool
505503
) -> None:
506504
# get the theory
507505
computational_tasks = await db.list_computational_tasks_from_db(state, user_id)

scripts/maintenance/computational-clusters/autoscaled_monitor/dask.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import distributed
66
import rich
7+
import typer
78
from mypy_boto3_ec2.service_resource import Instance
89
from pydantic import AnyUrl
910

@@ -63,6 +64,25 @@ async def dask_client(
6364
f"{url}", security=security, timeout="5", asynchronous=True
6465
)
6566
)
67+
versions = await _wrap_dask_async_call(client.get_versions())
68+
if versions["client"]["python"] != versions["scheduler"]["python"]:
69+
rich.print(
70+
f"[red]python versions do not match! TIP: install the correct version {versions['scheduler']['python']}[/red]"
71+
)
72+
raise typer.Exit(1)
73+
if (
74+
versions["client"]["distributed"]
75+
!= versions["scheduler"]["distributed"]
76+
):
77+
rich.print(
78+
f"[red]distributed versions do not match! TIP: install the correct version {versions['scheduler']['distributed']}[/red]"
79+
)
80+
raise typer.Exit(1)
81+
if versions["client"]["dask"] != versions["scheduler"]["dask"]:
82+
rich.print(
83+
f"[red]dask versions do not match! TIP: install the correct version {versions['scheduler']['dask']}[/red]"
84+
)
85+
raise typer.Exit(1)
6686
yield client
6787

6888
finally:

scripts/maintenance/computational-clusters/autoscaled_monitor/ssh.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,9 @@ async def get_dask_ip(
164164
if exit_status != 0:
165165
error_message = stderr.read().decode().strip()
166166
_logger.error(
167-
"Command failed with exit status %s: %s", exit_status, error_message
167+
"Inspecting dask IP Command failed with exit status %s: %s",
168+
exit_status,
169+
error_message,
168170
)
169171
return "Not Found / Drained / Not Ready"
170172

scripts/maintenance/computational-clusters/pyproject.toml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,3 @@
1-
[build-system]
2-
requires = ["setuptools>=61.0"]
3-
build-backend = "setuptools.build_meta"
4-
51
[project]
62
dependencies = [
73
"arrow",

0 commit comments

Comments
 (0)