Skip to content

Commit d2b722c

Browse files
authored
✨Comp backend: add local prometheus in every cluster to gather metrics (ITISFoundation#5530)
1 parent c010232 commit d2b722c

File tree

5 files changed

+155
-2
lines changed

5 files changed

+155
-2
lines changed

services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml

Lines changed: 60 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ services:
44
image: ${DOCKER_REGISTRY:-itisfoundation}/dask-sidecar:${DOCKER_IMAGE_TAG}
55
init: true
66
hostname: "{{.Node.Hostname}}-{{.Task.Slot}}"
7+
networks:
8+
- cluster
79
environment:
810
DASK_TLS_CA_FILE: ${DASK_TLS_CA_FILE}
911
DASK_TLS_CERT: ${DASK_TLS_CERT}
@@ -16,9 +18,15 @@ services:
1618
- 8786:8786 # dask-scheduler access
1719
- 8787:8787 # dashboard
1820
deploy:
21+
labels:
22+
prometheus-job: scheduler
23+
prometheus-port: 8787
1924
placement:
2025
constraints:
2126
- "node.role==manager"
27+
resources:
28+
limits:
29+
memory: 2048M
2230
secrets:
2331
- source: dask_tls_ca
2432
target: ${DASK_TLS_CA_FILE}
@@ -38,6 +46,8 @@ services:
3846
- computational_shared_data:${SIDECAR_COMP_SERVICES_SHARED_FOLDER:-/home/scu/computational_shared_data}
3947
- /var/run/docker.sock:/var/run/docker.sock:ro
4048
- ${ETC_HOSTNAME:-/etc/hostname}:/home/scu/hostname:ro
49+
networks:
50+
- cluster
4151
environment:
4252
DASK_LOG_FORMAT_LOCAL_DEV_ENABLED: 1
4353
DASK_NPROCS: 1
@@ -54,6 +64,9 @@ services:
5464
SIDECAR_COMP_SERVICES_SHARED_VOLUME_NAME: computational_shared_data
5565
deploy:
5666
mode: global
67+
labels:
68+
prometheus-job: sidecars
69+
prometheus-port: 8787
5770
placement:
5871
constraints:
5972
- "node.role==worker"
@@ -95,11 +108,19 @@ services:
95108
REDIS_HOST: redis
96109
REDIS_PORT: 6379
97110
volumes:
98-
- "/var/run/docker.sock:/var/run/docker.sock"
111+
- "/var/run/docker.sock:/var/run/docker.sock:ro"
112+
networks:
113+
- cluster
99114
deploy:
115+
labels:
116+
prometheus-job: autoscaling
117+
prometheus-port: 8000
100118
placement:
101119
constraints:
102120
- "node.role==manager"
121+
resources:
122+
limits:
123+
memory: 512M
103124
secrets:
104125
- source: dask_tls_ca
105126
target: ${DASK_TLS_CA_FILE}
@@ -123,16 +144,53 @@ services:
123144
retries: 50
124145
volumes:
125146
- redis-data:/data
147+
networks:
148+
- cluster
126149
deploy:
127150
placement:
128151
constraints:
129152
- "node.role==manager"
153+
resources:
154+
limits:
155+
memory: 512M
156+
cpus: "0.5"
157+
158+
prometheus:
159+
image: prom/prometheus:v2.51.0@sha256:5ccad477d0057e62a7cd1981ffcc43785ac10c5a35522dc207466ff7e7ec845f
160+
command:
161+
- "--storage.tsdb.retention.size=1GB"
162+
ports:
163+
- 9090:9090
164+
configs:
165+
- source: prometheus-config
166+
target: /etc/prometheus/prometheus.yml
167+
volumes:
168+
- prometheus-data:/prometheus
169+
- /var/run/docker.sock:/var/run/docker.sock:ro
170+
user: root # because of docker
171+
networks:
172+
- cluster
173+
deploy:
174+
placement:
175+
constraints:
176+
- "node.role==manager"
177+
resources:
178+
limits:
179+
memory: 1024M
180+
cpus: "1.0"
181+
182+
networks:
183+
cluster:
184+
185+
configs:
186+
prometheus-config:
187+
file: ./prometheus.yml
130188

131189
volumes:
132190
computational_shared_data:
133191
name: computational_shared_data
134192
redis-data:
135-
193+
prometheus-data:
136194

137195
secrets:
138196
dask_tls_ca:
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
global:
2+
scrape_interval: "29s"
3+
scrape_configs:
4+
# Create a job for Docker Swarm containers.
5+
- job_name: 'docker nodes'
6+
dockerswarm_sd_configs:
7+
- host: unix:///var/run/docker.sock
8+
role: nodes
9+
relabel_configs:
10+
# Fetch metrics on port 9323.
11+
- source_labels: [__meta_dockerswarm_node_address]
12+
target_label: __address__
13+
replacement: $1:9323
14+
# Set hostname as instance label
15+
- source_labels: [__meta_dockerswarm_node_hostname]
16+
target_label: instance
17+
# Create a job for Docker Swarm containers.
18+
- job_name: 'docker tasks'
19+
dockerswarm_sd_configs:
20+
- host: unix:///var/run/docker.sock
21+
role: tasks
22+
relabel_configs:
23+
# Set hostname as instance label
24+
- source_labels: [__meta_dockerswarm_node_hostname]
25+
target_label: instance
26+
# Only keep containers that should be running.
27+
- source_labels: [__meta_dockerswarm_task_desired_state]
28+
regex: running
29+
action: keep
30+
# Only keep tasks with a `prometheus_port` label.
31+
- source_labels: [__meta_dockerswarm_service_label_prometheus_port]
32+
regex: .+
33+
action: keep
34+
# Only keep containers that have a `prometheus-job` label.
35+
- source_labels: [__meta_dockerswarm_service_label_prometheus_job]
36+
regex: .+
37+
action: keep
38+
# Use the prometheus-job Swarm label as Prometheus job label.
39+
- source_labels: [__meta_dockerswarm_service_label_prometheus_job]
40+
target_label: job
41+
# Specify the metric path if needed (optional)
42+
- source_labels: [__meta_dockerswarm_service_label_prometheus_path]
43+
target_label: __metrics_path__
44+
regex: (.+)
45+
# Use the `prometheus_port` Swarm label to set the __address__ for scraping.
46+
- source_labels: [__address__, __meta_dockerswarm_service_label_prometheus_port]
47+
target_label: __address__
48+
regex: ([^:]+)(?::\d+)?;(\d+)
49+
replacement: $1:$2
50+
action: replace

services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@
2121
from .dask import get_scheduler_url
2222

2323
_DOCKER_COMPOSE_FILE_NAME: Final[str] = "docker-compose.yml"
24+
_PROMETHEUS_FILE_NAME: Final[str] = "prometheus.yml"
2425
_HOST_DOCKER_COMPOSE_PATH: Final[Path] = Path(f"/{_DOCKER_COMPOSE_FILE_NAME}")
26+
_HOST_PROMETHEUS_PATH: Final[Path] = Path(f"/{_PROMETHEUS_FILE_NAME}")
2527
_HOST_CERTIFICATES_BASE_PATH: Final[Path] = Path("/.dask-sidecar-certificates")
2628
_HOST_TLS_CA_FILE_PATH: Final[Path] = _HOST_CERTIFICATES_BASE_PATH / "tls_dask_ca.pem"
2729
_HOST_TLS_CERT_FILE_PATH: Final[Path] = (
@@ -42,6 +44,12 @@ def _docker_compose_yml_base64_encoded() -> str:
4244
return _base_64_encode(file_path)
4345

4446

47+
@functools.lru_cache
48+
def _prometheus_yml_base64_encoded() -> str:
49+
file_path = PACKAGE_DATA_FOLDER / _PROMETHEUS_FILE_NAME
50+
return _base_64_encode(file_path)
51+
52+
4553
def _prepare_environment_variables(
4654
app_settings: ApplicationSettings,
4755
*,
@@ -117,6 +125,7 @@ def create_startup_script(
117125
# NOTE: https://stackoverflow.com/questions/41203492/solving-redis-warnings-on-overcommit-memory-and-transparent-huge-pages-for-ubunt
118126
"sysctl vm.overcommit_memory=1",
119127
f"echo '{_docker_compose_yml_base64_encoded()}' | base64 -d > {_HOST_DOCKER_COMPOSE_PATH}",
128+
f"echo '{_prometheus_yml_base64_encoded()}' | base64 -d > {_HOST_PROMETHEUS_PATH}",
120129
# NOTE: --default-addr-pool is necessary in order to prevent conflicts with AWS node IPs
121130
"docker swarm init --default-addr-pool 172.20.0.0/14",
122131
f"{' '.join(environment_variables)} docker stack deploy --with-registry-auth --compose-file={_HOST_DOCKER_COMPOSE_PATH} dask_stack",

services/clusters-keeper/tests/manual/README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ flowchart TD
3434
1. build simcore
3535
```bash
3636
git clone https://github.com/ITISFoundation/osparc-simcore.git
37+
cd osparc-simcore
3738
make .env # generate initial .env file
3839
make build-devel # build for development mode or
3940
make build # for production mode
@@ -82,6 +83,13 @@ WORKERS_EC2_INSTANCES_TIME_BEFORE_TERMINATION="00:03:00"
8283
WORKERS_EC2_INSTANCES_CUSTOM_TAGS='{"osparc-tag": "some fun tag value"}'
8384
```
8485

86+
4. prepare dask TLS certificates
87+
NOTE: the dask TLS certificates are in AWS and shall be copied into the local stack such that the director-v2 can access the clusters
88+
these are defined by PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_CA, PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_CERT and PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_KEY
89+
1. one need to go to the AWS Parameter Store (SSM)
90+
2. find these entries then copy their contents into respectively services/dask-sidecar/.dask-certificates/dask-cert.pem and services/dask-sidecar/.dask-certificates/dask-key.pem
91+
92+
8593
5. start osparc
8694
```bash
8795
make up-devel # for devel mode

services/clusters-keeper/tests/unit/test_utils_clusters.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
NoAuthentication,
2323
TLSAuthentication,
2424
)
25+
from pydantic import ByteSize, parse_obj_as
2526
from pytest_simcore.helpers.utils_envs import EnvVarsDict
2627
from simcore_service_clusters_keeper.core.settings import ApplicationSettings
2728
from simcore_service_clusters_keeper.utils.clusters import (
@@ -140,6 +141,33 @@ def test_create_startup_script(
140141
)
141142

142143

144+
def test_create_startup_script_script_size_below_16kb(
145+
disabled_rabbitmq: None,
146+
mocked_ec2_server_envs: EnvVarsDict,
147+
mocked_redis_server: None,
148+
app_settings: ApplicationSettings,
149+
cluster_machines_name_prefix: str,
150+
clusters_keeper_docker_compose: dict[str, Any],
151+
ec2_boot_specs: EC2InstanceBootSpecific,
152+
):
153+
additional_custom_tags = {
154+
AWSTagKey("pytest-tag-key"): AWSTagValue("pytest-tag-value")
155+
}
156+
startup_script = create_startup_script(
157+
app_settings,
158+
cluster_machines_name_prefix=cluster_machines_name_prefix,
159+
ec2_boot_specific=ec2_boot_specs,
160+
additional_custom_tags=additional_custom_tags,
161+
)
162+
script_size_in_bytes = len(startup_script.encode("utf-8"))
163+
164+
print(
165+
f"current script size is {parse_obj_as(ByteSize, script_size_in_bytes).human_readable()}"
166+
)
167+
# NOTE: EC2 user data cannot be above 16KB, we keep some margin here
168+
assert script_size_in_bytes < 15 * 1024
169+
170+
143171
def test_startup_script_defines_all_envs_for_docker_compose(
144172
disabled_rabbitmq: None,
145173
mocked_ec2_server_envs: EnvVarsDict,

0 commit comments

Comments
 (0)