Skip to content

Commit 27a80ab

Browse files
committed
re-added monitoring
1 parent 1976f16 commit 27a80ab

File tree

3 files changed

+70
-66
lines changed

3 files changed

+70
-66
lines changed

services/director/src/simcore_service_director/instrumentation.py

Lines changed: 57 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,57 @@
1+
from dataclasses import dataclass, field
2+
from typing import cast
3+
14
from fastapi import FastAPI
5+
from prometheus_client import CollectorRegistry, Counter
26
from servicelib.fastapi.prometheus_instrumentation import (
37
setup_prometheus_instrumentation,
48
)
9+
from servicelib.instrumentation import MetricsBase, get_metrics_namespace
510

11+
from ._meta import APP_NAME
12+
from .core.errors import ConfigurationError
613
from .core.settings import get_application_settings
714

15+
MONITOR_SERVICE_STARTED_LABELS: list[str] = [
16+
"service_key",
17+
"service_tag",
18+
"simcore_user_agent",
19+
]
20+
21+
MONITOR_SERVICE_STOPPED_LABELS: list[str] = [
22+
"service_key",
23+
"service_tag",
24+
"result",
25+
"simcore_user_agent",
26+
]
27+
28+
29+
@dataclass(slots=True, kw_only=True)
30+
class DirectorV0Instrumentation(MetricsBase):
31+
registry: CollectorRegistry
32+
33+
services_started: Counter = field(init=False)
34+
services_stopped: Counter = field(init=False)
35+
36+
def __post_init__(self) -> None:
37+
self.services_started = Counter(
38+
name="services_started_total",
39+
documentation="Counts the services started",
40+
labelnames=MONITOR_SERVICE_STARTED_LABELS,
41+
namespace=get_metrics_namespace(APP_NAME),
42+
subsystem=self.subsystem,
43+
registry=self.registry,
44+
)
45+
46+
self.services_stopped = Counter(
47+
name="services_stopped_total",
48+
documentation="Counts the services stopped",
49+
labelnames=MONITOR_SERVICE_STOPPED_LABELS,
50+
namespace=get_metrics_namespace(APP_NAME),
51+
subsystem=self.subsystem,
52+
registry=self.registry,
53+
)
54+
855

956
def setup(app: FastAPI) -> None:
1057
app_settings = get_application_settings(app)
@@ -15,15 +62,10 @@ def setup(app: FastAPI) -> None:
1562
instrumentator = setup_prometheus_instrumentation(app)
1663

1764
async def on_startup() -> None:
18-
# metrics_subsystem = (
19-
# "dynamic" if app_settings.AUTOSCALING_NODES_MONITORING else "computational"
20-
# )
21-
# app.state.instrumentation = (
22-
# AutoscalingInstrumentation( # pylint: disable=unexpected-keyword-arg
23-
# registry=instrumentator.registry, subsystem=metrics_subsystem
24-
# )
25-
# )
26-
...
65+
metrics_subsystem = ""
66+
app.state.instrumentation = DirectorV0Instrumentation(
67+
registry=instrumentator.registry, subsystem=metrics_subsystem
68+
)
2769

2870
async def on_shutdown() -> None:
2971
...
@@ -32,12 +74,12 @@ async def on_shutdown() -> None:
3274
app.add_event_handler("shutdown", on_shutdown)
3375

3476

35-
# def get_instrumentation(app: FastAPI) -> AutoscalingInstrumentation:
36-
# if not app.state.instrumentation:
37-
# raise ConfigurationError(
38-
# msg="Instrumentation not setup. Please check the configuration."
39-
# )
40-
# return cast(AutoscalingInstrumentation, app.state.instrumentation)
77+
def get_instrumentation(app: FastAPI) -> DirectorV0Instrumentation:
78+
if not app.state.instrumentation:
79+
raise ConfigurationError(
80+
msg="Instrumentation not setup. Please check the configuration."
81+
)
82+
return cast(DirectorV0Instrumentation, app.state.instrumentation)
4183

4284

4385
def has_instrumentation(app: FastAPI) -> bool:

services/director/src/simcore_service_director/monitoring.py

Lines changed: 0 additions & 30 deletions
This file was deleted.

services/director/src/simcore_service_director/producer.py

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,6 @@
2525
from fastapi import FastAPI
2626
from packaging.version import Version
2727
from servicelib.async_utils import run_sequentially_in_context
28-
29-
# from servicelib.monitor_services import service_started, service_stopped
3028
from settings_library.docker_registry import RegistrySettings
3129
from tenacity import retry
3230
from tenacity.retry import retry_if_exception_type
@@ -44,6 +42,7 @@
4442
)
4543
from .core.settings import ApplicationSettings, get_application_settings
4644
from .exceptions import ServiceStateSaveError
45+
from .instrumentation import get_instrumentation
4746
from .services_common import ServicesCommonSettings
4847
from .system_utils import get_system_extra_hosts_raw
4948
from .utils import parse_as_datetime
@@ -1033,15 +1032,12 @@ async def start_service(
10331032
)
10341033
node_details = containers_meta_data[0]
10351034
if app_settings.DIRECTOR_MONITORING_ENABLED:
1036-
...
1037-
# TODO: is monitoring necessary?
1038-
# service_started(
1039-
# app,
1040-
# "undefined_user", # NOTE: to prevent high cardinality metrics this is disabled
1041-
# service_key,
1042-
# service_tag,
1043-
# "DYNAMIC",
1044-
# )
1035+
get_instrumentation(app).services_started.labels(
1036+
service_key=service_key,
1037+
service_tag=service_tag,
1038+
simcore_user_agent="undefined_user",
1039+
).inc()
1040+
10451041
# we return only the info of the main service
10461042
return node_details
10471043

@@ -1294,13 +1290,9 @@ async def stop_service(app: FastAPI, *, node_uuid: str, save_state: bool) -> Non
12941290
log.debug("removed network")
12951291

12961292
if app_settings.DIRECTOR_MONITORING_ENABLED:
1297-
...
1298-
# TODO: is it necessary still?
1299-
# service_stopped(
1300-
# app,
1301-
# "undefined_user",
1302-
# service_details["service_key"],
1303-
# service_details["service_version"],
1304-
# "DYNAMIC",
1305-
# "SUCCESS",
1306-
# )
1293+
get_instrumentation(app).services_stopped.labels(
1294+
service_key=service_details["service_key"],
1295+
service_tag=service_details["service_version"],
1296+
simcore_user_agent="undefined_user",
1297+
result="SUCCESS",
1298+
).inc()

0 commit comments

Comments
 (0)