Skip to content

Commit a71811f

Browse files
author
maico
committed
Prometheus http and kernel startup/shutdown metrics
1 parent d01e84a commit a71811f

File tree

6 files changed

+99
-3
lines changed

6 files changed

+99
-3
lines changed

enterprise_gateway/base/handlers.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing import List
99

1010
import jupyter_server._version
11+
import prometheus_client
1112
from jupyter_server.base.handlers import APIHandler
1213
from tornado import web
1314

@@ -31,6 +32,16 @@ def get(self):
3132
)
3233

3334

35+
class PrometheusMetricsHandler(CORSMixin, web.RequestHandler):
36+
"""
37+
Return prometheus metrics from this enterprise gateway
38+
"""
39+
40+
def get(self):
41+
self.set_header("Content-Type", prometheus_client.CONTENT_TYPE_LATEST)
42+
self.write(prometheus_client.generate_latest(prometheus_client.REGISTRY))
43+
44+
3445
class NotFoundHandler(JSONErrorsMixin, web.RequestHandler):
3546
"""
3647
Catches all requests and responds with 404 JSON messages.
@@ -48,4 +59,8 @@ def prepare(self):
4859
raise web.HTTPError(404)
4960

5061

51-
default_handlers: List[tuple] = [(r"/api", APIVersionHandler), (r"/(.*)", NotFoundHandler)]
62+
default_handlers: List[tuple] = [
63+
(r"/api", APIVersionHandler),
64+
(r"/metrics", PrometheusMetricsHandler),
65+
(r"/(.*)", NotFoundHandler),
66+
]

enterprise_gateway/enterprisegatewayapp.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
WebhookKernelSessionManager,
3838
)
3939
from .services.sessions.sessionmanager import SessionManager
40+
from .webapp import EnterpriseGatewayWebApp
4041

4142
try:
4243
from jupyter_server.auth.authorizer import AllowAllAuthorizer
@@ -219,7 +220,7 @@ def init_webapp(self) -> None:
219220

220221
handlers = self._create_request_handlers()
221222

222-
self.web_app = web.Application(
223+
self.web_app = EnterpriseGatewayWebApp(
223224
handlers=handlers,
224225
kernel_manager=self.kernel_manager,
225226
session_manager=self.session_manager,

enterprise_gateway/metrics.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import os
2+
3+
from prometheus_client import Histogram
4+
5+
metrics_prefix = os.environ.get("EG_METRICS_PREFIX", "enterprise_gateway")
6+
7+
HTTP_REQUEST_DURATION_SECONDS = Histogram(
8+
'http_request_duration_seconds',
9+
'Request duration for all HTTP requests',
10+
['method', 'handler', 'status_code'],
11+
namespace=metrics_prefix,
12+
)
13+
14+
KERNEL_START_DURATION_SECONDS = Histogram(
15+
'kernel_start_duration_seconds',
16+
'Kernel startup duration',
17+
['kernel_name', 'process_proxy'],
18+
buckets=[0.1, 0.25, 0.5, 1, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0],
19+
namespace=metrics_prefix,
20+
)
21+
22+
KERNEL_SHUTDOWN_DURATION_SECONDS = Histogram(
23+
'kernel_shutdown_duration_seconds',
24+
'Kernel startup duration for all HTTP requests',
25+
['kernel_name', 'process_proxy'],
26+
buckets=[0.1, 0.25, 0.5, 1, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0],
27+
namespace=metrics_prefix,
28+
)

enterprise_gateway/services/kernels/remotemanager.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
from enterprise_gateway.mixins import EnterpriseGatewayConfigMixin
2525

26+
from ...metrics import KERNEL_SHUTDOWN_DURATION_SECONDS, KERNEL_START_DURATION_SECONDS
2627
from ..processproxies.processproxy import BaseProcessProxyABC, LocalProcessProxy, RemoteProcessProxy
2728
from ..sessions.kernelsessionmanager import KernelSessionManager
2829

@@ -501,7 +502,12 @@ async def start_kernel(self, **kwargs: dict[str, Any] | None):
501502
"""
502503
self._get_process_proxy()
503504
self._capture_user_overrides(**kwargs)
504-
await super().start_kernel(**kwargs)
505+
with KERNEL_START_DURATION_SECONDS.time() as timer:
506+
timer.labels(
507+
kernel_name=self.kernel_name,
508+
process_proxy=f'{self.process_proxy.__class__.__module__}.{type(self.process_proxy).__name__}',
509+
)
510+
await super().start_kernel(**kwargs)
505511

506512
def _capture_user_overrides(self, **kwargs: dict[str, Any] | None) -> None:
507513
"""
@@ -588,6 +594,14 @@ def request_shutdown(self, restart: bool = False) -> None:
588594
if isinstance(self.process_proxy, RemoteProcessProxy):
589595
self.process_proxy.shutdown_listener()
590596

597+
async def shutdown_kernel(self, now: bool = False, restart: bool = False):
598+
with KERNEL_SHUTDOWN_DURATION_SECONDS.time() as timer:
599+
timer.labels(
600+
kernel_name=self.kernel_name,
601+
process_proxy=f'{self.process_proxy.__class__.__module__}.{type(self.process_proxy).__name__}',
602+
)
603+
await super().shutdown_kernel(now=now, restart=restart)
604+
591605
async def restart_kernel(self, now: bool = False, **kwargs: dict[str, Any] | None) -> None:
592606
"""
593607
Restarts a kernel with the arguments that were used to launch it.

enterprise_gateway/tests/test_handlers.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,12 @@ def test_kernel_env_auth_token(self):
557557
if ws:
558558
ws.close()
559559

560+
@gen_test
561+
def test_get_metrics(self):
562+
"""Getting the swagger.json spec should be ok"""
563+
response = yield self.http_client.fetch(self.get_url("/metrics"))
564+
self.assertEqual(response.code, 200)
565+
560566

561567
class TestCustomDefaultKernel(TestHandlers):
562568
"""Tests gateway behavior when setting a custom default kernelspec."""

enterprise_gateway/webapp.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
2+
from tornado import web
3+
from tornado.web import RequestHandler
4+
5+
from enterprise_gateway.metrics import HTTP_REQUEST_DURATION_SECONDS
6+
7+
8+
class EnterpriseGatewayWebApp(web.Application):
9+
10+
def log_request(self, handler: RequestHandler) -> None:
11+
"""
12+
Tornado log handler for recording RED metrics.
13+
14+
We record the following metrics:
15+
Rate: the number of requests, per second, your services are serving.
16+
Errors: the number of failed requests per second.
17+
Duration: the amount of time each request takes expressed as a time interval.
18+
19+
We use a fully qualified name of the handler as a label,
20+
rather than every url path to reduce cardinality.
21+
22+
This function should be either the value of or called from a function
23+
that is the 'log_function' tornado setting. This makes it get called
24+
at the end of every request, allowing us to record the metrics we need.
25+
"""
26+
super().log_request(handler)
27+
28+
HTTP_REQUEST_DURATION_SECONDS.labels(
29+
method=handler.request.method,
30+
handler=f'{handler.__class__.__module__}.{type(handler).__name__}',
31+
status_code=handler.get_status(),
32+
).observe(handler.request.request_time())

0 commit comments

Comments
 (0)