Skip to content

Commit 7bbc50a

Browse files
committed
[feat/1316] Support Serving Metrics On an Alternate Port
1 parent 4ee6e1d commit 7bbc50a

File tree

3 files changed

+310
-6
lines changed

3 files changed

+310
-6
lines changed
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
"""
2+
Prometheus metrics integration for Jupyter Server.
3+
4+
This module provides Prometheus metrics collection and exposure for Jupyter Server.
5+
"""
6+
7+
from .metrics import (
8+
HTTP_REQUEST_DURATION_SECONDS,
9+
KERNEL_CURRENTLY_RUNNING_TOTAL,
10+
TERMINAL_CURRENTLY_RUNNING_TOTAL,
11+
SERVER_INFO,
12+
SERVER_EXTENSION_INFO,
13+
LAST_ACTIVITY,
14+
SERVER_STARTED,
15+
ACTIVE_DURATION,
16+
)
17+
18+
from .server import PrometheusMetricsServer, start_metrics_server
19+
20+
__all__ = [
21+
"HTTP_REQUEST_DURATION_SECONDS",
22+
"KERNEL_CURRENTLY_RUNNING_TOTAL",
23+
"TERMINAL_CURRENTLY_RUNNING_TOTAL",
24+
"SERVER_INFO",
25+
"SERVER_EXTENSION_INFO",
26+
"LAST_ACTIVITY",
27+
"SERVER_STARTED",
28+
"ACTIVE_DURATION",
29+
"PrometheusMetricsServer",
30+
"start_metrics_server",
31+
]
Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
"""
2+
Prometheus metrics server for Jupyter Server
3+
4+
This module provides functionality to start a separate Prometheus metrics server
5+
that exposes Jupyter-specific metrics on a dedicated port.
6+
7+
Note on HTTP Request Metrics:
8+
The separate metrics server uses the same prometheus registry as the main server.
9+
HTTP request duration metrics (http_request_duration_seconds) are recorded by the
10+
main server's logging system when record_http_request_metrics=True. Since both
11+
servers share the same registry, these metrics will be available in the separate
12+
metrics server as well.
13+
14+
The record_http_request_metrics parameter controls whether the main server records
15+
these metrics, and the separate metrics server will automatically reflect this
16+
setting since it uses the same underlying metrics collection.
17+
18+
Authentication:
19+
The separate metrics server reuses the main server's authentication settings and
20+
handler infrastructure, ensuring consistent behavior.
21+
"""
22+
23+
import threading
24+
import tornado.web
25+
import tornado.httpserver
26+
import tornado.ioloop
27+
import prometheus_client
28+
from typing import Optional
29+
30+
from jupyter_server._version import __version__
31+
from jupyter_server.base.handlers import PrometheusMetricsHandler
32+
from jupyter_server.prometheus.metrics import (
33+
SERVER_INFO,
34+
SERVER_EXTENSION_INFO,
35+
LAST_ACTIVITY,
36+
SERVER_STARTED,
37+
ACTIVE_DURATION,
38+
HTTP_REQUEST_DURATION_SECONDS,
39+
KERNEL_CURRENTLY_RUNNING_TOTAL,
40+
TERMINAL_CURRENTLY_RUNNING_TOTAL,
41+
)
42+
43+
44+
class PrometheusMetricsServer:
45+
"""A separate server for exposing Prometheus metrics."""
46+
47+
def __init__(self, server_app):
48+
"""Initialize the metrics server.
49+
50+
Parameters
51+
----------
52+
server_app : ServerApp
53+
The main Jupyter server application instance
54+
"""
55+
self.server_app = server_app
56+
self.port = None
57+
self.http_server = None
58+
self.thread = None
59+
60+
def initialize_metrics(self):
61+
"""Initialize Jupyter-specific metrics for this server instance."""
62+
# Set server version info
63+
SERVER_INFO.info({"version": __version__})
64+
65+
# Set up extension info
66+
for ext in self.server_app.extension_manager.extensions.values():
67+
SERVER_EXTENSION_INFO.labels(
68+
name=ext.name, version=ext.version, enabled=str(ext.enabled).lower()
69+
).info({})
70+
71+
# Set server start time
72+
started = self.server_app.web_app.settings["started"]
73+
SERVER_STARTED.set(started.timestamp())
74+
75+
# Set up activity tracking
76+
LAST_ACTIVITY.set_function(lambda: self.server_app.web_app.last_activity().timestamp())
77+
ACTIVE_DURATION.set_function(
78+
lambda: (
79+
self.server_app.web_app.last_activity() - self.server_app.web_app.settings["started"]
80+
).total_seconds()
81+
)
82+
83+
# Set up kernel and terminal metrics
84+
self._setup_runtime_metrics()
85+
86+
# Note: HTTP request metrics are recorded by the main server's logging system
87+
# via the log_request function when record_http_request_metrics=True.
88+
# The separate metrics server uses the same prometheus registry, so those
89+
# metrics will be available here as well.
90+
91+
def _setup_runtime_metrics(self):
92+
"""Set up metrics that track runtime state."""
93+
# Set up kernel count tracking
94+
def update_kernel_metrics():
95+
try:
96+
kernel_manager = self.server_app.kernel_manager
97+
if hasattr(kernel_manager, 'list_kernel_ids'):
98+
kernel_ids = kernel_manager.list_kernel_ids()
99+
# Reset all kernel type metrics to 0
100+
for kernel_type in set(KERNEL_CURRENTLY_RUNNING_TOTAL._metrics.keys()):
101+
KERNEL_CURRENTLY_RUNNING_TOTAL.labels(type=kernel_type).set(0)
102+
103+
# Count kernels by type
104+
kernel_types = {}
105+
for kid in kernel_ids:
106+
try:
107+
kernel = kernel_manager.get_kernel(kid)
108+
if hasattr(kernel, 'kernel_name'):
109+
kernel_type = kernel.kernel_name
110+
else:
111+
kernel_type = 'unknown'
112+
kernel_types[kernel_type] = kernel_types.get(kernel_type, 0) + 1
113+
except Exception:
114+
kernel_types['unknown'] = kernel_types.get('unknown', 0) + 1
115+
116+
# Update metrics
117+
for kernel_type, count in kernel_types.items():
118+
KERNEL_CURRENTLY_RUNNING_TOTAL.labels(type=kernel_type).set(count)
119+
except Exception as e:
120+
self.server_app.log.debug(f"Error updating kernel metrics: {e}")
121+
122+
# Set up terminal count tracking
123+
def update_terminal_metrics():
124+
try:
125+
terminal_manager = getattr(self.server_app, 'terminal_manager', None)
126+
if terminal_manager and hasattr(terminal_manager, 'list'):
127+
terminal_count = len(terminal_manager.list())
128+
TERMINAL_CURRENTLY_RUNNING_TOTAL.set(terminal_count)
129+
else:
130+
TERMINAL_CURRENTLY_RUNNING_TOTAL.set(0)
131+
except Exception as e:
132+
self.server_app.log.debug(f"Error updating terminal metrics: {e}")
133+
134+
# Set up periodic updates
135+
def periodic_update():
136+
update_kernel_metrics()
137+
update_terminal_metrics()
138+
139+
# Run initial update
140+
periodic_update()
141+
142+
# Set up periodic updates every 30 seconds
143+
def start_periodic_updates():
144+
loop = tornado.ioloop.IOLoop.current()
145+
def update():
146+
periodic_update()
147+
loop.call_later(30, update)
148+
loop.call_later(30, update)
149+
150+
# Start periodic updates in the main server's IOLoop
151+
if hasattr(self.server_app, 'io_loop') and self.server_app.io_loop:
152+
self.server_app.io_loop.add_callback(start_periodic_updates)
153+
154+
def start(self, port: int) -> None:
155+
"""Start the metrics server on the specified port.
156+
157+
Parameters
158+
----------
159+
port : int
160+
The port to listen on for metrics requests
161+
"""
162+
self.port = port
163+
164+
# Initialize Jupyter metrics
165+
self.initialize_metrics()
166+
167+
# Reuse the main server's web application and settings
168+
# This ensures identical behavior and eliminates duplication
169+
main_app = self.server_app.web_app
170+
171+
# Create a new application that shares the same settings and handlers
172+
# but only serves the metrics endpoint
173+
metrics_app = tornado.web.Application([
174+
(r"/metrics", PrometheusMetricsHandler),
175+
], **main_app.settings)
176+
177+
# Determine authentication status for logging
178+
authenticate_metrics = main_app.settings.get("authenticate_prometheus", True)
179+
auth_info = "with authentication" if authenticate_metrics else "without authentication"
180+
181+
# Create and start the HTTP server
182+
self.http_server = tornado.httpserver.HTTPServer(metrics_app)
183+
self.http_server.listen(port)
184+
185+
# Start the IOLoop in a separate thread
186+
def start_metrics_loop():
187+
loop = tornado.ioloop.IOLoop()
188+
loop.make_current()
189+
loop.start()
190+
191+
self.thread = threading.Thread(target=start_metrics_loop, daemon=True)
192+
self.thread.start()
193+
194+
self.server_app.log.info(f"Metrics server started on port {port} {auth_info} (using Jupyter Prometheus integration)")
195+
196+
def stop(self) -> None:
197+
"""Stop the metrics server."""
198+
if self.http_server:
199+
self.http_server.stop()
200+
self.http_server = None
201+
202+
if self.thread and self.thread.is_alive():
203+
# Note: Tornado IOLoop doesn't have a clean stop method
204+
# The thread will exit when the process ends
205+
pass
206+
207+
self.server_app.log.info(f"Metrics server stopped on port {self.port}")
208+
209+
210+
def start_metrics_server(server_app, port: int) -> PrometheusMetricsServer:
211+
"""Start a Prometheus metrics server for the given Jupyter server.
212+
213+
Parameters
214+
----------
215+
server_app : ServerApp
216+
The main Jupyter server application instance
217+
port : int
218+
The port to listen on for metrics requests
219+
220+
Returns
221+
-------
222+
PrometheusMetricsServer
223+
The metrics server instance
224+
"""
225+
metrics_server = PrometheusMetricsServer(server_app)
226+
metrics_server.start(port)
227+
return metrics_server

0 commit comments

Comments
 (0)