21
21
"""
22
22
23
23
import threading
24
- import tornado .web
24
+ from typing import Optional
25
+
26
+ import prometheus_client
25
27
import tornado .httpserver
26
28
import tornado .ioloop
27
- import prometheus_client
28
- from typing import Optional
29
+ import tornado .web
29
30
30
31
from jupyter_server ._version import __version__
31
32
from jupyter_server .base .handlers import PrometheusMetricsHandler
32
33
from jupyter_server .prometheus .metrics import (
33
- SERVER_INFO ,
34
- SERVER_EXTENSION_INFO ,
35
- LAST_ACTIVITY ,
36
- SERVER_STARTED ,
37
34
ACTIVE_DURATION ,
38
35
HTTP_REQUEST_DURATION_SECONDS ,
39
36
KERNEL_CURRENTLY_RUNNING_TOTAL ,
37
+ LAST_ACTIVITY ,
38
+ SERVER_EXTENSION_INFO ,
39
+ SERVER_INFO ,
40
+ SERVER_STARTED ,
40
41
TERMINAL_CURRENTLY_RUNNING_TOTAL ,
41
42
)
42
43
43
44
44
45
class PrometheusMetricsServer :
45
46
"""A separate server for exposing Prometheus metrics."""
46
-
47
+
47
48
def __init__ (self , server_app ):
48
49
"""Initialize the metrics server.
49
-
50
+
50
51
Parameters
51
52
----------
52
53
server_app : ServerApp
@@ -56,172 +57,181 @@ def __init__(self, server_app):
56
57
self .port = None
57
58
self .http_server = None
58
59
self .thread = None
59
-
60
+
60
61
def initialize_metrics (self ):
61
62
"""Initialize Jupyter-specific metrics for this server instance."""
62
63
# Set server version info
63
64
SERVER_INFO .info ({"version" : __version__ })
64
-
65
+
65
66
# Set up extension info
66
67
for ext in self .server_app .extension_manager .extensions .values ():
67
68
SERVER_EXTENSION_INFO .labels (
68
69
name = ext .name , version = ext .version , enabled = str (ext .enabled ).lower ()
69
70
).info ({})
70
-
71
+
71
72
# Set server start time
72
73
started = self .server_app .web_app .settings ["started" ]
73
74
SERVER_STARTED .set (started .timestamp ())
74
-
75
+
75
76
# Set up activity tracking
76
77
LAST_ACTIVITY .set_function (lambda : self .server_app .web_app .last_activity ().timestamp ())
77
78
ACTIVE_DURATION .set_function (
78
79
lambda : (
79
- self .server_app .web_app .last_activity () - self .server_app .web_app .settings ["started" ]
80
+ self .server_app .web_app .last_activity ()
81
+ - self .server_app .web_app .settings ["started" ]
80
82
).total_seconds ()
81
83
)
82
-
84
+
83
85
# Set up kernel and terminal metrics
84
86
self ._setup_runtime_metrics ()
85
-
87
+
86
88
# Note: HTTP request metrics are recorded by the main server's logging system
87
89
# via the log_request function when record_http_request_metrics=True.
88
90
# The separate metrics server uses the same prometheus registry, so those
89
91
# metrics will be available here as well.
90
-
92
+
91
93
def _setup_runtime_metrics (self ):
92
94
"""Set up metrics that track runtime state."""
95
+
93
96
# Set up kernel count tracking
94
97
def update_kernel_metrics ():
95
98
try :
96
99
kernel_manager = self .server_app .kernel_manager
97
- if hasattr (kernel_manager , ' list_kernel_ids' ):
100
+ if hasattr (kernel_manager , " list_kernel_ids" ):
98
101
kernel_ids = kernel_manager .list_kernel_ids ()
99
102
# Reset all kernel type metrics to 0
100
103
for kernel_type in set (KERNEL_CURRENTLY_RUNNING_TOTAL ._metrics .keys ()):
101
104
KERNEL_CURRENTLY_RUNNING_TOTAL .labels (type = kernel_type ).set (0 )
102
-
105
+
103
106
# Count kernels by type
104
107
kernel_types = {}
105
108
for kid in kernel_ids :
106
109
try :
107
110
kernel = kernel_manager .get_kernel (kid )
108
- if hasattr (kernel , ' kernel_name' ):
111
+ if hasattr (kernel , " kernel_name" ):
109
112
kernel_type = kernel .kernel_name
110
113
else :
111
- kernel_type = ' unknown'
114
+ kernel_type = " unknown"
112
115
kernel_types [kernel_type ] = kernel_types .get (kernel_type , 0 ) + 1
113
116
except Exception :
114
- kernel_types [' unknown' ] = kernel_types .get (' unknown' , 0 ) + 1
115
-
117
+ kernel_types [" unknown" ] = kernel_types .get (" unknown" , 0 ) + 1
118
+
116
119
# Update metrics
117
120
for kernel_type , count in kernel_types .items ():
118
121
KERNEL_CURRENTLY_RUNNING_TOTAL .labels (type = kernel_type ).set (count )
119
122
except Exception as e :
120
123
self .server_app .log .debug (f"Error updating kernel metrics: { e } " )
121
-
124
+
122
125
# Set up terminal count tracking
123
126
def update_terminal_metrics ():
124
127
try :
125
- terminal_manager = getattr (self .server_app , ' terminal_manager' , None )
126
- if terminal_manager and hasattr (terminal_manager , ' list' ):
128
+ terminal_manager = getattr (self .server_app , " terminal_manager" , None )
129
+ if terminal_manager and hasattr (terminal_manager , " list" ):
127
130
terminal_count = len (terminal_manager .list ())
128
131
TERMINAL_CURRENTLY_RUNNING_TOTAL .set (terminal_count )
129
132
else :
130
133
TERMINAL_CURRENTLY_RUNNING_TOTAL .set (0 )
131
134
except Exception as e :
132
135
self .server_app .log .debug (f"Error updating terminal metrics: { e } " )
133
-
136
+
134
137
# Set up periodic updates
135
138
def periodic_update ():
136
139
update_kernel_metrics ()
137
140
update_terminal_metrics ()
138
-
141
+
139
142
# Run initial update
140
143
periodic_update ()
141
-
144
+
142
145
# Set up periodic updates every 30 seconds
143
146
def start_periodic_updates ():
144
147
loop = tornado .ioloop .IOLoop .current ()
148
+
145
149
def update ():
146
150
periodic_update ()
147
151
loop .call_later (30 , update )
152
+
148
153
loop .call_later (30 , update )
149
-
154
+
150
155
# Start periodic updates in the main server's IOLoop
151
- if hasattr (self .server_app , ' io_loop' ) and self .server_app .io_loop :
156
+ if hasattr (self .server_app , " io_loop" ) and self .server_app .io_loop :
152
157
self .server_app .io_loop .add_callback (start_periodic_updates )
153
-
158
+
154
159
def start (self , port : int ) -> None :
155
160
"""Start the metrics server on the specified port.
156
-
161
+
157
162
Parameters
158
163
----------
159
164
port : int
160
165
The port to listen on for metrics requests
161
166
"""
162
167
self .port = port
163
-
168
+
164
169
# Initialize Jupyter metrics
165
170
self .initialize_metrics ()
166
-
171
+
167
172
# Reuse the main server's web application and settings
168
173
# This ensures identical behavior and eliminates duplication
169
174
main_app = self .server_app .web_app
170
-
175
+
171
176
# Create a new application that shares the same settings and handlers
172
177
# but only serves the metrics endpoint
173
- metrics_app = tornado .web .Application ([
174
- (r"/metrics" , PrometheusMetricsHandler ),
175
- ], ** main_app .settings )
176
-
178
+ metrics_app = tornado .web .Application (
179
+ [
180
+ (r"/metrics" , PrometheusMetricsHandler ),
181
+ ],
182
+ ** main_app .settings ,
183
+ )
184
+
177
185
# Determine authentication status for logging
178
186
authenticate_metrics = main_app .settings .get ("authenticate_prometheus" , True )
179
187
auth_info = "with authentication" if authenticate_metrics else "without authentication"
180
-
188
+
181
189
# Create and start the HTTP server
182
190
self .http_server = tornado .httpserver .HTTPServer (metrics_app )
183
191
self .http_server .listen (port )
184
-
192
+
185
193
# Start the IOLoop in a separate thread
186
194
def start_metrics_loop ():
187
195
loop = tornado .ioloop .IOLoop ()
188
196
loop .make_current ()
189
197
loop .start ()
190
-
198
+
191
199
self .thread = threading .Thread (target = start_metrics_loop , daemon = True )
192
200
self .thread .start ()
193
-
194
- self .server_app .log .info (f"Metrics server started on port { port } { auth_info } (using Jupyter Prometheus integration)" )
195
-
201
+
202
+ self .server_app .log .info (
203
+ f"Metrics server started on port { port } { auth_info } (using Jupyter Prometheus integration)"
204
+ )
205
+
196
206
def stop (self ) -> None :
197
207
"""Stop the metrics server."""
198
208
if self .http_server :
199
209
self .http_server .stop ()
200
210
self .http_server = None
201
-
211
+
202
212
if self .thread and self .thread .is_alive ():
203
213
# Note: Tornado IOLoop doesn't have a clean stop method
204
214
# The thread will exit when the process ends
205
215
pass
206
-
216
+
207
217
self .server_app .log .info (f"Metrics server stopped on port { self .port } " )
208
218
209
219
210
220
def start_metrics_server (server_app , port : int ) -> PrometheusMetricsServer :
211
221
"""Start a Prometheus metrics server for the given Jupyter server.
212
-
222
+
213
223
Parameters
214
224
----------
215
225
server_app : ServerApp
216
226
The main Jupyter server application instance
217
227
port : int
218
228
The port to listen on for metrics requests
219
-
229
+
220
230
Returns
221
231
-------
222
232
PrometheusMetricsServer
223
233
The metrics server instance
224
234
"""
225
235
metrics_server = PrometheusMetricsServer (server_app )
226
236
metrics_server .start (port )
227
- return metrics_server
237
+ return metrics_server
0 commit comments