Log HTTP request codes & timings to Prometheus

yuvipanda · yuvipanda · commit 8aa22d62f8eb · 2018-04-02T12:03:16.000-07:00
Code adapted from JupyterHub
diff --git a/notebook/log.py b/notebook/log.py
@@ -7,6 +7,7 @@
 
 import json
 from tornado.log import access_log
+from .metrics import prometheus_log_method
 
 def log_request(handler):
     """log a bit more information about each request than tornado's default
@@ -45,4 +46,4 @@ def log_request(handler):
         # log all headers if it caused an error
         log_method(json.dumps(dict(request.headers), indent=2))
     log_method(msg.format(**ns))
-
+    prometheus_log_method(handler)
diff --git a/notebook/metrics.py b/notebook/metrics.py
@@ -0,0 +1,39 @@
+"""
+Prometheus metrics exported by Jupyter Notebook Server
+
+Read https://prometheus.io/docs/practices/naming/ for naming
+conventions for metrics & labels. We generally prefer naming them
+`<noun>_<verb>_<type_suffix>`. So a histogram that's tracking
+the duration (in seconds) of servers spawning would be called
+SERVER_SPAWN_DURATION_SECONDS.
+"""
+
+from prometheus_client import Histogram
+
+REQUEST_DURATION_SECONDS = Histogram(
+    'request_duration_seconds',
+    'request duration for all HTTP requests',
+    ['method', 'handler', 'code'],
+)
+
+def prometheus_log_method(handler):
+    """
+    Tornado log handler for recording RED metrics.
+
+    We record the following metrics:
+       Rate – the number of requests, per second, your services are serving.
+       Errors – the number of failed requests per second.
+       Duration – The amount of time each request takes expressed as a time interval.
+
+    We use a fully qualified name of the handler as a label,
+    rather than every url path to reduce cardinality.
+
+    This function should be either the value of or called from a function
+    that is the 'log_function' tornado setting. This makes it get called
+    at the end of every request, allowing us to record the metrics we need.
+    """
+    REQUEST_DURATION_SECONDS.labels(
+        method=handler.request.method,
+        handler='{}.{}'.format(handler.__class__.__module__, type(handler).__name__),
+        code=handler.get_status()
+    ).observe(handler.request.request_time())