test: change way api_server timings are determined

roypat · roypat · commit 06194e9ca540 · 2023-06-16T16:21:51.000Z
Previously, when checking that api server requests complete in under
700ms, we measured this client-side, which included whatever latency was
added by python/the network stack.

Now we instead directly utilize the timing information firecracker
provides in its logs in our integration tests (a log message for this
was previously present, and has been upgraded to info level so that we
can access it in the integration tests).

Signed-off-by: Patrick Roy &lt;roypat@amazon.co.uk&gt;
diff --git a/tests/framework/builder.py b/tests/framework/builder.py
@@ -214,7 +214,7 @@ def build_from_snapshot(
             )
             vm.jailer.daemonize = daemonize
             vm.spawn(log_level="Error", use_ramdisk=use_ramdisk)
-            vm.api_session.untime()
+            vm.time_api_requests = False
 
         metrics_file_path = os.path.join(vm.path, "metrics.log")
         metrics_fifo = log_tools.Fifo(metrics_file_path)
diff --git a/tests/framework/decorators.py b/tests/framework/decorators.py
diff --git a/tests/framework/http.py b/tests/framework/http.py
@@ -5,8 +5,6 @@
 import requests
 from requests_unixsocket import DEFAULT_SCHEME, UnixAdapter
 
-from framework import decorators
-
 
 class Session(requests.Session):
     """Wrapper over requests_unixsocket.Session limiting the call duration.
@@ -75,30 +73,3 @@ def is_status_payload_too_large(response: int):
         self.is_status_bad_request = is_status_bad_request
         self.is_status_not_found = is_status_not_found
         self.is_status_payload_too_large = is_status_payload_too_large
-
-    @decorators.timed_request
-    def get(self, url, **kwargs):
-        """Wrap the GET call with duration limit."""
-        # pylint: disable=method-hidden
-        # The `untime` method overrides this, and pylint disapproves.
-        return super().get(url, **kwargs)
-
-    @decorators.timed_request
-    def patch(self, url, data=None, **kwargs):
-        """Wrap the PATCH call with duration limit."""
-        # pylint: disable=method-hidden
-        # The `untime` method overrides this, and pylint disapproves.
-        return super().patch(url, data=data, **kwargs)
-
-    @decorators.timed_request
-    def put(self, url, data=None, **kwargs):
-        """Wrap the PUT call with duration limit."""
-        # pylint: disable=method-hidden
-        # The `untime` method overrides this, and pylint disapproves.
-        return super().put(url, data=data, **kwargs)
-
-    def untime(self):
-        """Restore the HTTP methods to their un-timed selves."""
-        self.get = super().get
-        self.patch = super().patch
-        self.put = super().put
diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py
@@ -11,11 +11,13 @@
 import json
 import logging
 import os
+import re
 import select
 import shutil
 import time
 import uuid
 import weakref
+from collections import namedtuple
 from functools import cached_property
 from pathlib import Path
 from threading import Lock
@@ -28,7 +30,7 @@
 import host_tools.memory as mem_tools
 import host_tools.network as net_tools
 from framework import utils
-from framework.defs import FC_PID_FILE_NAME
+from framework.defs import FC_PID_FILE_NAME, MAX_API_CALL_DURATION_MS
 from framework.http import Session
 from framework.jailer import JailerContext
 from framework.resources import (
@@ -149,6 +151,8 @@ def __init__(
         self._screen_pid = None
         self._screen_log = None
 
+        self.time_api_requests = True
+
         # Initalize memory monitor
         self.memory_monitor = None
 
@@ -205,6 +209,9 @@ def kill(self):
             self.expect_kill_by_signal = True
             utils.run_cmd("kill -9 {} || true".format(self.screen_pid))
 
+        if self.time_api_requests:
+            self._validate_api_response_times()
+
         # Check if Firecracker was launched by the jailer in a new pid ns.
         fc_pid_in_new_ns = self.pid_in_new_ns
 
@@ -219,6 +226,59 @@ def kill(self):
                 self.memory_monitor.join(timeout=1)
             self.memory_monitor.check_samples()
 
+    def _validate_api_response_times(self):
+        """
+        Parses the firecracker logs for information regarding api server request processing times, and asserts they
+        are within acceptable bounds.
+        """
+        # Log messages are either
+        # 2023-06-16T07:45:41.767987318 [fc44b23e-ce47-4635-9549-5779a6bd9cee:fc_api] The API server received a Get request on "/mmds".
+        # or
+        # 2023-06-16T07:47:31.204704732 [2f2427c7-e4de-4226-90e6-e3556402be84:fc_api] The API server received a Put request on "/actions" with body "{\"action_type\": \"InstanceStart\"}".
+        api_request_regex = re.compile(
+            r"\] The API server received a (?P<method>\w+) request on \"(?P<url>(/(\w|-)*)+)\"( with body (?P<body>.*))?\."
+        )
+        api_request_times_regex = re.compile(
+            r"\] Total previous API call duration: (?P<execution_time>\d+) us.$"
+        )
+
+        # Note: Processing of api requests is synchronous, so these messages cannot be torn by concurrency effects
+        log_lines = self.log_data.split("\n")
+
+        ApiCall = namedtuple("ApiCall", "method url body")
+
+        current_call = None
+
+        for log_line in log_lines:
+            match = api_request_regex.search(log_line)
+
+            if match:
+                if current_call is not None:
+                    raise Exception(
+                        f"API call duration log entry for {current_call.method} {current_call.url} with body {current_call.body} is missing!"
+                    )
+
+                current_call = ApiCall(
+                    match.group("method"), match.group("url"), match.group("body")
+                )
+
+            match = api_request_times_regex.search(log_line)
+
+            if match:
+                if current_call is None:
+                    raise Exception(
+                        "Got API call duration log entry before request entry"
+                    )
+
+                if current_call.url != "/snapshot/create":
+                    exec_time = float(match.group("execution_time")) / 1000.0
+
+                    assert (
+                        exec_time <= MAX_API_CALL_DURATION_MS
+                    ), f"{current_call.method} {current_call.url} API call exceeded maximum duration: {exec_time} ms. Body: {current_call.body}"
+
+                current_call = None
+
     @property
     def firecracker_version(self):
         """Return the version of the Firecracker executable."""
@@ -392,7 +452,7 @@ def spawn(
         self,
         create_logger=True,
         log_file="log_fifo",
-        log_level="Info",
+        log_level="Debug",
         use_ramdisk=False,
         metrics_path=None,
     ):
@@ -765,7 +825,6 @@ def pause_to_snapshot(
         response = self.vm.patch(state="Paused")
         assert self.api_session.is_status_no_content(response.status_code)
 
-        self.api_session.untime()
         response = self.snapshot.create(
             mem_file_path=mem_file_path,
             snapshot_path=snapshot_path,
diff --git a/tests/framework/resources.py b/tests/framework/resources.py
@@ -381,7 +381,6 @@ def __init__(self, api_usocket_full_name, api_session):
 
     def put(self, **args):
         """Create a snapshot of the microvm."""
-        self._api_session.untime()
         datax = self.create_json(**args)
         return self._api_session.put("{}".format(self._snapshot_cfg_url), json=datax)
 
diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py
@@ -1400,7 +1400,7 @@ def test_map_private_seccomp_regression(test_microvm_with_api):
         {"http-api-max-payload-size": str(1024 * 1024 * 2)}
     )
     test_microvm.spawn()
-    test_microvm.api_session.untime()
+    test_microvm.time_api_request = False
 
     response = test_microvm.mmds.get()
     assert test_microvm.api_session.is_status_ok(response.status_code)
diff --git a/tests/integration_tests/functional/test_logging.py b/tests/integration_tests/functional/test_logging.py
@@ -124,6 +124,8 @@ def test_log_config_failure(test_microvm_with_api):
         show_level=True,
         show_log_origin=True,
     )
+    # only works if log level is Debug
+    microvm.time_api_requests = False
     assert microvm.api_session.is_status_bad_request(response.status_code)
     assert response.json()["fault_message"]
 
@@ -146,6 +148,8 @@ def test_api_requests_logs(test_microvm_with_api):
         show_level=True,
         show_log_origin=True,
     )
+    # only works if log level is Debug
+    microvm.time_api_requests = False
     assert microvm.api_session.is_status_no_content(response.status_code)
     microvm.start_console_logger(log_fifo)
 
@@ -206,6 +210,8 @@ def test_api_requests_logs(test_microvm_with_api):
 def _test_log_config(microvm, log_level="Info", show_level=True, show_origin=True):
     """Exercises different scenarios for testing the logging config."""
     microvm.spawn(create_logger=False)
+    # only works if log level is Debug
+    microvm.time_api_requests = False
 
     # Configure logging.
     log_fifo_path = os.path.join(microvm.path, "log_fifo")
diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py
@@ -191,6 +191,8 @@ def test_load_snapshot_failure_handling(test_microvm_with_api):
     logger = logging.getLogger("snapshot_load_failure")
     vm = test_microvm_with_api
     vm.spawn(log_level="Info")
+    # only works if log level is Debug
+    vm.time_api_requests = False
 
     # Create two empty files for snapshot state and snapshot memory
     chroot_path = vm.jailer.chroot_path()

Original file line number	Diff line number	Diff line change
`@@ -214,7 +214,7 @@ def build_from_snapshot(`
`214`	`214`	`)`
`215`	`215`	`vm.jailer.daemonize = daemonize`
`216`	`216`	`vm.spawn(log_level="Error", use_ramdisk=use_ramdisk)`
`217`		`- vm.api_session.untime()`
	`217`	`+ vm.time_api_requests = False`
`218`	`218`
`219`	`219`	`metrics_file_path = os.path.join(vm.path, "metrics.log")`
`220`	`220`	`metrics_fifo = log_tools.Fifo(metrics_file_path)`
Original file line number	Diff line number	Diff line change
`@@ -1400,7 +1400,7 @@ def test_map_private_seccomp_regression(test_microvm_with_api):`
`1400`	`1400`	`{"http-api-max-payload-size": str(1024 * 1024 * 2)}`
`1401`	`1401`	`)`
`1402`	`1402`	`test_microvm.spawn()`
`1403`		`- test_microvm.api_session.untime()`
	`1403`	`+ test_microvm.time_api_request = False`
`1404`	`1404`
`1405`	`1405`	`response = test_microvm.mmds.get()`
`1406`	`1406`	`assert test_microvm.api_session.is_status_ok(response.status_code)`