tests: Check for stalls in balloon tests

JackThomson2 · JackThomson2 · commit 30c5c7ea4c07 · 2026-01-14T15:41:01.000Z
Update our balloon tests to check for stall messages in the guest.

Signed-off-by: Jack Thomson &lt;jackabt@amazon.com&gt;
diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py
@@ -16,6 +16,14 @@
 STATS_POLLING_INTERVAL_S = 1
 
 
+def check_guest_dmesg_for_stalls(ssh_connection):
+    """Check guest dmesg for RCU stalls and soft lockups."""
+    _, stdout, _ = ssh_connection.run("dmesg")
+    assert "rcu_sched self-detected stall on CPU" not in stdout
+    assert "rcu_preempt detected stalls on CPUs/tasks" not in stdout
+    assert "BUG: soft lockup -" not in stdout
+
+
 def lower_ssh_oom_chance(ssh_connection):
     """Lure OOM away from ssh process"""
     logger = logging.getLogger("lower_ssh_oom_chance")
@@ -76,6 +84,7 @@ def _test_rss_memory_lower(test_microvm):
 
     # Check that the ballooning reclaimed the memory.
     assert balloon_rss - init_rss <= 15000
+    check_guest_dmesg_for_stalls(ssh_connection)
 
 
 # pylint: disable=C0103
@@ -131,6 +140,7 @@ def test_inflate_reduces_free(uvm_plain_any):
 
     # Assert that ballooning reclaimed about 64 MB of memory.
     assert available_mem_inflated <= available_mem_deflated - 85 * 64000 / 100
+    check_guest_dmesg_for_stalls(test_microvm.ssh)
 
 
 # pylint: disable=C0103
@@ -192,6 +202,7 @@ def test_deflate_on_oom(uvm_plain_any, deflate_on_oom):
         print(f"size before: {balloon_size_before} size after: {balloon_size_after}")
         if deflate_on_oom:
             assert balloon_size_after < balloon_size_before, "Balloon did not deflate"
+            check_guest_dmesg_for_stalls(test_microvm.ssh)
         else:
             assert balloon_size_after >= balloon_size_before, "Balloon deflated"
             # Kill it here, letting the infrastructure know that the process might
@@ -255,6 +266,7 @@ def test_reinflate_balloon(uvm_plain_any):
     # is probably freed after the first inflation.
     assert (third_reading - first_reading) <= 20000
     assert abs(second_reading - fourth_reading) <= 20000
+    check_guest_dmesg_for_stalls(test_microvm.ssh)
 
 
 # pylint: disable=C0103
@@ -326,6 +338,7 @@ def test_stats(uvm_plain_any):
     # Ensure the stats reflect deflating the balloon.
     assert inflated_stats["free_memory"] < deflated_stats["free_memory"]
     assert inflated_stats["available_memory"] < deflated_stats["available_memory"]
+    check_guest_dmesg_for_stalls(test_microvm.ssh)
 
 
 def test_stats_update(uvm_plain_any):
@@ -377,6 +390,7 @@ def test_stats_update(uvm_plain_any):
 
     # Ensure that stats don't have unknown balloon stats fields
     assert "balloon: unknown stats update tag:" not in test_microvm.log_data
+    check_guest_dmesg_for_stalls(test_microvm.ssh)
 
 
 def test_balloon_snapshot(uvm_plain_any, microvm_factory):
@@ -453,6 +467,7 @@ def test_balloon_snapshot(uvm_plain_any, microvm_factory):
     # Ensure the stats are still working after restore and show
     # that the balloon inflated.
     assert stats_after_snap["available_memory"] > latest_stats["available_memory"]
+    check_guest_dmesg_for_stalls(microvm.ssh)
 
 
 @pytest.mark.parametrize("method", ["reporting", "hinting"])
@@ -532,6 +547,7 @@ def test_hinting_reporting_snapshot(uvm_plain_any, microvm_factory, method):
     # There should be a reduction in RSS, but it's inconsistent.
     # We only test that the reduction happens.
     assert third_reading > fourth_reading
+    check_guest_dmesg_for_stalls(microvm.ssh)
 
 
 @pytest.mark.parametrize("method", ["traditional", "hinting", "reporting"])
@@ -581,3 +597,4 @@ def test_memory_scrub(uvm_plain_any, method):
         _ = get_stable_rss_mem(microvm)
 
     microvm.ssh.check_output("/usr/local/bin/readmem {} {}".format(60, 1))
+    check_guest_dmesg_for_stalls(microvm.ssh)