chore(asyncio): fix flaky test_asyncio test (#5498)

tabgok · web-flow · commit c4ee0db5a5b4 · 2023-04-13T10:19:09.000-04:00
This change ensures that the MainThread Task with name 'None' is run long enough to be targted by an event collector during this test. Prior to this change the test tests/profiling/collector/test_stack_asyncio.py::test_asyncio failed occasionally due to a race condition between event collectors (which scan threads) and collecting data about the MainThread with name=None. If the collectors didn't run in time to catch the MainThread, no CPU time would be recorded. After this change, a sleep() statement guarantees the thread is running long enough to be collected, and pointers have been added to enable future changes in behavior. Sample error: ``` assert wall_time_ns[t1_name] > 0 assert wall_time_ns[t2_name] > 0 if sys.platform != "win32": # Windows seems to get 0 CPU for this > assert cpu_time_found E assert False tests/profiling/collector/test_stack_asyncio.py:90: AssertionError ``` This change also modified a second test, which was failing due to referenced code lines. Prior to this change, the test_asyncio test expected the profiler to identify exactly the first line of a method. After this change, the tests ensures the profiler pointer is anywhere in the function. Testing This change was tested by increasing the collection interval (StackSampleEvent interval) from (2 x sys.setswitchinterval) to (100 x sys.setswitchinterval). These settings guaranteed 0% test success prior to the fix and 100% test success after, over 50 runs. Risks None, this is an update to testing only. ## Checklist - [x] Change(s) are motivated and described in the PR description. - [x] Testing strategy is described if automated tests are not included in the PR. - [x] Risk is outlined (performance impact, potential for breakage, maintainability, etc). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] [Library release note guidelines](https://ddtrace.readthedocs.io/en/stable/contributing.html#Release-Note-Guidelines) are followed. - [x] Documentation is included (in-code, generated user docs, [public corp docs](https://github.com/DataDog/documentation/)). - [x] PR description includes explicit acknowledgement/acceptance of the performance implications of this PR as reported in the benchmarks PR comment. ## Reviewer Checklist - [x] Title is accurate. - [x] No unnecessary changes are introduced. - [x] Description motivates each change. - [x] Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes unless absolutely necessary. - [x] Testing strategy adequately addresses listed risk(s). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] Release note makes sense to a user of the library. - [x] Reviewer has explicitly acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment.
diff --git a/tests/profiling/collector/test_stack_asyncio.py b/tests/profiling/collector/test_stack_asyncio.py
@@ -1,42 +1,69 @@
 import asyncio
 import collections
 import sys
+import types
 
 import pytest
 
 from ddtrace.profiling import _asyncio
 from ddtrace.profiling import profiler
 from ddtrace.profiling.collector import stack_event
+from ddtrace.profiling.collector.stack import StackCollector
 
 from . import _asyncio_compat
 
 
+def patch_stack_collector(stack_collector):
+    """
+    Patch a stack collect so we can count how many times it has run
+    """
+
+    def _collect(self):
+        self.run_count += 1
+        return self._orig_collect()
+
+    stack_collector.run_count = 0
+    orig = stack_collector.collect
+    stack_collector._orig_collect = orig
+    stack_collector.collect = types.MethodType(_collect, stack_collector)
+
+
 @pytest.mark.skipif(not _asyncio_compat.PY36_AND_LATER, reason="Python > 3.5 needed")
 def test_asyncio(tmp_path, monkeypatch) -> None:
     sleep_time = 0.2
-    sleep_times = 5
 
-    async def stuff() -> None:
-        await asyncio.sleep(sleep_time)
+    async def stuff(collector) -> None:
+        count = collector.run_count
+        while collector.run_count == count:
+            await asyncio.sleep(sleep_time)
 
-    async def hello() -> None:
-        t1 = _asyncio_compat.create_task(stuff(), name="sleep 1")
-        t2 = _asyncio_compat.create_task(stuff(), name="sleep 2")
-        for _ in range(sleep_times):
-            await stuff()
+    async def hello(collector) -> None:
+        t1 = _asyncio_compat.create_task(stuff(collector), name="sleep 1")
+        t2 = _asyncio_compat.create_task(stuff(collector), name="sleep 2")
+        await stuff(collector)
         return (t1, t2)
 
     monkeypatch.setenv("DD_PROFILING_CAPTURE_PCT", "100")
     monkeypatch.setenv("DD_PROFILING_OUTPUT_PPROF", str(tmp_path / "pprof"))
     # start a complete profiler so asyncio policy is setup
     p = profiler.Profiler()
+    stack_collector = [collector for collector in p._profiler._collectors if type(collector) == StackCollector][0]
+    patch_stack_collector(stack_collector)
+
     p.start()
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
     if _asyncio_compat.PY38_AND_LATER:
-        maintask = loop.create_task(hello(), name="main")
+        maintask = loop.create_task(hello(stack_collector), name="main")
     else:
-        maintask = loop.create_task(hello())
+        maintask = loop.create_task(hello(stack_collector))
+
+    # Wait for the collector to run at least once on this thread, while it is doing something
+    # 2.5+ seconds at times
+    count = stack_collector.run_count
+    while count == stack_collector.run_count:
+        pass
+
     t1, t2 = loop.run_until_complete(maintask)
     events = p._profiler._recorder.reset()
     p.stop()
@@ -55,17 +82,29 @@ async def hello() -> None:
 
         # This assertion does not work reliably on Python < 3.7
         if _asyncio_compat.PY37_AND_LATER:
+            first_line_this_test_class = test_asyncio.__code__.co_firstlineno
+            co_filename, lineno, co_name, class_name = event.frames[0]
             if event.task_name == "main":
                 assert event.thread_name == "MainThread"
-                assert event.frames == [(__file__, test_asyncio.__code__.co_firstlineno + 12, "hello", "")]
+                assert len(event.frames) == 1
+                assert co_filename == __file__
+                assert first_line_this_test_class + 9 <= lineno <= first_line_this_test_class + 13
+                assert co_name == "hello"
+                assert class_name == ""
                 assert event.nframes == 1
             elif event.task_name == t1_name:
                 assert event.thread_name == "MainThread"
-                assert event.frames == [(__file__, test_asyncio.__code__.co_firstlineno + 6, "stuff", "")]
+                assert co_filename == __file__
+                assert first_line_this_test_class + 4 <= lineno <= first_line_this_test_class + 7
+                assert co_name == "stuff"
+                assert class_name == ""
                 assert event.nframes == 1
             elif event.task_name == t2_name:
                 assert event.thread_name == "MainThread"
-                assert event.frames == [(__file__, test_asyncio.__code__.co_firstlineno + 6, "stuff", "")]
+                assert co_filename == __file__
+                assert first_line_this_test_class + 4 <= lineno <= first_line_this_test_class + 7
+                assert co_name == "stuff"
+                assert class_name == ""
                 assert event.nframes == 1
 
         if event.thread_name == "MainThread" and event.task_name is None: