fix(runtime metrics): ensure collection lambda reference isn't reused between loops [backport 1.17] (#6603)

github-actions[bot] · sanchda · web-flow · commit f393fc6d58f9 · 2023-08-15T16:13:55.000Z
Backport 238ca96 from #6490 to 1.17. A typo in the new runtime metrics implementation led to all instantaneous-type metrics to shadow the value (in procfs) of a "delta" metric. This led to values such as `num_threads` monotonically increasing over the lifetime of a process, as well as just being outrageously incorrect. ## Checklist - [X] Change(s) are motivated and described in the PR description. - [x] Testing strategy is described if automated tests are not included in the PR. - [X] Risk is outlined (performance impact, potential for breakage, maintainability, etc). - [X] Change is maintainable (easy to change, telemetry, documentation). - [X] [Library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) are followed. If no release note is required, add label `changelog/no-changelog`. - [X] Documentation is included (in-code, generated user docs, [public corp docs](https://github.com/DataDog/documentation/)). - [X] Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Title is accurate. - [x] No unnecessary changes are introduced. - [x] Description motivates each change. - [x] Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes unless absolutely necessary. - [x] Testing strategy adequately addresses listed risk(s). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] Release note makes sense to a user of the library. - [x] Reviewer has explicitly acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment. - [x] Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) Co-authored-by: David Sanchez <838104+sanchda@users.noreply.github.com>
diff --git a/ddtrace/internal/runtime/metric_collectors.py b/ddtrace/internal/runtime/metric_collectors.py
@@ -71,9 +71,9 @@ def collect_fn(self, keys):
             metrics = {}
 
             # Populate metrics for which we compute delta values
-            for metric, func in self.delta_funs.items():
+            for metric, delta_fun in self.delta_funs.items():
                 try:
-                    value = func(self.proc)
+                    value = delta_fun(self.proc)
                 except Exception:
                     value = 0
 
@@ -82,9 +82,9 @@ def collect_fn(self, keys):
                 metrics[metric] = delta
 
             # Populate metrics that just take instantaneous reading
-            for metric, fun in self.abs_funs.items():
+            for metric, abs_fun in self.abs_funs.items():
                 try:
-                    value = func(self.proc)
+                    value = abs_fun(self.proc)
                 except Exception:
                     value = 0
 
diff --git a/tests/tracer/runtime/test_metric_collectors.py b/tests/tracer/runtime/test_metric_collectors.py
@@ -1,6 +1,9 @@
+from ddtrace.internal.runtime.constants import CPU_PERCENT
 from ddtrace.internal.runtime.constants import GC_COUNT_GEN0
 from ddtrace.internal.runtime.constants import GC_RUNTIME_METRICS
+from ddtrace.internal.runtime.constants import MEM_RSS
 from ddtrace.internal.runtime.constants import PSUTIL_RUNTIME_METRICS
+from ddtrace.internal.runtime.constants import THREAD_COUNT
 from ddtrace.internal.runtime.metric_collectors import GCRuntimeMetricCollector
 from ddtrace.internal.runtime.metric_collectors import PSUtilRuntimeMetricCollector
 from ddtrace.internal.runtime.metric_collectors import RuntimeMetricCollector
@@ -28,6 +31,95 @@ def test_metrics(self):
         for (key, value) in collector.collect(PSUTIL_RUNTIME_METRICS):
             self.assertIsNotNone(value)
 
+    def test_static_metrics(self):
+        import os
+        import threading
+        import time
+
+        from ddtrace.vendor import psutil
+
+        # Something to bump CPU utilization
+        def busy_wait(duration_ms):
+            end_time = time.time() + (duration_ms / 1000.0)
+            while time.time() < end_time:
+                pass
+
+        def get_metrics():
+            # need to waste a reading of psutil because some of its reading have
+            # memory and need a previous state
+            collector = PSUtilRuntimeMetricCollector()
+            collector.collect_fn(None)  # wasted
+            proc = psutil.Process(os.getpid())
+            proc.cpu_percent()  # wasted
+
+            # Create some load.  If the duration is too low, then it can cause
+            # wildly different values between readings.
+            busy_wait(50)
+
+            runtime_metrics = dict(collector.collect_fn(None))
+
+            with proc.oneshot():
+                psutil_metrics = {
+                    CPU_PERCENT: proc.cpu_percent(),
+                    MEM_RSS: proc.memory_info().rss,
+                    THREAD_COUNT: proc.num_threads(),
+                }
+            return runtime_metrics, psutil_metrics
+
+        def check_metrics(runtime_metrics, psutil_metrics):
+            def within_threshold(a, b, epsilon):
+                return abs(a - b) <= epsilon * max(abs(a), abs(b))
+
+            # Number of threads should be precise
+            if psutil_metrics[THREAD_COUNT] != runtime_metrics[THREAD_COUNT]:
+                return False
+
+            # CPU and RAM should be approximate.  These tests are checking that the category of
+            # the value is correct, rather than the specific value itself.
+            epsilon = 0.25
+            if not within_threshold(psutil_metrics[CPU_PERCENT], runtime_metrics[CPU_PERCENT], epsilon):
+                return False
+
+            if not within_threshold(psutil_metrics[MEM_RSS], runtime_metrics[MEM_RSS], epsilon):
+                return False
+
+            return True
+
+        # Sanity-check that the num_threads comparison works
+        rt_metrics, pu_metrics = get_metrics()
+        pu_metrics[THREAD_COUNT] += 1
+        self.assertFalse(check_metrics(rt_metrics, pu_metrics))
+
+        # Check that the CPU comparison works
+        rt_metrics, pu_metrics = get_metrics()
+        pu_metrics[CPU_PERCENT] *= 2
+        self.assertFalse(check_metrics(rt_metrics, pu_metrics))
+
+        # Check that the memory comparison works
+        rt_metrics, pu_metrics = get_metrics()
+        pu_metrics[MEM_RSS] *= 2
+        self.assertFalse(check_metrics(rt_metrics, pu_metrics))
+
+        # Baseline check
+        self.assertTrue(check_metrics(*get_metrics()))
+
+        # Check for threads.  Rather than using a sleep() which might be brittle in CI, use an explicit
+        # semaphore as a stop condition per thread.
+        def thread_stopper(stop_event):
+            stop_event.wait()
+
+        stop_event = threading.Event()
+        threads = [threading.Thread(target=thread_stopper, args=(stop_event,)) for _ in range(10)]
+        _ = [thread.start() for thread in threads]
+        self.assertTrue(check_metrics(*get_metrics()))
+        stop_event.set()
+        _ = [thread.join() for thread in threads]
+
+        # Check for RSS
+        wasted_memory = [" "] * 16 * 1024 ** 2  # 16 megs
+        self.assertTrue(check_metrics(*get_metrics()))
+        del wasted_memory
+
 
 class TestGCRuntimeMetricCollector(BaseTestCase):
     def test_metrics(self):