Skip to content

Commit f393fc6

Browse files
fix(runtime metrics): ensure collection lambda reference isn't reused between loops [backport 1.17] (#6603)
Backport 238ca96 from #6490 to 1.17. A typo in the new runtime metrics implementation led to all instantaneous-type metrics to shadow the value (in procfs) of a "delta" metric. This led to values such as `num_threads` monotonically increasing over the lifetime of a process, as well as just being outrageously incorrect. ## Checklist - [X] Change(s) are motivated and described in the PR description. - [x] Testing strategy is described if automated tests are not included in the PR. - [X] Risk is outlined (performance impact, potential for breakage, maintainability, etc). - [X] Change is maintainable (easy to change, telemetry, documentation). - [X] [Library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) are followed. If no release note is required, add label `changelog/no-changelog`. - [X] Documentation is included (in-code, generated user docs, [public corp docs](https://github.com/DataDog/documentation/)). - [X] Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Title is accurate. - [x] No unnecessary changes are introduced. - [x] Description motivates each change. - [x] Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes unless absolutely necessary. - [x] Testing strategy adequately addresses listed risk(s). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] Release note makes sense to a user of the library. - [x] Reviewer has explicitly acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment. - [x] Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) Co-authored-by: David Sanchez <[email protected]>
1 parent 8501ee0 commit f393fc6

File tree

2 files changed

+96
-4
lines changed

2 files changed

+96
-4
lines changed

ddtrace/internal/runtime/metric_collectors.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,9 @@ def collect_fn(self, keys):
7171
metrics = {}
7272

7373
# Populate metrics for which we compute delta values
74-
for metric, func in self.delta_funs.items():
74+
for metric, delta_fun in self.delta_funs.items():
7575
try:
76-
value = func(self.proc)
76+
value = delta_fun(self.proc)
7777
except Exception:
7878
value = 0
7979

@@ -82,9 +82,9 @@ def collect_fn(self, keys):
8282
metrics[metric] = delta
8383

8484
# Populate metrics that just take instantaneous reading
85-
for metric, fun in self.abs_funs.items():
85+
for metric, abs_fun in self.abs_funs.items():
8686
try:
87-
value = func(self.proc)
87+
value = abs_fun(self.proc)
8888
except Exception:
8989
value = 0
9090

tests/tracer/runtime/test_metric_collectors.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1+
from ddtrace.internal.runtime.constants import CPU_PERCENT
12
from ddtrace.internal.runtime.constants import GC_COUNT_GEN0
23
from ddtrace.internal.runtime.constants import GC_RUNTIME_METRICS
4+
from ddtrace.internal.runtime.constants import MEM_RSS
35
from ddtrace.internal.runtime.constants import PSUTIL_RUNTIME_METRICS
6+
from ddtrace.internal.runtime.constants import THREAD_COUNT
47
from ddtrace.internal.runtime.metric_collectors import GCRuntimeMetricCollector
58
from ddtrace.internal.runtime.metric_collectors import PSUtilRuntimeMetricCollector
69
from ddtrace.internal.runtime.metric_collectors import RuntimeMetricCollector
@@ -28,6 +31,95 @@ def test_metrics(self):
2831
for (key, value) in collector.collect(PSUTIL_RUNTIME_METRICS):
2932
self.assertIsNotNone(value)
3033

34+
def test_static_metrics(self):
35+
import os
36+
import threading
37+
import time
38+
39+
from ddtrace.vendor import psutil
40+
41+
# Something to bump CPU utilization
42+
def busy_wait(duration_ms):
43+
end_time = time.time() + (duration_ms / 1000.0)
44+
while time.time() < end_time:
45+
pass
46+
47+
def get_metrics():
48+
# need to waste a reading of psutil because some of its reading have
49+
# memory and need a previous state
50+
collector = PSUtilRuntimeMetricCollector()
51+
collector.collect_fn(None) # wasted
52+
proc = psutil.Process(os.getpid())
53+
proc.cpu_percent() # wasted
54+
55+
# Create some load. If the duration is too low, then it can cause
56+
# wildly different values between readings.
57+
busy_wait(50)
58+
59+
runtime_metrics = dict(collector.collect_fn(None))
60+
61+
with proc.oneshot():
62+
psutil_metrics = {
63+
CPU_PERCENT: proc.cpu_percent(),
64+
MEM_RSS: proc.memory_info().rss,
65+
THREAD_COUNT: proc.num_threads(),
66+
}
67+
return runtime_metrics, psutil_metrics
68+
69+
def check_metrics(runtime_metrics, psutil_metrics):
70+
def within_threshold(a, b, epsilon):
71+
return abs(a - b) <= epsilon * max(abs(a), abs(b))
72+
73+
# Number of threads should be precise
74+
if psutil_metrics[THREAD_COUNT] != runtime_metrics[THREAD_COUNT]:
75+
return False
76+
77+
# CPU and RAM should be approximate. These tests are checking that the category of
78+
# the value is correct, rather than the specific value itself.
79+
epsilon = 0.25
80+
if not within_threshold(psutil_metrics[CPU_PERCENT], runtime_metrics[CPU_PERCENT], epsilon):
81+
return False
82+
83+
if not within_threshold(psutil_metrics[MEM_RSS], runtime_metrics[MEM_RSS], epsilon):
84+
return False
85+
86+
return True
87+
88+
# Sanity-check that the num_threads comparison works
89+
rt_metrics, pu_metrics = get_metrics()
90+
pu_metrics[THREAD_COUNT] += 1
91+
self.assertFalse(check_metrics(rt_metrics, pu_metrics))
92+
93+
# Check that the CPU comparison works
94+
rt_metrics, pu_metrics = get_metrics()
95+
pu_metrics[CPU_PERCENT] *= 2
96+
self.assertFalse(check_metrics(rt_metrics, pu_metrics))
97+
98+
# Check that the memory comparison works
99+
rt_metrics, pu_metrics = get_metrics()
100+
pu_metrics[MEM_RSS] *= 2
101+
self.assertFalse(check_metrics(rt_metrics, pu_metrics))
102+
103+
# Baseline check
104+
self.assertTrue(check_metrics(*get_metrics()))
105+
106+
# Check for threads. Rather than using a sleep() which might be brittle in CI, use an explicit
107+
# semaphore as a stop condition per thread.
108+
def thread_stopper(stop_event):
109+
stop_event.wait()
110+
111+
stop_event = threading.Event()
112+
threads = [threading.Thread(target=thread_stopper, args=(stop_event,)) for _ in range(10)]
113+
_ = [thread.start() for thread in threads]
114+
self.assertTrue(check_metrics(*get_metrics()))
115+
stop_event.set()
116+
_ = [thread.join() for thread in threads]
117+
118+
# Check for RSS
119+
wasted_memory = [" "] * 16 * 1024 ** 2 # 16 megs
120+
self.assertTrue(check_metrics(*get_metrics()))
121+
del wasted_memory
122+
31123

32124
class TestGCRuntimeMetricCollector(BaseTestCase):
33125
def test_metrics(self):

0 commit comments

Comments
 (0)