Skip to content

Commit c65f29a

Browse files
committed
[WIP] Temporal PerformanceMonitor race condition fix
Signed-off-by: alvaro <[email protected]>
1 parent b027d1c commit c65f29a

File tree

1 file changed

+69
-38
lines changed

1 file changed

+69
-38
lines changed

art/performance_monitor.py

Lines changed: 69 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
"""
22
Performance monitoring utilities for ART benchmarking and testing.
33
"""
4-
import time
4+
import os
55
import threading
6-
from typing import Dict, List, Optional, Tuple, Union, Any
6+
import time
77
from datetime import datetime
8+
from typing import Dict, List, Optional, Any
9+
10+
import numpy as np
811
import pandas as pd
912
import psutil
10-
import os
11-
import numpy as np
1213
from matplotlib import pyplot as plt
1314

1415
# GPU monitoring support
@@ -72,6 +73,8 @@ def __init__(self, interval: float = 0.1) -> None:
7273
# Check for GPU availability
7374
self.has_gpu = HAS_NVML and GPU_COUNT > 0
7475

76+
self.data_lock = threading.Lock()
77+
7578
def start(self) -> None:
7679
"""Start monitoring resources in a background thread."""
7780
self.stop_flag = False
@@ -88,50 +91,78 @@ def stop(self) -> None:
8891
def _monitor_resources(self) -> None:
8992
"""Resource monitoring loop that runs in a background thread."""
9093
while not self.stop_flag:
91-
# CPU usage (percent)
92-
cpu_percent = self.process.cpu_percent()
93-
self.cpu_percentages.append(cpu_percent)
94-
95-
# Memory usage (MB)
96-
memory_info = self.process.memory_info()
97-
memory_mb = memory_info.rss / (1024 * 1024)
98-
self.memory_usages.append(memory_mb)
9994

100-
# Timestamp
101-
self.timestamps.append(time.time())
102-
103-
# GPUs
104-
if self.has_gpu:
105-
usages = []
106-
memories = []
107-
for i in range(GPU_COUNT):
108-
handle = nvmlDeviceGetHandleByIndex(i)
109-
util = nvmlDeviceGetUtilizationRates(handle)
110-
mem_info = nvmlDeviceGetMemoryInfo(handle)
111-
usages.append(util.gpu)
112-
# use used memory in MB
113-
memories.append(mem_info.used / (1024**2))
114-
self.gpu_usages.append(usages)
115-
self.gpu_memories.append(memories)
116-
117-
time.sleep(self.interval)
95+
with self.data_lock:
96+
# CPU usage (percent)
97+
cpu_percent = self.process.cpu_percent()
98+
self.cpu_percentages.append(cpu_percent)
99+
100+
# Memory usage (MB)
101+
memory_info = self.process.memory_info()
102+
memory_mb = memory_info.rss / (1024 * 1024)
103+
self.memory_usages.append(memory_mb)
104+
105+
# Timestamp
106+
self.timestamps.append(time.time())
107+
108+
# GPUs
109+
if self.has_gpu:
110+
usages = []
111+
memories = []
112+
for i in range(GPU_COUNT):
113+
handle = nvmlDeviceGetHandleByIndex(i)
114+
util = nvmlDeviceGetUtilizationRates(handle)
115+
mem_info = nvmlDeviceGetMemoryInfo(handle)
116+
usages.append(util.gpu)
117+
# use used memory in MB
118+
memories.append(mem_info.used / (1024**2))
119+
self.gpu_usages.append(usages)
120+
self.gpu_memories.append(memories)
121+
122+
time.sleep(self.interval)
118123

119124
def get_data(self) -> Dict[str, List[float]]:
120125
"""
121126
Get the collected monitoring data.
122127
123128
:return: Dictionary containing resource usage time series
124129
"""
125-
relative_times = [t - self.timestamps[0] for t in self.timestamps] if self.timestamps else []
126-
data = {
127-
'time': relative_times,
128-
'cpu_percent': self.cpu_percentages,
129-
'memory_mb': self.memory_usages,
130-
}
130+
with self.data_lock:
131+
timestamps = self.timestamps.copy()
132+
cpu_percentages = self.cpu_percentages.copy()
133+
memory_usages = self.memory_usages.copy()
134+
135+
if self.has_gpu:
136+
gpu_usages = self.gpu_usages.copy()
137+
gpu_memories = self.gpu_memories.copy()
138+
139+
140+
min_length = min(len(timestamps),
141+
len(cpu_percentages),
142+
len(memory_usages))
143+
144+
timestamps = [t - timestamps[0] for t in timestamps[:min_length]]
145+
cpu_percentages = cpu_percentages[:min_length]
146+
memory_usages = memory_usages[:min_length]
147+
148+
data = {}
131149

132150
if self.has_gpu:
133-
data['gpu_percent'] = self.gpu_usages
134-
data['gpu_memory_mb'] = self.gpu_memories
151+
min_length = min(min_length,
152+
len(gpu_usages),
153+
len(gpu_memories))
154+
timestamps = timestamps[:min_length]
155+
cpu_percentages = cpu_percentages[:min_length]
156+
memory_usages = memory_usages[:min_length]
157+
gpu_usages = gpu_usages[:min_length]
158+
gpu_memories = gpu_memories[:min_length]
159+
160+
data['gpu_percent'] = gpu_usages
161+
data['gpu_memory_mb'] = gpu_memories
162+
163+
data['time'] = timestamps
164+
data['cpu_percent'] = cpu_percentages
165+
data['memory_mb'] = memory_usages
135166

136167
return data
137168

0 commit comments

Comments
 (0)