1
1
"""
2
2
Performance monitoring utilities for ART benchmarking and testing.
3
3
"""
4
- import time
4
+ import os
5
5
import threading
6
- from typing import Dict , List , Optional , Tuple , Union , Any
6
+ import time
7
7
from datetime import datetime
8
+ from typing import Dict , List , Optional , Any
9
+
10
+ import numpy as np
8
11
import pandas as pd
9
12
import psutil
10
- import os
11
- import numpy as np
12
13
from matplotlib import pyplot as plt
13
14
14
15
# GPU monitoring support
@@ -72,6 +73,8 @@ def __init__(self, interval: float = 0.1) -> None:
72
73
# Check for GPU availability
73
74
self .has_gpu = HAS_NVML and GPU_COUNT > 0
74
75
76
+ self .data_lock = threading .Lock ()
77
+
75
78
def start (self ) -> None :
76
79
"""Start monitoring resources in a background thread."""
77
80
self .stop_flag = False
@@ -88,50 +91,78 @@ def stop(self) -> None:
88
91
def _monitor_resources (self ) -> None :
89
92
"""Resource monitoring loop that runs in a background thread."""
90
93
while not self .stop_flag :
91
- # CPU usage (percent)
92
- cpu_percent = self .process .cpu_percent ()
93
- self .cpu_percentages .append (cpu_percent )
94
-
95
- # Memory usage (MB)
96
- memory_info = self .process .memory_info ()
97
- memory_mb = memory_info .rss / (1024 * 1024 )
98
- self .memory_usages .append (memory_mb )
99
94
100
- # Timestamp
101
- self .timestamps .append (time .time ())
102
-
103
- # GPUs
104
- if self .has_gpu :
105
- usages = []
106
- memories = []
107
- for i in range (GPU_COUNT ):
108
- handle = nvmlDeviceGetHandleByIndex (i )
109
- util = nvmlDeviceGetUtilizationRates (handle )
110
- mem_info = nvmlDeviceGetMemoryInfo (handle )
111
- usages .append (util .gpu )
112
- # use used memory in MB
113
- memories .append (mem_info .used / (1024 ** 2 ))
114
- self .gpu_usages .append (usages )
115
- self .gpu_memories .append (memories )
116
-
117
- time .sleep (self .interval )
95
+ with self .data_lock :
96
+ # CPU usage (percent)
97
+ cpu_percent = self .process .cpu_percent ()
98
+ self .cpu_percentages .append (cpu_percent )
99
+
100
+ # Memory usage (MB)
101
+ memory_info = self .process .memory_info ()
102
+ memory_mb = memory_info .rss / (1024 * 1024 )
103
+ self .memory_usages .append (memory_mb )
104
+
105
+ # Timestamp
106
+ self .timestamps .append (time .time ())
107
+
108
+ # GPUs
109
+ if self .has_gpu :
110
+ usages = []
111
+ memories = []
112
+ for i in range (GPU_COUNT ):
113
+ handle = nvmlDeviceGetHandleByIndex (i )
114
+ util = nvmlDeviceGetUtilizationRates (handle )
115
+ mem_info = nvmlDeviceGetMemoryInfo (handle )
116
+ usages .append (util .gpu )
117
+ # use used memory in MB
118
+ memories .append (mem_info .used / (1024 ** 2 ))
119
+ self .gpu_usages .append (usages )
120
+ self .gpu_memories .append (memories )
121
+
122
+ time .sleep (self .interval )
118
123
119
124
def get_data (self ) -> Dict [str , List [float ]]:
120
125
"""
121
126
Get the collected monitoring data.
122
127
123
128
:return: Dictionary containing resource usage time series
124
129
"""
125
- relative_times = [t - self .timestamps [0 ] for t in self .timestamps ] if self .timestamps else []
126
- data = {
127
- 'time' : relative_times ,
128
- 'cpu_percent' : self .cpu_percentages ,
129
- 'memory_mb' : self .memory_usages ,
130
- }
130
+ with self .data_lock :
131
+ timestamps = self .timestamps .copy ()
132
+ cpu_percentages = self .cpu_percentages .copy ()
133
+ memory_usages = self .memory_usages .copy ()
134
+
135
+ if self .has_gpu :
136
+ gpu_usages = self .gpu_usages .copy ()
137
+ gpu_memories = self .gpu_memories .copy ()
138
+
139
+
140
+ min_length = min (len (timestamps ),
141
+ len (cpu_percentages ),
142
+ len (memory_usages ))
143
+
144
+ timestamps = [t - timestamps [0 ] for t in timestamps [:min_length ]]
145
+ cpu_percentages = cpu_percentages [:min_length ]
146
+ memory_usages = memory_usages [:min_length ]
147
+
148
+ data = {}
131
149
132
150
if self .has_gpu :
133
- data ['gpu_percent' ] = self .gpu_usages
134
- data ['gpu_memory_mb' ] = self .gpu_memories
151
+ min_length = min (min_length ,
152
+ len (gpu_usages ),
153
+ len (gpu_memories ))
154
+ timestamps = timestamps [:min_length ]
155
+ cpu_percentages = cpu_percentages [:min_length ]
156
+ memory_usages = memory_usages [:min_length ]
157
+ gpu_usages = gpu_usages [:min_length ]
158
+ gpu_memories = gpu_memories [:min_length ]
159
+
160
+ data ['gpu_percent' ] = gpu_usages
161
+ data ['gpu_memory_mb' ] = gpu_memories
162
+
163
+ data ['time' ] = timestamps
164
+ data ['cpu_percent' ] = cpu_percentages
165
+ data ['memory_mb' ] = memory_usages
135
166
136
167
return data
137
168
0 commit comments