Skip to content

Commit f245a52

Browse files
committed
Track memory deltas directly
Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
1 parent 54d9a15 commit f245a52

File tree

5 files changed

+1439
-870
lines changed

5 files changed

+1439
-870
lines changed

scripts/plot_resource_tracker.py

Lines changed: 233 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -31,115 +31,258 @@ def plot_resource_tracker(csv_file):
3131
# Read CSV file
3232
df = pd.read_csv(csv_file)
3333

34+
# Check if CSV has any data rows
35+
if len(df) == 0:
36+
print(f"Error: CSV file '{csv_file}' contains no data rows (only header).")
37+
print("Make sure the application runs with ZEL_ENABLE_SYSTEM_RESOURCE_TRACKER_CHECKER=1")
38+
return
39+
3440
# Convert time from milliseconds to seconds
3541
df['TimeSec'] = df['TimeMs'] / 1000.0
3642

43+
# Detect potential memory leaks - track memory increases that don't get freed
44+
# For create/destroy operations, memory should return to baseline
45+
# Look for APIs that destroy resources but memory doesn't decrease
46+
df['PotentialLeak_KB'] = 0
47+
48+
# Create/Destroy pairs to monitor for leaks
49+
create_destroy_pairs = {
50+
'zeContextCreate': 'zeContextDestroy',
51+
'zeCommandQueueCreate': 'zeCommandQueueDestroy',
52+
'zeModuleCreate': 'zeModuleDestroy',
53+
'zeKernelCreate': 'zeKernelDestroy',
54+
'zeEventPoolCreate': 'zeEventPoolDestroy',
55+
'zeCommandListCreate': 'zeCommandListDestroy',
56+
'zeCommandListCreateImmediate': 'zeCommandListDestroy',
57+
'zeEventCreate': 'zeEventDestroy',
58+
'zeFenceCreate': 'zeFenceDestroy',
59+
'zeImageCreate': 'zeImageDestroy',
60+
'zeSamplerCreate': 'zeSamplerDestroy',
61+
'zeMemAllocDevice': 'zeMemFree',
62+
'zeMemAllocHost': 'zeMemFree',
63+
'zeMemAllocShared': 'zeMemFree',
64+
}
65+
66+
# Track memory at create and check if it decreased at destroy
67+
for idx, row in df.iterrows():
68+
api = row['APICall']
69+
# If this is a destroy operation, check if memory decreased
70+
if api in create_destroy_pairs.values():
71+
# Memory should decrease on destroy - if it increased, it's a leak
72+
if row['Delta_VmRSS_KB'] > 100: # Threshold: 100KB increase on destroy = leak
73+
df.at[idx, 'PotentialLeak_KB'] = row['Delta_VmRSS_KB']
74+
75+
# Calculate cumulative leaks over time
76+
df['CumulativeLeak_KB'] = df['PotentialLeak_KB'].cumsum()
77+
3778
# Create figure with multiple subplots
38-
fig, axes = plt.subplots(3, 3, figsize=(18, 12))
79+
fig, axes = plt.subplots(4, 3, figsize=(18, 16))
3980
fig.suptitle(f'Level Zero System Resource Tracking\n{Path(csv_file).name}', fontsize=16)
4081

41-
# Plot 1: Memory Usage Over Time (VmRSS, VmSize, VmData)
82+
# Plot 1: Memory Leak Detection Over Time
4283
ax1 = axes[0, 0]
43-
ax1.plot(df['TimeSec'], df['VmRSS_KB'], label='VmRSS', linewidth=2)
44-
ax1.plot(df['TimeSec'], df['VmSize_KB'], label='VmSize', linewidth=2, alpha=0.7)
45-
ax1.plot(df['TimeSec'], df['VmData_KB'], label='VmData', linewidth=2, alpha=0.7)
84+
# Plot cumulative leaks
85+
ax1.plot(df['TimeSec'], df['CumulativeLeak_KB'] / 1024, label='Cumulative Leaks',
86+
linewidth=3, color='red')
87+
# Mark individual leak events
88+
leak_events = df[df['PotentialLeak_KB'] > 0]
89+
if not leak_events.empty:
90+
ax1.scatter(leak_events['TimeSec'], leak_events['CumulativeLeak_KB'] / 1024,
91+
color='darkred', s=100, marker='x', linewidths=3,
92+
label=f'Leak Events ({len(leak_events)})', zorder=5)
4693
ax1.set_xlabel('Time (s)')
47-
ax1.set_ylabel('Memory (KB)')
48-
ax1.set_title('System Memory Usage Over Time')
94+
ax1.set_ylabel('Memory Leaked (MB)')
95+
ax1.set_title('Memory Leak Detection Over Time', fontweight='bold', color='darkred')
4996
ax1.legend()
5097
ax1.grid(True, alpha=0.3)
98+
if df['CumulativeLeak_KB'].max() > 0:
99+
ax1.set_facecolor('#fff5f5') # Light red background if leaks detected
51100

52-
# Plot 2: Memory Deltas (showing per-call changes)
101+
# Plot 2: Memory Usage Over Time (VmRSS, VmSize, VmData)
53102
ax2 = axes[0, 1]
54-
ax2.plot(df['TimeSec'], df['Delta_VmRSS_KB'] / 1024, label='Delta VmRSS', linewidth=1.5)
55-
ax2.plot(df['TimeSec'], df['Delta_VmSize_KB'] / 1024, label='Delta VmSize', linewidth=1.5, alpha=0.7)
56-
ax2.axhline(y=0, color='black', linestyle='--', linewidth=0.5)
103+
ax2.plot(df['TimeSec'], df['VmRSS_KB'] / 1024, label='VmRSS', linewidth=2)
104+
ax2.plot(df['TimeSec'], df['VmSize_KB'] / 1024, label='VmSize', linewidth=2, alpha=0.7)
105+
ax2.plot(df['TimeSec'], df['VmData_KB'] / 1024, label='VmData', linewidth=2, alpha=0.7)
57106
ax2.set_xlabel('Time (s)')
58-
ax2.set_ylabel('Memory Change (MB)')
59-
ax2.set_title('Memory Deltas Per API Call')
107+
ax2.set_ylabel('Memory (MB)')
108+
ax2.set_title('System Memory Usage Over Time')
60109
ax2.legend()
61110
ax2.grid(True, alpha=0.3)
62111

63-
# Plot 3: Level Zero Resource Counts
64-
ax3 = axes[1, 0]
112+
# Plot 3: Leak Events by API Type
113+
ax3 = axes[0, 2]
114+
if not leak_events.empty:
115+
leak_by_api = leak_events.groupby('APICall')['PotentialLeak_KB'].sum().sort_values(ascending=True) / 1024
116+
if not leak_by_api.empty:
117+
leak_by_api.plot(kind='barh', ax=ax3, color='orangered')
118+
ax3.set_xlabel('Total Leaked Memory (MB)')
119+
ax3.set_title('Memory Leaks by API Call', fontweight='bold')
120+
ax3.grid(True, alpha=0.3, axis='x')
121+
ax3.set_facecolor('#fff5f5')
122+
else:
123+
ax3.text(0.5, 0.5, 'No Leaks Detected!', ha='center', va='center',
124+
fontsize=14, color='green', fontweight='bold', transform=ax3.transAxes)
125+
ax3.set_title('Memory Leaks by API Call')
126+
ax3.axis('off')
127+
else:
128+
ax3.text(0.5, 0.5, 'No Leaks Detected!', ha='center', va='center',
129+
fontsize=14, color='green', fontweight='bold', transform=ax3.transAxes)
130+
ax3.set_title('Memory Leaks by API Call')
131+
ax3.axis('off')
132+
133+
# Plot 4: Memory Deltas (showing per-call changes)
134+
ax4 = axes[1, 0]
135+
ax4.plot(df['TimeSec'], df['Delta_VmRSS_KB'] / 1024, label='Delta VmRSS', linewidth=1.5)
136+
ax4.plot(df['TimeSec'], df['Delta_VmSize_KB'] / 1024, label='Delta VmSize', linewidth=1.5, alpha=0.7)
137+
ax4.axhline(y=0, color='black', linestyle='--', linewidth=0.5)
138+
ax4.set_xlabel('Time (s)')
139+
ax4.set_ylabel('Memory Change (MB)')
140+
ax4.set_title('Memory Deltas Per API Call')
141+
ax4.legend()
142+
ax4.grid(True, alpha=0.3)
143+
144+
# Plot 5: Level Zero Resource Counts
145+
ax5 = axes[1, 1]
146+
has_resources = False
65147
if df['Contexts'].max() > 0:
66-
ax3.plot(df['TimeSec'], df['Contexts'], label='Contexts', linewidth=2)
148+
ax5.plot(df['TimeSec'], df['Contexts'], label='Contexts', linewidth=2)
149+
has_resources = True
67150
if df['CommandQueues'].max() > 0:
68-
ax3.plot(df['TimeSec'], df['CommandQueues'], label='CommandQueues', linewidth=2)
151+
ax5.plot(df['TimeSec'], df['CommandQueues'], label='CommandQueues', linewidth=2)
152+
has_resources = True
69153
if df['Modules'].max() > 0:
70-
ax3.plot(df['TimeSec'], df['Modules'], label='Modules', linewidth=2)
154+
ax5.plot(df['TimeSec'], df['Modules'], label='Modules', linewidth=2)
155+
has_resources = True
71156
if df['Kernels'].max() > 0:
72-
ax3.plot(df['TimeSec'], df['Kernels'], label='Kernels', linewidth=2)
73-
ax3.set_xlabel('Time (s)')
74-
ax3.set_ylabel('Resource Count')
75-
ax3.set_title('L0 Resource Counts (Contexts, Queues, Modules, Kernels)')
76-
ax3.legend()
77-
ax3.grid(True, alpha=0.3)
78-
79-
# Plot 4: Command Lists and Event Resources
80-
ax4 = axes[1, 1]
157+
ax5.plot(df['TimeSec'], df['Kernels'], label='Kernels', linewidth=2)
158+
has_resources = True
159+
ax5.set_xlabel('Time (s)')
160+
ax5.set_ylabel('Resource Count')
161+
ax5.set_title('L0 Resource Counts (Contexts, Queues, Modules, Kernels)')
162+
if has_resources:
163+
ax5.legend()
164+
ax5.grid(True, alpha=0.3)
165+
166+
# Plot 6: Command Lists and Event Resources
167+
ax6 = axes[1, 2]
168+
has_cmd_resources = False
81169
if df['CommandLists'].max() > 0:
82-
ax4.plot(df['TimeSec'], df['CommandLists'], label='CommandLists', linewidth=2)
170+
ax6.plot(df['TimeSec'], df['CommandLists'], label='CommandLists', linewidth=2)
171+
has_cmd_resources = True
83172
if df['EventPools'].max() > 0:
84-
ax4.plot(df['TimeSec'], df['EventPools'], label='EventPools', linewidth=2)
173+
ax6.plot(df['TimeSec'], df['EventPools'], label='EventPools', linewidth=2)
174+
has_cmd_resources = True
85175
if df['Events'].max() > 0:
86-
ax4.plot(df['TimeSec'], df['Events'], label='Events', linewidth=2)
176+
ax6.plot(df['TimeSec'], df['Events'], label='Events', linewidth=2)
177+
has_cmd_resources = True
87178
if df['Fences'].max() > 0:
88-
ax4.plot(df['TimeSec'], df['Fences'], label='Fences', linewidth=2)
89-
ax4.set_xlabel('Time (s)')
90-
ax4.set_ylabel('Resource Count')
91-
ax4.set_title('L0 Command Lists and Events')
92-
ax4.legend()
93-
ax4.grid(True, alpha=0.3)
179+
ax6.plot(df['TimeSec'], df['Fences'], label='Fences', linewidth=2)
180+
has_cmd_resources = True
181+
ax6.set_xlabel('Time (s)')
182+
ax6.set_ylabel('Resource Count')
183+
ax6.set_title('L0 Command Lists and Events')
184+
if has_cmd_resources:
185+
ax6.legend()
186+
ax6.grid(True, alpha=0.3)
94187

95-
# Plot 5: Total Memory Allocations
96-
ax5 = axes[2, 0]
97-
ax5.plot(df['TimeSec'], df['TotalMemory_Bytes'] / (1024*1024), label='Total Memory',
188+
# Plot 7: Total Memory Allocations
189+
ax7 = axes[2, 0]
190+
ax7.plot(df['TimeSec'], df['TotalMemory_Bytes'] / (1024*1024), label='Total Memory',
98191
linewidth=2, color='red')
99-
ax5.set_xlabel('Time (s)')
100-
ax5.set_ylabel('Memory (MB)')
101-
ax5.set_title('Total L0 Memory Allocations')
102-
ax5.legend()
103-
ax5.grid(True, alpha=0.3)
192+
ax7.set_xlabel('Time (s)')
193+
ax7.set_ylabel('Memory (MB)')
194+
ax7.set_title('Total L0 Memory Allocations')
195+
ax7.legend()
196+
ax7.grid(True, alpha=0.3)
104197

105-
# Plot 6: API Call Distribution (top 10 most frequent)
106-
ax6 = axes[2, 1]
198+
# Plot 8: API Call Distribution (top 10 most frequent)
199+
ax8 = axes[2, 1]
107200
api_counts = df['APICall'].value_counts().head(10).sort_values(ascending=True)
108-
api_counts.plot(kind='barh', ax=ax6, color='steelblue')
109-
ax6.set_xlabel('Call Count')
110-
ax6.set_title('Top 10 Most Frequent API Calls')
111-
ax6.grid(True, alpha=0.3, axis='x')
201+
if len(api_counts) > 0:
202+
api_counts.plot(kind='barh', ax=ax8, color='steelblue')
203+
ax8.set_xlabel('Call Count')
204+
ax8.set_title('Top 10 Most Frequent API Calls')
205+
ax8.grid(True, alpha=0.3, axis='x')
206+
else:
207+
ax8.text(0.5, 0.5, 'No API calls recorded', ha='center', va='center',
208+
fontsize=12, transform=ax8.transAxes)
209+
ax8.set_title('Top 10 Most Frequent API Calls')
210+
ax8.axis('off')
112211

113-
# Plot 7: Top 10 API Calls by Memory Usage
114-
ax7 = axes[2, 2]
212+
# Plot 9: Top 10 API Calls by Memory Usage
213+
ax9 = axes[2, 2]
115214
# Calculate total memory delta per API call type
116215
memory_by_api = (df.groupby('APICall')['Delta_VmRSS_KB'].sum() / 1024).sort_values(ascending=True).tail(10)
117-
memory_by_api.plot(kind='barh', ax=ax7, color='coral')
118-
ax7.set_xlabel('Total Memory Delta (MB)')
119-
ax7.set_title('Top 10 API Calls by Memory Impact')
120-
ax7.grid(True, alpha=0.3, axis='x')
216+
if len(memory_by_api) > 0:
217+
memory_by_api.plot(kind='barh', ax=ax9, color='coral')
218+
ax9.set_xlabel('Total Memory Delta (MB)')
219+
ax9.set_title('Top 10 API Calls by Memory Impact')
220+
ax9.grid(True, alpha=0.3, axis='x')
221+
else:
222+
ax9.text(0.5, 0.5, 'No API calls recorded', ha='center', va='center',
223+
fontsize=12, transform=ax9.transAxes)
224+
ax9.set_title('Top 10 API Calls by Memory Impact')
225+
ax9.axis('off')
121226

122-
# Plot 8: Memory Usage by API Call (average per call)
123-
ax8 = axes[1, 2]
227+
# Plot 10: Memory Usage by API Call (average per call)
228+
ax10 = axes[3, 0]
124229
# Calculate average memory delta per API call type
125230
avg_memory_by_api = (df.groupby('APICall')['Delta_VmRSS_KB'].mean() / 1024).sort_values(ascending=True).tail(10)
126-
avg_memory_by_api.plot(kind='barh', ax=ax8, color='mediumseagreen')
127-
ax8.set_xlabel('Avg Memory Delta per Call (MB)')
128-
ax8.set_title('Top 10 API Calls by Avg Memory per Call')
129-
ax8.grid(True, alpha=0.3, axis='x')
231+
if len(avg_memory_by_api) > 0:
232+
avg_memory_by_api.plot(kind='barh', ax=ax10, color='mediumseagreen')
233+
ax10.set_xlabel('Avg Memory Delta per Call (MB)')
234+
ax10.set_title('Top 10 API Calls by Avg Memory per Call')
235+
ax10.grid(True, alpha=0.3, axis='x')
236+
else:
237+
ax10.text(0.5, 0.5, 'No API calls recorded', ha='center', va='center',
238+
fontsize=12, transform=ax10.transAxes)
239+
ax10.set_title('Top 10 API Calls by Avg Memory per Call')
240+
ax10.axis('off')
130241

131-
# Plot 9: Cumulative memory by API over time
132-
ax9 = axes[0, 2]
242+
# Plot 11: Cumulative memory by API over time
243+
ax11 = axes[3, 1]
133244
# Get top 5 API calls by total memory impact
134245
top5_apis = df.groupby('APICall')['Delta_VmRSS_KB'].sum().nlargest(5).index
135-
for api in top5_apis:
136-
api_data = df[df['APICall'] == api]
137-
ax9.plot(api_data['TimeSec'], (api_data['Delta_VmRSS_KB'].cumsum() / 1024), label=api, linewidth=2)
138-
ax9.set_xlabel('Time (s)')
139-
ax9.set_ylabel('Cumulative Memory Delta (MB)')
140-
ax9.set_title('Cumulative Memory Impact by Top 5 APIs')
141-
ax9.legend(fontsize=8)
142-
ax9.grid(True, alpha=0.3)
246+
if len(top5_apis) > 0:
247+
for api in top5_apis:
248+
api_data = df[df['APICall'] == api]
249+
ax11.plot(api_data['TimeSec'], (api_data['Delta_VmRSS_KB'].cumsum() / 1024), label=api, linewidth=2)
250+
ax11.set_xlabel('Time (s)')
251+
ax11.set_ylabel('Cumulative Memory Delta (MB)')
252+
ax11.set_title('Cumulative Memory Impact by Top 5 APIs')
253+
ax11.legend(fontsize=8)
254+
ax11.grid(True, alpha=0.3)
255+
else:
256+
ax11.text(0.5, 0.5, 'No API calls recorded', ha='center', va='center',
257+
fontsize=12, transform=ax11.transAxes)
258+
ax11.set_title('Cumulative Memory Impact by Top 5 APIs')
259+
ax11.axis('off')
260+
261+
# Plot 12: Leak detection timeline with annotations
262+
ax12 = axes[3, 2]
263+
if not leak_events.empty:
264+
# Show individual leak magnitudes over time
265+
ax12.bar(leak_events['TimeSec'], leak_events['PotentialLeak_KB'] / 1024,
266+
width=0.01, color='red', alpha=0.7, label='Leak Magnitude')
267+
ax12.set_xlabel('Time (s)')
268+
ax12.set_ylabel('Leaked Memory (MB)')
269+
ax12.set_title('Individual Leak Events Timeline', fontweight='bold')
270+
ax12.legend()
271+
ax12.grid(True, alpha=0.3, axis='y')
272+
ax12.set_facecolor('#fff5f5')
273+
274+
# Add text annotation for total
275+
total_leaked = leak_events['PotentialLeak_KB'].sum() / 1024
276+
ax12.text(0.95, 0.95, f'Total: {total_leaked:.2f} MB\n{len(leak_events)} events',
277+
transform=ax12.transAxes, ha='right', va='top',
278+
bbox=dict(boxstyle='round', facecolor='white', alpha=0.8),
279+
fontsize=10, fontweight='bold', color='darkred')
280+
else:
281+
ax12.text(0.5, 0.5, 'No Memory Leaks Detected!\n✓ All resources properly cleaned up',
282+
ha='center', va='center', fontsize=14, color='green',
283+
fontweight='bold', transform=ax12.transAxes)
284+
ax12.set_title('Individual Leak Events Timeline')
285+
ax12.axis('off')
143286

144287
plt.tight_layout()
145288

@@ -160,6 +303,24 @@ def plot_resource_tracker(csv_file):
160303
print(f"Total memory allocated: {df['TotalMemory_Bytes'].max():.2f} bytes "
161304
f"({df['TotalMemory_Bytes'].max()/(1024*1024):.2f} MB)")
162305
print(f"Number of threads: {df['Threads'].max()}")
306+
307+
# Print leak detection summary
308+
print(f"\n=== MEMORY LEAK DETECTION ===")
309+
if df['CumulativeLeak_KB'].max() > 0:
310+
print(f"⚠️ LEAKS DETECTED!")
311+
print(f"Total leaked memory: {df['CumulativeLeak_KB'].max() / 1024:.2f} MB ({df['CumulativeLeak_KB'].max():.2f} KB)")
312+
print(f"Number of leak events: {len(leak_events)}")
313+
if not leak_events.empty:
314+
print(f"\nLeak events by API:")
315+
leak_summary = leak_events.groupby('APICall')['PotentialLeak_KB'].agg(['count', 'sum', 'mean'])
316+
leak_summary.columns = ['Count', 'Total_KB', 'Avg_KB']
317+
leak_summary = leak_summary.sort_values('Total_KB', ascending=False)
318+
for api, row in leak_summary.iterrows():
319+
print(f" {api}: {row['Count']} events, {row['Total_KB']/1024:.2f} MB total, {row['Avg_KB']/1024:.2f} MB avg")
320+
else:
321+
print(f"✓ No memory leaks detected!")
322+
print(f" All resources were properly cleaned up.")
323+
163324
print(f"\nPeak resource counts:")
164325
print(f" Contexts: {df['Contexts'].max()}")
165326
print(f" CommandQueues: {df['CommandQueues'].max()}")
87.9 KB
Loading

0 commit comments

Comments
 (0)