1
+ #!/usr/bin/env python3
2
+ """
3
+ Capture autotune min/mid/max from first two generations only.
4
+ Kills the process after seeing Generation 2 data to save time.
5
+ """
6
+
7
+ import subprocess
8
+ import re
9
+ import time
10
+ import os
11
+ import sys
12
+ import signal
13
+ from pathlib import Path
14
+ from concurrent .futures import ProcessPoolExecutor , ThreadPoolExecutor
15
+ import threading
16
+ import json
17
+ try :
18
+ import psutil
19
+ except ImportError :
20
+ print ("Warning: psutil not installed. CPU affinity detection will be limited." )
21
+ psutil = None
22
+
23
+ class AutotuneCapture :
24
+ def __init__ (self ):
25
+ self .initial_pop = None
26
+ self .pattern = re .compile (
27
+ r'(?:Initial population|Initial generation):.*?'
28
+ r'min=([\d.]+)\s+mid=([\d.]+)\s+max=([\d.]+)'
29
+ )
30
+
31
+ def parse_line (self , line ):
32
+ """Extract initial generation data from a line."""
33
+ match = self .pattern .search (line )
34
+ if match :
35
+ data = {
36
+ 'min' : float (match .group (1 )),
37
+ 'mid' : float (match .group (2 )),
38
+ 'max' : float (match .group (3 ))
39
+ }
40
+
41
+ if 'Initial population' in line or 'Initial generation' in line :
42
+ self .initial_pop = data
43
+ return 'initial'
44
+ return None
45
+
46
+ def monitor_process_output (proc , gpu_id , capture ):
47
+ """Monitor process output and kill after initial generation."""
48
+ print (f"[GPU { gpu_id } ] Monitoring autotune output..." )
49
+
50
+ for line in proc .stdout :
51
+ line = line .strip ()
52
+ if line :
53
+ # Check for generation data
54
+ result = capture .parse_line (line )
55
+
56
+ if result == 'initial' :
57
+ print (f"[GPU { gpu_id } ] Initial generation: min={ capture .initial_pop ['min' ]:.4f} "
58
+ f"mid={ capture .initial_pop ['mid' ]:.4f} max={ capture .initial_pop ['max' ]:.4f} " )
59
+
60
+ # Kill the process after initial generation
61
+ print (f"[GPU { gpu_id } ] Got initial generation data, terminating..." )
62
+ proc .terminate ()
63
+ time .sleep (1 )
64
+ if proc .poll () is None :
65
+ proc .kill ()
66
+ break
67
+
68
+ return capture
69
+
70
+ def get_numa_cpu_affinity (gpu_id , total_gpus = 8 ):
71
+ """Get NUMA node and CPU cores for a GPU."""
72
+ import psutil
73
+
74
+ cpu_count = psutil .cpu_count (logical = False )
75
+ cpu_count_logical = psutil .cpu_count (logical = True )
76
+
77
+ # Simple heuristic: distribute GPUs across available CPUs
78
+ cpus_per_gpu = max (4 , cpu_count_logical // total_gpus )
79
+ start_cpu = gpu_id * cpus_per_gpu
80
+ end_cpu = min (start_cpu + cpus_per_gpu - 1 , cpu_count_logical - 1 )
81
+
82
+ # Assume 2 NUMA nodes for simplicity
83
+ numa_node = gpu_id // (total_gpus // 2 )
84
+
85
+ return numa_node , f"{ start_cpu } -{ end_cpu } "
86
+
87
+ def run_autotune_capture (gpu_id , kernel = 'gemm' , log_file = None , use_isolation = True ):
88
+ """Run benchmark and capture initial generation of autotune data."""
89
+ env = os .environ .copy ()
90
+ env ['CUDA_VISIBLE_DEVICES' ] = str (gpu_id )
91
+
92
+ # Enable Helion autotune logging
93
+ env ['HELION_AUTOTUNE_LOG_LEVEL' ] = '10' # DEBUG level
94
+
95
+ # Set thread limits to prevent oversubscription
96
+ numa_node , cpu_list = get_numa_cpu_affinity (gpu_id )
97
+ num_cpus = len (range (int (cpu_list .split ('-' )[0 ]), int (cpu_list .split ('-' )[1 ]) + 1 ))
98
+ env ['OMP_NUM_THREADS' ] = str (num_cpus )
99
+ env ['MKL_NUM_THREADS' ] = str (num_cpus )
100
+
101
+ cmd = []
102
+
103
+ # Add isolation commands if requested
104
+ if use_isolation :
105
+ # CPU pinning with taskset
106
+ cmd .extend (['taskset' , '-c' , cpu_list ])
107
+
108
+ # NUMA binding with numactl
109
+ cmd .extend (['numactl' , f'--cpunodebind={ numa_node } ' , f'--membind={ numa_node } ' ])
110
+
111
+ # Python command
112
+ cmd .extend ([
113
+ sys .executable ,
114
+ 'benchmarks/run.py' ,
115
+ '--kernel' , kernel ,
116
+ '--num-inputs' , '1'
117
+ ])
118
+
119
+ if use_isolation :
120
+ print (f"[GPU { gpu_id } ] Starting autotune capture with isolation:" )
121
+ print (f" NUMA node: { numa_node } , CPU cores: { cpu_list } " )
122
+ else :
123
+ print (f"[GPU { gpu_id } ] Starting autotune capture..." )
124
+
125
+ capture = AutotuneCapture ()
126
+
127
+ # Start process
128
+ proc = subprocess .Popen (
129
+ cmd ,
130
+ env = env ,
131
+ stdout = subprocess .PIPE ,
132
+ stderr = subprocess .STDOUT ,
133
+ text = True ,
134
+ bufsize = 1 # Line buffered
135
+ )
136
+
137
+ # Monitor output
138
+ try :
139
+ monitor_process_output (proc , gpu_id , capture )
140
+ except Exception as e :
141
+ print (f"[GPU { gpu_id } ] Error: { e } " )
142
+ proc .kill ()
143
+
144
+ # Save to log file if specified
145
+ if log_file and capture .initial_pop :
146
+ with open (log_file , 'w' ) as f :
147
+ f .write (f"GPU { gpu_id } Autotune Results\n " )
148
+ f .write ("=" * 40 + "\n " )
149
+ f .write (f"Initial: min={ capture .initial_pop ['min' ]:.4f} "
150
+ f"mid={ capture .initial_pop ['mid' ]:.4f} "
151
+ f"max={ capture .initial_pop ['max' ]:.4f} \n " )
152
+
153
+ return capture
154
+
155
+ def compare_single_vs_concurrent (kernel = 'gemm' , num_gpus = 4 , use_isolation = True ):
156
+ """Compare autotune results between single GPU and concurrent execution."""
157
+ results_dir = Path ('autotune_comparison' )
158
+ results_dir .mkdir (exist_ok = True )
159
+
160
+ print ("AUTOTUNE INITIAL GENERATION COMPARISON" )
161
+ print ("=" * 60 )
162
+ print (f"Kernel: { kernel } " )
163
+ print (f"Capturing: Initial population/generation only" )
164
+
165
+ # Step 1: Single GPU baseline
166
+ print (f"\n Step 1: Single GPU baseline (GPU 0)" )
167
+ print ("-" * 40 )
168
+
169
+ baseline_log = results_dir / 'baseline_gpu0.txt'
170
+ baseline = run_autotune_capture (0 , kernel , baseline_log , use_isolation = False ) # Single GPU doesn't need isolation
171
+
172
+ if not baseline .initial_pop :
173
+ print ("ERROR: Failed to capture initial generation for baseline" )
174
+ return
175
+
176
+ # Cool down
177
+ print ("\n Cooling down for 30s..." )
178
+ time .sleep (30 )
179
+
180
+ # Step 2: Concurrent execution
181
+ isolation_msg = "with process isolation" if use_isolation else "without isolation"
182
+ print (f"\n Step 2: Concurrent execution on { num_gpus } GPUs { isolation_msg } " )
183
+ print ("-" * 40 )
184
+
185
+ concurrent_results = {}
186
+
187
+ with ProcessPoolExecutor (max_workers = num_gpus ) as executor :
188
+ futures = {
189
+ executor .submit (
190
+ run_autotune_capture ,
191
+ gpu_id ,
192
+ kernel ,
193
+ results_dir / f'concurrent_gpu{ gpu_id } .txt' ,
194
+ use_isolation
195
+ ): gpu_id
196
+ for gpu_id in range (num_gpus )
197
+ }
198
+
199
+ for future in futures :
200
+ gpu_id = futures [future ]
201
+ try :
202
+ result = future .result ()
203
+ concurrent_results [gpu_id ] = result
204
+ except Exception as e :
205
+ print (f"[GPU { gpu_id } ] Failed: { e } " )
206
+
207
+ # Analysis
208
+ print ("\n " + "=" * 60 )
209
+ print ("RESULTS COMPARISON" )
210
+ print ("=" * 60 )
211
+
212
+ print ("\n Baseline (Single GPU):" )
213
+ print (f" Initial: min={ baseline .initial_pop ['min' ]:.4f} "
214
+ f"mid={ baseline .initial_pop ['mid' ]:.4f} "
215
+ f"max={ baseline .initial_pop ['max' ]:.4f} " )
216
+
217
+ print ("\n Concurrent GPUs:" )
218
+
219
+ # Collect all concurrent min times
220
+ concurrent_mins = []
221
+ for gpu_id in sorted (concurrent_results .keys ()):
222
+ result = concurrent_results [gpu_id ]
223
+ if result .initial_pop :
224
+ concurrent_mins .append (result .initial_pop ['min' ])
225
+ print (f" GPU { gpu_id } Initial: min={ result .initial_pop ['min' ]:.4f} "
226
+ f"mid={ result .initial_pop ['mid' ]:.4f} "
227
+ f"max={ result .initial_pop ['max' ]:.4f} " )
228
+
229
+ if concurrent_mins :
230
+ # Compare min times
231
+ baseline_min = baseline .initial_pop ['min' ]
232
+ avg_concurrent_min = sum (concurrent_mins ) / len (concurrent_mins )
233
+
234
+ degradation = ((avg_concurrent_min - baseline_min ) / baseline_min ) * 100
235
+
236
+ print (f"\n Initial Generation Min Time Comparison:" )
237
+ print (f" Baseline: { baseline_min :.4f} " )
238
+ print (f" Concurrent avg: { avg_concurrent_min :.4f} " )
239
+ print (f" Degradation: { degradation :+.1f} %" )
240
+
241
+ if degradation > 5 :
242
+ print (f"\n ⚠️ SIGNIFICANT CONTENTION DETECTED!" )
243
+ print (f" Concurrent autotuning shows { degradation :.1f} % worse min times" )
244
+ print (f" This indicates resource contention is affecting autotune quality" )
245
+ elif degradation > 2 :
246
+ print (f"\n ⚡ MODERATE CONTENTION ({ degradation :.1f} % degradation)" )
247
+ else :
248
+ print (f"\n ✅ MINIMAL CONTENTION ({ degradation :.1f} % degradation)" )
249
+
250
+ # Save summary
251
+ summary = {
252
+ 'kernel' : kernel ,
253
+ 'baseline' : {
254
+ 'initial' : baseline .initial_pop
255
+ },
256
+ 'concurrent' : {
257
+ f'gpu_{ gpu_id } ' : {
258
+ 'initial' : result .initial_pop
259
+ }
260
+ for gpu_id , result in concurrent_results .items ()
261
+ if result .initial_pop
262
+ }
263
+ }
264
+
265
+ import json
266
+ with open (results_dir / 'summary.json' , 'w' ) as f :
267
+ json .dump (summary , f , indent = 2 )
268
+
269
+ print (f"\n Detailed results saved to { results_dir } /" )
270
+
271
+ def main ():
272
+ import argparse
273
+ parser = argparse .ArgumentParser (description = 'Capture autotune generations for contention analysis' )
274
+ parser .add_argument ('--kernel' , default = 'gemm' , help = 'Kernel to test' )
275
+ parser .add_argument ('--num-gpus' , type = int , default = 4 , help = 'Number of GPUs for concurrent test' )
276
+ parser .add_argument ('--no-isolation' , action = 'store_true' , help = 'Disable process isolation for concurrent runs' )
277
+
278
+ args = parser .parse_args ()
279
+
280
+ compare_single_vs_concurrent (args .kernel , args .num_gpus , use_isolation = not args .no_isolation )
281
+
282
+ if __name__ == '__main__' :
283
+ main ()
0 commit comments