-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlaunch_demon.py
More file actions
165 lines (143 loc) · 6.96 KB
/
launch_demon.py
File metadata and controls
165 lines (143 loc) · 6.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
cat > launch_demon.py <<'EOF'
#!/usr/bin/env python3
"""
CUDAgraveyard Lite v1.0: Fonctionne sur Ubuntu 24.04 + 2×3060 sans Shinka.
Utilise OpenAI pour mutations, torch pour matrices, hyperfine pour timing.
"""
import argparse
import json
import subprocess
import os
import time
import torch
from openai import OpenAI
# Clé API (ajoute-la si pas en env)
if not os.getenv("OPENAI_API_KEY"):
print("ERREUR: Set OPENAI_API_KEY=sk-... (obligatoire pour mutations)")
exit(1)
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def load_template(template_path):
with open(template_path, 'r') as f:
return json.load(f)
def generate_hypothesis(task):
prompt = f"""Task: Optimise GEMM CUDA pour {task['task']}.
Propose 3 mutations simples (ex: tile size 32, unroll 4, basic async copy).
Output strict JSON: {{"mutations": [{{"desc": "Change tile to 32x32", "code_snippet": "__global__ void gemm(...) {{ /* ton code CUDA ici */ }}"}}]}}
Keep code short, compilable with nvcc -arch=sm_86."""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
temperature=0.7,
max_tokens=800
)
try:
return json.loads(response.choices[0].message.content)
except json.JSONDecodeError:
return {"mutations": [{"desc": "Fallback mutation", "code_snippet": "#include <cuda_runtime.h>\n__global__ void gemm(float *C, float *A, float *B, int N) {\n // simple loop\n int tx = threadIdx.x, ty = threadIdx.y;\n int bx = blockIdx.x, by = blockIdx.y;\n float sum = 0;\n for (int k = 0; k < N; k++) sum += A[(bx * blockDim.x + tx) * N + k] * B[k * N + (by * blockDim.y + ty)];\n C[(bx * blockDim.x + tx) * N + (by * blockDim.y + ty)] = sum;\n}"}]}
def compile_kernel(code_snippet, filename="temp.cu"):
with open(filename, "w") as f:
f.write(code_snippet)
cmd = ["nvcc", "-O3", "-arch=sm_86", filename, "-o", "killer"] # sm_86 pour RTX 30xx
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
return None, result.stderr
return "killer", None
def benchmark_kernel(binary, gpus, matrix_size):
# Setup matrices avec torch (transfert GPU)
device0 = torch.device(f"cuda:{gpus[0]}")
a = torch.randn(matrix_size, matrix_size, device=device0, dtype=torch.float16)
b = torch.randn(matrix_size, matrix_size, device=device0, dtype=torch.float16)
c = torch.zeros(matrix_size, matrix_size, device=device0, dtype=torch.float16)
# Export pour kernel (simulé, adapte si besoin)
torch.cuda.synchronize()
start_time = time.time()
# Run hyperfine pour timing précis
hyper_cmd = ["hyperfine", "--warmup=3", "--min-runs=5", f"./{binary}"]
hyper_result = subprocess.run(hyper_cmd, capture_output=True, text=True)
if hyper_result.returncode != 0:
return 0, 0, hyper_result.stderr
# Parse temps moyen (ex: "Time (mean ± σ): 0.123 s ± 0.004 s")
lines = hyper_result.stdout.split('\n')
time_mean = float([line for line in lines if 'Time (mean' in line][0].split()[3][:-1]) # sec
# GFLOPS approx = 2 * N^3 / time
gflops = 2 * (matrix_size ** 3) / time_mean / 1e9
# Power via nvidia-smi
power_cmd = ["nvidia-smi", "--query-gpu=power.draw", "--format=csv", "-i", str(gpus[0])]
power_result = subprocess.run(power_cmd, capture_output=True, text=True)
power = float(power_result.stdout.strip().split('\n')[-1].split()[0]) if power_result.stdout else 0
return gflops, power, None
def main(args):
task = load_template(args.template)
gpus_list = args.gpus.split(",")
size = task.get("matrix_size", 4096)
# Baseline simple (kernel naive)
baseline_code = """#include <cuda_runtime.h>
#include <stdio.h>
int main() {
int N = """ + str(size) + """;
float *A, *B, *C;
cudaMallocManaged(&A, N*N*sizeof(float));
cudaMallocManaged(&B, N*N*sizeof(float));
cudaMallocManaged(&C, N*N*sizeof(float));
// Init random...
for (int i = 0; i < N*N; i++) { A[i] = rand() / (float)RAND_MAX; B[i] = rand() / (float)RAND_MAX; }
dim3 block(16,16);
dim3 grid((N+15)/16, (N+15)/16);
gemm<<<grid, block>>>(C, A, B, N);
cudaDeviceSynchronize();
cudaFree(A); cudaFree(B); cudaFree(C);
return 0;
}
__global__ void gemm(float *C, float *A, float *B, int N) {
int tx = threadIdx.x, ty = threadIdx.y;
int bx = blockIdx.x, by = blockIdx.y;
float sum = 0;
for (int k = 0; k < N; k++) sum += A[(bx*16 + tx)*N + k] * B[k*N + (by*16 + ty)];
C[(bx*16 + tx)*N + (by*16 + ty)] = sum;
}"""
baseline_bin, err = compile_kernel(baseline_code, "baseline.cu")
if err:
print(f"Baseline compile error: {err}")
return
baseline_gflops, _, _ = benchmark_kernel(baseline_bin, gpus_list, size)
print(f"Baseline cuBLAS-like: {baseline_gflops:.2f} GFLOPS")
current_code = baseline_code # Start with baseline
for iter in range(task["constraints"][2].split()[-2]): # Max iters from template
print(f"\n--- Iteration {iter + 1} ---")
hyps = generate_hypothesis(task)
best_speedup = 0
best_code = current_code
for mut in hyps["mutations"]:
full_code = current_code.replace("__global__ void gemm", mut["code_snippet"]) # Simple replace for demo
bin_name, err = compile_kernel(full_code)
if err:
print(f" Mut {mut['desc']}: Compile error - skipped")
continue
gflops, power, err = benchmark_kernel(bin_name, gpus_list, size)
if err:
print(f" Mut {mut['desc']}: Bench error - skipped")
continue
speedup = (gflops / baseline_gflops - 1) * 100
print(f" Mut {mut['desc']}: {gflops:.2f} GFLOPS (+{speedup:.1f}%) - {power}W")
if speedup > best_speedup:
best_speedup = speedup
best_code = full_code
if best_speedup > 8: # Seuil kill
os.makedirs("graves", exist_ok=True)
ts = int(time.time())
with open(f"graves/gemm_kill_{ts}.cu", "w") as f:
f.write(best_code)
with open(f"graves/metrics_{ts}.md", "w") as f:
f.write(f"# Kill {ts}\nBaseline: {baseline_gflops:.2f} GFLOPS\nBest: {baseline_gflops * (1 + best_speedup/100):.2f} GFLOPS (+{best_speedup:.1f}%)\nPower saved: estimated -{best_speedup*0.5:.1f}%")
print(f"\n🎉 KILL! +{best_speedup:.1f}% - Enterré dans graves/gemm_kill_{ts}.cu")
return
current_code = best_code
time.sleep(10) # Pause entre iters
print("\nTest fini (pas de kill >8%, mais loop OK). Augmente iters pour plus.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Launch CUDAgraveyard lite")
parser.add_argument("--template", required=True, help="Path to JSON template")
parser.add_argument("--gpus", default="0", help="GPU IDs comma sep")
args = parser.parse_args()
main(args)
EOF