Skip to content

Commit 4de83d0

Browse files
committed
Added tune code for IN_WG,IN_SIZEX,OUT_WG,OUT_SIZEX
1 parent 47ef597 commit 4de83d0

File tree

1 file changed

+44
-0
lines changed

1 file changed

+44
-0
lines changed

src/tune.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,50 @@ void Tune::tune() {
338338
u32 variant = 101;
339339
//GW: if fft spec on the command line specifies a variant then we should use that variant (I get some interesting results with 000 vs 101 vs 201 vs 202 likely due to rocm optimizer)
340340

341+
// Find best IN_WG,IN_SIZEX setting
342+
if (1) {
343+
const FFTShape& shape = shapes[0];
344+
FFTConfig fft{shape, variant, CARRY_32};
345+
u32 exponent = primes.prevPrime(fft.maxExp());
346+
u32 best_in_wg = 0;
347+
u32 best_in_sizex = 0;
348+
double best_cost = -1.0;
349+
for (u32 in_wg : {64, 128, 256}) {
350+
for (u32 in_sizex : {8, 16, 32}) {
351+
shared.args->flags["IN_WG"] = to_string(in_wg);
352+
shared.args->flags["IN_SIZEX"] = to_string(in_sizex);
353+
double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP();
354+
log("Time for %12s using IN_WG=%u, IN_SIZEX=%u is %6.1f\n", fft.spec().c_str(), in_wg, in_sizex, cost);
355+
if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_in_wg = in_wg; best_in_sizex = in_sizex; }
356+
}
357+
}
358+
log("Best IN_WG, IN_SIZEX is %u, %u. Default is 128, 16.\n", best_in_wg, best_in_sizex);
359+
shared.args->flags["IN_WG"] = to_string(best_in_wg);
360+
shared.args->flags["IN_SIZEX"] = to_string(best_in_sizex);
361+
}
362+
363+
// Find best OUT_WG,OUT_SIZEX setting
364+
if (1) {
365+
const FFTShape& shape = shapes[0];
366+
FFTConfig fft{shape, variant, CARRY_32};
367+
u32 exponent = primes.prevPrime(fft.maxExp());
368+
u32 best_out_wg = 0;
369+
u32 best_out_sizex = 0;
370+
double best_cost = -1.0;
371+
for (u32 out_wg : {64, 128, 256}) {
372+
for (u32 out_sizex : {8, 16, 32}) {
373+
shared.args->flags["OUT_WG"] = to_string(out_wg);
374+
shared.args->flags["OUT_SIZEX"] = to_string(out_sizex);
375+
double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP();
376+
log("Time for %12s using OUT_WG=%u, OUT_SIZEX=%u is %6.1f\n", fft.spec().c_str(), out_wg, out_sizex, cost);
377+
if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_out_wg = out_wg; best_out_sizex = out_sizex; }
378+
}
379+
}
380+
log("Best OUT_WG, OUT_SIZEX is %u, %u. Default is 128, 16.\n", best_out_wg, best_out_sizex);
381+
shared.args->flags["OUT_WG"] = to_string(best_out_wg);
382+
shared.args->flags["OUT_SIZEX"] = to_string(best_out_sizex);
383+
}
384+
341385
// Find best FAST_BARRIER setting
342386
if (1 && AMDGPU) {
343387
const FFTShape& shape = shapes[0];

0 commit comments

Comments
 (0)