Skip to content

Commit 47ef597

Browse files
committed
Added -tune code to time all important -use options!
1 parent b1deb06 commit 47ef597

File tree

3 files changed

+242
-12
lines changed

3 files changed

+242
-12
lines changed

src/TrigBufCache.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,7 @@ TrigBufCache::~TrigBufCache() = default;
282282
TrigPtr TrigBufCache::smallTrig(u32 W, u32 nW) {
283283
lock_guard lock{mut};
284284
auto& m = small;
285-
decay_t<decltype(m)>::key_type key{W, nW, 0, 0};
285+
decay_t<decltype(m)>::key_type key{W, nW, 0, 0, 0, 0};
286286

287287
TrigPtr p{};
288288
auto it = m.find(key);
@@ -300,9 +300,9 @@ TrigPtr TrigBufCache::smallTrigCombo(u32 width, u32 middle, u32 W, u32 nW, u32 v
300300

301301
lock_guard lock{mut};
302302
auto& m = small;
303-
decay_t<decltype(m)>::key_type key1{W, nW, width, middle};
303+
decay_t<decltype(m)>::key_type key1{W, nW, width, middle, tail_single_wide, tail_trigs};
304304
// We write the "combo" under two keys, so it can also be retrieved as non-combo by smallTrig()
305-
decay_t<decltype(m)>::key_type key2{W, nW, 0, 0};
305+
decay_t<decltype(m)>::key_type key2{W, nW, 0, 0, 0, 0};
306306

307307
TrigPtr p{};
308308
auto it = m.find(key1);

src/TrigBufCache.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class TrigBufCache {
2727
const Context* context;
2828
std::mutex mut;
2929

30-
std::map<tuple<u32, u32, u32, u32>, TrigPtr::weak_type> small;
30+
std::map<tuple<u32, u32, u32, u32, bool, u32>, TrigPtr::weak_type> small;
3131
std::map<tuple<u32, u32, u32>, TrigPtr::weak_type> middle;
3232

3333
// The shared-pointers below keep the most recent set of buffers alive even without any Gpu instance

src/tune.cpp

Lines changed: 238 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -327,9 +327,243 @@ void Tune::ctune() {
327327

328328
void Tune::tune() {
329329
Args *args = shared.args;
330-
string fftSpec = args->fftSpec;
330+
vector<FFTShape> shapes = FFTShape::multiSpec(args->fftSpec);
331+
332+
// There are some options and variants that are different based on GPU manufacturer
333+
bool AMDGPU = isAmdGpu(q->context->deviceId());
334+
335+
// Look for best settings of various options
336+
337+
if (1) {
338+
u32 variant = 101;
339+
//GW: if fft spec on the command line specifies a variant then we should use that variant (I get some interesting results with 000 vs 101 vs 201 vs 202 likely due to rocm optimizer)
340+
341+
// Find best FAST_BARRIER setting
342+
if (1 && AMDGPU) {
343+
const FFTShape& shape = shapes[0];
344+
FFTConfig fft{shape, variant, CARRY_32};
345+
u32 exponent = primes.prevPrime(fft.maxExp());
346+
u32 best_fast_barrier = 0;
347+
double best_cost = -1.0;
348+
for (u32 fast_barrier : {0, 1}) {
349+
shared.args->flags["FAST_BARRIER"] = to_string(fast_barrier);
350+
double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP();
351+
log("Time for %12s using FAST_BARRIER=%u is %6.1f\n", fft.spec().c_str(), fast_barrier, cost);
352+
if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_fast_barrier = fast_barrier; }
353+
}
354+
log("Best FAST_BARRIER is %u. Default FAST_BARRIER is 0.\n", best_fast_barrier);
355+
shared.args->flags["FAST_BARRIER"] = to_string(best_fast_barrier);
356+
}
357+
358+
// Find best TAIL_TRIGS setting
359+
if (1) {
360+
const FFTShape& shape = shapes[0];
361+
FFTConfig fft{shape, variant, CARRY_32};
362+
u32 exponent = primes.prevPrime(fft.maxExp());
363+
u32 best_tail_trigs = 0;
364+
double best_cost = -1.0;
365+
for (u32 tail_trigs : {0, 1, 2}) {
366+
shared.args->flags["TAIL_TRIGS"] = to_string(tail_trigs);
367+
double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP();
368+
log("Time for %12s using TAIL_TRIGS=%u is %6.1f\n", fft.spec().c_str(), tail_trigs, cost);
369+
if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_tail_trigs = tail_trigs; }
370+
}
371+
log("Best TAIL_TRIGS is %u. Default TAIL_TRIGS is 2.\n", best_tail_trigs);
372+
shared.args->flags["TAIL_TRIGS"] = to_string(best_tail_trigs);
373+
}
374+
375+
// Find best TAIL_KERNELS setting
376+
if (1) {
377+
const FFTShape& shape = shapes[0];
378+
FFTConfig fft{shape, variant, CARRY_32};
379+
u32 exponent = primes.prevPrime(fft.maxExp());
380+
u32 best_tail_kernels = 0;
381+
double best_cost = -1.0;
382+
for (u32 tail_kernels : {0, 1, 2, 3}) {
383+
shared.args->flags["TAIL_KERNELS"] = to_string(tail_kernels);
384+
double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP();
385+
log("Time for %12s using TAIL_KERNELS=%u is %6.1f\n", fft.spec().c_str(), tail_kernels, cost);
386+
if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_tail_kernels = tail_kernels; }
387+
}
388+
if (best_tail_kernels & 1)
389+
log("Best TAIL_KERNELS is %u. Default TAIL_KERNELS is 2.\n", best_tail_kernels);
390+
else
391+
log("Best TAIL_KERNELS is %u (but best may be %u when running two workers on one GPU). Default TAIL_KERNELS is 2.\n", best_tail_kernels, best_tail_kernels | 1);
392+
shared.args->flags["TAIL_KERNELS"] = to_string(best_tail_kernels);
393+
}
331394

332-
//GW: detail all the configs we should auto-time first
395+
// Find best TABMUL_CHAIN setting
396+
if (1) {
397+
const FFTShape& shape = shapes[0];
398+
FFTConfig fft{shape, 101, CARRY_32};
399+
u32 exponent = primes.prevPrime(fft.maxExp());
400+
u32 best_tabmul_chain = 0;
401+
double best_cost = -1.0;
402+
for (u32 tabmul_chain : {0, 1}) {
403+
shared.args->flags["TABMUL_CHAIN"] = to_string(tabmul_chain);
404+
double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP();
405+
log("Time for %12s using TABMUL_CHAIN=%u is %6.1f\n", fft.spec().c_str(), tabmul_chain, cost);
406+
if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_tabmul_chain = tabmul_chain; }
407+
}
408+
log("Best TABMUL_CHAIN is %u. Default TABMUL_CHAIN is 0.\n", best_tabmul_chain);
409+
shared.args->flags["TABMUL_CHAIN"] = to_string(best_tabmul_chain);
410+
}
411+
412+
// Find best PAD setting. Default is 256 bytes for AMD, 0 for all others.
413+
if (1) {
414+
const FFTShape& shape = shapes[0];
415+
FFTConfig fft{shape, variant, CARRY_32};
416+
u32 exponent = primes.prevPrime(fft.maxExp());
417+
u32 best_pad = 0;
418+
double best_cost = -1.0;
419+
for (u32 pad : {0, 64, 128, 256, 512}) {
420+
shared.args->flags["PAD"] = to_string(pad);
421+
double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP();
422+
log("Time for %12s using PAD=%u is %6.1f\n", fft.spec().c_str(), pad, cost);
423+
if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_pad = pad; }
424+
}
425+
log("Best PAD is %u bytes. Default PAD is %u bytes.\n", best_pad, AMDGPU ? 256 : 0);
426+
shared.args->flags["PAD"] = to_string(best_pad);
427+
}
428+
429+
// Find best NONTEMPORAL setting
430+
if (1) {
431+
const FFTShape& shape = shapes[0];
432+
FFTConfig fft{shape, variant, CARRY_32};
433+
u32 exponent = primes.prevPrime(fft.maxExp());
434+
u32 best_nontemporal = 0;
435+
double best_cost = -1.0;
436+
for (u32 nontemporal : {0, 1}) {
437+
shared.args->flags["NONTEMPORAL"] = to_string(nontemporal);
438+
double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP();
439+
log("Time for %12s using NONTEMPORAL=%u is %6.1f\n", fft.spec().c_str(), nontemporal, cost);
440+
if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_nontemporal = nontemporal; }
441+
}
442+
log("Best NONTEMPORAL is %u. Default NONTEMPORAL is 0.\n", best_nontemporal);
443+
shared.args->flags["NONTEMPORAL"] = to_string(best_nontemporal);
444+
}
445+
446+
// Find best UNROLL_W setting
447+
if (1) {
448+
const FFTShape& shape = shapes[0];
449+
FFTConfig fft{shape, variant, CARRY_32};
450+
u32 exponent = primes.prevPrime(fft.maxExp());
451+
u32 best_unroll_w = 0;
452+
double best_cost = -1.0;
453+
for (u32 unroll_w : {0, 1}) {
454+
shared.args->flags["UNROLL_W"] = to_string(unroll_w);
455+
double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP();
456+
log("Time for %12s using UNROLL_W=%u is %6.1f\n", fft.spec().c_str(), unroll_w, cost);
457+
if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_unroll_w = unroll_w; }
458+
}
459+
log("Best UNROLL_W is %u. Default UNROLL_W is %u.\n", best_unroll_w, AMDGPU ? 0 : 1);
460+
shared.args->flags["UNROLL_W"] = to_string(best_unroll_w);
461+
}
462+
463+
// Find best UNROLL_H setting
464+
if (1) {
465+
const FFTShape& shape = shapes[0];
466+
FFTConfig fft{shape, variant, CARRY_32};
467+
u32 exponent = primes.prevPrime(fft.maxExp());
468+
u32 best_unroll_h = 0;
469+
double best_cost = -1.0;
470+
for (u32 unroll_h : {0, 1}) {
471+
shared.args->flags["UNROLL_H"] = to_string(unroll_h);
472+
double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP();
473+
log("Time for %12s using UNROLL_H=%u is %6.1f\n", fft.spec().c_str(), unroll_h, cost);
474+
if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_unroll_h = unroll_h; }
475+
}
476+
log("Best UNROLL_H is %u. Default UNROLL_H is %u.\n", best_unroll_h, AMDGPU && shape.height >= 1024 ? 0 : 1);
477+
shared.args->flags["UNROLL_H"] = to_string(best_unroll_h);
478+
}
479+
480+
// Find best ZEROHACK_W setting
481+
if (1) {
482+
const FFTShape& shape = shapes[0];
483+
FFTConfig fft{shape, variant, CARRY_32};
484+
u32 exponent = primes.prevPrime(fft.maxExp());
485+
u32 best_zerohack_w = 0;
486+
double best_cost = -1.0;
487+
for (u32 zerohack_w : {0, 1}) {
488+
shared.args->flags["ZEROHACK_W"] = to_string(zerohack_w);
489+
double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP();
490+
log("Time for %12s using ZEROHACK_W=%u is %6.1f\n", fft.spec().c_str(), zerohack_w, cost);
491+
if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_zerohack_w = zerohack_w; }
492+
}
493+
log("Best ZEROHACK_W is %u. Default ZEROHACK_W is 1.\n", best_zerohack_w);
494+
shared.args->flags["ZEROHACK_W"] = to_string(best_zerohack_w);
495+
}
496+
497+
// Find best ZEROHACK_H setting
498+
if (1) {
499+
const FFTShape& shape = shapes[0];
500+
FFTConfig fft{shape, variant, CARRY_32};
501+
u32 exponent = primes.prevPrime(fft.maxExp());
502+
u32 best_zerohack_h = 0;
503+
double best_cost = -1.0;
504+
for (u32 zerohack_h : {0, 1}) {
505+
shared.args->flags["ZEROHACK_H"] = to_string(zerohack_h);
506+
double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP();
507+
log("Time for %12s using ZEROHACK_H=%u is %6.1f\n", fft.spec().c_str(), zerohack_h, cost);
508+
if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_zerohack_h = zerohack_h; }
509+
}
510+
log("Best ZEROHACK_H is %u. Default ZEROHACK_H is 1.\n", best_zerohack_h);
511+
shared.args->flags["ZEROHACK_H"] = to_string(best_zerohack_h);
512+
}
513+
514+
// Find best MIDDLE_IN_LDS_TRANSPOSE setting
515+
if (1) {
516+
const FFTShape& shape = shapes[0];
517+
FFTConfig fft{shape, variant, CARRY_32};
518+
u32 exponent = primes.prevPrime(fft.maxExp());
519+
u32 best_middle_in_lds_transpose = 0;
520+
double best_cost = -1.0;
521+
for (u32 middle_in_lds_transpose : {0, 1}) {
522+
shared.args->flags["MIDDLE_IN_LDS_TRANSPOSE"] = to_string(middle_in_lds_transpose);
523+
double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP();
524+
log("Time for %12s using MIDDLE_IN_LDS_TRANSPOSE=%u is %6.1f\n", fft.spec().c_str(), middle_in_lds_transpose, cost);
525+
if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_middle_in_lds_transpose = middle_in_lds_transpose; }
526+
}
527+
log("Best MIDDLE_IN_LDS_TRANSPOSE is %u. Default MIDDLE_IN_LDS_TRANSPOSE is 1.\n", best_middle_in_lds_transpose);
528+
shared.args->flags["MIDDLE_IN_LDS_TRANSPOSE"] = to_string(best_middle_in_lds_transpose);
529+
}
530+
531+
// Find best MIDDLE_OUT_LDS_TRANSPOSE setting
532+
if (1) {
533+
const FFTShape& shape = shapes[0];
534+
FFTConfig fft{shape, variant, CARRY_32};
535+
u32 exponent = primes.prevPrime(fft.maxExp());
536+
u32 best_middle_out_lds_transpose = 0;
537+
double best_cost = -1.0;
538+
for (u32 middle_out_lds_transpose : {0, 1}) {
539+
shared.args->flags["MIDDLE_OUT_LDS_TRANSPOSE"] = to_string(middle_out_lds_transpose);
540+
double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP();
541+
log("Time for %12s using MIDDLE_OUT_LDS_TRANSPOSE=%u is %6.1f\n", fft.spec().c_str(), middle_out_lds_transpose, cost);
542+
if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_middle_out_lds_transpose = middle_out_lds_transpose; }
543+
}
544+
log("Best MIDDLE_OUT_LDS_TRANSPOSE is %u. Default MIDDLE_OUT_LDS_TRANSPOSE is 1.\n", best_middle_out_lds_transpose);
545+
shared.args->flags["MIDDLE_OUT_LDS_TRANSPOSE"] = to_string(best_middle_out_lds_transpose);
546+
}
547+
548+
// Find best BIGLIT setting
549+
if (1) {
550+
const FFTShape& shape = shapes[0];
551+
FFTConfig fft{shape, variant, CARRY_32};
552+
u32 exponent = primes.prevPrime(fft.maxExp());
553+
u32 best_biglit = 0;
554+
double best_cost = -1.0;
555+
for (u32 biglit : {0, 1}) {
556+
shared.args->flags["BIGLIT"] = to_string(biglit);
557+
double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP();
558+
log("Time for %12s using BIGLIT=%u is %6.1f\n", fft.spec().c_str(), biglit, cost);
559+
if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_biglit = biglit; }
560+
}
561+
log("Best BIGLIT is %u. Default BIGLIT is 1. The BIGLIT=0 option will probably be deprecated.\n", best_biglit);
562+
shared.args->flags["BIGLIT"] = to_string(best_biglit);
563+
}
564+
565+
//GW: Time some IN/OUT_WG/SIZEX combos?
566+
}
333567

334568
// Flags that prune the amount of shapes and variants to time.
335569
// These should be computed automatically and saved in the tune.txt or config.txt file.
@@ -341,11 +575,8 @@ void Tune::tune() {
341575
// The width = height = 512 FFT shape is so good, we probably don't need to time the width = 1024, height = 256 shape.
342576
bool skip_1K_256 = 1;
343577

344-
// There are some variands only AMD GPUs can execute
345-
bool AMDGPU = isAmdGpu(q->context->deviceId());
346-
347-
// make command line args for this?
348-
skip_some_WH_variants = 2;
578+
// make command line args for this?
579+
skip_some_WH_variants = 2; // should default be 1??
349580
skip_1K_256 = 0;
350581

351582
//GW: Suggest tuning with TAIL_KERNELS=2 even if production runs use TAIL_KERNELS=3
@@ -359,7 +590,6 @@ skip_1K_256 = 0;
359590
map<int, u32> fastest_height_variants;
360591

361592
vector<TuneEntry> results = TuneEntry::readTuneFile(*args);
362-
vector<FFTShape> shapes = FFTShape::multiSpec(args->fftSpec);
363593

364594
// Loop through all possible FFT shapes
365595
for (const FFTShape& shape : shapes) {

0 commit comments

Comments
 (0)