@@ -327,9 +327,243 @@ void Tune::ctune() {
327327
328328void Tune::tune () {
329329 Args *args = shared.args ;
330- string fftSpec = args->fftSpec ;
330+ vector<FFTShape> shapes = FFTShape::multiSpec (args->fftSpec );
331+
332+ // There are some options and variants that are different based on GPU manufacturer
333+ bool AMDGPU = isAmdGpu (q->context ->deviceId ());
334+
335+ // Look for best settings of various options
336+
337+ if (1 ) {
338+ u32 variant = 101 ;
339+ // GW: if fft spec on the command line specifies a variant then we should use that variant (I get some interesting results with 000 vs 101 vs 201 vs 202 likely due to rocm optimizer)
340+
341+ // Find best FAST_BARRIER setting
342+ if (1 && AMDGPU) {
343+ const FFTShape& shape = shapes[0 ];
344+ FFTConfig fft{shape, variant, CARRY_32};
345+ u32 exponent = primes.prevPrime (fft.maxExp ());
346+ u32 best_fast_barrier = 0 ;
347+ double best_cost = -1.0 ;
348+ for (u32 fast_barrier : {0 , 1 }) {
349+ shared.args ->flags [" FAST_BARRIER" ] = to_string (fast_barrier);
350+ double cost = Gpu::make (q, exponent, shared, fft, {}, false )->timePRP ();
351+ log (" Time for %12s using FAST_BARRIER=%u is %6.1f\n " , fft.spec ().c_str (), fast_barrier, cost);
352+ if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_fast_barrier = fast_barrier; }
353+ }
354+ log (" Best FAST_BARRIER is %u. Default FAST_BARRIER is 0.\n " , best_fast_barrier);
355+ shared.args ->flags [" FAST_BARRIER" ] = to_string (best_fast_barrier);
356+ }
357+
358+ // Find best TAIL_TRIGS setting
359+ if (1 ) {
360+ const FFTShape& shape = shapes[0 ];
361+ FFTConfig fft{shape, variant, CARRY_32};
362+ u32 exponent = primes.prevPrime (fft.maxExp ());
363+ u32 best_tail_trigs = 0 ;
364+ double best_cost = -1.0 ;
365+ for (u32 tail_trigs : {0 , 1 , 2 }) {
366+ shared.args ->flags [" TAIL_TRIGS" ] = to_string (tail_trigs);
367+ double cost = Gpu::make (q, exponent, shared, fft, {}, false )->timePRP ();
368+ log (" Time for %12s using TAIL_TRIGS=%u is %6.1f\n " , fft.spec ().c_str (), tail_trigs, cost);
369+ if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_tail_trigs = tail_trigs; }
370+ }
371+ log (" Best TAIL_TRIGS is %u. Default TAIL_TRIGS is 2.\n " , best_tail_trigs);
372+ shared.args ->flags [" TAIL_TRIGS" ] = to_string (best_tail_trigs);
373+ }
374+
375+ // Find best TAIL_KERNELS setting
376+ if (1 ) {
377+ const FFTShape& shape = shapes[0 ];
378+ FFTConfig fft{shape, variant, CARRY_32};
379+ u32 exponent = primes.prevPrime (fft.maxExp ());
380+ u32 best_tail_kernels = 0 ;
381+ double best_cost = -1.0 ;
382+ for (u32 tail_kernels : {0 , 1 , 2 , 3 }) {
383+ shared.args ->flags [" TAIL_KERNELS" ] = to_string (tail_kernels);
384+ double cost = Gpu::make (q, exponent, shared, fft, {}, false )->timePRP ();
385+ log (" Time for %12s using TAIL_KERNELS=%u is %6.1f\n " , fft.spec ().c_str (), tail_kernels, cost);
386+ if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_tail_kernels = tail_kernels; }
387+ }
388+ if (best_tail_kernels & 1 )
389+ log (" Best TAIL_KERNELS is %u. Default TAIL_KERNELS is 2.\n " , best_tail_kernels);
390+ else
391+ log (" Best TAIL_KERNELS is %u (but best may be %u when running two workers on one GPU). Default TAIL_KERNELS is 2.\n " , best_tail_kernels, best_tail_kernels | 1 );
392+ shared.args ->flags [" TAIL_KERNELS" ] = to_string (best_tail_kernels);
393+ }
331394
332- // GW: detail all the configs we should auto-time first
395+ // Find best TABMUL_CHAIN setting
396+ if (1 ) {
397+ const FFTShape& shape = shapes[0 ];
398+ FFTConfig fft{shape, 101 , CARRY_32};
399+ u32 exponent = primes.prevPrime (fft.maxExp ());
400+ u32 best_tabmul_chain = 0 ;
401+ double best_cost = -1.0 ;
402+ for (u32 tabmul_chain : {0 , 1 }) {
403+ shared.args ->flags [" TABMUL_CHAIN" ] = to_string (tabmul_chain);
404+ double cost = Gpu::make (q, exponent, shared, fft, {}, false )->timePRP ();
405+ log (" Time for %12s using TABMUL_CHAIN=%u is %6.1f\n " , fft.spec ().c_str (), tabmul_chain, cost);
406+ if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_tabmul_chain = tabmul_chain; }
407+ }
408+ log (" Best TABMUL_CHAIN is %u. Default TABMUL_CHAIN is 0.\n " , best_tabmul_chain);
409+ shared.args ->flags [" TABMUL_CHAIN" ] = to_string (best_tabmul_chain);
410+ }
411+
412+ // Find best PAD setting. Default is 256 bytes for AMD, 0 for all others.
413+ if (1 ) {
414+ const FFTShape& shape = shapes[0 ];
415+ FFTConfig fft{shape, variant, CARRY_32};
416+ u32 exponent = primes.prevPrime (fft.maxExp ());
417+ u32 best_pad = 0 ;
418+ double best_cost = -1.0 ;
419+ for (u32 pad : {0 , 64 , 128 , 256 , 512 }) {
420+ shared.args ->flags [" PAD" ] = to_string (pad);
421+ double cost = Gpu::make (q, exponent, shared, fft, {}, false )->timePRP ();
422+ log (" Time for %12s using PAD=%u is %6.1f\n " , fft.spec ().c_str (), pad, cost);
423+ if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_pad = pad; }
424+ }
425+ log (" Best PAD is %u bytes. Default PAD is %u bytes.\n " , best_pad, AMDGPU ? 256 : 0 );
426+ shared.args ->flags [" PAD" ] = to_string (best_pad);
427+ }
428+
429+ // Find best NONTEMPORAL setting
430+ if (1 ) {
431+ const FFTShape& shape = shapes[0 ];
432+ FFTConfig fft{shape, variant, CARRY_32};
433+ u32 exponent = primes.prevPrime (fft.maxExp ());
434+ u32 best_nontemporal = 0 ;
435+ double best_cost = -1.0 ;
436+ for (u32 nontemporal : {0 , 1 }) {
437+ shared.args ->flags [" NONTEMPORAL" ] = to_string (nontemporal);
438+ double cost = Gpu::make (q, exponent, shared, fft, {}, false )->timePRP ();
439+ log (" Time for %12s using NONTEMPORAL=%u is %6.1f\n " , fft.spec ().c_str (), nontemporal, cost);
440+ if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_nontemporal = nontemporal; }
441+ }
442+ log (" Best NONTEMPORAL is %u. Default NONTEMPORAL is 0.\n " , best_nontemporal);
443+ shared.args ->flags [" NONTEMPORAL" ] = to_string (best_nontemporal);
444+ }
445+
446+ // Find best UNROLL_W setting
447+ if (1 ) {
448+ const FFTShape& shape = shapes[0 ];
449+ FFTConfig fft{shape, variant, CARRY_32};
450+ u32 exponent = primes.prevPrime (fft.maxExp ());
451+ u32 best_unroll_w = 0 ;
452+ double best_cost = -1.0 ;
453+ for (u32 unroll_w : {0 , 1 }) {
454+ shared.args ->flags [" UNROLL_W" ] = to_string (unroll_w);
455+ double cost = Gpu::make (q, exponent, shared, fft, {}, false )->timePRP ();
456+ log (" Time for %12s using UNROLL_W=%u is %6.1f\n " , fft.spec ().c_str (), unroll_w, cost);
457+ if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_unroll_w = unroll_w; }
458+ }
459+ log (" Best UNROLL_W is %u. Default UNROLL_W is %u.\n " , best_unroll_w, AMDGPU ? 0 : 1 );
460+ shared.args ->flags [" UNROLL_W" ] = to_string (best_unroll_w);
461+ }
462+
463+ // Find best UNROLL_H setting
464+ if (1 ) {
465+ const FFTShape& shape = shapes[0 ];
466+ FFTConfig fft{shape, variant, CARRY_32};
467+ u32 exponent = primes.prevPrime (fft.maxExp ());
468+ u32 best_unroll_h = 0 ;
469+ double best_cost = -1.0 ;
470+ for (u32 unroll_h : {0 , 1 }) {
471+ shared.args ->flags [" UNROLL_H" ] = to_string (unroll_h);
472+ double cost = Gpu::make (q, exponent, shared, fft, {}, false )->timePRP ();
473+ log (" Time for %12s using UNROLL_H=%u is %6.1f\n " , fft.spec ().c_str (), unroll_h, cost);
474+ if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_unroll_h = unroll_h; }
475+ }
476+ log (" Best UNROLL_H is %u. Default UNROLL_H is %u.\n " , best_unroll_h, AMDGPU && shape.height >= 1024 ? 0 : 1 );
477+ shared.args ->flags [" UNROLL_H" ] = to_string (best_unroll_h);
478+ }
479+
480+ // Find best ZEROHACK_W setting
481+ if (1 ) {
482+ const FFTShape& shape = shapes[0 ];
483+ FFTConfig fft{shape, variant, CARRY_32};
484+ u32 exponent = primes.prevPrime (fft.maxExp ());
485+ u32 best_zerohack_w = 0 ;
486+ double best_cost = -1.0 ;
487+ for (u32 zerohack_w : {0 , 1 }) {
488+ shared.args ->flags [" ZEROHACK_W" ] = to_string (zerohack_w);
489+ double cost = Gpu::make (q, exponent, shared, fft, {}, false )->timePRP ();
490+ log (" Time for %12s using ZEROHACK_W=%u is %6.1f\n " , fft.spec ().c_str (), zerohack_w, cost);
491+ if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_zerohack_w = zerohack_w; }
492+ }
493+ log (" Best ZEROHACK_W is %u. Default ZEROHACK_W is 1.\n " , best_zerohack_w);
494+ shared.args ->flags [" ZEROHACK_W" ] = to_string (best_zerohack_w);
495+ }
496+
497+ // Find best ZEROHACK_H setting
498+ if (1 ) {
499+ const FFTShape& shape = shapes[0 ];
500+ FFTConfig fft{shape, variant, CARRY_32};
501+ u32 exponent = primes.prevPrime (fft.maxExp ());
502+ u32 best_zerohack_h = 0 ;
503+ double best_cost = -1.0 ;
504+ for (u32 zerohack_h : {0 , 1 }) {
505+ shared.args ->flags [" ZEROHACK_H" ] = to_string (zerohack_h);
506+ double cost = Gpu::make (q, exponent, shared, fft, {}, false )->timePRP ();
507+ log (" Time for %12s using ZEROHACK_H=%u is %6.1f\n " , fft.spec ().c_str (), zerohack_h, cost);
508+ if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_zerohack_h = zerohack_h; }
509+ }
510+ log (" Best ZEROHACK_H is %u. Default ZEROHACK_H is 1.\n " , best_zerohack_h);
511+ shared.args ->flags [" ZEROHACK_H" ] = to_string (best_zerohack_h);
512+ }
513+
514+ // Find best MIDDLE_IN_LDS_TRANSPOSE setting
515+ if (1 ) {
516+ const FFTShape& shape = shapes[0 ];
517+ FFTConfig fft{shape, variant, CARRY_32};
518+ u32 exponent = primes.prevPrime (fft.maxExp ());
519+ u32 best_middle_in_lds_transpose = 0 ;
520+ double best_cost = -1.0 ;
521+ for (u32 middle_in_lds_transpose : {0 , 1 }) {
522+ shared.args ->flags [" MIDDLE_IN_LDS_TRANSPOSE" ] = to_string (middle_in_lds_transpose);
523+ double cost = Gpu::make (q, exponent, shared, fft, {}, false )->timePRP ();
524+ log (" Time for %12s using MIDDLE_IN_LDS_TRANSPOSE=%u is %6.1f\n " , fft.spec ().c_str (), middle_in_lds_transpose, cost);
525+ if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_middle_in_lds_transpose = middle_in_lds_transpose; }
526+ }
527+ log (" Best MIDDLE_IN_LDS_TRANSPOSE is %u. Default MIDDLE_IN_LDS_TRANSPOSE is 1.\n " , best_middle_in_lds_transpose);
528+ shared.args ->flags [" MIDDLE_IN_LDS_TRANSPOSE" ] = to_string (best_middle_in_lds_transpose);
529+ }
530+
531+ // Find best MIDDLE_OUT_LDS_TRANSPOSE setting
532+ if (1 ) {
533+ const FFTShape& shape = shapes[0 ];
534+ FFTConfig fft{shape, variant, CARRY_32};
535+ u32 exponent = primes.prevPrime (fft.maxExp ());
536+ u32 best_middle_out_lds_transpose = 0 ;
537+ double best_cost = -1.0 ;
538+ for (u32 middle_out_lds_transpose : {0 , 1 }) {
539+ shared.args ->flags [" MIDDLE_OUT_LDS_TRANSPOSE" ] = to_string (middle_out_lds_transpose);
540+ double cost = Gpu::make (q, exponent, shared, fft, {}, false )->timePRP ();
541+ log (" Time for %12s using MIDDLE_OUT_LDS_TRANSPOSE=%u is %6.1f\n " , fft.spec ().c_str (), middle_out_lds_transpose, cost);
542+ if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_middle_out_lds_transpose = middle_out_lds_transpose; }
543+ }
544+ log (" Best MIDDLE_OUT_LDS_TRANSPOSE is %u. Default MIDDLE_OUT_LDS_TRANSPOSE is 1.\n " , best_middle_out_lds_transpose);
545+ shared.args ->flags [" MIDDLE_OUT_LDS_TRANSPOSE" ] = to_string (best_middle_out_lds_transpose);
546+ }
547+
548+ // Find best BIGLIT setting
549+ if (1 ) {
550+ const FFTShape& shape = shapes[0 ];
551+ FFTConfig fft{shape, variant, CARRY_32};
552+ u32 exponent = primes.prevPrime (fft.maxExp ());
553+ u32 best_biglit = 0 ;
554+ double best_cost = -1.0 ;
555+ for (u32 biglit : {0 , 1 }) {
556+ shared.args ->flags [" BIGLIT" ] = to_string (biglit);
557+ double cost = Gpu::make (q, exponent, shared, fft, {}, false )->timePRP ();
558+ log (" Time for %12s using BIGLIT=%u is %6.1f\n " , fft.spec ().c_str (), biglit, cost);
559+ if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_biglit = biglit; }
560+ }
561+ log (" Best BIGLIT is %u. Default BIGLIT is 1. The BIGLIT=0 option will probably be deprecated.\n " , best_biglit);
562+ shared.args ->flags [" BIGLIT" ] = to_string (best_biglit);
563+ }
564+
565+ // GW: Time some IN/OUT_WG/SIZEX combos?
566+ }
333567
334568 // Flags that prune the amount of shapes and variants to time.
335569 // These should be computed automatically and saved in the tune.txt or config.txt file.
@@ -341,11 +575,8 @@ void Tune::tune() {
341575 // The width = height = 512 FFT shape is so good, we probably don't need to time the width = 1024, height = 256 shape.
342576 bool skip_1K_256 = 1 ;
343577
344- // There are some variands only AMD GPUs can execute
345- bool AMDGPU = isAmdGpu (q->context ->deviceId ());
346-
347- // make command line args for this?
348- skip_some_WH_variants = 2 ;
578+ // make command line args for this?
579+ skip_some_WH_variants = 2 ; // should default be 1??
349580skip_1K_256 = 0 ;
350581
351582// GW: Suggest tuning with TAIL_KERNELS=2 even if production runs use TAIL_KERNELS=3
@@ -359,7 +590,6 @@ skip_1K_256 = 0;
359590 map<int , u32 > fastest_height_variants;
360591
361592 vector<TuneEntry> results = TuneEntry::readTuneFile (*args);
362- vector<FFTShape> shapes = FFTShape::multiSpec (args->fftSpec );
363593
364594 // Loop through all possible FFT shapes
365595 for (const FFTShape& shape : shapes) {
0 commit comments