@@ -347,6 +347,7 @@ void Tune::tune() {
347347 bool tune_config = 1 ;
348348 bool time_FFTs = 0 ;
349349 bool time_NTTs = 0 ;
350+ bool time_FP32 = 1 ;
350351 int quick = 7 ; // Run config from slowest (quick=1) to fastest (quick=10)
351352 u64 min_exponent = 75000000 ;
352353 u64 max_exponent = 350000000 ;
@@ -358,6 +359,7 @@ void Tune::tune() {
358359 if (s == " noconfig" ) tune_config = 0 ;
359360 if (s == " fp64" ) time_FFTs = 1 ;
360361 if (s == " ntt" ) time_NTTs = 1 ;
362+ if (s == " nofp32" ) time_FP32 = 0 ;
361363 auto keyVal = split (s, ' =' );
362364 if (keyVal.size () == 2 ) {
363365 if (keyVal.front () == " quick" ) quick = stod (keyVal.back ());
@@ -554,7 +556,7 @@ void Tune::tune() {
554556
555557 // Find best INPLACE setting
556558 if (1 ) {
557- FFTConfig fft{*defaultShape, 101 , CARRY_AUTO};
559+ FFTConfig fft{*defaultShape, variant , CARRY_AUTO};
558560 u32 exponent = primes.prevPrime (fft.maxExp ());
559561 u32 best_inplace = 0 ;
560562 double best_cost = -1.0 ;
@@ -566,7 +568,7 @@ void Tune::tune() {
566568 if (inplace == current_inplace) current_cost = cost;
567569 if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_inplace = inplace; }
568570 }
569- log (" Best INPLACE is %u. Default INPLACE is 0. Best INPLACE setting may change when using larger FFTs .\n " , best_inplace);
571+ log (" Best INPLACE is %u. Default INPLACE is 0. Best INPLACE setting may be different for other FFT lengths .\n " , best_inplace);
570572 configsUpdate (current_cost, best_cost, 0.002 , " INPLACE" , best_inplace, newConfigKeyVals, suggestedConfigKeyVals);
571573 args->flags [" INPLACE" ] = to_string (best_inplace);
572574 }
@@ -676,7 +678,7 @@ void Tune::tune() {
676678 }
677679
678680 // Find best TAIL_TRIGS32 setting
679- if (time_NTTs) {
681+ if (time_NTTs && time_FP32 ) {
680682 FFTConfig fft{defaultNTTShape, 202 , CARRY_AUTO};
681683 if (!fft.FFT_FP32 ) fft = FFTConfig (FFTShape (FFT3261, 512 , 8 , 512 ), 202 , CARRY_AUTO);
682684 u32 exponent = primes.prevPrime (fft.maxBpw () * 0.95 * fft.shape .size ()); // Back off the maxExp as different settings will have different maxBpw
@@ -759,7 +761,7 @@ void Tune::tune() {
759761 }
760762
761763 // Find best TABMUL_CHAIN32 setting
762- if (time_NTTs) {
764+ if (time_NTTs && time_FP32 ) {
763765 FFTConfig fft{defaultNTTShape, 202 , CARRY_AUTO};
764766 if (!fft.FFT_FP32 ) fft = FFTConfig (FFTShape (FFT3261, 512 , 8 , 512 ), 202 , CARRY_AUTO);
765767 u32 exponent = primes.prevPrime (fft.maxBpw () * 0.95 * fft.shape .size ()); // Back off the maxExp as different settings will have different maxBpw
@@ -945,9 +947,10 @@ void Tune::tune() {
945947 config.write (" \n -log 1000000\n " );
946948 }
947949 if (args->workers < 2 ) {
948- config.write (" \n # Running two workers sometimes gives better throughput." );
949- config.write (" \n # Changing TAIL_KERNELS to 3 with two workers may be better." );
950- config.write (" \n # -workers 2 -use TAIL_KERNELS=3\n " );
950+ config.write (" \n # Running two workers sometimes gives better throughput. Autoprimenet will need to create up a second worktodo file." );
951+ config.write (" \n # -workers 2\n " );
952+ config.write (" \n # Changing TAIL_KERNELS to 3 when running two workers may be better." );
953+ config.write (" \n # -use TAIL_KERNELS=3\n " );
951954 }
952955 }
953956
@@ -981,6 +984,7 @@ skip_1K_256 = 0;
981984 // Skip some FFTs and NTTs
982985 if (shape.fft_type == FFT64 && !time_FFTs) continue ;
983986 if (shape.fft_type != FFT64 && !time_NTTs) continue ;
987+ if ((shape.fft_type == FFT3261 || shape.fft_type == FFT323161 || shape.fft_type == FFT3231 || shape.fft_type == FFT32) && !time_FP32) continue ;
984988
985989 // Time an exponent that's good for all variants and carry-config.
986990 u32 exponent = primes.prevPrime (FFTConfig{shape, shape.width <= 1024 ? 0u : 100u , CARRY_32}.maxExp ());
0 commit comments