Skip to content

Commit 09ed544

Browse files
committed
Added no fp32 tune option (a Windows user found OpenCL compiler choking on use of fma function on floats).
Some minor changes on wording of tune output.
1 parent 4db49dc commit 09ed544

File tree

1 file changed

+11
-7
lines changed

1 file changed

+11
-7
lines changed

src/tune.cpp

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,7 @@ void Tune::tune() {
347347
bool tune_config = 1;
348348
bool time_FFTs = 0;
349349
bool time_NTTs = 0;
350+
bool time_FP32 = 1;
350351
int quick = 7; // Run config from slowest (quick=1) to fastest (quick=10)
351352
u64 min_exponent = 75000000;
352353
u64 max_exponent = 350000000;
@@ -358,6 +359,7 @@ void Tune::tune() {
358359
if (s == "noconfig") tune_config = 0;
359360
if (s == "fp64") time_FFTs = 1;
360361
if (s == "ntt") time_NTTs = 1;
362+
if (s == "nofp32") time_FP32 = 0;
361363
auto keyVal = split(s, '=');
362364
if (keyVal.size() == 2) {
363365
if (keyVal.front() == "quick") quick = stod(keyVal.back());
@@ -554,7 +556,7 @@ void Tune::tune() {
554556

555557
// Find best INPLACE setting
556558
if (1) {
557-
FFTConfig fft{*defaultShape, 101, CARRY_AUTO};
559+
FFTConfig fft{*defaultShape, variant, CARRY_AUTO};
558560
u32 exponent = primes.prevPrime(fft.maxExp());
559561
u32 best_inplace = 0;
560562
double best_cost = -1.0;
@@ -566,7 +568,7 @@ void Tune::tune() {
566568
if (inplace == current_inplace) current_cost = cost;
567569
if (best_cost < 0.0 || cost < best_cost) { best_cost = cost; best_inplace = inplace; }
568570
}
569-
log("Best INPLACE is %u. Default INPLACE is 0. Best INPLACE setting may change when using larger FFTs.\n", best_inplace);
571+
log("Best INPLACE is %u. Default INPLACE is 0. Best INPLACE setting may be different for other FFT lengths.\n", best_inplace);
570572
configsUpdate(current_cost, best_cost, 0.002, "INPLACE", best_inplace, newConfigKeyVals, suggestedConfigKeyVals);
571573
args->flags["INPLACE"] = to_string(best_inplace);
572574
}
@@ -676,7 +678,7 @@ void Tune::tune() {
676678
}
677679

678680
// Find best TAIL_TRIGS32 setting
679-
if (time_NTTs) {
681+
if (time_NTTs && time_FP32) {
680682
FFTConfig fft{defaultNTTShape, 202, CARRY_AUTO};
681683
if (!fft.FFT_FP32) fft = FFTConfig(FFTShape(FFT3261, 512, 8, 512), 202, CARRY_AUTO);
682684
u32 exponent = primes.prevPrime(fft.maxBpw() * 0.95 * fft.shape.size()); // Back off the maxExp as different settings will have different maxBpw
@@ -759,7 +761,7 @@ void Tune::tune() {
759761
}
760762

761763
// Find best TABMUL_CHAIN32 setting
762-
if (time_NTTs) {
764+
if (time_NTTs && time_FP32) {
763765
FFTConfig fft{defaultNTTShape, 202, CARRY_AUTO};
764766
if (!fft.FFT_FP32) fft = FFTConfig(FFTShape(FFT3261, 512, 8, 512), 202, CARRY_AUTO);
765767
u32 exponent = primes.prevPrime(fft.maxBpw() * 0.95 * fft.shape.size()); // Back off the maxExp as different settings will have different maxBpw
@@ -945,9 +947,10 @@ void Tune::tune() {
945947
config.write("\n -log 1000000\n");
946948
}
947949
if (args->workers < 2) {
948-
config.write("\n# Running two workers sometimes gives better throughput.");
949-
config.write("\n# Changing TAIL_KERNELS to 3 with two workers may be better.");
950-
config.write("\n# -workers 2 -use TAIL_KERNELS=3\n");
950+
config.write("\n# Running two workers sometimes gives better throughput. Autoprimenet will need to create up a second worktodo file.");
951+
config.write("\n# -workers 2\n");
952+
config.write("\n# Changing TAIL_KERNELS to 3 when running two workers may be better.");
953+
config.write("\n# -use TAIL_KERNELS=3\n");
951954
}
952955
}
953956

@@ -981,6 +984,7 @@ skip_1K_256 = 0;
981984
// Skip some FFTs and NTTs
982985
if (shape.fft_type == FFT64 && !time_FFTs) continue;
983986
if (shape.fft_type != FFT64 && !time_NTTs) continue;
987+
if ((shape.fft_type == FFT3261 || shape.fft_type == FFT323161 || shape.fft_type == FFT3231 || shape.fft_type == FFT32) && !time_FP32) continue;
984988

985989
// Time an exponent that's good for all variants and carry-config.
986990
u32 exponent = primes.prevPrime(FFTConfig{shape, shape.width <= 1024 ? 0u : 100u, CARRY_32}.maxExp());

0 commit comments

Comments
 (0)