@@ -162,7 +162,7 @@ void Tune::ztune() {
162162 double bpw[4 ];
163163 double A[4 ];
164164 for (u32 variant = 0 ; variant < FFTConfig::N_VARIANT; ++variant) {
165- FFTConfig fft{shape, variant, FFTConfig:: CARRY_AUTO};
165+ FFTConfig fft{shape, variant, CARRY_AUTO};
166166 std::tie (bpw[variant], A[variant]) = maxBpw (fft);
167167 }
168168 string s = " \" " s + shape.spec () + " \" " ;
@@ -178,7 +178,7 @@ void Tune::carryTune() {
178178 shared.args ->flags [" STATS" ] = " 1" ;
179179 u32 prevSize = 0 ;
180180 for (FFTShape shape : FFTShape::multiSpec (shared.args ->fftSpec )) {
181- FFTConfig fft{shape, 3 , FFTConfig:: CARRY_AUTO};
181+ FFTConfig fft{shape, 3 , CARRY_AUTO};
182182 if (prevSize == fft.size ()) { continue ; }
183183 prevSize = fft.size ();
184184
@@ -194,7 +194,9 @@ void Tune::carryTune() {
194194 }
195195
196196 double avg = (zv[0 ] + zv[1 ]) / 2 ;
197- log (" %14s %.3f : %.3f (%.3f %.3f) %f\n " , fft.spec ().c_str (), mid, avg, zv[0 ], zv[1 ], m);
197+ u32 exponent = fft.shape .carry32BPW () * fft.size ();
198+ double pErr100 = -expm1 (-exp (-avg) * exponent * 100 );
199+ log (" %14s %.3f : %.3f (%.3f %.3f) %f %.0f%%\n " , fft.spec ().c_str (), mid, avg, zv[0 ], zv[1 ], m, pErr100 * 100 );
198200 fo.printf (" %f %f\n " , log2 (fft.size ()), avg);
199201 }
200202}
@@ -223,7 +225,7 @@ void Tune::ctune() {
223225 }
224226
225227 for (FFTShape shape : shapes) {
226- u32 exponent = primes.prevPrime (FFTConfig{shape, 0 , FFTConfig:: CARRY_AUTO}.maxExp ());
228+ u32 exponent = primes.prevPrime (FFTConfig{shape, 0 , CARRY_AUTO}.maxExp ());
227229 // log("tuning %10s with exponent %u\n", fft.shape.spec().c_str(), exponent);
228230
229231 vector<int > bestPos (configsVect.size ());
@@ -240,7 +242,7 @@ void Tune::ctune() {
240242 for (u32 k = i + 1 ; k < configsVect.size (); ++k) {
241243 add (c, configsVect[k][bestPos[k]]);
242244 }
243- auto cost = Gpu::make (q, exponent, shared, FFTConfig{shape, 0 , FFTConfig:: CARRY_AUTO}, c, false )->timePRP ();
245+ auto cost = Gpu::make (q, exponent, shared, FFTConfig{shape, 0 , CARRY_AUTO}, c, false )->timePRP ();
244246
245247 bool isBest = (cost < best.cost );
246248 if (isBest) {
@@ -267,28 +269,28 @@ void Tune::tune() {
267269 double minCost = -1 ;
268270
269271 // Time an exponent that's good for all variants and carry-config.
270- u32 exponent = primes.prevPrime (FFTConfig{shape, 0 , FFTConfig:: CARRY_32}.maxExp ());
272+ u32 exponent = primes.prevPrime (FFTConfig{shape, 0 , CARRY_32}.maxExp ());
271273
272274 for (u32 variant = 0 ; variant < FFTConfig::N_VARIANT; ++variant) {
273- vector carryToTest{FFTConfig:: CARRY_32};
274- // We need to test both carry-32 and carry-64 only when the carry cutoff BPW is within the range.
275- if (shape. carry32BPW () < FFTConfig{shape, variant, FFTConfig::CARRY_64 }.maxBpw ()) {
276- carryToTest.push_back (FFTConfig:: CARRY_64);
275+ vector carryToTest{CARRY_32};
276+ // We need to test both carry-32 and carry-64 only when the carry transition is within the BPW range.
277+ if (FFTConfig{ shape, variant, CARRY_64}. maxBpw () > FFTConfig{shape, variant, CARRY_32 }.maxBpw ()) {
278+ carryToTest.push_back (CARRY_64);
277279 }
278280
279281 for (auto carry : carryToTest) {
280282 FFTConfig fft{shape, variant, carry};
281283
282284 if (minCost > 0 && !TuneEntry{minCost, fft}.willUpdate (results)) {
283- log (" skipped %s %9u\n " , fft.spec ().c_str (), fft.maxExp ());
285+ // log("skipped %s %9u\n", fft.spec().c_str(), fft.maxExp());
284286 continue ;
285287 }
286288
287289 double cost = Gpu::make (q, exponent, shared, fft, {}, false )->timePRP ();
288290 if (minCost <= 0 ) { minCost = cost; }
289291
290292 bool isUseful = TuneEntry{cost, fft}.update (results);
291- log (" %c %6.0f %12s %9u\n " , isUseful ? ' *' : ' ' , cost, fft.spec ().c_str (), fft.maxExp ());
293+ log (" %c %6.1f %12s %9u\n " , isUseful ? ' *' : ' ' , cost, fft.spec ().c_str (), fft.maxExp ());
292294 }
293295 }
294296 }
0 commit comments