@@ -156,6 +156,7 @@ named "config.txt" in the prpll run directory.
156156-prp <exponent> : run a single PRP test and exit, ignoring worktodo.txt
157157-ll <exponent> : run a single LL test and exit, ignoring worktodo.txt
158158-verify <file> : verify PRP-proof contained in <file>
159+ -smallest : work on smallest exponent in worktodo.txt rather than the first exponent in worktodo.txt
159160-proof <power> : generate proof of power <power> (default: optimal depending on exponent).
160161 A lower power reduces disk space requirements but increases the verification cost.
161162 A higher power increases disk usage a lot.
@@ -185,6 +186,8 @@ named "config.txt" in the prpll run directory.
185186 2 = calculate from scratch, no memory read
186187 1 = calculate using one complex multiply from cached memory and uncached memory
187188 0 = read trig values from memory
189+ -use INPLACE=n : Perform tranforms in-place. Great if the reduced memory usage fits in the GPU's L2 cache.
190+ 0 = not in-place, 1 = nVidia friendly access pattern, 2 = AMD friendly access pattern.
188191 -use PAD=<val> : insert pad bytes to possibly improve memory access patterns. Val is number bytes to pad.
189192 -use MIDDLE_IN_LDS_TRANSPOSE=0|1 : Transpose values in local memory before writing to global memory
190193 -use MIDDLE_OUT_LDS_TRANSPOSE=0|1 : Transpose values in local memory before writing to global memory
@@ -194,23 +197,14 @@ named "config.txt" in the prpll run directory.
194197
195198 -use DEBUG : enable asserts in OpenCL kernels (slow, developers)
196199
197- -tune : measures the speed of the FFTs specified in -fft <spec> to find the best FFT for each exponent.
198-
199- -ctune <configs> : finds the best configuration for each FFT specified in -fft <spec>.
200- Prints the results in a form that can be incorporated in config.txt
201- -fft 6.5M -ctune "OUT_SIZEX=32,8;OUT_WG=64,128,256"
202-
203- It is possible to specify -ctune multiple times on the same command in order to define multiple
204- sets of parameters to be combined, e.g.:
205- -ctune "IN_WG=256,128,64" -ctune "OUT_WG=256,64;OUT_SIZEX=32,16,8"
206- which would try only 8 combinations among those two sets.
207-
208- The tunable parameters (with the default value emphasized) are:
209- IN_WG, OUT_WG: 64, 128, *256*
210- IN_SIZEX, OUT_SIZEX: 4, 8, 16, *32*
211- UNROLL_W: *0*, 1
212- UNROLL_H: 0, 1
213-
200+ -tune <options> : Looks for best settings to include in config.txt. Times many FFTs to find fastest one to test exponents -- written to tune.txt.
201+ An -fft <spec> can be given on the command line to limit which FFTs are timed.
202+ Options are not required. If present, the options are a comma separated list from below.
203+ noconfig - Skip timings to find best config.txt settings
204+ fp64 - Tune for settings that affect FP64 FFTs. Time FP64 FFTs for tune.txt.
205+ ntt - Tune for settings that affect integer NTTs. Time integer NTTs for tune.txt.
206+ minexp=<val> - Time FFTs to find the best one for exponents greater than <val>.
207+ maxexp=<val> - Time FFTs to find the best one for exponents less than <val>.
214208-device <N> : select the GPU at position N in the list of devices
215209-uid <UID> : select the GPU with the given UID (on ROCm/AMDGPU, Linux)
216210-pci <BDF> : select the GPU with the given PCI BDF, e.g. "0c:00.0"
@@ -236,31 +230,34 @@ Device selection : use one of -uid <UID>, -pci <BDF>, -device <N>, see the list
236230 );
237231
238232 }
239- printf (" \n FFT Configurations (specify with -fft <width>:<middle>:<height> from the set below):\n "
233+ printf (" \n FFT Configurations (specify with -fft <type>:< width>:<middle>:<height> from the set below):\n "
240234 " Size MaxExp BPW FFT\n " );
241-
235+
242236 vector<FFTShape> configs = FFTShape::allShapes ();
243237 configs.push_back (configs.front ()); // dummy guard for the loop below.
244- string variants;
245238 u32 activeSize = 0 ;
246- double maxBpw = 0 ;
247- for (auto c : configs) {
248- if (c.size () != activeSize) {
249- if (!variants.empty ()) {
250- printf (" %5s %7.2fM %.2f %s\n " ,
251- numberK (activeSize).c_str (),
252- // activeSize * FFTShape::MIN_BPW / 1'000'000,
253- activeSize * maxBpw / 1'000'000.0 ,
254- maxBpw,
255- variants.c_str ());
256- variants.clear ();
239+ float maxBpw = 0 ;
240+ string variants;
241+ for (enum FFT_TYPES type : {FFT64, FFT3161, FFT3261, FFT61}) {
242+ for (auto c : configs) {
243+ if (c.fft_type != type) continue ;
244+ if (c.size () != activeSize) {
245+ if (!variants.empty ()) {
246+ printf (" %5s %7.2fM %.2f %s\n " ,
247+ numberK (activeSize).c_str (),
248+ // activeSize * FFTShape::MIN_BPW / 1'000'000,
249+ activeSize * maxBpw / 1'000'000.0 ,
250+ maxBpw,
251+ variants.c_str ());
252+ variants.clear ();
253+ }
254+ activeSize = c.size ();
255+ maxBpw = 0 ;
257256 }
258- activeSize = c.size ();
259- maxBpw = 0 ;
257+ maxBpw = max (maxBpw, c.maxBpw ());
258+ if (!variants.empty ()) { variants.push_back (' ,' ); }
259+ variants += c.spec ();
260260 }
261- maxBpw = max (maxBpw, c.maxBpw ());
262- if (!variants.empty ()) { variants.push_back (' ,' ); }
263- variants += c.spec ();
264261 }
265262}
266263
@@ -295,9 +292,10 @@ void Args::parse(const string& line) {
295292 log (" -info expects an FFT spec, e.g. -info 1K:13:256\n " );
296293 throw " -info <fft>" ;
297294 }
298- log (" FFT | BPW | Max exp (M)\n " );
295+ log (" FFT | BPW | Max exp (M)\n " );
299296 for (const FFTShape& shape : FFTShape::multiSpec (s)) {
300297 for (u32 variant = 0 ; variant <= LAST_VARIANT; variant = next_variant (variant)) {
298+ if (variant != LAST_VARIANT && shape.fft_type != FFT64) continue ;
301299 FFTConfig fft{shape, variant, CARRY_AUTO};
302300 log (" %12s | %.2f | %5.1f\n " , fft.spec ().c_str (), fft.maxBpw (), fft.maxExp () / 1'000'000.0 );
303301 }
@@ -310,8 +308,8 @@ void Args::parse(const string& line) {
310308 assert (s.empty ());
311309 logROE = true ;
312310 } else if (key == " -tune" ) {
313- assert (s.empty ());
314311 doTune = true ;
312+ if (!s.empty ()) { tune = s; }
315313 } else if (key == " -ctune" ) {
316314 doCtune = true ;
317315 if (!s.empty ()) { ctune.push_back (s); }
@@ -372,6 +370,7 @@ void Args::parse(const string& line) {
372370 else if (key == " -iters" ) { iters = stoi (s); assert (iters && (iters % 10000 == 0 )); }
373371 else if (key == " -prp" || key == " -PRP" ) { prpExp = stoll (s); }
374372 else if (key == " -ll" || key == " -LL" ) { llExp = stoll (s); }
373+ else if (key == " -smallest" ) { smallest = true ; }
375374 else if (key == " -fft" ) { fftSpec = s; }
376375 else if (key == " -dump" ) { dump = s; }
377376 else if (key == " -user" ) { user = s; }
0 commit comments