Skip to content

Commit 4d0e759

Browse files
authored
Merge pull request #340 from gwoltman/master
NTT
2 parents 6c16f7e + 09ed544 commit 4d0e759

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+13225
-1247
lines changed

Makefile

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,12 @@ else
1919
CXX = g++
2020
endif
2121

22-
COMMON_FLAGS = -Wall -std=c++20
23-
# -static-libstdc++ -static-libgcc
22+
ifneq ($(findstring MINGW, $(HOST_OS)), MINGW)
23+
COMMON_FLAGS = -Wall -std=c++20 -static-libstdc++ -static-libgcc
24+
else
25+
# For mingw-64 use this:
26+
COMMON_FLAGS = -Wall -std=c++20 -static-libstdc++ -static-libgcc -static
27+
endif
2428
# -fext-numeric-literals
2529

2630
ifeq ($(HOST_OS), Darwin)

src/Args.cpp

Lines changed: 37 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ named "config.txt" in the prpll run directory.
156156
-prp <exponent> : run a single PRP test and exit, ignoring worktodo.txt
157157
-ll <exponent> : run a single LL test and exit, ignoring worktodo.txt
158158
-verify <file> : verify PRP-proof contained in <file>
159+
-smallest : work on smallest exponent in worktodo.txt rather than the first exponent in worktodo.txt
159160
-proof <power> : generate proof of power <power> (default: optimal depending on exponent).
160161
A lower power reduces disk space requirements but increases the verification cost.
161162
A higher power increases disk usage a lot.
@@ -185,6 +186,8 @@ named "config.txt" in the prpll run directory.
185186
2 = calculate from scratch, no memory read
186187
1 = calculate using one complex multiply from cached memory and uncached memory
187188
0 = read trig values from memory
189+
-use INPLACE=n : Perform tranforms in-place. Great if the reduced memory usage fits in the GPU's L2 cache.
190+
0 = not in-place, 1 = nVidia friendly access pattern, 2 = AMD friendly access pattern.
188191
-use PAD=<val> : insert pad bytes to possibly improve memory access patterns. Val is number bytes to pad.
189192
-use MIDDLE_IN_LDS_TRANSPOSE=0|1 : Transpose values in local memory before writing to global memory
190193
-use MIDDLE_OUT_LDS_TRANSPOSE=0|1 : Transpose values in local memory before writing to global memory
@@ -194,23 +197,14 @@ named "config.txt" in the prpll run directory.
194197
195198
-use DEBUG : enable asserts in OpenCL kernels (slow, developers)
196199
197-
-tune : measures the speed of the FFTs specified in -fft <spec> to find the best FFT for each exponent.
198-
199-
-ctune <configs> : finds the best configuration for each FFT specified in -fft <spec>.
200-
Prints the results in a form that can be incorporated in config.txt
201-
-fft 6.5M -ctune "OUT_SIZEX=32,8;OUT_WG=64,128,256"
202-
203-
It is possible to specify -ctune multiple times on the same command in order to define multiple
204-
sets of parameters to be combined, e.g.:
205-
-ctune "IN_WG=256,128,64" -ctune "OUT_WG=256,64;OUT_SIZEX=32,16,8"
206-
which would try only 8 combinations among those two sets.
207-
208-
The tunable parameters (with the default value emphasized) are:
209-
IN_WG, OUT_WG: 64, 128, *256*
210-
IN_SIZEX, OUT_SIZEX: 4, 8, 16, *32*
211-
UNROLL_W: *0*, 1
212-
UNROLL_H: 0, 1
213-
200+
-tune <options> : Looks for best settings to include in config.txt. Times many FFTs to find fastest one to test exponents -- written to tune.txt.
201+
An -fft <spec> can be given on the command line to limit which FFTs are timed.
202+
Options are not required. If present, the options are a comma separated list from below.
203+
noconfig - Skip timings to find best config.txt settings
204+
fp64 - Tune for settings that affect FP64 FFTs. Time FP64 FFTs for tune.txt.
205+
ntt - Tune for settings that affect integer NTTs. Time integer NTTs for tune.txt.
206+
minexp=<val> - Time FFTs to find the best one for exponents greater than <val>.
207+
maxexp=<val> - Time FFTs to find the best one for exponents less than <val>.
214208
-device <N> : select the GPU at position N in the list of devices
215209
-uid <UID> : select the GPU with the given UID (on ROCm/AMDGPU, Linux)
216210
-pci <BDF> : select the GPU with the given PCI BDF, e.g. "0c:00.0"
@@ -236,31 +230,34 @@ Device selection : use one of -uid <UID>, -pci <BDF>, -device <N>, see the list
236230
);
237231

238232
}
239-
printf("\nFFT Configurations (specify with -fft <width>:<middle>:<height> from the set below):\n"
233+
printf("\nFFT Configurations (specify with -fft <type>:<width>:<middle>:<height> from the set below):\n"
240234
" Size MaxExp BPW FFT\n");
241-
235+
242236
vector<FFTShape> configs = FFTShape::allShapes();
243237
configs.push_back(configs.front()); // dummy guard for the loop below.
244-
string variants;
245238
u32 activeSize = 0;
246-
double maxBpw = 0;
247-
for (auto c : configs) {
248-
if (c.size() != activeSize) {
249-
if (!variants.empty()) {
250-
printf("%5s %7.2fM %.2f %s\n",
251-
numberK(activeSize).c_str(),
252-
// activeSize * FFTShape::MIN_BPW / 1'000'000,
253-
activeSize * maxBpw / 1'000'000.0,
254-
maxBpw,
255-
variants.c_str());
256-
variants.clear();
239+
float maxBpw = 0;
240+
string variants;
241+
for (enum FFT_TYPES type : {FFT64, FFT3161, FFT3261, FFT61}) {
242+
for (auto c : configs) {
243+
if (c.fft_type != type) continue;
244+
if (c.size() != activeSize) {
245+
if (!variants.empty()) {
246+
printf("%5s %7.2fM %.2f %s\n",
247+
numberK(activeSize).c_str(),
248+
// activeSize * FFTShape::MIN_BPW / 1'000'000,
249+
activeSize * maxBpw / 1'000'000.0,
250+
maxBpw,
251+
variants.c_str());
252+
variants.clear();
253+
}
254+
activeSize = c.size();
255+
maxBpw = 0;
257256
}
258-
activeSize = c.size();
259-
maxBpw = 0;
257+
maxBpw = max(maxBpw, c.maxBpw());
258+
if (!variants.empty()) { variants.push_back(','); }
259+
variants += c.spec();
260260
}
261-
maxBpw = max(maxBpw, c.maxBpw());
262-
if (!variants.empty()) { variants.push_back(','); }
263-
variants += c.spec();
264261
}
265262
}
266263

@@ -295,9 +292,10 @@ void Args::parse(const string& line) {
295292
log("-info expects an FFT spec, e.g. -info 1K:13:256\n");
296293
throw "-info <fft>";
297294
}
298-
log(" FFT | BPW | Max exp (M)\n");
295+
log(" FFT | BPW | Max exp (M)\n");
299296
for (const FFTShape& shape : FFTShape::multiSpec(s)) {
300297
for (u32 variant = 0; variant <= LAST_VARIANT; variant = next_variant (variant)) {
298+
if (variant != LAST_VARIANT && shape.fft_type != FFT64) continue;
301299
FFTConfig fft{shape, variant, CARRY_AUTO};
302300
log("%12s | %.2f | %5.1f\n", fft.spec().c_str(), fft.maxBpw(), fft.maxExp() / 1'000'000.0);
303301
}
@@ -310,8 +308,8 @@ void Args::parse(const string& line) {
310308
assert(s.empty());
311309
logROE = true;
312310
} else if (key == "-tune") {
313-
assert(s.empty());
314311
doTune = true;
312+
if (!s.empty()) { tune = s; }
315313
} else if (key == "-ctune") {
316314
doCtune = true;
317315
if (!s.empty()) { ctune.push_back(s); }
@@ -372,6 +370,7 @@ void Args::parse(const string& line) {
372370
else if (key == "-iters") { iters = stoi(s); assert(iters && (iters % 10000 == 0)); }
373371
else if (key == "-prp" || key == "-PRP") { prpExp = stoll(s); }
374372
else if (key == "-ll" || key == "-LL") { llExp = stoll(s); }
373+
else if (key == "-smallest") { smallest = true; }
375374
else if (key == "-fft") { fftSpec = s; }
376375
else if (key == "-dump") { dump = s; }
377376
else if (key == "-user") { user = s; }

src/Args.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ class Args {
4343
string uid;
4444
string verifyPath;
4545

46+
string tune;
4647
vector<string> ctune;
4748

4849
bool doCtune{};
@@ -53,14 +54,15 @@ class Args {
5354

5455
std::map<std::string, std::string> flags;
5556
std::map<std::string, vector<KeyVal>> perFftConfig;
56-
57+
5758
int device = 0;
5859

5960
bool safeMath = true;
6061
bool clean = true;
6162
bool verbose = false;
6263
bool useCache = false;
6364
bool profile = false;
65+
bool smallest = false;
6466

6567
fs::path masterDir;
6668
fs::path proofResultDir = "proof";

src/Background.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616
class Background {
1717
unsigned maxSize;
1818
std::deque<std::function<void()> > tasks;
19-
std::jthread thread;
2019
std::mutex mut;
2120
std::condition_variable cond;
22-
bool stopRequested{};
21+
bool stopRequested;
22+
std::jthread thread;
2323

2424
void run() {
2525
std::function<void()> task;
@@ -59,6 +59,7 @@ class Background {
5959
public:
6060
Background(unsigned size = 2) :
6161
maxSize{size},
62+
stopRequested(false),
6263
thread{&Background::run, this} {
6364
}
6465

src/Buffer.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ class Buffer {
2626

2727
Queue* queue;
2828
TimeInfo *tInfo;
29-
29+
3030
Buffer(cl_context context, TimeInfo *tInfo, Queue* queue, size_t size, unsigned flags, const T* ptr = nullptr)
3131
: ptr{size == 0 ? NULL : makeBuf_(context, flags, size * sizeof(T), ptr)}
3232
, size{size}

0 commit comments

Comments
 (0)