Skip to content

Commit 49af97c

Browse files
committed
-tune (FFT tuning) takes carry 32/64 into account
1 parent 900e538 commit 49af97c

File tree

6 files changed

+188
-31
lines changed

6 files changed

+188
-31
lines changed

src/Args.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ void Args::parse(const string& line) {
272272
log(" FFT | BPW | Max exp (M)\n");
273273
for (const FFTShape& shape : FFTShape::multiSpec(s)) {
274274
for (u32 variant = 0; variant < FFTConfig::N_VARIANT; ++variant) {
275-
FFTConfig fft{shape, variant};
275+
FFTConfig fft{shape, variant, FFTConfig::CARRY_AUTO};
276276
log("%12s | %.2f | %5.1f\n", fft.spec().c_str(), fft.maxBpw(), fft.maxExp() / 1'000'000.0);
277277
}
278278
}

src/FFTConfig.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ FFTShape::FFTShape(const string& w, const string& m, const string& h) :
108108
FFTShape{parseInt(w), parseInt(m), parseInt(h)}
109109
{}
110110

111-
double FFTShape::carryLimitBPW() const {
111+
double FFTShape::carry32BPW() const {
112112
// The formula below was validated empirically with -carryTune
113113

114114
// We observe that FFT 6.5M (1024:13:256) has safe carry32 up to 18.3 BPW
@@ -119,7 +119,7 @@ double FFTShape::carryLimitBPW() const {
119119
}
120120

121121
bool FFTShape::needsLargeCarry(u32 E) const {
122-
return E / double(size()) > carryLimitBPW();
122+
return E / double(size()) > carry32BPW();
123123
}
124124

125125
FFTShape::FFTShape(u32 w, u32 m, u32 h) :
@@ -149,9 +149,9 @@ FFTConfig::FFTConfig(const string& spec) {
149149
assert(v.size() == 3 || v.size() == 4 || v.size() == 5);
150150

151151
if (v.size() == 3) {
152-
*this = {FFTShape{v[0], v[1], v[2]}, 3};
152+
*this = {FFTShape{v[0], v[1], v[2]}, 3, CARRY_AUTO};
153153
} else if (v.size() == 4) {
154-
*this = {FFTShape{v[0], v[1], v[2]}, parseInt(v[3])};
154+
*this = {FFTShape{v[0], v[1], v[2]}, parseInt(v[3]), CARRY_AUTO};
155155
} else if (v.size() == 5) {
156156
int c = parseInt(v[4]);
157157
assert(c == 0 || c == 1);
@@ -161,7 +161,7 @@ FFTConfig::FFTConfig(const string& spec) {
161161
}
162162
}
163163

164-
FFTConfig::FFTConfig(FFTShape shape, u32 variant, CARRY_KIND carry) :
164+
FFTConfig::FFTConfig(FFTShape shape, u32 variant, u32 carry) :
165165
shape{shape},
166166
variant{variant},
167167
carry{carry}
@@ -176,7 +176,7 @@ string FFTConfig::spec() const {
176176

177177
double FFTConfig::maxBpw() const {
178178
double b = shape.bpw[variant];
179-
return carry == CARRY_32 ? std::min(shape.carryLimitBPW(), b) : b;
179+
return carry == CARRY_32 ? std::min(shape.carry32BPW(), b) : b;
180180
}
181181

182182
FFTConfig FFTConfig::bestFit(const Args& args, u32 E, const string& spec) {
@@ -211,7 +211,7 @@ FFTConfig FFTConfig::bestFit(const Args& args, u32 E, const string& spec) {
211211
// Take the first FFT that can handle E
212212
for (const FFTShape& shape : FFTShape::allShapes()) {
213213
for (u32 v = 0; v < 4; ++v) {
214-
if (FFTConfig fft{shape, v}; fft.maxExp() >= E) { return fft; }
214+
if (FFTConfig fft{shape, v, CARRY_AUTO}; fft.maxExp() >= E) { return fft; }
215215
}
216216
}
217217

src/FFTConfig.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class FFTShape {
4343
double maxBpw() const { return *max_element(bpw.begin(), bpw.end()); }
4444
std::string spec() const { return numberK(width) + ':' + numberK(middle) + ':' + numberK(height); }
4545

46-
double carryLimitBPW() const;
46+
double carry32BPW() const;
4747
bool needsLargeCarry(u32 E) const;
4848
};
4949

@@ -56,10 +56,10 @@ struct FFTConfig {
5656

5757
FFTShape shape{};
5858
u32 variant;
59-
CARRY_KIND carry;
59+
u32 carry;
6060

6161
explicit FFTConfig(const string& spec);
62-
FFTConfig(FFTShape shape, u32 variant, CARRY_KIND carry = CARRY_AUTO);
62+
FFTConfig(FFTShape shape, u32 variant, u32 carry /* = CARRY_AUTO*/);
6363

6464
std::string spec() const;
6565
u32 size() const { return shape.size(); }

src/TuneEntry.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,9 @@ void TuneEntry::writeTuneFile(const vector<TuneEntry>& results) {
7676
CycleFile tune{"tune.txt"};
7777
for (const TuneEntry& r : results) {
7878
u32 maxExp = r.fft.maxExp();
79-
assert(r.cost > prevCost && maxExp > prevMaxExp);
79+
assert(r.cost >= prevCost && maxExp > prevMaxExp);
8080
prevCost = r.cost;
8181
prevMaxExp = maxExp;
82-
tune->printf("%6.0f %12s # %u\n", r.cost, r.fft.spec().c_str(), maxExp);
82+
tune->printf("%6.0f %14s # %u\n", r.cost, r.fft.spec().c_str(), maxExp);
8383
}
8484
}

src/tune.cpp

Lines changed: 25 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ void Tune::ztune() {
162162
double bpw[4];
163163
double A[4];
164164
for (u32 variant = 0; variant < FFTConfig::N_VARIANT; ++variant) {
165-
FFTConfig fft{shape, variant};
165+
FFTConfig fft{shape, variant, FFTConfig::CARRY_AUTO};
166166
std::tie(bpw[variant], A[variant]) = maxBpw(fft);
167167
}
168168
string s = "\""s + shape.spec() + "\"";
@@ -178,13 +178,13 @@ void Tune::carryTune() {
178178
shared.args->flags["STATS"] = "1";
179179
u32 prevSize = 0;
180180
for (FFTShape shape : FFTShape::multiSpec(shared.args->fftSpec)) {
181-
FFTConfig fft{shape, 3};
181+
FFTConfig fft{shape, 3, FFTConfig::CARRY_AUTO};
182182
if (prevSize == fft.size()) { continue; }
183183
prevSize = fft.size();
184184

185185
vector<double> zv;
186186
double m = 0;
187-
const double mid = fft.shape.carryLimitBPW();
187+
const double mid = fft.shape.carry32BPW();
188188
for (double bpw : {mid - 0.05, mid + 0.05}) {
189189
u32 exponent = primes.nearestPrime(fft.size() * bpw);
190190
auto [ok, carry] = Gpu::make(q, exponent, shared, fft, {}, false)->measureCarry();
@@ -223,7 +223,7 @@ void Tune::ctune() {
223223
}
224224

225225
for (FFTShape shape : shapes) {
226-
u32 exponent = primes.prevPrime(FFTConfig{shape, 0}.maxExp());
226+
u32 exponent = primes.prevPrime(FFTConfig{shape, 0, FFTConfig::CARRY_AUTO}.maxExp());
227227
// log("tuning %10s with exponent %u\n", fft.shape.spec().c_str(), exponent);
228228

229229
vector<int> bestPos(configsVect.size());
@@ -240,7 +240,7 @@ void Tune::ctune() {
240240
for (u32 k = i + 1; k < configsVect.size(); ++k) {
241241
add(c, configsVect[k][bestPos[k]]);
242242
}
243-
auto cost = Gpu::make(q, exponent, shared, FFTConfig{shape, 0}, c, false)->timePRP();
243+
auto cost = Gpu::make(q, exponent, shared, FFTConfig{shape, 0, FFTConfig::CARRY_AUTO}, c, false)->timePRP();
244244

245245
bool isBest = (cost < best.cost);
246246
if (isBest) {
@@ -264,22 +264,29 @@ void Tune::tune() {
264264
vector<TuneEntry> results = TuneEntry::readTuneFile(*args);
265265

266266
for (const FFTShape& shape : FFTShape::multiSpec(args->fftSpec)) {
267-
double costZero{};
268-
for (u32 variant = 0; variant < FFTConfig::N_VARIANT; ++variant) {
269-
FFTConfig fft{shape, variant};
267+
double minCost = -1;
270268

271-
if (variant > 0 && !TuneEntry{costZero, fft}.willUpdate(results)) {
272-
log("skipped %s\n", fft.spec().c_str());
273-
continue;
274-
}
269+
// Time an exponent that's good for all variants and carry-config.
270+
u32 exponent = primes.prevPrime(FFTConfig{shape, 0, FFTConfig::CARRY_32}.maxExp());
271+
272+
for (u32 variant = 0; variant < FFTConfig::N_VARIANT; ++variant) {
273+
// We need to test both carry-32 and carry-64 only when the carry cutoff BPW is within the range.
274+
bool testBoth = shape.carry32BPW() < FFTConfig{shape, variant, FFTConfig::CARRY_64}.maxBpw();
275+
auto carries = testBoth ? vector{FFTConfig::CARRY_32, FFTConfig::CARRY_64} : vector{FFTConfig::CARRY_AUTO};
276+
for (auto carry : carries) {
277+
FFTConfig fft{shape, variant, carry};
278+
279+
if (minCost > 0 && !TuneEntry{minCost, fft}.willUpdate(results)) {
280+
log("skipped %s %9u\n", fft.spec().c_str(), fft.maxExp());
281+
continue;
282+
}
275283

276-
u32 maxExp = fft.maxExp();
277-
u32 exponent = primes.prevPrime(maxExp);
278-
double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP();
279-
if (variant == 0) { costZero = cost; }
284+
double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP();
285+
if (minCost <= 0) { minCost = cost; }
280286

281-
bool isUseful = TuneEntry{cost, fft}.update(results);
282-
log("%c %6.0f %12s %9u\n", isUseful ? '*' : ' ', cost, fft.spec().c_str(), exponent);
287+
bool isUseful = TuneEntry{cost, fft}.update(results);
288+
log("%c %6.0f %12s %9u\n", isUseful ? '*' : ' ', cost, fft.spec().c_str(), fft.maxExp());
289+
}
283290
}
284291
}
285292

tune.txt

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
68 256:2:256:2 # 5151391
2+
84 256:3:256:0 # 7613448
3+
85 256:3:256:3 # 7625244
4+
111 256:4:256:2 # 10140778
5+
112 256:4:256:3 # 10153361
6+
127 256:5:256:2 # 12587499
7+
129 256:5:256:3 # 12627476
8+
150 512:3:256:0 # 14983888
9+
150 512:3:256:2 # 15019278
10+
153 512:3:256:3 # 15023996
11+
153 256:3:512:2 # 15032647
12+
155 256:6:256:3 # 15042084
13+
166 256:7:256:2 # 17427988
14+
166 256:7:256:3 # 17483038
15+
193 512:4:256:0 # 19791872
16+
194 256:8:256:0 # 19828572
17+
195 256:8:256:2 # 19918749
18+
196 512:4:256:2 # 19938672
19+
198 512:4:256:3 # 19960692
20+
203 256:9:256:2 # 22238724
21+
207 256:9:256:3 # 22375563
22+
224 512:5:256:0 # 24537989
23+
225 512:5:256:2 # 24718868
24+
228 512:5:256:3 # 24806686
25+
235 256:10:256:3 # 24936448
26+
241 256:11:256:2 # 27125874
27+
252 256:11:256:3 # 27383955
28+
269 512:6:256:0 # 29500637
29+
269 512:6:256:2 # 29547823
30+
270 1K:3:256:0 # 29617029
31+
272 1K:3:256:2 # 29646913
32+
272 1K:3:256:3 # 29678370
33+
279 256:13:256:2 # 31965839
34+
293 512:7:256:0 # 34259599
35+
293 512:7:256:2 # 34309144
36+
295 512:7:256:1 # 34377039
37+
296 512:7:256:3 # 34439430
38+
322 256:15:256:0 # 36588748
39+
322 256:15:256:2 # 36679188
40+
331 256:15:256:3 # 36991795
41+
345 512:8:256:0 # 38944112
42+
350 512:8:256:2 # 39097204
43+
361 512:9:256:0 # 43774377
44+
363 512:9:256:2 # 43864031
45+
368 512:9:256:1 # 43993792
46+
369 512:9:256:3 # 44052774
47+
396 256:9:512:3 # 44102320
48+
414 1K:5:256:0 # 48625090
49+
415 1K:5:256:1 # 48735191
50+
419 1K:5:256:2 # 48958013
51+
420 1K:5:256:3 # 49112678
52+
434 512:11:256:0 # 53063712
53+
435 512:11:256:2 # 53285748
54+
442 512:11:256:1 # 53401092
55+
448 512:11:256:3 # 53744238
56+
492 512:12:256:0 # 58029244
57+
494 512:12:256:2 # 58129907
58+
502 512:12:256:1 # 58337525
59+
503 512:13:256:0 # 62452662
60+
507 512:13:256:2 # 62841159
61+
528 512:13:256:3 # 63168315
62+
557 1K:7:256:0 # 67704455
63+
557 1K:7:256:1 # 67924656
64+
564 1K:7:256:3 # 68009066
65+
586 512:15:256:1 # 72418590
66+
590 512:15:256:3 # 72831467
67+
672 512:8:512:0 # 76319555
68+
679 512:8:512:2 # 76822872
69+
682 1K:8:256:0 # 77007421
70+
691 1K:8:256:2 # 77468794
71+
702 1K:9:256:0 # 86411575
72+
706 1K:9:256:1 # 86940057
73+
713 1K:9:256:3 # 87081615
74+
810 1K:10:256:0 # 95635374
75+
823 1K:10:256:2 # 96254033
76+
836 1K:5:512:3 # 96322191
77+
838 1K:10:256:3:0 # 96936949
78+
846 1K:11:256:0 # 104899018
79+
856 1K:11:256:2 # 105331556
80+
862 1K:11:256:1 # 105723723
81+
871 1K:11:256:3:0 # 106234141
82+
927 1K:11:256:3:1 # 106467688
83+
968 1K:12:256:0 # 114517082
84+
987 1K:12:256:1 # 115209142
85+
988 1K:13:256:0 # 123733016
86+
999 1K:13:256:2 # 124305539
87+
1032 1K:13:256:3:0 # 124728115
88+
1097 1K:13:256:3:1 # 125293821
89+
1125 1K:7:512:0 # 133309661
90+
1129 1K:7:512:1 # 133647302
91+
1148 1K:7:512:3 # 133911543
92+
1160 1K:15:256:1:0 # 143105258
93+
1228 1K:15:256:1:1 # 143287910
94+
1243 1K:15:256:3:1 # 143712583
95+
1372 1K:8:512:0 # 151120773
96+
1387 1K:8:512:1 # 151749918
97+
1390 1K:8:512:2 # 152236457
98+
1406 1K:8:512:3:0 # 152255079
99+
1433 1K:9:512:0 # 170124115
100+
1437 1K:9:512:1:0 # 170485157
101+
1521 1K:9:512:1:1 # 170982899
102+
1544 1K:9:512:3:1 # 171426447
103+
1644 1K:10:512:0 # 187789475
104+
1664 1K:10:512:1 # 188512993
105+
1676 1K:10:512:2:0 # 188631019
106+
1740 1K:11:512:0 # 206153187
107+
1763 1K:11:512:2:0 # 206701114
108+
1872 1K:11:512:1:1 # 207594979
109+
1902 1K:11:512:3:1 # 209094443
110+
1990 1K:12:512:0:0 # 224702353
111+
2032 1K:13:512:0:0 # 242640486
112+
2150 1K:13:512:0:1 # 242722275
113+
2196 1K:13:512:2:1 # 244439842
114+
2273 1K:13:512:3:1 # 245639413
115+
2306 4K:7:256:0:0 # 260520378
116+
2387 1K:15:512:0:0 # 278346196
117+
2503 1K:15:512:1:1 # 281290997
118+
2559 1K:15:512:3:1 # 283052605
119+
2726 4K:8:256:0:0 # 296121551
120+
2913 4K:9:256:0:0 # 331533131
121+
3290 1K:9:1K:1:1 # 337700192
122+
3350 1K:9:1K:3:1 # 338625036
123+
3352 4K:10:256:0:0 # 366776278
124+
3569 4K:11:256:0:0 # 401867893
125+
4018 4K:12:256:0:0 # 436821794
126+
4178 4K:13:256:0:0 # 471649484
127+
4728 4K:14:256:0:0 # 506360692
128+
4879 4K:15:256:0:0 # 540963752
129+
5514 1K:15:1K:1:1 # 555913052
130+
5617 1K:15:1K:3:1 # 558272348
131+
5683 4K:8:512:0:0 # 575465886
132+
6303 4K:9:512:0:0 # 644191894
133+
7048 4K:10:512:0:0 # 712581036
134+
7710 4K:11:512:0:0 # 780667115
135+
8473 4K:12:512:0:0 # 848477765
136+
8847 4K:13:512:0:0 # 916035993
137+
10176 4K:14:512:0:0 # 983361256
138+
10555 4K:15:512:0:0 # 1050470224
139+
12494 4K:15:512:1:1 # 1083892039
140+
12506 4K:15:512:3:1 # 1093203394
141+
12683 4K:8:1K:0:0 # 1117377340
142+
13761 4K:9:1K:0:0 # 1250635053
143+
15480 4K:10:1K:0:0 # 1383219032
144+
16880 4K:11:1K:0:0 # 1515196887
145+
18631 4K:12:1K:0:0 # 1646623883
146+
20056 4K:13:1K:0:0 # 1777546035
147+
22309 4K:14:1K:0:0 # 1908002257
148+
23235 4K:15:1K:0:0 # 2038025889
149+
27010 4K:15:1K:1:1 # 2145386496
150+
27238 4K:15:1K:3:1 # 2159731015

0 commit comments

Comments
 (0)