Skip to content

Commit 7deb35a

Browse files
committed
Update carry32 bpw and tune.txt
1 parent e2e88cc commit 7deb35a

File tree

7 files changed

+194
-173
lines changed

7 files changed

+194
-173
lines changed

src/Args.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ void Args::parse(const string& line) {
278278
log(" FFT | BPW | Max exp (M)\n");
279279
for (const FFTShape& shape : FFTShape::multiSpec(s)) {
280280
for (u32 variant = 0; variant < FFTConfig::N_VARIANT; ++variant) {
281-
FFTConfig fft{shape, variant, FFTConfig::CARRY_AUTO};
281+
FFTConfig fft{shape, variant, CARRY_AUTO};
282282
log("%12s | %.2f | %5.1f\n", fft.spec().c_str(), fft.maxBpw(), fft.maxExp() / 1'000'000.0);
283283
}
284284
}

src/FFTConfig.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,11 +112,11 @@ FFTShape::FFTShape(const string& w, const string& m, const string& h) :
112112
double FFTShape::carry32BPW() const {
113113
// The formula below was validated empirically with -carryTune
114114

115-
// We observe that FFT 6.5M (1024:13:256) has safe carry32 up to 18.3 BPW
115+
// We observe that FFT 6.5M (1024:13:256) has safe carry32 up to 18.35 BPW
116116
// while the 0.5*log2() models the impact of FFT size changes.
117117
// We model carry with a Gumbel distrib similar to the one used for ROE, and measure carry with
118118
// -use STATS=1. See -carryTune
119-
return 18.3 + 0.5 * (log2(13 * 1024 * 512) - log2(size()));
119+
return 18.35 + 0.5 * (log2(13 * 1024 * 512) - log2(size()));
120120
}
121121

122122
bool FFTShape::needsLargeCarry(u32 E) const {

src/FFTConfig.h

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,25 +47,23 @@ class FFTShape {
4747
bool needsLargeCarry(u32 E) const;
4848
};
4949

50+
enum CARRY_KIND { CARRY_32=0, CARRY_64=1, CARRY_AUTO=2};
51+
5052
struct FFTConfig {
5153
public:
5254
static const u32 N_VARIANT = 4;
5355
static FFTConfig bestFit(const Args& args, u32 E, const std::string& spec);
5456

55-
enum CARRY_KIND { CARRY_AUTO, CARRY_32, CARRY_64};
56-
5757
FFTShape shape{};
5858
u32 variant;
5959
u32 carry;
6060

6161
explicit FFTConfig(const string& spec);
62-
FFTConfig(FFTShape shape, u32 variant, u32 carry /* = CARRY_AUTO*/);
62+
FFTConfig(FFTShape shape, u32 variant, u32 carry);
6363

6464
std::string spec() const;
6565
u32 size() const { return shape.size(); }
6666
u32 maxExp() const { return maxBpw() * shape.size(); }
6767

6868
double maxBpw() const;
69-
70-
// bool needsLargeCarry(u32 E) const { return shape.needsLargeCarry(E); }
7169
};

src/Gpu.cpp

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ string clDefines(const Args& args, cl_device_id id, FFTConfig fft, const vector<
188188

189189
if (isAmdGpu(id)) { defines += toDefine("AMDGPU", 1); }
190190

191-
if ((fft.carry == FFTConfig::CARRY_AUTO && fft.shape.needsLargeCarry(E)) || (fft.carry == FFTConfig::CARRY_64)) {
191+
if ((fft.carry == CARRY_AUTO && fft.shape.needsLargeCarry(E)) || (fft.carry == CARRY_64)) {
192192
if (doLog) { log("Using CARRY64\n"); }
193193
defines += toDefine("CARRY64", 1);
194194
}
@@ -475,7 +475,31 @@ RoeInfo Gpu::readCarryStats() {
475475
assert(carry.size() == carryPos);
476476
bufStatsCarry.zero(carryPos);
477477
carryPos = 0;
478-
return roeStat(carry);
478+
479+
RoeInfo ret = roeStat(carry);
480+
481+
#if 0
482+
log("%s\n", ret.toString().c_str());
483+
484+
std::sort(carry.begin(), carry.end());
485+
File fo = File::openAppend("carry.txt");
486+
auto it = carry.begin();
487+
u32 n = carry.size();
488+
u32 c = 0;
489+
for (int i=0; i < 500; ++i) {
490+
double y = 0.23 + (0.48 - 0.23) / 500 * i;
491+
while (it < carry.end() && *it < y) {
492+
++c;
493+
++it;
494+
}
495+
fo.printf("%f %f\n", y, c / double(n));
496+
}
497+
498+
// for (auto x : carry) { fo.printf("%f\n", x); }
499+
fo.printf("\n\n");
500+
#endif
501+
502+
return ret;
479503
}
480504

481505
template<typename T>

src/TuneEntry.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,6 @@ void TuneEntry::writeTuneFile(const vector<TuneEntry>& results) {
7979
assert(r.cost >= prevCost && maxExp > prevMaxExp);
8080
prevCost = r.cost;
8181
prevMaxExp = maxExp;
82-
tune->printf("%6.0f %14s # %u\n", r.cost, r.fft.spec().c_str(), maxExp);
82+
tune->printf("%6.1f %14s # %u\n", r.cost, r.fft.spec().c_str(), maxExp);
8383
}
8484
}

src/tune.cpp

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ void Tune::ztune() {
162162
double bpw[4];
163163
double A[4];
164164
for (u32 variant = 0; variant < FFTConfig::N_VARIANT; ++variant) {
165-
FFTConfig fft{shape, variant, FFTConfig::CARRY_AUTO};
165+
FFTConfig fft{shape, variant, CARRY_AUTO};
166166
std::tie(bpw[variant], A[variant]) = maxBpw(fft);
167167
}
168168
string s = "\""s + shape.spec() + "\"";
@@ -178,7 +178,7 @@ void Tune::carryTune() {
178178
shared.args->flags["STATS"] = "1";
179179
u32 prevSize = 0;
180180
for (FFTShape shape : FFTShape::multiSpec(shared.args->fftSpec)) {
181-
FFTConfig fft{shape, 3, FFTConfig::CARRY_AUTO};
181+
FFTConfig fft{shape, 3, CARRY_AUTO};
182182
if (prevSize == fft.size()) { continue; }
183183
prevSize = fft.size();
184184

@@ -194,7 +194,9 @@ void Tune::carryTune() {
194194
}
195195

196196
double avg = (zv[0] + zv[1]) / 2;
197-
log("%14s %.3f : %.3f (%.3f %.3f) %f\n", fft.spec().c_str(), mid, avg, zv[0], zv[1], m);
197+
u32 exponent = fft.shape.carry32BPW() * fft.size();
198+
double pErr100 = -expm1(-exp(-avg) * exponent * 100);
199+
log("%14s %.3f : %.3f (%.3f %.3f) %f %.0f%%\n", fft.spec().c_str(), mid, avg, zv[0], zv[1], m, pErr100 * 100);
198200
fo.printf("%f %f\n", log2(fft.size()), avg);
199201
}
200202
}
@@ -223,7 +225,7 @@ void Tune::ctune() {
223225
}
224226

225227
for (FFTShape shape : shapes) {
226-
u32 exponent = primes.prevPrime(FFTConfig{shape, 0, FFTConfig::CARRY_AUTO}.maxExp());
228+
u32 exponent = primes.prevPrime(FFTConfig{shape, 0, CARRY_AUTO}.maxExp());
227229
// log("tuning %10s with exponent %u\n", fft.shape.spec().c_str(), exponent);
228230

229231
vector<int> bestPos(configsVect.size());
@@ -240,7 +242,7 @@ void Tune::ctune() {
240242
for (u32 k = i + 1; k < configsVect.size(); ++k) {
241243
add(c, configsVect[k][bestPos[k]]);
242244
}
243-
auto cost = Gpu::make(q, exponent, shared, FFTConfig{shape, 0, FFTConfig::CARRY_AUTO}, c, false)->timePRP();
245+
auto cost = Gpu::make(q, exponent, shared, FFTConfig{shape, 0, CARRY_AUTO}, c, false)->timePRP();
244246

245247
bool isBest = (cost < best.cost);
246248
if (isBest) {
@@ -267,28 +269,28 @@ void Tune::tune() {
267269
double minCost = -1;
268270

269271
// Time an exponent that's good for all variants and carry-config.
270-
u32 exponent = primes.prevPrime(FFTConfig{shape, 0, FFTConfig::CARRY_32}.maxExp());
272+
u32 exponent = primes.prevPrime(FFTConfig{shape, 0, CARRY_32}.maxExp());
271273

272274
for (u32 variant = 0; variant < FFTConfig::N_VARIANT; ++variant) {
273-
vector carryToTest{FFTConfig::CARRY_32};
274-
// We need to test both carry-32 and carry-64 only when the carry cutoff BPW is within the range.
275-
if (shape.carry32BPW() < FFTConfig{shape, variant, FFTConfig::CARRY_64}.maxBpw()) {
276-
carryToTest.push_back(FFTConfig::CARRY_64);
275+
vector carryToTest{CARRY_32};
276+
// We need to test both carry-32 and carry-64 only when the carry transition is within the BPW range.
277+
if (FFTConfig{shape, variant, CARRY_64}.maxBpw() > FFTConfig{shape, variant, CARRY_32}.maxBpw()) {
278+
carryToTest.push_back(CARRY_64);
277279
}
278280

279281
for (auto carry : carryToTest) {
280282
FFTConfig fft{shape, variant, carry};
281283

282284
if (minCost > 0 && !TuneEntry{minCost, fft}.willUpdate(results)) {
283-
log("skipped %s %9u\n", fft.spec().c_str(), fft.maxExp());
285+
// log("skipped %s %9u\n", fft.spec().c_str(), fft.maxExp());
284286
continue;
285287
}
286288

287289
double cost = Gpu::make(q, exponent, shared, fft, {}, false)->timePRP();
288290
if (minCost <= 0) { minCost = cost; }
289291

290292
bool isUseful = TuneEntry{cost, fft}.update(results);
291-
log("%c %6.0f %12s %9u\n", isUseful ? '*' : ' ', cost, fft.spec().c_str(), fft.maxExp());
293+
log("%c %6.1f %12s %9u\n", isUseful ? '*' : ' ', cost, fft.spec().c_str(), fft.maxExp());
292294
}
293295
}
294296
}

0 commit comments

Comments
 (0)