Skip to content

Commit cca201a

Browse files
authored
Merge pull request #319 from gwoltman/master
New fft_height and fft_width (backward compatible)
2 parents 4c35daf + 67ae312 commit cca201a

File tree

8 files changed

+460
-27
lines changed

8 files changed

+460
-27
lines changed

src/TrigBufCache.cpp

Lines changed: 160 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
#define SINGLE_WIDE 0 // Old single-wide tailSquare vs. new double-wide tailSquare
1515
#define SINGLE_KERNEL 0 // Implement tailSquare in a single kernel vs. two kernels
1616

17+
#define SAVE_ONE_MORE_WIDTH_MUL 0 // I want to make saving the only option -- but rocm optimizer is inexplicably making it slower in carryfused
18+
#define SAVE_ONE_MORE_HEIGHT_MUL 1 // In tailSquar this is the fastest option
19+
1720
#define _USE_MATH_DEFINES
1821
#include <cmath>
1922

@@ -93,23 +96,100 @@ double2 root1(u32 N, u32 k) {
9396
namespace {
9497
static const constexpr bool LOG_TRIG_ALLOC = false;
9598

99+
// Interleave two lines of trig values so that AMD GPUs can use global_load_dwordx4 instructions
100+
void T2shuffle(u32 size, u32 radix, u32 line, vector<double> &tab) {
101+
vector<double> line1, line2;
102+
u32 line_size = size / radix;
103+
for (u32 col = 0; col < line_size; ++col) {
104+
line1.push_back(tab[line*line_size + col]);
105+
line2.push_back(tab[(line+1)*line_size + col]);
106+
}
107+
for (u32 col = 0; col < line_size; ++col) {
108+
tab[line*line_size + 2*col] = line1[col];
109+
tab[line*line_size + 2*col + 1] = line2[col];
110+
}
111+
}
112+
96113
vector<double2> genSmallTrig(u32 size, u32 radix) {
97114
if (LOG_TRIG_ALLOC) { log("genSmallTrig(%u, %u)\n", size, radix); }
98115

99116
vector<double2> tab;
100-
#if 1
117+
// old fft_WIDTH
101118
for (u32 line = 1; line < radix; ++line) {
102119
for (u32 col = 0; col < size / radix; ++col) {
103120
tab.push_back(radix / line >= 8 ? root1Fancy(size, col * line) : root1(size, col * line));
104121
}
105122
}
106123
tab.resize(size);
107-
#else
108-
tab.resize(size);
109-
auto *p = tab.data() + radix;
110-
for (u32 w = radix; w < size; w *= radix) { p = smallTrigBlock(w, std::min(radix, size / w), p); }
111-
assert(p - tab.data() == size);
124+
125+
// New fft_WIDTH
126+
vector<double> tab1;
127+
// Epsilon value, 2^-250, should have an exact representation as a double
128+
const double epsilon = 5.5271478752604445602472651921923E-76; // Protect against divide by zero
129+
// Sine/cosine values for first fft8
130+
//TO DO: explore using long doubles through the division (though dividing by double(cosine) may make sense)
131+
for (u32 line = 1; line < radix; ++line) {
132+
for (u32 col = 0; col < size / radix; ++col) {
133+
double2 root = root1(size, col * line); root.first += epsilon;
134+
tab1.push_back(root.second / root.first);
135+
}
136+
}
137+
// Interleave trig values for faster AMD GPU access
138+
for (u32 i = 0; i < 6; i += 2) T2shuffle(size, radix, i, tab1);
139+
// Sine/cosine values for second fft8
140+
for (u32 line = 0; line < radix; ++line) {
141+
for (u32 col = 0; col < size / radix / 8; ++col) {
142+
double2 root = root1(size, 8 * col * line); root.first += epsilon;
143+
tab1.push_back(root.second / root.first);
144+
}
145+
}
146+
// Cosine values for first fft8 (output in post-shufl order)
147+
//TODO: Examine why when sine is 0.0 cosine is not 1.0 or -1.0 (printf is outputting 0.999... and -0.999...)
148+
for (u32 col = 0; col < size / radix; ++col) { // col 0..7 will be line0, col 8..15 will be line1, etc.
149+
for (u32 line = 0; line < radix; ++line) {
150+
double2 root = root1(size, col * line); root.first += epsilon;
151+
#if SAVE_ONE_MORE_WIDTH_MUL
152+
if (col / 8 == 3) { // Compute cosine3 / cosine1
153+
root.first /= root1(size, (col - 16) * line).first + epsilon;
154+
}
155+
if (col / 8 == 5) { // Compute cosine5 / cosine1
156+
root.first /= root1(size, (col - 32) * line).first + epsilon;
157+
}
112158
#endif
159+
if (col / 8 == 6) { // Compute cosine6 / cosine2
160+
root.first /= root1(size, (col - 32) * line).first + epsilon;
161+
}
162+
if (col / 8 == 7) { // Compute cosine7 / cosine3
163+
root.first /= root1(size, (col - 32) * line).first + epsilon;
164+
}
165+
tab1.push_back(root.first);
166+
}
167+
}
168+
// Interleave trig values for faster AMD GPU access
169+
for (u32 i = 8; i < 16; i += 2) T2shuffle(size, radix, i, tab1);
170+
// Cosine values for second fft8 (output in post-shufl order)
171+
for (u32 col = 0; col < size / radix / 8; ++col) {
172+
for (u32 line = 0; line < radix; ++line) {
173+
double2 root = root1(size, 8 * col * line); root.first += epsilon;
174+
#if SAVE_ONE_MORE_WIDTH_MUL
175+
if (col == 3) { // Compute cosine3 / cosine1
176+
root.first /= root1(size, 8 * (col - 2) * line).first + epsilon;
177+
}
178+
if (col == 5) { // Compute cosine5 / cosine1
179+
root.first /= root1(size, 8 * (col - 4) * line).first + epsilon;
180+
}
181+
#endif
182+
if (col == 6) { // Compute cosine6 / cosine2
183+
root.first /= root1(size, 8 * (col - 4) * line).first + epsilon;
184+
}
185+
if (col == 7) { // Compute cosine7 / cosine3
186+
root.first /= root1(size, 8 * (col - 4) * line).first + epsilon;
187+
}
188+
tab1.push_back(root.first);
189+
}
190+
}
191+
// Convert to a vector of double2
192+
for (u32 i = 0; i < tab1.size(); i += 2) tab.push_back({tab1[i], tab1[i+1]});
113193

114194
return tab;
115195
}
@@ -119,11 +199,85 @@ vector<double2> genSmallTrigCombo(u32 width, u32 middle, u32 size, u32 radix) {
119199
if (LOG_TRIG_ALLOC) { log("genSmallTrigCombo(%u, %u)\n", size, radix); }
120200

121201
vector<double2> tab;
202+
// old fft_HEIGHT
122203
for (u32 line = 1; line < radix; ++line) {
123204
for (u32 col = 0; col < size / radix; ++col) {
124205
tab.push_back(radix / line >= 8 ? root1Fancy(size, col * line) : root1(size, col * line));
125206
}
126207
}
208+
tab.resize(size);
209+
210+
// New fft_HEIGHT
211+
vector<double> tab1;
212+
// Epsilon value, 2^-250, should have an exact representation as a double
213+
const double epsilon = 5.5271478752604445602472651921923E-76; // Protect against divide by zero
214+
// Sine/cosine values for first fft8
215+
//TO DO: explore using long doubles through the division (though dividing by double(cosine) may make sense)
216+
for (u32 line = 1; line < radix; ++line) {
217+
for (u32 col = 0; col < size / radix; ++col) {
218+
double2 root = root1(size, col * line); root.first += epsilon;
219+
tab1.push_back(root.second / root.first);
220+
}
221+
}
222+
// Interleave trig values for faster AMD GPU access
223+
for (u32 i = 0; i < 6; i += 2) T2shuffle(size, radix, i, tab1);
224+
// Sine/cosine values for second fft8
225+
for (u32 line = 0; line < radix; ++line) {
226+
for (u32 col = 0; col < size / radix / 8; ++col) {
227+
double2 root = root1(size, 8 * col * line); root.first += epsilon;
228+
tab1.push_back(root.second / root.first);
229+
}
230+
}
231+
// Cosine values for first fft8 (output in post-shufl order)
232+
//TODO: Examine why when sine is 0.0 cosine is not 1.0 or -1.0 (printf is outputting 0.999... and -0.999...)
233+
for (u32 col = 0; col < size / radix; ++col) { // col 0..7 will be line0, col 8..15 will be line1, etc.
234+
for (u32 line = 0; line < radix; ++line) {
235+
double2 root = root1(size, col * line); root.first += epsilon;
236+
#if SAVE_ONE_MORE_HEIGHT_MUL
237+
if (col / 8 == 3) { // Compute cosine3 / cosine1
238+
root.first /= root1(size, (col - 16) * line).first + epsilon;
239+
}
240+
if (col / 8 == 5) { // Compute cosine5 / cosine1
241+
root.first /= root1(size, (col - 32) * line).first + epsilon;
242+
}
243+
#endif
244+
if (col / 8 == 6) { // Compute cosine6 / cosine2
245+
root.first /= root1(size, (col - 32) * line).first + epsilon;
246+
}
247+
if (col / 8 == 7) { // Compute cosine7 / cosine3
248+
root.first /= root1(size, (col - 32) * line).first + epsilon;
249+
}
250+
tab1.push_back(root.first);
251+
}
252+
}
253+
// Interleave trig values for faster AMD GPU access
254+
for (u32 i = 8; i < 16; i += 2) T2shuffle(size, radix, i, tab1);
255+
// Cosine values for second fft8 (output in post-shufl order)
256+
for (u32 col = 0; col < size / radix / 8; ++col) {
257+
for (u32 line = 0; line < radix; ++line) {
258+
double2 root = root1(size, 8 * col * line); root.first += epsilon;
259+
#if SAVE_ONE_MORE_HEIGHT_MUL
260+
if (col == 3) { // Compute cosine3 / cosine1
261+
root.first /= root1(size, 8 * (col - 2) * line).first + epsilon;
262+
}
263+
if (col == 5) { // Compute cosine5 / cosine1
264+
root.first /= root1(size, 8 * (col - 4) * line).first + epsilon;
265+
}
266+
#endif
267+
if (col == 6) { // Compute cosine6 / cosine2
268+
root.first /= root1(size, 8 * (col - 4) * line).first + epsilon;
269+
}
270+
if (col == 7) { // Compute cosine7 / cosine3
271+
root.first /= root1(size, 8 * (col - 4) * line).first + epsilon;
272+
}
273+
tab1.push_back(root.first);
274+
}
275+
}
276+
// Convert to a vector of double2
277+
for (u32 i = 0; i < tab1.size(); i += 2) tab.push_back({tab1[i], tab1[i+1]});
278+
279+
tab.resize(size*4);
280+
127281
// From tailSquare pre-calculate some or all of these: T2 trig = slowTrig_N(line + H * lowMe, ND / NH * 2);
128282
#if PREFER_DP_TO_MEM == 2 // No pre-computed trig values
129283
#elif PREFER_DP_TO_MEM == 1 // best option on a Radeon VII.

src/cl/base.cl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,10 @@ typedef double2 T2;
145145
// For reasons unknown, loading trig values into nVidia's constant cache has terrible performance
146146
#if AMDGPU
147147
typedef constant const T2* Trig;
148+
typedef constant const T* TrigSingle;
148149
#else
149150
typedef global const T2* Trig;
151+
typedef global const T* TrigSingle;
150152
#endif
151153
// However, caching weights in nVidia's constant cache improves performance.
152154
// Even better is to not pollute the constant cache with weights that are used only once.

src/cl/carryfused.cl

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,14 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
2929
u32 line = gr % H;
3030

3131
T2 u[NW];
32-
32+
3333
readCarryFusedLine(in, u, line);
3434

35-
// Split 32 bits into NW groups of 2 bits. See later for different way to do this.
35+
#if HAS_ASM
36+
__asm("s_setprio 3");
37+
#endif
38+
39+
// Split 32 bits into NW groups of 2 bits. See later for different way to do this.
3640
#if !BIGLIT
3741
#define GPW (16 / NW)
3842
u32 b = bits[(G_W * line + me) / GPW] >> (me % GPW * (2 * NW));
@@ -43,7 +47,18 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
4347
// common sub-expressions to re-use in the second fft_WIDTH call. Re-using this data requires dozens of VGPRs
4448
// which causes a terrible reduction in occupancy.
4549
// fft_WIDTH(lds + (get_group_id(0) / 131072), u, smallTrig + (get_group_id(0) / 131072));
46-
fft_WIDTH(lds, u, smallTrig);
50+
51+
// A temporary hack until we figure out which combinations we want to finally offer:
52+
// UNROLL_W=0: old fft_WIDTH, no loop unrolling
53+
// UNROLL_W=1: old fft_WIDTH, loop unrolling
54+
// UNROLL_W=2: old fft_WIDTH, loop unrolling with "hidden zero" hack to thwart rocm optimizer. I'm seeing this as best R7Pro option.
55+
// UNROLL_W=3: new fft_WIDTH if applicable, hidden zero hack. Slightly better on Radeon VII -- more study needed as to why results weren't better.
56+
// UNROLL_W=4: new fft_WIDTH if applicable, no hidden zero hack. Best on Titan V.
57+
#if UNROLL_W == 2 || UNROLL_W == 3
58+
new_fft_WIDTH1(lds + (get_group_id(0) / 131072), u, smallTrig + (get_group_id(0) / 131072));
59+
#else
60+
new_fft_WIDTH1(lds, u, smallTrig);
61+
#endif
4762

4863
Word2 wu[NW];
4964
#if AMDGPU
@@ -86,7 +101,7 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
86101
}
87102

88103
// On Titan V it is faster to derive the big vs. little flags from the fractional number of bits in each FFT word rather read the flags from memory.
89-
// On Radeon VII this code is about he same speed. Not sure which is better on other GPUs.
104+
// On Radeon VII this code is about the same speed. Not sure which is better on other GPUs.
90105
#if BIGLIT
91106
// Calculate the most significant 32-bits of FRAC_BPW * the index of the FFT word. Also add FRAC_BPW_HI to test first biglit flag.
92107
u32 fft_word_index = (me * H + line) * 2;
@@ -147,10 +162,16 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
147162
}
148163
}
149164
if (gr == 0) { return; }
165+
#if HAS_ASM
166+
__asm("s_setprio 0");
167+
#endif
150168
u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT;
151169
if (me % WAVEFRONT == 0) {
152-
do { spin(); } while(atomic_load((atomic_uint *) &ready[pos]) == 0);
170+
do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0);
153171
}
172+
#if HAS_ASM
173+
__asm("s_setprio 1");
174+
#endif
154175
mem_fence(CLK_GLOBAL_MEM_FENCE);
155176

156177
// Clear carry ready flag for next iteration
@@ -201,6 +222,8 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
201222

202223
bar();
203224

204-
fft_WIDTH(lds, u, smallTrig);
225+
// fft_WIDTH(lds, u, smallTrig);
226+
new_fft_WIDTH2(lds, u, smallTrig);
227+
205228
writeCarryFusedLine(u, out, line);
206229
}

0 commit comments

Comments
 (0)