Skip to content

Commit 4dd475f

Browse files
committed
Coded up a new MM_CHAIN=1 version for TRIG_HI (fft spec :1 and :3).
This version uses fewer F64 ops, but is slower on Radeon 7 -- probably the rocm optimizer acting up. New version is disabled. I'll ask some users to see if it will be beneficial on other GPUs.
1 parent 947c51b commit 4dd475f

File tree

1 file changed

+34
-14
lines changed

1 file changed

+34
-14
lines changed

src/cl/fft-middle.cl

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -130,17 +130,37 @@ void middleMul(T2 *u, u32 s, Trig trig) {
130130
WADD(k, base);
131131
}
132132

133-
#elif MM_CHAIN == 1
133+
#elif 0 && MM_CHAIN == 1 // This is fewer F64 ops, but slower on Radeon 7 -- probably the optimizer being weird. It also has somewhat worse Z.
134134
for (u32 k = 3 + (MIDDLE - 2) % 3; k < MIDDLE; k += 3) {
135-
T2 base = slowTrig_N(WIDTH * k * s, WIDTH * SMALL_HEIGHT * k);
136-
WADD(k-1, base);
135+
T2 base, base_minus1, base_plus1;
136+
base = slowTrig_N(WIDTH * k * s, WIDTH * SMALL_HEIGHT * k);
137+
cmul_a_by_fancyb_and_conjfancyb(&base_plus1, &base_minus1, base, w);
138+
WADD(k-1, base_minus1);
137139
WADD(k, base);
138-
WADD(k+1, base);
140+
WADD(k+1, base_plus1);
139141
}
140142

143+
WADDF(1, w);
144+
145+
T2 w2;
146+
if ((MIDDLE - 2) % 3 > 0) {
147+
w2 = csqTrigFancy(w);
148+
WADDF(2, w2);
149+
}
150+
151+
if ((MIDDLE - 2) % 3 == 2) {
152+
T2 w3 = ccubeTrigFancy(w2, w);
153+
WADDF(3, w3);
154+
}
155+
156+
#elif MM_CHAIN == 1
141157
for (u32 k = 3 + (MIDDLE - 2) % 3; k < MIDDLE; k += 3) {
142-
WSUBF(k-1, w);
143-
WADDF(k+1, w);
158+
T2 base, base_minus1, base_plus1;
159+
base = slowTrig_N(WIDTH * k * s, WIDTH * SMALL_HEIGHT * k);
160+
cmul_a_by_fancyb_and_conjfancyb(&base_plus1, &base_minus1, base, w);
161+
WADD(k-1, base_minus1);
162+
WADD(k, base);
163+
WADD(k+1, base_plus1);
144164
}
145165

146166
WADDF(1, w);
@@ -230,22 +250,22 @@ void middleMul2(T2 *u, u32 x, u32 y, double factor, Trig trig) {
230250
T2 base, base_minus1, base_plus1;
231251
for (u32 i = 1; ; i += 3) {
232252
if (i-1 == MIDDLE-1) {
233-
base_minus1 = slowTrig_N(x * y + x * SMALL_HEIGHT * (i - 1), ND / MIDDLE * i) * factor;
234-
WADD(i-1, base_minus1);
235-
break;
253+
base_minus1 = slowTrig_N(x * y + x * SMALL_HEIGHT * (i - 1), ND / MIDDLE * i) * factor;
254+
WADD(i-1, base_minus1);
255+
break;
236256
} else if (i == MIDDLE-1) {
237-
base_minus1 = slowTrig_N(x * y + x * SMALL_HEIGHT * (i - 1), ND / MIDDLE * i) * factor;
257+
base_minus1 = slowTrig_N(x * y + x * SMALL_HEIGHT * (i - 1), ND / MIDDLE * i) * factor;
238258
base = cmulFancy(base_minus1, w);
239259
WADD(i-1, base_minus1);
240-
WADD(i, base);
241-
break;
260+
WADD(i, base);
261+
break;
242262
} else {
243263
base = slowTrig_N(x * y + x * SMALL_HEIGHT * i, ND / MIDDLE * (i + 1)) * factor;
244264
cmul_a_by_fancyb_and_conjfancyb(&base_plus1, &base_minus1, base, w);
245265
WADD(i-1, base_minus1);
246266
WADD(i, base);
247-
WADD(i+1, base_plus1);
248-
if (i+1 == MIDDLE-1) break;
267+
WADD(i+1, base_plus1);
268+
if (i+1 == MIDDLE-1) break;
249269
}
250270
}
251271
#else

0 commit comments

Comments
 (0)