1414#define SINGLE_WIDE 0 // Old single-wide tailSquare vs. new double-wide tailSquare
1515#define SINGLE_KERNEL 0 // Implement tailSquare in a single kernel vs. two kernels
1616
17+ #define SAVE_ONE_MORE_WIDTH_MUL 0 // I want to make saving the only option -- but rocm optimizer is inexplicably making it slower in carryfused
18+ #define SAVE_ONE_MORE_HEIGHT_MUL 1 // In tailSquar this is the fastest option
19+
1720#define _USE_MATH_DEFINES
1821#include < cmath>
1922
@@ -93,23 +96,100 @@ double2 root1(u32 N, u32 k) {
9396namespace {
9497static const constexpr bool LOG_TRIG_ALLOC = false ;
9598
99+ // Interleave two lines of trig values so that AMD GPUs can use global_load_dwordx4 instructions
100+ void T2shuffle (u32 size, u32 radix, u32 line, vector<double > &tab) {
101+ vector<double > line1, line2;
102+ u32 line_size = size / radix;
103+ for (u32 col = 0 ; col < line_size; ++col) {
104+ line1.push_back (tab[line*line_size + col]);
105+ line2.push_back (tab[(line+1 )*line_size + col]);
106+ }
107+ for (u32 col = 0 ; col < line_size; ++col) {
108+ tab[line*line_size + 2 *col] = line1[col];
109+ tab[line*line_size + 2 *col + 1 ] = line2[col];
110+ }
111+ }
112+
96113vector<double2> genSmallTrig (u32 size, u32 radix) {
97114 if (LOG_TRIG_ALLOC) { log (" genSmallTrig(%u, %u)\n " , size, radix); }
98115
99116 vector<double2> tab;
100- # if 1
117+ // old fft_WIDTH
101118 for (u32 line = 1 ; line < radix; ++line) {
102119 for (u32 col = 0 ; col < size / radix; ++col) {
103120 tab.push_back (radix / line >= 8 ? root1Fancy (size, col * line) : root1 (size, col * line));
104121 }
105122 }
106123 tab.resize (size);
107- #else
108- tab.resize(size);
109- auto *p = tab.data() + radix;
110- for (u32 w = radix; w < size; w *= radix) { p = smallTrigBlock(w, std::min(radix, size / w), p); }
111- assert(p - tab.data() == size);
124+
125+ // New fft_WIDTH
126+ vector<double > tab1;
127+ // Epsilon value, 2^-250, should have an exact representation as a double
128+ const double epsilon = 5.5271478752604445602472651921923E-76 ; // Protect against divide by zero
129+ // Sine/cosine values for first fft8
130+ // TO DO: explore using long doubles through the division (though dividing by double(cosine) may make sense)
131+ for (u32 line = 1 ; line < radix; ++line) {
132+ for (u32 col = 0 ; col < size / radix; ++col) {
133+ double2 root = root1 (size, col * line); root.first += epsilon;
134+ tab1.push_back (root.second / root.first );
135+ }
136+ }
137+ // Interleave trig values for faster AMD GPU access
138+ for (u32 i = 0 ; i < 6 ; i += 2 ) T2shuffle (size, radix, i, tab1);
139+ // Sine/cosine values for second fft8
140+ for (u32 line = 0 ; line < radix; ++line) {
141+ for (u32 col = 0 ; col < size / radix / 8 ; ++col) {
142+ double2 root = root1 (size, 8 * col * line); root.first += epsilon;
143+ tab1.push_back (root.second / root.first );
144+ }
145+ }
146+ // Cosine values for first fft8 (output in post-shufl order)
147+ // TODO: Examine why when sine is 0.0 cosine is not 1.0 or -1.0 (printf is outputting 0.999... and -0.999...)
148+ for (u32 col = 0 ; col < size / radix; ++col) { // col 0..7 will be line0, col 8..15 will be line1, etc.
149+ for (u32 line = 0 ; line < radix; ++line) {
150+ double2 root = root1 (size, col * line); root.first += epsilon;
151+ #if SAVE_ONE_MORE_WIDTH_MUL
152+ if (col / 8 == 3 ) { // Compute cosine3 / cosine1
153+ root.first /= root1 (size, (col - 16 ) * line).first + epsilon;
154+ }
155+ if (col / 8 == 5 ) { // Compute cosine5 / cosine1
156+ root.first /= root1 (size, (col - 32 ) * line).first + epsilon;
157+ }
112158#endif
159+ if (col / 8 == 6 ) { // Compute cosine6 / cosine2
160+ root.first /= root1 (size, (col - 32 ) * line).first + epsilon;
161+ }
162+ if (col / 8 == 7 ) { // Compute cosine7 / cosine3
163+ root.first /= root1 (size, (col - 32 ) * line).first + epsilon;
164+ }
165+ tab1.push_back (root.first );
166+ }
167+ }
168+ // Interleave trig values for faster AMD GPU access
169+ for (u32 i = 8 ; i < 16 ; i += 2 ) T2shuffle (size, radix, i, tab1);
170+ // Cosine values for second fft8 (output in post-shufl order)
171+ for (u32 col = 0 ; col < size / radix / 8 ; ++col) {
172+ for (u32 line = 0 ; line < radix; ++line) {
173+ double2 root = root1 (size, 8 * col * line); root.first += epsilon;
174+ #if SAVE_ONE_MORE_WIDTH_MUL
175+ if (col == 3 ) { // Compute cosine3 / cosine1
176+ root.first /= root1 (size, 8 * (col - 2 ) * line).first + epsilon;
177+ }
178+ if (col == 5 ) { // Compute cosine5 / cosine1
179+ root.first /= root1 (size, 8 * (col - 4 ) * line).first + epsilon;
180+ }
181+ #endif
182+ if (col == 6 ) { // Compute cosine6 / cosine2
183+ root.first /= root1 (size, 8 * (col - 4 ) * line).first + epsilon;
184+ }
185+ if (col == 7 ) { // Compute cosine7 / cosine3
186+ root.first /= root1 (size, 8 * (col - 4 ) * line).first + epsilon;
187+ }
188+ tab1.push_back (root.first );
189+ }
190+ }
191+ // Convert to a vector of double2
192+ for (u32 i = 0 ; i < tab1.size (); i += 2 ) tab.push_back ({tab1[i], tab1[i+1 ]});
113193
114194 return tab;
115195}
@@ -119,11 +199,85 @@ vector<double2> genSmallTrigCombo(u32 width, u32 middle, u32 size, u32 radix) {
119199 if (LOG_TRIG_ALLOC) { log (" genSmallTrigCombo(%u, %u)\n " , size, radix); }
120200
121201 vector<double2> tab;
202+ // old fft_HEIGHT
122203 for (u32 line = 1 ; line < radix; ++line) {
123204 for (u32 col = 0 ; col < size / radix; ++col) {
124205 tab.push_back (radix / line >= 8 ? root1Fancy (size, col * line) : root1 (size, col * line));
125206 }
126207 }
208+ tab.resize (size);
209+
210+ // New fft_HEIGHT
211+ vector<double > tab1;
212+ // Epsilon value, 2^-250, should have an exact representation as a double
213+ const double epsilon = 5.5271478752604445602472651921923E-76 ; // Protect against divide by zero
214+ // Sine/cosine values for first fft8
215+ // TO DO: explore using long doubles through the division (though dividing by double(cosine) may make sense)
216+ for (u32 line = 1 ; line < radix; ++line) {
217+ for (u32 col = 0 ; col < size / radix; ++col) {
218+ double2 root = root1 (size, col * line); root.first += epsilon;
219+ tab1.push_back (root.second / root.first );
220+ }
221+ }
222+ // Interleave trig values for faster AMD GPU access
223+ for (u32 i = 0 ; i < 6 ; i += 2 ) T2shuffle (size, radix, i, tab1);
224+ // Sine/cosine values for second fft8
225+ for (u32 line = 0 ; line < radix; ++line) {
226+ for (u32 col = 0 ; col < size / radix / 8 ; ++col) {
227+ double2 root = root1 (size, 8 * col * line); root.first += epsilon;
228+ tab1.push_back (root.second / root.first );
229+ }
230+ }
231+ // Cosine values for first fft8 (output in post-shufl order)
232+ // TODO: Examine why when sine is 0.0 cosine is not 1.0 or -1.0 (printf is outputting 0.999... and -0.999...)
233+ for (u32 col = 0 ; col < size / radix; ++col) { // col 0..7 will be line0, col 8..15 will be line1, etc.
234+ for (u32 line = 0 ; line < radix; ++line) {
235+ double2 root = root1 (size, col * line); root.first += epsilon;
236+ #if SAVE_ONE_MORE_HEIGHT_MUL
237+ if (col / 8 == 3 ) { // Compute cosine3 / cosine1
238+ root.first /= root1 (size, (col - 16 ) * line).first + epsilon;
239+ }
240+ if (col / 8 == 5 ) { // Compute cosine5 / cosine1
241+ root.first /= root1 (size, (col - 32 ) * line).first + epsilon;
242+ }
243+ #endif
244+ if (col / 8 == 6 ) { // Compute cosine6 / cosine2
245+ root.first /= root1 (size, (col - 32 ) * line).first + epsilon;
246+ }
247+ if (col / 8 == 7 ) { // Compute cosine7 / cosine3
248+ root.first /= root1 (size, (col - 32 ) * line).first + epsilon;
249+ }
250+ tab1.push_back (root.first );
251+ }
252+ }
253+ // Interleave trig values for faster AMD GPU access
254+ for (u32 i = 8 ; i < 16 ; i += 2 ) T2shuffle (size, radix, i, tab1);
255+ // Cosine values for second fft8 (output in post-shufl order)
256+ for (u32 col = 0 ; col < size / radix / 8 ; ++col) {
257+ for (u32 line = 0 ; line < radix; ++line) {
258+ double2 root = root1 (size, 8 * col * line); root.first += epsilon;
259+ #if SAVE_ONE_MORE_HEIGHT_MUL
260+ if (col == 3 ) { // Compute cosine3 / cosine1
261+ root.first /= root1 (size, 8 * (col - 2 ) * line).first + epsilon;
262+ }
263+ if (col == 5 ) { // Compute cosine5 / cosine1
264+ root.first /= root1 (size, 8 * (col - 4 ) * line).first + epsilon;
265+ }
266+ #endif
267+ if (col == 6 ) { // Compute cosine6 / cosine2
268+ root.first /= root1 (size, 8 * (col - 4 ) * line).first + epsilon;
269+ }
270+ if (col == 7 ) { // Compute cosine7 / cosine3
271+ root.first /= root1 (size, 8 * (col - 4 ) * line).first + epsilon;
272+ }
273+ tab1.push_back (root.first );
274+ }
275+ }
276+ // Convert to a vector of double2
277+ for (u32 i = 0 ; i < tab1.size (); i += 2 ) tab.push_back ({tab1[i], tab1[i+1 ]});
278+
279+ tab.resize (size*4 );
280+
127281 // From tailSquare pre-calculate some or all of these: T2 trig = slowTrig_N(line + H * lowMe, ND / NH * 2);
128282#if PREFER_DP_TO_MEM == 2 // No pre-computed trig values
129283#elif PREFER_DP_TO_MEM == 1 // best option on a Radeon VII.
0 commit comments