@@ -130,17 +130,37 @@ void middleMul(T2 *u, u32 s, Trig trig) {
130130 WADD (k , base );
131131 }
132132
133- #elif MM_CHAIN == 1
133+ #elif 0 && MM_CHAIN == 1 // This is fewer F64 ops, but slower on Radeon 7 -- probably the optimizer being weird. It also has somewhat worse Z.
134134 for (u32 k = 3 + (MIDDLE - 2 ) % 3 ; k < MIDDLE ; k += 3 ) {
135- T2 base = slowTrig_N (WIDTH * k * s , WIDTH * SMALL_HEIGHT * k );
136- WADD (k - 1 , base );
135+ T2 base , base_minus1 , base_plus1 ;
136+ base = slowTrig_N (WIDTH * k * s , WIDTH * SMALL_HEIGHT * k );
137+ cmul_a_by_fancyb_and_conjfancyb (& base_plus1 , & base_minus1 , base , w );
138+ WADD (k - 1 , base_minus1 );
137139 WADD (k , base );
138- WADD (k + 1 , base );
140+ WADD (k + 1 , base_plus1 );
139141 }
140142
143+ WADDF (1 , w );
144+
145+ T2 w2 ;
146+ if ((MIDDLE - 2 ) % 3 > 0 ) {
147+ w2 = csqTrigFancy (w );
148+ WADDF (2 , w2 );
149+ }
150+
151+ if ((MIDDLE - 2 ) % 3 == 2 ) {
152+ T2 w3 = ccubeTrigFancy (w2 , w );
153+ WADDF (3 , w3 );
154+ }
155+
156+ #elif MM_CHAIN == 1
141157 for (u32 k = 3 + (MIDDLE - 2 ) % 3 ; k < MIDDLE ; k += 3 ) {
142- WSUBF (k - 1 , w );
143- WADDF (k + 1 , w );
158+ T2 base , base_minus1 , base_plus1 ;
159+ base = slowTrig_N (WIDTH * k * s , WIDTH * SMALL_HEIGHT * k );
160+ cmul_a_by_fancyb_and_conjfancyb (& base_plus1 , & base_minus1 , base , w );
161+ WADD (k - 1 , base_minus1 );
162+ WADD (k , base );
163+ WADD (k + 1 , base_plus1 );
144164 }
145165
146166 WADDF (1 , w );
@@ -230,22 +250,22 @@ void middleMul2(T2 *u, u32 x, u32 y, double factor, Trig trig) {
230250 T2 base , base_minus1 , base_plus1 ;
231251 for (u32 i = 1 ; ; i += 3 ) {
232252 if (i - 1 == MIDDLE - 1 ) {
233- base_minus1 = slowTrig_N (x * y + x * SMALL_HEIGHT * (i - 1 ), ND / MIDDLE * i ) * factor ;
234- WADD (i - 1 , base_minus1 );
235- break ;
253+ base_minus1 = slowTrig_N (x * y + x * SMALL_HEIGHT * (i - 1 ), ND / MIDDLE * i ) * factor ;
254+ WADD (i - 1 , base_minus1 );
255+ break ;
236256 } else if (i == MIDDLE - 1 ) {
237- base_minus1 = slowTrig_N (x * y + x * SMALL_HEIGHT * (i - 1 ), ND / MIDDLE * i ) * factor ;
257+ base_minus1 = slowTrig_N (x * y + x * SMALL_HEIGHT * (i - 1 ), ND / MIDDLE * i ) * factor ;
238258 base = cmulFancy (base_minus1 , w );
239259 WADD (i - 1 , base_minus1 );
240- WADD (i , base );
241- break ;
260+ WADD (i , base );
261+ break ;
242262 } else {
243263 base = slowTrig_N (x * y + x * SMALL_HEIGHT * i , ND / MIDDLE * (i + 1 )) * factor ;
244264 cmul_a_by_fancyb_and_conjfancyb (& base_plus1 , & base_minus1 , base , w );
245265 WADD (i - 1 , base_minus1 );
246266 WADD (i , base );
247- WADD (i + 1 , base_plus1 );
248- if (i + 1 == MIDDLE - 1 ) break ;
267+ WADD (i + 1 , base_plus1 );
268+ if (i + 1 == MIDDLE - 1 ) break ;
249269 }
250270 }
251271#else
0 commit comments