Skip to content

Commit d4a7205

Browse files
committed
Same optimization for readTailSquareLine. Saved 100 bytess of code in TaliSquareZero (who cares) and suprisingly saved
nothing in TailSquare as rocm optimizer was able to optimize the old code.
1 parent db77003 commit d4a7205

File tree

1 file changed

+14
-4
lines changed

1 file changed

+14
-4
lines changed

src/cl/middle.cl

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -146,12 +146,17 @@ void readTailFusedLine(CP(T2) in, T2 *u, u32 line, u32 me) {
146146
u32 fftMiddleIn_i = line / WIDTH; // The i in fftMiddleIn's u[i]
147147
in += fftMiddleIn_i * IN_WG; // Adjust in pointer the same way writeMiddleInLine did
148148

149-
// Adjust in pointer based on the y value used in writeMiddleInLine
149+
// Adjust in pointer based on the y value used in writeMiddleInLine. This code is a little obscure as rocm compiler has trouble optimizing commented out code.
150150
in += me % SIZEY; // Adjust in pointer to read SIZEY consecutive values
151+
u32 fftMiddleIn_y = me; // The i=0 fftMiddleIn y value
152+
u32 chunk_y = fftMiddleIn_y / SIZEY; // The i=0 fftMiddleIn chunk_y value
153+
u32 fftMiddleIn_y_incr = G_H; // The increment to next fftMiddleIn y value
154+
u32 chunk_y_incr = fftMiddleIn_y_incr / SIZEY; // The increment to next fftMiddleIn chunk_y value
151155
for (i32 i = 0; i < NH; ++i) {
152-
u32 fftMiddleIn_y = i * G_H + me; // The fftMiddleIn y value
153-
u32 chunk_y = fftMiddleIn_y / SIZEY; // The fftMiddleIn chunk_y value
156+
// u32 fftMiddleIn_y = i * G_H + me; // The fftMiddleIn y value
157+
// u32 chunk_y = fftMiddleIn_y / SIZEY; // The fftMiddleIn chunk_y value
154158
u[i] = NTLOAD(in[chunk_y * (MIDDLE * IN_WG + PAD_SIZE)]); // Adjust in pointer the same way writeMiddleInLine did
159+
chunk_y += chunk_y_incr;
155160
}
156161

157162
#else // Read data that was not rotated or padded
@@ -167,12 +172,17 @@ void readTailFusedLine(CP(T2) in, T2 *u, u32 line, u32 me) {
167172
u32 fftMiddleIn_i = line / WIDTH; // The i in fftMiddleIn's u[i]
168173
in += fftMiddleIn_i * IN_WG; // Adjust in pointer the same way writeMiddleInLine did
169174

170-
// Adjust in pointer based on the y value used in writeMiddleInLine
175+
// Adjust in pointer based on the y value used in writeMiddleInLine. This code is a little obscure as rocm compiler has trouble optimizing commented out code.
171176
in += me % SIZEY; // Adjust in pointer to read SIZEY consecutive values
177+
u32 fftMiddleIn_y = me; // The i=0 fftMiddleIn y value
178+
u32 chunk_y = fftMiddleIn_y / SIZEY; // The i=0 fftMiddleIn chunk_y value
179+
u32 fftMiddleIn_y_incr = G_H; // The increment to next fftMiddleIn y value
180+
u32 chunk_y_incr = fftMiddleIn_y_incr / SIZEY; // The increment to next fftMiddleIn chunk_y value
172181
for (i32 i = 0; i < NH; ++i) {
173182
u32 fftMiddleIn_y = i * G_H + me; // The fftMiddleIn y value
174183
u32 chunk_y = fftMiddleIn_y / SIZEY; // The fftMiddleIn chunk_y value
175184
u[i] = NTLOAD(in[chunk_y * (MIDDLE * IN_WG)]); // Adjust in pointer the same way writeMiddleInLine did
185+
chunk_y += chunk_y_incr;
176186
}
177187

178188
#endif

0 commit comments

Comments
 (0)