Skip to content

Commit e4709ec

Browse files
gwoltmanpreda
authored andcommitted
An even better solution to carryStep. Optimizer was having trouble generating v_bfe_i32 instructions.
Used the approved amdgcn_builtin. Saved 30 bytes of assembly code. (Yes, I know that is not terribly important. I'm hoping that by reducing the complexity of carryFused the optimizer won't go bonkers and generate poor code quite as often). The key to getting reduced code is creating sequences where both nBits and 32-nBits need to be genereated.
1 parent 8285e9f commit e4709ec

File tree

1 file changed

+7
-3
lines changed

1 file changed

+7
-3
lines changed

src/cl/carryutil.cl

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,12 @@ void updateStats(global uint *bufROE, u32 posROE, float roundMax) {
1515
}
1616
#endif
1717

18+
#if defined(__has_builtin) && __has_builtin(__builtin_amdgcn_sbfe)
19+
i32 lowBits(i32 u, u32 bits) { return __builtin_amdgcn_sbfe(u, 0, bits); }
20+
#else
1821
i32 lowBits(i32 u, u32 bits) { return ((u << (32 - bits)) >> (32 - bits)); }
22+
#endif
23+
1924
#if defined(__has_builtin) && __has_builtin(__builtin_amdgcn_alignbit)
2025
i32 xtract32(i64 x, u32 bits) { return __builtin_amdgcn_alignbit(as_int2(x).y, as_int2(x).x, bits); }
2126
#else
@@ -69,9 +74,8 @@ Word OVERLOAD carryStep(i64 x, i64 *outCarry, bool isBigWord) {
6974

7075
Word OVERLOAD carryStep(i64 x, i32 *outCarry, bool isBigWord) {
7176
u32 nBits = bitlen(isBigWord);
72-
x <<= 32 - nBits;
73-
Word w = as_int2(x).x >> (32 - nBits);
74-
*outCarry = as_int2(x).y + (w < 0);
77+
Word w = lowBits(x, nBits);
78+
*outCarry = xtract32(x, nBits) + (w < 0);
7579
return w;
7680
}
7781

0 commit comments

Comments
 (0)