@@ -1725,7 +1725,7 @@ uint __builtin_spirv_OpGroupNonUniformBallotFindMSB_i32_v4i32(uint Execution, ui
17251725{
17261726 if (Execution == Subgroup )
17271727 {
1728- return ( sizeof ( uint ) * 8 ) - __builtin_spirv_OpenCL_clz_i32 (Value .x );
1728+ return __builtin_spirv_OpenCL_clz_i32 (Value .x );
17291729 }
17301730 return 0 ;
17311731}
@@ -2083,117 +2083,6 @@ DEFN_UNIFORM_GROUP_FUNC(SMax, int, i32, __builtin_spirv_OpenCL_s_max_i32_i32,
20832083DEFN_UNIFORM_GROUP_FUNC (SMax , long , i64 , __builtin_spirv_OpenCL_s_max_i64_i64 , LONG_MIN )
20842084
20852085#if defined(cl_khr_subgroup_non_uniform_arithmetic ) || defined(cl_khr_subgroup_clustered_reduce )
2086- #define DEFN_SUB_GROUP_REDUCE_NON_UNIFORM (type , type_abbr , op , identity , X ) \
2087- { \
2088- uint activeChannels = __builtin_IB_WaveBallot(true); \
2089- uint firstActive = __builtin_spirv_OpenCL_ctz_i32(activeChannels); \
2090- \
2091- type result = identity; \
2092- while (activeChannels) \
2093- { \
2094- uint activeId = __builtin_spirv_OpenCL_ctz_i32(activeChannels); \
2095- \
2096- type value = intel_sub_group_shuffle(X, activeId); \
2097- result = op(value, result); \
2098- \
2099- uint disable = 1 << activeId; \
2100- activeChannels ^= disable; \
2101- } \
2102- \
2103- uint3 vec3; \
2104- vec3.s0 = firstActive; \
2105- X = __builtin_spirv_OpGroupBroadcast_i32_##type_abbr##_v3i32(Subgroup, result, vec3); \
2106- }
2107-
2108- #define DEFN_SUB_GROUP_SCAN_INCL_NON_UNIFORM (type , type_abbr , op , identity , X ) \
2109- { \
2110- uint sglid = __builtin_spirv_BuiltInSubgroupLocalInvocationId(); \
2111- uint activeChannels = __builtin_IB_WaveBallot(true); \
2112- \
2113- while (activeChannels) \
2114- { \
2115- uint activeId = __builtin_spirv_OpenCL_ctz_i32(activeChannels); \
2116- \
2117- type value = intel_sub_group_shuffle(X, activeId); \
2118- if (sglid > activeId) \
2119- X = op(value, X); \
2120- \
2121- uint disable = 1 << activeId; \
2122- activeChannels ^= disable; \
2123- } \
2124- }
2125-
2126- #define DEFN_SUB_GROUP_SCAN_EXCL_NON_UNIFORM (type , type_abbr , op , identity , X ) \
2127- { \
2128- uint sglid = __builtin_spirv_BuiltInSubgroupLocalInvocationId(); \
2129- uint activeChannels = __builtin_IB_WaveBallot(true); \
2130- \
2131- uint mask = (1 << sglid) - 1; \
2132- uint sglidPrev = (sizeof(uint) * 8 - __builtin_spirv_OpenCL_clz_i32(activeChannels & mask)) - 1; \
2133- uint offsetToPrevActive = sglid - sglidPrev; \
2134- X = intel_sub_group_shuffle_up((type)identity, X, offsetToPrevActive); \
2135- \
2136- while (activeChannels) \
2137- { \
2138- uint activeId = __builtin_spirv_OpenCL_ctz_i32(activeChannels); \
2139- \
2140- type value = intel_sub_group_shuffle(X, activeId); \
2141- if (sglid > activeId) \
2142- X = op(value, X); \
2143- \
2144- uint disable = 1 << activeId; \
2145- activeChannels ^= disable; \
2146- } \
2147- }
2148-
2149- #define DEFN_SUB_GROUP_CLUSTERED_REDUCE (type , type_abbr , op , identity , X , ClusterSize ) \
2150- { \
2151- uint clusterIndex = 0; \
2152- uint activeChannels = __builtin_IB_WaveBallot(true); \
2153- uint numActive = __builtin_spirv_OpenCL_popcount_i32(activeChannels); \
2154- uint numClusters = numActive / ClusterSize; \
2155- \
2156- for (uint clusterIndex = 0; clusterIndex < numClusters; clusterIndex++) \
2157- { \
2158- uint Counter = ClusterSize; \
2159- uint Ballot = activeChannels; \
2160- uint clusterBallot = 0; \
2161- while (Counter--) \
2162- { \
2163- uint trailingOne = 1 << __builtin_spirv_OpenCL_ctz_i32(Ballot); \
2164- clusterBallot |= trailingOne; \
2165- Ballot ^= trailingOne; \
2166- } \
2167- uint active = __builtin_spirv_OpGroupNonUniformInverseBallot_i32_v4i32(Subgroup, clusterBallot); \
2168- if (active) \
2169- { \
2170- DEFN_SUB_GROUP_REDUCE_NON_UNIFORM(type, type_abbr, op, identity, X) \
2171- } \
2172- activeChannels ^= clusterBallot; \
2173- } \
2174- }
2175-
2176- #define SUB_GROUP_SWITCH_NON_UNIFORM (type , type_abbr , op , identity , X , Operation , ClusterSize ) \
2177- { \
2178- switch (Operation){ \
2179- case GroupOperationReduce: \
2180- DEFN_SUB_GROUP_REDUCE_NON_UNIFORM(type, type_abbr, op, identity, X) \
2181- break; \
2182- case GroupOperationInclusiveScan: \
2183- DEFN_SUB_GROUP_SCAN_INCL_NON_UNIFORM(type, type_abbr, op, identity, X) \
2184- break; \
2185- case GroupOperationExclusiveScan: \
2186- DEFN_SUB_GROUP_SCAN_EXCL_NON_UNIFORM(type, type_abbr, op, identity, X) \
2187- break; \
2188- case GroupOperationClusteredReduce: \
2189- DEFN_SUB_GROUP_CLUSTERED_REDUCE(type, type_abbr, op, identity, X, ClusterSize) \
2190- break; \
2191- default: \
2192- return 0; \
2193- break; \
2194- } \
2195- }
2196-
21972086// ClusterSize is an optional parameter
21982087#define DEFN_NON_UNIFORM_GROUP_FUNC (func , type , type_abbr , op , identity ) \
21992088type __builtin_spirv_OpGroupNonUniform##func##_i32_i32_##type_abbr##_i32(uint Execution, uint Operation, type X, uint ClusterSize) \
@@ -2220,8 +2109,7 @@ type __builtin_spirv_OpGroupNonUniform##func##_i32_i32_##type_abbr##_i32(uint E
22202109 } \
22212110 } \
22222111 else { \
2223- SUB_GROUP_SWITCH_NON_UNIFORM(type, type_abbr, op, identity, X, Operation, ClusterSize) \
2224- return X; \
2112+ SUB_GROUP_SWITCH(type, type_abbr, op, identity, X, Operation) \
22252113 } \
22262114 return 0; \
22272115 } \
@@ -2283,36 +2171,31 @@ DEFN_NON_UNIFORM_GROUP_FUNC(FMax, half, f16, __builtin_spirv_OpenCL_fmax_f16_f
22832171#endif // defined(cl_khr_fp16)
22842172
22852173// OpGroupNonUniformIMul, OpGroupNonUniformFMul
2286- DEFN_NON_UNIFORM_GROUP_FUNC (IMul , uchar , i8 , __intel_mul , 1 )
2287- DEFN_NON_UNIFORM_GROUP_FUNC (IMul , ushort , i16 , __intel_mul , 1 )
2288- DEFN_NON_UNIFORM_GROUP_FUNC (IMul , uint , i32 , __intel_mul , 1 )
2289- DEFN_NON_UNIFORM_GROUP_FUNC (IMul , ulong , i64 , __intel_mul , 1 )
2290- DEFN_NON_UNIFORM_GROUP_FUNC (FMul , float , f32 , __intel_mul , 1 )
2174+ DEFN_NON_UNIFORM_GROUP_FUNC (IMul , uchar , i8 , __intel_mul , 0 )
2175+ DEFN_NON_UNIFORM_GROUP_FUNC (IMul , ushort , i16 , __intel_mul , 0 )
2176+ DEFN_NON_UNIFORM_GROUP_FUNC (IMul , uint , i32 , __intel_mul , 0 )
2177+ DEFN_NON_UNIFORM_GROUP_FUNC (IMul , ulong , i64 , __intel_mul , 0 )
2178+ DEFN_NON_UNIFORM_GROUP_FUNC (FMul , float , f32 , __intel_mul , 0 )
22912179#if defined(cl_khr_fp64 )
2292- DEFN_NON_UNIFORM_GROUP_FUNC (FMul , double , f64 , __intel_mul , 1 )
2180+ DEFN_NON_UNIFORM_GROUP_FUNC (FMul , double , f64 , __intel_mul , 0 )
22932181#endif // defined(cl_khr_fp64)
22942182#if defined(cl_khr_fp16 )
2295- DEFN_NON_UNIFORM_GROUP_FUNC (FMul , half , f16 , __intel_mul , 1 )
2183+ DEFN_NON_UNIFORM_GROUP_FUNC (FMul , half , f16 , __intel_mul , 0 )
22962184#endif // defined(cl_khr_fp16)
22972185
22982186// OpGroupNonUniformBitwiseAnd, OpGroupNonUniformBitwiseOr, OpGroupNonUniformBitwiseXor
2299- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseAnd , uchar , i8 , __intel_and , 0xFF )
2300- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseAnd , ushort , i16 , __intel_and , 0xFFFF )
2301- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseAnd , uint , i32 , __intel_and , 0xFFFFFFFF )
2302- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseAnd , ulong , i64 , __intel_and , 0xFFFFFFFFFFFFFFFF )
2303-
2304- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseOr , uchar , i8 , __intel_or , 0 )
2305- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseOr , ushort , i16 , __intel_or , 0 )
2306- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseOr , uint , i32 , __intel_or , 0 )
2307- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseOr , ulong , i64 , __intel_or , 0 )
2187+ #define DEFN_NON_UNIFORM_BITWISE_OPERATION (func , op ) \
2188+ DEFN_NON_UNIFORM_GROUP_FUNC(func, uchar, i8, __intel_##op, 0) \
2189+ DEFN_NON_UNIFORM_GROUP_FUNC(func, ushort, i16, __intel_##op, 0) \
2190+ DEFN_NON_UNIFORM_GROUP_FUNC(func, uint, i32, __intel_##op, 0) \
2191+ DEFN_NON_UNIFORM_GROUP_FUNC(func, ulong, i64, __intel_##op, 0)
23082192
2309- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseXor , uchar , i8 , __intel_xor , 0 )
2310- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseXor , ushort , i16 , __intel_xor , 0 )
2311- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseXor , uint , i32 , __intel_xor , 0 )
2312- DEFN_NON_UNIFORM_GROUP_FUNC (BitwiseXor , ulong , i64 , __intel_xor , 0 )
2193+ DEFN_NON_UNIFORM_BITWISE_OPERATION (BitwiseAnd , and )
2194+ DEFN_NON_UNIFORM_BITWISE_OPERATION (BitwiseOr , or )
2195+ DEFN_NON_UNIFORM_BITWISE_OPERATION (BitwiseXor , xor )
23132196
23142197// OpGroupNonUniformLogicalAnd, OpGroupNonUniformLogicalOr, OpGroupNonUniformLogicalXor
2315- DEFN_NON_UNIFORM_GROUP_FUNC (LogicalAnd , bool , i1 , __intel_and , 1 )
2198+ DEFN_NON_UNIFORM_GROUP_FUNC (LogicalAnd , bool , i1 , __intel_and , 0 )
23162199DEFN_NON_UNIFORM_GROUP_FUNC (LogicalOr , bool , i1 , __intel_or , 0 )
23172200DEFN_NON_UNIFORM_GROUP_FUNC (LogicalXor , bool , i1 , __intel_xor , 0 )
23182201#endif // defined(cl_khr_subgroup_non_uniform_arithmetic) || defined(cl_khr_subgroup_clustered_reduce)
0 commit comments