Skip to content

Commit d6739d3

Browse files
authored
[AMD] Update HIP headers to 6.2.2 (#5077)
This pulls in some new symbols so we don't need to use hardcoded numbers later. This cherry-picks triton-lang/triton#3989 again to fix macOS builds.
1 parent 627ebbb commit d6739d3

27 files changed

+4709
-658
lines changed

third_party/amd/backend/include/hip/amd_detail/amd_device_functions.h

Lines changed: 28 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -266,14 +266,14 @@ __device__ static inline int __mul24(int x, int y) {
266266
}
267267

268268
__device__ static inline long long __mul64hi(long long int x, long long int y) {
269-
ulong x0 = (ulong)x & 0xffffffffUL;
270-
long x1 = x >> 32;
271-
ulong y0 = (ulong)y & 0xffffffffUL;
272-
long y1 = y >> 32;
273-
ulong z0 = x0*y0;
274-
long t = x1*y0 + (z0 >> 32);
275-
long z1 = t & 0xffffffffL;
276-
long z2 = t >> 32;
269+
unsigned long long x0 = (unsigned long long)x & 0xffffffffUL;
270+
long long x1 = x >> 32;
271+
unsigned long long y0 = (unsigned long long)y & 0xffffffffUL;
272+
long long y1 = y >> 32;
273+
unsigned long long z0 = x0*y0;
274+
long long t = x1*y0 + (z0 >> 32);
275+
long long z1 = t & 0xffffffffL;
276+
long long z2 = t >> 32;
277277
z1 = x0*y1 + z1;
278278
return x1*y1 + z2 + (z1 >> 32);
279279
}
@@ -300,14 +300,14 @@ __device__ static inline int __umul24(unsigned int x, unsigned int y) {
300300

301301
__device__
302302
static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
303-
ulong x0 = x & 0xffffffffUL;
304-
ulong x1 = x >> 32;
305-
ulong y0 = y & 0xffffffffUL;
306-
ulong y1 = y >> 32;
307-
ulong z0 = x0*y0;
308-
ulong t = x1*y0 + (z0 >> 32);
309-
ulong z1 = t & 0xffffffffUL;
310-
ulong z2 = t >> 32;
303+
unsigned long long x0 = x & 0xffffffffUL;
304+
unsigned long long x1 = x >> 32;
305+
unsigned long long y0 = y & 0xffffffffUL;
306+
unsigned long long y1 = y >> 32;
307+
unsigned long long z0 = x0*y0;
308+
unsigned long long t = x1*y0 + (z0 >> 32);
309+
unsigned long long z1 = t & 0xffffffffUL;
310+
unsigned long long z2 = t >> 32;
311311
z1 = x0*y1 + z1;
312312
return x1*y1 + z2 + (z1 >> 32);
313313
}
@@ -322,11 +322,6 @@ __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, uns
322322
return __ockl_sadd_u32(x, y, z);
323323
}
324324

325-
__device__ static inline unsigned int __lane_id() {
326-
return __builtin_amdgcn_mbcnt_hi(
327-
-1, __builtin_amdgcn_mbcnt_lo(-1, 0));
328-
}
329-
330325
__device__
331326
static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
332327

@@ -339,6 +334,7 @@ HIP specific device functions
339334

340335
#if !defined(__HIPCC_RTC__)
341336
#include "amd_warp_functions.h"
337+
#include "amd_warp_sync_functions.h"
342338
#endif
343339

344340
#define MASK1 0x00ff00ff
@@ -687,34 +683,6 @@ void __named_sync() { __builtin_amdgcn_s_barrier(); }
687683

688684
#endif // __HIP_DEVICE_COMPILE__
689685

690-
// warp vote function __all __any __ballot
691-
__device__
692-
inline
693-
int __all(int predicate) {
694-
return __ockl_wfall_i32(predicate);
695-
}
696-
697-
__device__
698-
inline
699-
int __any(int predicate) {
700-
return __ockl_wfany_i32(predicate);
701-
}
702-
703-
// XXX from llvm/include/llvm/IR/InstrTypes.h
704-
#define ICMP_NE 33
705-
706-
__device__
707-
inline
708-
unsigned long long int __ballot(int predicate) {
709-
return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
710-
}
711-
712-
__device__
713-
inline
714-
unsigned long long int __ballot64(int predicate) {
715-
return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
716-
}
717-
718686
// hip.amdgcn.bc - lanemask
719687
__device__
720688
inline
@@ -877,6 +845,10 @@ int __syncthreads_or(int predicate)
877845
#if (defined(__GFX10__) || defined(__GFX11__))
878846
#define HW_ID_WGP_ID_SIZE 4
879847
#define HW_ID_WGP_ID_OFFSET 10
848+
#if (defined(__AMDGCN_CUMODE__))
849+
#define HW_ID_CU_ID_SIZE 1
850+
#define HW_ID_CU_ID_OFFSET 8
851+
#endif
880852
#else
881853
#define HW_ID_CU_ID_SIZE 4
882854
#define HW_ID_CU_ID_OFFSET 8
@@ -933,6 +905,10 @@ unsigned __smid(void)
933905
GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
934906
unsigned sa_id = __builtin_amdgcn_s_getreg(
935907
GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
908+
#if (defined(__AMDGCN_CUMODE__))
909+
unsigned cu_id = __builtin_amdgcn_s_getreg(
910+
GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
911+
#endif
936912
#else
937913
#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
938914
unsigned xcc_id = __builtin_amdgcn_s_getreg(
@@ -945,6 +921,9 @@ unsigned __smid(void)
945921
unsigned temp = se_id;
946922
temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
947923
temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
924+
#if (defined(__AMDGCN_CUMODE__))
925+
temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
926+
#endif
948927
return temp;
949928
//TODO : CU Mode impl
950929
#elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))

third_party/amd/backend/include/hip/amd_detail/amd_hip_atomic.h

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -612,11 +612,17 @@ float atomicMin(float* addr, float val) {
612612
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
613613
return unsafeAtomicMin(addr, val);
614614
#else
615+
typedef union u_hold {
616+
float a;
617+
unsigned int b;
618+
} u_hold_t;
619+
u_hold_t u{val};
620+
bool neg_zero = 0x80000000U == u.b;
615621
#if __has_builtin(__hip_atomic_load) && \
616622
__has_builtin(__hip_atomic_compare_exchange_strong)
617623
float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
618624
bool done = false;
619-
while (!done && value > val) {
625+
while (!done && (value > val || (neg_zero && value == 0.0f))) {
620626
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
621627
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
622628
}
@@ -625,7 +631,7 @@ float atomicMin(float* addr, float val) {
625631
unsigned int *uaddr = (unsigned int *)addr;
626632
unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
627633
bool done = false;
628-
while (!done && __uint_as_float(value) > val) {
634+
while (!done && (__uint_as_float(value) > val || (neg_zero && __uint_as_float(value) == 0.0f))) {
629635
done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
630636
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
631637
}
@@ -658,11 +664,17 @@ double atomicMin(double* addr, double val) {
658664
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
659665
return unsafeAtomicMin(addr, val);
660666
#else
667+
typedef union u_hold {
668+
double a;
669+
unsigned long long b;
670+
} u_hold_t;
671+
u_hold_t u{val};
672+
bool neg_zero = 0x8000000000000000ULL == u.b;
661673
#if __has_builtin(__hip_atomic_load) && \
662674
__has_builtin(__hip_atomic_compare_exchange_strong)
663675
double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
664676
bool done = false;
665-
while (!done && value > val) {
677+
while (!done && (value > val || (neg_zero && value == 0.0))) {
666678
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
667679
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
668680
}
@@ -671,7 +683,8 @@ double atomicMin(double* addr, double val) {
671683
unsigned long long *uaddr = (unsigned long long *)addr;
672684
unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
673685
bool done = false;
674-
while (!done && __longlong_as_double(value) > val) {
686+
while (!done &&
687+
(__longlong_as_double(value) > val || (neg_zero && __longlong_as_double(value) == 0.0))) {
675688
done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
676689
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
677690
}
@@ -856,11 +869,17 @@ float atomicMax(float* addr, float val) {
856869
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
857870
return unsafeAtomicMax(addr, val);
858871
#else
872+
typedef union u_hold {
873+
float a;
874+
unsigned int b;
875+
} u_hold_t;
876+
u_hold_t u{val};
877+
bool neg_zero = 0x80000000U == u.b;
859878
#if __has_builtin(__hip_atomic_load) && \
860879
__has_builtin(__hip_atomic_compare_exchange_strong)
861880
float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
862881
bool done = false;
863-
while (!done && value < val) {
882+
while (!done && (value < val || (neg_zero && value == 0.0f))) {
864883
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
865884
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
866885
}
@@ -869,7 +888,7 @@ float atomicMax(float* addr, float val) {
869888
unsigned int *uaddr = (unsigned int *)addr;
870889
unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
871890
bool done = false;
872-
while (!done && __uint_as_float(value) < val) {
891+
while (!done && (__uint_as_float(value) < val || (neg_zero && __uint_as_float(value) == 0.0f))) {
873892
done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
874893
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
875894
}
@@ -902,11 +921,17 @@ double atomicMax(double* addr, double val) {
902921
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
903922
return unsafeAtomicMax(addr, val);
904923
#else
924+
typedef union u_hold {
925+
double a;
926+
unsigned long long b;
927+
} u_hold_t;
928+
u_hold_t u{val};
929+
bool neg_zero = 0x8000000000000000ULL == u.b;
905930
#if __has_builtin(__hip_atomic_load) && \
906931
__has_builtin(__hip_atomic_compare_exchange_strong)
907932
double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
908933
bool done = false;
909-
while (!done && value < val) {
934+
while (!done && (value < val || (neg_zero && value == 0.0))) {
910935
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
911936
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
912937
}
@@ -915,7 +940,8 @@ double atomicMax(double* addr, double val) {
915940
unsigned long long *uaddr = (unsigned long long *)addr;
916941
unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
917942
bool done = false;
918-
while (!done && __longlong_as_double(value) < val) {
943+
while (!done &&
944+
(__longlong_as_double(value) < val || (neg_zero && __longlong_as_double(value) == 0.0))) {
919945
done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
920946
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
921947
}
@@ -977,7 +1003,7 @@ unsigned int atomicDec(unsigned int* address, unsigned int val)
9771003
#else
9781004
return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
9791005
#endif // __gfx941__
980-
1006+
9811007
}
9821008

9831009
__device__

0 commit comments

Comments
 (0)