Skip to content

Commit 9cbb62c

Browse files
committed
Promote till commit '6c755a411648c892d917d2415c5e646ab1e37fe3'
Change-Id: Ic387a6a90265d581239ed00d45baccd4da5cdd63
2 parents 03366da + 6c755a4 commit 9cbb62c

31 files changed

+540
-473
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ Full documentation for HIP is available at [rocm.docs.amd.com](https://rocm.docs
77
### Changed
88
* Added new environment variable
99
- `DEBUG_HIP_7_PREVIEW` This is used for enabling the backward incompatible changes before the next major ROCm release 7.0. By default this is set to 0. Users can set this variable to 0x1, to match the behavior of hipGetLastError with its corresponding CUDA API.
10+
* New HIP APIs
11+
- The `_sync()` version of crosslane builtins such as `shfl_sync()`,
12+
`__all_sync()` and `__any_sync()`, are enabled by default. These can be
13+
disabled by setting the preprocessor macro `HIP_DISABLE_WARP_SYNC_BUILTINS`.
1014

1115
## HIP 6.3 for ROCm 6.3
1216

hipamd/include/hip/amd_detail/amd_hip_bf16.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -681,7 +681,7 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned s
681681
return u.bf16;
682682
}
683683

684-
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
684+
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
685685
/**
686686
* \ingroup HIP_INTRINSIC_BFLOAT16_MOVE
687687
* \brief shfl down warp intrinsic for bfloat16
@@ -789,7 +789,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_xor_sync(const unsigned long long
789789
u.ui = __shfl_xor_sync(mask, u.ui, delta, width);
790790
return u.bf162;
791791
}
792-
#endif
792+
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
793793

794794
/**
795795
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH

hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -473,7 +473,7 @@ class coalesced_group : public thread_group {
473473

474474
return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
475475
}
476-
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
476+
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
477477
__CG_QUALIFIER__ unsigned long long ballot(int pred) const {
478478
return internal::helper::adjust_mask(
479479
coalesced_info.member_mask,
@@ -500,7 +500,7 @@ class coalesced_group : public thread_group {
500500
__match_all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value,
501501
&pred));
502502
}
503-
#endif
503+
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
504504
};
505505

506506
/** \brief User exposed API to create coalesced groups.
@@ -665,12 +665,12 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
665665
friend __CG_QUALIFIER__ coalesced_group
666666
binary_partition(const thread_block_tile<fsize, fparent>& tgrp, bool pred);
667667

668-
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
668+
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
669669
__CG_QUALIFIER__ unsigned long long build_mask() const {
670670
unsigned long long mask = ~0ull >> (64 - numThreads);
671671
return mask << ((internal::workgroup::thread_rank() / numThreads) * numThreads);
672672
}
673-
#endif
673+
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
674674

675675
public:
676676
__CG_STATIC_QUALIFIER__ void sync() {
@@ -697,7 +697,7 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
697697
return (__shfl_xor(var, laneMask, numThreads));
698698
}
699699

700-
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
700+
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
701701
__CG_QUALIFIER__ unsigned long long ballot(int pred) const {
702702
const auto mask = build_mask();
703703
return internal::helper::adjust_mask(mask, __ballot_sync(mask, pred));
@@ -716,7 +716,7 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
716716
const auto mask = build_mask();
717717
return internal::helper::adjust_mask(mask, __match_all_sync(mask, value, &pred));
718718
}
719-
#endif
719+
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
720720
};
721721

722722
/** \brief User exposed API that captures the state of the parent group pre-partition
@@ -901,7 +901,7 @@ __CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const Paren
901901
return impl::tiled_partition_internal<size, ParentCGTy>(g);
902902
}
903903

904-
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
904+
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
905905
/** \brief Binary partition
906906
*
907907
* \details This splits the input thread group into two partitions determined by predicate
@@ -927,7 +927,7 @@ __CG_QUALIFIER__ coalesced_group binary_partition(const thread_block_tile<size,
927927
return coalesced_group(tgrp.build_mask() ^ mask);
928928
}
929929
}
930-
#endif
930+
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
931931
} // namespace cooperative_groups
932932

933933
#endif // __cplusplus

hipamd/include/hip/amd_detail/amd_warp_functions.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,15 +112,15 @@ unsigned long long int __ballot64(int predicate) {
112112
}
113113

114114
// See amd_warp_sync_functions.h for an explanation of this preprocessor flag.
115-
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
115+
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
116116
// Since threads in a wave do not make independent progress, __activemask()
117117
// always returns the exact active mask, i.e, all active threads in the wave.
118118
__device__
119119
inline
120120
unsigned long long __activemask() {
121121
return __ballot(true);
122122
}
123-
#endif // HIP_ENABLE_WARP_SYNC_BUILTINS
123+
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
124124

125125
__device__ static inline unsigned int __lane_id() {
126126
return __builtin_amdgcn_mbcnt_hi(

hipamd/include/hip/amd_detail/amd_warp_sync_functions.h

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,10 @@ THE SOFTWARE.
2323
#pragma once
2424

2525
// Warp sync builtins (with explicit mask argument) introduced in ROCm 6.2 as a
26-
// preview to allow end-users to adapt to the new interface involving 64-bit
27-
// masks. These are disabled by default, and can be enabled by setting the macro
28-
// "HIP_ENABLE_WARP_SYNC_BUILTINS". This arrangement also applies to the
29-
// __activemask() builtin defined in amd_warp_functions.h.
30-
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
26+
// preview and enabled by default in ROCm 6.4. These can be disabled, by setting
27+
// the macro "HIP_DISABLE_WARP_SYNC_BUILTINS". This arrangement also applies to
28+
// the __activemask() builtin defined in amd_warp_functions.h.
29+
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
3130

3231
#if !defined(__HIPCC_RTC__)
3332
#include "amd_warp_functions.h"
@@ -283,4 +282,4 @@ T __shfl_xor_sync(MaskT mask, T var, int laneMask,
283282
#undef __hip_check_mask
284283
#undef __hip_adjust_mask_for_wave32
285284

286-
#endif // HIP_ENABLE_WARP_SYNC_BUILTINS
285+
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS

hipamd/src/amd_hsa_elf.hpp

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ enum {
3232
ELFABIVERSION_AMDGPU_HSA_V2 = 0,
3333
ELFABIVERSION_AMDGPU_HSA_V3 = 1,
3434
ELFABIVERSION_AMDGPU_HSA_V4 = 2,
35-
ELFABIVERSION_AMDGPU_HSA_V5 = 3
35+
ELFABIVERSION_AMDGPU_HSA_V5 = 3,
36+
ELFABIVERSION_AMDGPU_HSA_V6 = 4,
3637
};
3738

3839
// AMDGPU specific e_flags
@@ -109,10 +110,21 @@ enum : unsigned {
109110
EF_AMDGPU_MACH_AMDGCN_GFX942 = 0x04c,
110111
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4D = 0x04d,
111112
EF_AMDGPU_MACH_AMDGCN_GFX1201 = 0x04e,
113+
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4F = 0x04f,
114+
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X50 = 0x050,
115+
EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC = 0x051,
116+
EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC = 0x052,
117+
EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC = 0x053,
118+
EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC = 0x054,
119+
EF_AMDGPU_MACH_AMDGCN_GFX1152 = 0x055,
120+
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X56 = 0x056,
121+
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X57 = 0x057,
122+
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X58 = 0x058,
123+
EF_AMDGPU_MACH_AMDGCN_GFX12_GENERIC = 0x059,
112124

113125
// First/last AMDGCN-based processors.
114126
EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
115-
EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX1201,
127+
EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX12_GENERIC,
116128

117129
// Indicates if the "xnack" target feature is enabled for all code contained
118130
// in the object.
@@ -125,18 +137,26 @@ enum : unsigned {
125137
// Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3.
126138
EF_AMDGPU_FEATURE_SRAMECC_V3 = 0x200,
127139

128-
// Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4.
140+
// Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4,
141+
// ELFABIVERSION_AMDGPU_HSA_V5 and ELFABIVERSION_AMDGPU_HSA_V6.
129142
EF_AMDGPU_FEATURE_XNACK_V4 = 0x300,
130143
EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4 = 0x000,
131144
EF_AMDGPU_FEATURE_XNACK_ANY_V4 = 0x100,
132145
EF_AMDGPU_FEATURE_XNACK_OFF_V4 = 0x200,
133146
EF_AMDGPU_FEATURE_XNACK_ON_V4 = 0x300,
134147

135148
// SRAMECC selection mask for EF_AMDGPU_FEATURE_SRAMECC_* values.
136-
// Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4.
149+
// Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4,
150+
// ELFABIVERSION_AMDGPU_HSA_V5 and ELFABIVERSION_AMDGPU_HSA_V6.
137151
EF_AMDGPU_FEATURE_SRAMECC_V4 = 0xc00,
138152
EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4 = 0x000,
139153
EF_AMDGPU_FEATURE_SRAMECC_ANY_V4 = 0x400,
140154
EF_AMDGPU_FEATURE_SRAMECC_OFF_V4 = 0x800,
141155
EF_AMDGPU_FEATURE_SRAMECC_ON_V4 = 0xc00,
156+
157+
// Generic target versioning. This is contained in the list byte of EFLAGS.
158+
EF_AMDGPU_GENERIC_VERSION = 0xff000000,
159+
EF_AMDGPU_GENERIC_VERSION_OFFSET = 24,
160+
EF_AMDGPU_GENERIC_VERSION_MIN = 1,
161+
EF_AMDGPU_GENERIC_VERSION_MAX = 0xff,
142162
};

0 commit comments

Comments
 (0)