Skip to content

Commit 4a9154c

Browse files
committed
Promote till commit 'f1c05e902656c15d7a6c229e0b400a1656dbcb5f'
Change-Id: I1cf4c3b8c671c69dcca9b5f3a453415b48e511ad
2 parents 9c3f9f8 + f1c05e9 commit 4a9154c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+1557
-927
lines changed

CHANGELOG.md

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,6 @@ Full documentation for HIP is available at [rocm.docs.amd.com](https://rocm.docs
77
### Changed
88
* Added new environment variable
99
- `DEBUG_HIP_7_PREVIEW` This is used for enabling the backward incompatible changes before the next major ROCm release 7.0. By default this is set to 0. Users can set this variable to 0x1, to match the behavior of hipGetLastError with its corresponding CUDA API.
10-
* New HIP APIs
11-
- The `_sync()` version of crosslane builtins such as `shfl_sync()`,
12-
`__all_sync()` and `__any_sync()`, are enabled by default. These can be
13-
disabled by setting the preprocessor macro `HIP_DISABLE_WARP_SYNC_BUILTINS`.
1410

1511
## HIP 6.3 for ROCm 6.3
1612

hipamd/hip-config-amd.cmake.in

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,16 @@ if(NOT WIN32)
6666
find_dependency(AMDDeviceLibs HINTS ${ROCM_PATH} PATHS "/opt/rocm")
6767
endif()
6868

69-
# If AMDGPU_TARGETS is not defined by the app, amdgpu-arch is run to find the gpu archs
69+
if(DEFINED AMDGPU_TARGETS AND NOT DEFINED GPU_TARGETS)
70+
message(AUTHOR_WARNING "AMDGPU_TARGETS is deprecated. Please use GPU_TARGETS instead.")
71+
72+
# Set GPU_TARGETS to the value of AMDGPU_TARGETS
73+
set(GPU_TARGETS "${AMDGPU_TARGETS}")
74+
endif()
75+
76+
# If GPU_TARGETS is not defined by the app, amdgpu-arch is run to find the gpu archs
7077
# of all the devices present in the machine
71-
if(NOT AMDGPU_TARGETS)
78+
if(NOT GPU_TARGETS)
7279
if(@BUILD_SHARED_LIBS@)
7380
if (WIN32)
7481
set(AMDGPU_ARCH "${HIP_CLANG_ROOT}/bin/amdgpu-arch.exe")
@@ -89,7 +96,7 @@ if(NOT AMDGPU_TARGETS)
8996

9097
if(AMDGPU_ARCH_ERROR)
9198
message(AUTHOR_WARNING
92-
" AMDGPU_TARGETS was not set, and system GPU detection was unsuccsesful.\n \n"
99+
" GPU_TARGETS was not set, and system GPU detection was unsuccsesful.\n \n"
93100
" The amdgpu-arch tool failed:\n"
94101
" Error: '${AMDGPU_ARCH_ERROR}'\n"
95102
" Output: '${AMDGPU_ARCH_OUTPUT}'\n \n"
@@ -102,17 +109,17 @@ if(NOT AMDGPU_TARGETS)
102109
string(REPLACE "gfx000\n" "" AMDGPU_ARCH_OUTPUT "${AMDGPU_ARCH_OUTPUT}")
103110
if (NOT AMDGPU_ARCH_OUTPUT STREQUAL "")
104111
string(REPLACE "\n" ";" AMDGPU_ARCH_OUTPUT ${AMDGPU_ARCH_OUTPUT})
105-
set(AMDGPU_TARGETS ${AMDGPU_ARCH_OUTPUT} CACHE STRING "AMD GPU targets to compile for")
112+
set(GPU_TARGETS ${AMDGPU_ARCH_OUTPUT} CACHE STRING "AMD GPU targets to compile for")
106113
endif()
107114
endif()
108115
endif()
109116

110-
if (NOT AMDGPU_TARGETS AND NOT @BUILD_SHARED_LIBS@)
117+
if (NOT GPU_TARGETS AND NOT @BUILD_SHARED_LIBS@)
111118
# The default architecture is gfx942 for static build
112-
set(AMDGPU_TARGETS "gfx942" CACHE STRING "AMD GPU targets to compile for")
119+
set(GPU_TARGETS "gfx942" CACHE STRING "AMD GPU targets to compile for")
113120
endif()
114121

115-
set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU targets to compile for")
122+
set(GPU_BUILD_TARGETS "${GPU_TARGETS}" CACHE STRING "GPU targets to compile for")
116123
if(NOT WIN32)
117124
find_dependency(amd_comgr HINTS ${ROCM_PATH} PATHS "/opt/rocm")
118125
endif()
@@ -153,7 +160,7 @@ endif()
153160

154161
hip_add_interface_link_flags(hip::device --hip-link)
155162

156-
foreach(GPU_TARGET ${GPU_TARGETS})
163+
foreach(GPU_TARGET ${GPU_BUILD_TARGETS})
157164
if (NOT compilePropIsSet)
158165
hip_add_interface_compile_flags(hip::device --offload-arch=${GPU_TARGET})
159166
endif()

hipamd/include/hip/amd_detail/amd_hip_bf16.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -681,7 +681,7 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned s
681681
return u.bf16;
682682
}
683683

684-
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
684+
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
685685
/**
686686
* \ingroup HIP_INTRINSIC_BFLOAT16_MOVE
687687
* \brief shfl down warp intrinsic for bfloat16
@@ -789,7 +789,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_xor_sync(const unsigned long long
789789
u.ui = __shfl_xor_sync(mask, u.ui, delta, width);
790790
return u.bf162;
791791
}
792-
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
792+
#endif
793793

794794
/**
795795
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH

hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -473,7 +473,7 @@ class coalesced_group : public thread_group {
473473

474474
return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
475475
}
476-
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
476+
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
477477
__CG_QUALIFIER__ unsigned long long ballot(int pred) const {
478478
return internal::helper::adjust_mask(
479479
coalesced_info.member_mask,
@@ -500,7 +500,7 @@ class coalesced_group : public thread_group {
500500
__match_all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value,
501501
&pred));
502502
}
503-
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
503+
#endif
504504
};
505505

506506
/** \brief User exposed API to create coalesced groups.
@@ -665,12 +665,12 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
665665
friend __CG_QUALIFIER__ coalesced_group
666666
binary_partition(const thread_block_tile<fsize, fparent>& tgrp, bool pred);
667667

668-
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
668+
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
669669
__CG_QUALIFIER__ unsigned long long build_mask() const {
670670
unsigned long long mask = ~0ull >> (64 - numThreads);
671671
return mask << ((internal::workgroup::thread_rank() / numThreads) * numThreads);
672672
}
673-
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
673+
#endif
674674

675675
public:
676676
__CG_STATIC_QUALIFIER__ void sync() {
@@ -697,7 +697,7 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
697697
return (__shfl_xor(var, laneMask, numThreads));
698698
}
699699

700-
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
700+
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
701701
__CG_QUALIFIER__ unsigned long long ballot(int pred) const {
702702
const auto mask = build_mask();
703703
return internal::helper::adjust_mask(mask, __ballot_sync(mask, pred));
@@ -716,7 +716,7 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
716716
const auto mask = build_mask();
717717
return internal::helper::adjust_mask(mask, __match_all_sync(mask, value, &pred));
718718
}
719-
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
719+
#endif
720720
};
721721

722722
/** \brief User exposed API that captures the state of the parent group pre-partition
@@ -901,7 +901,7 @@ __CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const Paren
901901
return impl::tiled_partition_internal<size, ParentCGTy>(g);
902902
}
903903

904-
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
904+
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
905905
/** \brief Binary partition
906906
*
907907
* \details This splits the input thread group into two partitions determined by predicate
@@ -927,7 +927,7 @@ __CG_QUALIFIER__ coalesced_group binary_partition(const thread_block_tile<size,
927927
return coalesced_group(tgrp.build_mask() ^ mask);
928928
}
929929
}
930-
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
930+
#endif
931931
} // namespace cooperative_groups
932932

933933
#endif // __cplusplus

0 commit comments

Comments
 (0)