Skip to content

Commit a62940d

Browse files
committed
Merge branch 'release/2.8' into add_gfx115x_to_hipblaslt_list_release_2.8
2 parents 39aaf53 + 7a52036 commit a62940d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+2862
-1127
lines changed
Lines changed: 31 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,60 +1,37 @@
1-
#!/bin/bash
2-
# Script used in CI and CD pipeline
1+
#!/usr/bin/env bash
2+
# Script used only in CD pipeline
33

4-
set -ex
4+
set -eou pipefail
55

6-
ver() {
7-
printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');
8-
}
9-
10-
# Magma build scripts need `python`
11-
ln -sf /usr/bin/python3 /usr/bin/python
12-
13-
ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
14-
case "$ID" in
15-
almalinux)
16-
yum install -y gcc-gfortran
17-
;;
18-
*)
19-
echo "No preinstalls to build magma..."
20-
;;
21-
esac
6+
function do_install() {
7+
rocm_version=$1
8+
if [[ ${rocm_version} =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
9+
# chop off any patch version
10+
rocm_version="${rocm_version%.*}"
11+
fi
2212

23-
MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}
13+
rocm_version_nodot=${rocm_version//./}
2414

25-
# "install" hipMAGMA into /opt/rocm/magma by copying after build
26-
if [[ $(ver $ROCM_VERSION) -ge $(ver 7.0) ]]; then
27-
git clone https://github.com/ROCm/utk-magma.git -b release/2.9.0_rocm70 magma
28-
pushd magma
29-
# version 2.9 + ROCm 7.0 related updates
30-
git checkout 91c4f720a17e842b364e9de41edeef76995eb9ad
31-
else
32-
git clone https://bitbucket.org/icl/magma.git
33-
pushd magma
3415
# Version 2.7.2 + ROCm related updates
35-
git checkout a1625ff4d9bc362906bd01f805dbbe12612953f6
36-
fi
16+
MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
17+
magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
18+
19+
rocm_dir="/opt/rocm"
20+
(
21+
set -x
22+
tmp_dir=$(mktemp -d)
23+
pushd ${tmp_dir}
24+
curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
25+
if tar -xvf "${magma_archive}"
26+
then
27+
mkdir -p "${rocm_dir}/magma"
28+
mv include "${rocm_dir}/magma/include"
29+
mv lib "${rocm_dir}/magma/lib"
30+
else
31+
echo "${magma_archive} not found, skipping magma install"
32+
fi
33+
popd
34+
)
35+
}
3736

38-
cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
39-
echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
40-
if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then
41-
echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc
42-
fi
43-
echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc
44-
echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc
45-
export PATH="${PATH}:/opt/rocm/bin"
46-
if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
47-
amdgpu_targets=`echo $PYTORCH_ROCM_ARCH | sed 's/;/ /g'`
48-
else
49-
amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs`
50-
fi
51-
for arch in $amdgpu_targets; do
52-
echo "DEVCCFLAGS += --offload-arch=$arch" >> make.inc
53-
done
54-
# hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
55-
sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
56-
make -f make.gen.hipMAGMA -j $(nproc)
57-
LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}"
58-
make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}"
59-
popd
60-
mv magma /opt/rocm
37+
do_install $1

CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -867,7 +867,7 @@ cmake_dependent_option(
867867
"Whether to build the flash_attention kernel for scaled dot product attention.\
868868
Will be disabled if not supported by the platform"
869869
ON
870-
"USE_CUDA OR USE_ROCM;NOT MSVC"
870+
"(USE_CUDA AND NOT MSVC) OR USE_ROCM"
871871
OFF)
872872

873873
# CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
@@ -883,7 +883,7 @@ cmake_dependent_option(
883883
# USE_FLASH_ATTENTION -> USE_ROCM -> Dependencies.cmake -> aotriton.cmake
884884
#
885885
if(USE_ROCM)
886-
if(UNIX AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
886+
if(USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION)
887887
include(cmake/External/aotriton.cmake)
888888
endif()
889889
endif()

aten/src/ATen/cuda/detail/OffsetCalculator.cuh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,24 @@ struct OffsetCalculator {
4545

4646
C10_HOST_DEVICE offset_type get(index_t linear_idx) const {
4747
offset_type offsets;
48+
49+
#if defined(USE_ROCM)
50+
if ((dims > 0) && (dims <= 2)) {
51+
auto divmod = sizes_[0].divmod(linear_idx);
52+
#pragma unroll
53+
for (int arg = 0; arg < NARGS; arg++)
54+
offsets[arg] = divmod.mod * strides_[0][arg];
55+
if (dims >= 2) {
56+
divmod = sizes_[1].divmod(divmod.div);
57+
#pragma unroll
58+
for (int arg = 0; arg < NARGS; arg++)
59+
offsets[arg] += divmod.mod * strides_[1][arg];
60+
}
61+
// [...]
62+
return offsets;
63+
}
64+
#endif
65+
4866
#pragma unroll
4967
for (int arg = 0; arg < NARGS; arg++) {
5068
offsets[arg] = 0;

aten/src/ATen/cudnn/Descriptors.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ inline int dataSize(cudnnDataType_t dataType)
3838
}
3939
}
4040

41+
// NOTE [ cudnn fixSizeOneDimStride ]
4142
// The stride for a size-1 dimensions is not uniquely determined; in
4243
// fact, it can be anything you want, because the fact that the
4344
// tensor is size 1 at this dimension means that you will never actually
Lines changed: 202 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#pragma once
22

3-
#include <c10/core/Allocator.h>
4-
#include <c10/core/DeviceType.h>
3+
#include <c10/hip/HIPCachingAllocator.h>
54

65
// Use of c10::hip namespace here makes hipification easier, because
76
// I don't have to also fix namespaces. Sorry!
@@ -10,22 +9,220 @@ namespace c10::hip {
109
// Takes a valid HIPAllocator (of any sort) and turns it into
1110
// an allocator pretending to be a CUDA allocator. See
1211
// Note [Masquerading as CUDA]
13-
class HIPAllocatorMasqueradingAsCUDA final : public Allocator {
14-
Allocator* allocator_;
12+
class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllocator {
13+
HIPCachingAllocator::HIPAllocator* allocator_;
1514
public:
16-
explicit HIPAllocatorMasqueradingAsCUDA(Allocator* allocator)
15+
explicit HIPAllocatorMasqueradingAsCUDA(HIPCachingAllocator::HIPAllocator* allocator)
1716
: allocator_(allocator) {}
17+
18+
virtual ~HIPAllocatorMasqueradingAsCUDA() = default;
19+
20+
// From c10::Allocator
21+
1822
DataPtr allocate(size_t size) override {
1923
DataPtr r = allocator_->allocate(size);
2024
r.unsafe_set_device(Device(c10::DeviceType::CUDA, r.device().index()));
2125
return r;
2226
}
27+
28+
bool is_simple_data_ptr(const DataPtr& data_ptr) const override {
29+
return allocator_->is_simple_data_ptr(data_ptr);
30+
}
31+
2332
DeleterFnPtr raw_deleter() const override {
2433
return allocator_->raw_deleter();
2534
}
35+
2636
void copy_data(void* dest, const void* src, std::size_t count) const final {
2737
allocator_->copy_data(dest, src, count);
2838
}
39+
40+
// From CUDAAllocator
41+
42+
void* raw_alloc(size_t nbytes) override {
43+
return allocator_->raw_alloc(nbytes);
44+
}
45+
46+
void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) override {
47+
return allocator_->raw_alloc_with_stream(nbytes, stream);
48+
}
49+
50+
void raw_delete(void* ptr) override {
51+
allocator_->raw_delete(ptr);
52+
}
53+
54+
void init(int device_count) override {
55+
allocator_->init(device_count);
56+
}
57+
58+
bool initialized() override {
59+
return allocator_->initialized();
60+
}
61+
62+
double getMemoryFraction(c10::DeviceIndex device) override {
63+
return allocator_->getMemoryFraction(device);
64+
}
65+
66+
void setMemoryFraction(double fraction, c10::DeviceIndex device) override {
67+
allocator_->setMemoryFraction(fraction, device);
68+
}
69+
70+
void emptyCache(MempoolId_t mempool_id = {0, 0}) override {
71+
allocator_->emptyCache(mempool_id);
72+
}
73+
74+
void enable(bool value) override {
75+
allocator_->enable(value);
76+
}
77+
78+
bool isEnabled() const override {
79+
return allocator_->isEnabled();
80+
}
81+
82+
void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) override {
83+
allocator_->cacheInfo(device, largestBlock);
84+
}
85+
86+
void* getBaseAllocation(void* ptr, size_t* size) override {
87+
return allocator_->getBaseAllocation(ptr, size);
88+
}
89+
90+
void recordStream(const DataPtr& ptr, HIPStream stream) override {
91+
allocator_->recordStream(ptr, stream);
92+
}
93+
94+
CachingDeviceAllocator::DeviceStats getDeviceStats(c10::DeviceIndex device) override {
95+
return allocator_->getDeviceStats(device);
96+
}
97+
98+
void resetAccumulatedStats(c10::DeviceIndex device) override {
99+
allocator_->resetAccumulatedStats(device);
100+
}
101+
102+
void resetPeakStats(c10::DeviceIndex device) override {
103+
allocator_->resetPeakStats(device);
104+
}
105+
106+
HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) override {
107+
return allocator_->snapshot(mempool_id);
108+
}
109+
110+
void beginAllocateToPool(
111+
c10::DeviceIndex device,
112+
MempoolId_t mempool_id,
113+
std::function<bool(hipStream_t)> filter) override {
114+
allocator_->beginAllocateToPool(device, mempool_id, filter);
115+
}
116+
117+
void endAllocateToPool(
118+
c10::DeviceIndex device,
119+
MempoolId_t mempool_id) override {
120+
allocator_->endAllocateToPool(device, mempool_id);
121+
}
122+
123+
void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) override {
124+
allocator_->releasePool(device, mempool_id);
125+
}
126+
127+
int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) override {
128+
return allocator_->getPoolUseCount(device, mempool_id);
129+
}
130+
131+
void createOrIncrefPool(
132+
c10::DeviceIndex device,
133+
MempoolId_t mempool_id,
134+
HIPAllocator* allocator = nullptr) override {
135+
allocator_->createOrIncrefPool(device, mempool_id, allocator);
136+
}
137+
138+
void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) override {
139+
allocator_->setUseOnOOM(device, mempool_id);
140+
}
141+
142+
bool checkPoolLiveAllocations(
143+
c10::DeviceIndex device,
144+
MempoolId_t mempool_id,
145+
const std::unordered_set<void*>& expected_live_allocations) override {
146+
return allocator_->checkPoolLiveAllocations(device, mempool_id, expected_live_allocations);
147+
}
148+
149+
HIPCachingAllocator::ShareableHandle shareIpcHandle(void* ptr) override {
150+
return allocator_->shareIpcHandle(ptr);
151+
}
152+
153+
std::shared_ptr<void> getIpcDevPtr(std::string handle) override {
154+
return allocator_->getIpcDevPtr(handle);
155+
}
156+
157+
bool isHistoryEnabled() override {
158+
return allocator_->isHistoryEnabled();
159+
}
160+
161+
void recordHistory(
162+
bool enabled,
163+
HIPCachingAllocator::CreateContextFn context_recorder,
164+
size_t alloc_trace_max_entries,
165+
HIPCachingAllocator::RecordContext when,
166+
bool clearHistory) override {
167+
allocator_->recordHistory(enabled, context_recorder, alloc_trace_max_entries, when, clearHistory);
168+
}
169+
170+
void recordAnnotation(
171+
const std::vector<std::pair<std::string, std::string>>& md) override {
172+
allocator_->recordAnnotation(md);
173+
}
174+
175+
void pushCompileContext(std::string& md) override {
176+
allocator_->pushCompileContext(md);
177+
}
178+
179+
void popCompileContext() override {
180+
allocator_->popCompileContext();
181+
}
182+
183+
void attachOutOfMemoryObserver(HIPCachingAllocator::OutOfMemoryObserver observer) override {
184+
allocator_->attachOutOfMemoryObserver(observer);
185+
}
186+
187+
void attachAllocatorTraceTracker(HIPCachingAllocator::AllocatorTraceTracker tracker) override {
188+
allocator_->attachAllocatorTraceTracker(tracker);
189+
}
190+
191+
void enablePeerAccess(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) override {
192+
allocator_->enablePeerAccess(dev, dev_to_access);
193+
}
194+
195+
hipError_t memcpyAsync(
196+
void* dst,
197+
int dstDevice,
198+
const void* src,
199+
int srcDevice,
200+
size_t count,
201+
hipStream_t stream,
202+
bool p2p_enabled) override {
203+
return allocator_->memcpyAsync(dst, dstDevice, src, srcDevice, count, stream, p2p_enabled);
204+
}
205+
206+
std::shared_ptr<HIPCachingAllocator::AllocatorState> getCheckpointState(
207+
c10::DeviceIndex device,
208+
MempoolId_t id) override {
209+
return allocator_->getCheckpointState(device, id);
210+
}
211+
212+
HIPCachingAllocator::CheckpointDelta setCheckpointPoolState(
213+
c10::DeviceIndex device,
214+
std::shared_ptr<HIPCachingAllocator::AllocatorState> pps) override {
215+
auto cpd = allocator_->setCheckpointPoolState(device, pps);
216+
for (auto& ptr : cpd.dataptrs_allocd) {
217+
ptr.unsafe_set_device(Device(c10::DeviceType::CUDA, ptr.device().index()));
218+
}
219+
return cpd;
220+
}
221+
222+
std::string name() override {
223+
return allocator_->name();
224+
}
225+
29226
};
30227

31228
} // namespace c10::hip

aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1-
#include <c10/core/Allocator.h>
1+
#include <c10/hip/HIPCachingAllocator.h>
2+
#include <ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h>
23
#include <ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h>
34

45
namespace c10 { namespace hip {
56
namespace HIPCachingAllocatorMasqueradingAsCUDA {
67

7-
Allocator* get() {
8+
HIPCachingAllocator::HIPAllocator* get() {
89
static HIPAllocatorMasqueradingAsCUDA allocator(HIPCachingAllocator::get());
910
return &allocator;
1011
}

0 commit comments

Comments
 (0)