ROCm
diff --git a/‎.ci/docker/common/install_rocm_magma.sh‎
Lines changed: 31 additions & 54 deletions b/‎.ci/docker/common/install_rocm_magma.sh‎
Lines changed: 31 additions & 54 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/cuda/detail/OffsetCalculator.cuh‎
Lines changed: 18 additions & 0 deletions b/‎aten/src/ATen/cuda/detail/OffsetCalculator.cuh‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎aten/src/ATen/cudnn/Descriptors.h‎
Lines changed: 1 addition & 0 deletions b/‎aten/src/ATen/cudnn/Descriptors.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h‎
Lines changed: 202 additions & 5 deletions b/‎aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h‎
Lines changed: 202 additions & 5 deletions
diff --git a/‎aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp‎
Lines changed: 3 additions & 2 deletions b/‎aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp‎
Lines changed: 3 additions & 2 deletions
@@ -1,60 +1,37 @@
-#!/bin/bash
-# Script used in CI and CD pipeline
+#!/usr/bin/env bash
+# Script used only in CD pipeline
 
-set -ex
+set -eou pipefail
 
-ver() {
-    printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');
-}
-
-# Magma build scripts need `python`
-ln -sf /usr/bin/python3 /usr/bin/python
-
-ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
-case "$ID" in
-  almalinux)
-    yum install -y gcc-gfortran
-    ;;
-  *)
-    echo "No preinstalls to build magma..."
-    ;;
-esac
+function do_install() {
+    rocm_version=$1
+    if [[ ${rocm_version} =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+        # chop off any patch version
+        rocm_version="${rocm_version%.*}"
+    fi
 
-MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}
+    rocm_version_nodot=${rocm_version//./}
 
-# "install" hipMAGMA into /opt/rocm/magma by copying after build
-if [[ $(ver $ROCM_VERSION) -ge $(ver 7.0) ]]; then
-    git clone https://github.com/ROCm/utk-magma.git -b release/2.9.0_rocm70 magma
-    pushd magma
-    # version 2.9 + ROCm 7.0 related updates
-    git checkout 91c4f720a17e842b364e9de41edeef76995eb9ad
-else
-    git clone https://bitbucket.org/icl/magma.git
-    pushd magma
     # Version 2.7.2 + ROCm related updates
-    git checkout a1625ff4d9bc362906bd01f805dbbe12612953f6
-fi
+    MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
+    magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
+
+    rocm_dir="/opt/rocm"
+    (
+        set -x
+        tmp_dir=$(mktemp -d)
+        pushd ${tmp_dir}
+        curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
+        if tar -xvf "${magma_archive}"
+        then
+            mkdir -p "${rocm_dir}/magma"
+            mv include "${rocm_dir}/magma/include"
+            mv lib "${rocm_dir}/magma/lib"
+        else
+            echo "${magma_archive} not found, skipping magma install"
+        fi
+        popd
+    )
+}
 
-cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
-echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
-if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then
-    echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc
-fi
-echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc
-echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc
-export PATH="${PATH}:/opt/rocm/bin"
-if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
-  amdgpu_targets=`echo $PYTORCH_ROCM_ARCH | sed 's/;/ /g'`
-else
-  amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs`
-fi
-for arch in $amdgpu_targets; do
-  echo "DEVCCFLAGS += --offload-arch=$arch" >> make.inc
-done
-# hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
-sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
-make -f make.gen.hipMAGMA -j $(nproc)
-LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}"
-make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}"
-popd
-mv magma /opt/rocm
+do_install $1
@@ -867,7 +867,7 @@ cmake_dependent_option(
   "Whether to build the flash_attention kernel for scaled dot product attention.\
   Will be disabled if not supported by the platform"
   ON
-  "USE_CUDA OR USE_ROCM;NOT MSVC"
+  "(USE_CUDA AND NOT MSVC) OR USE_ROCM"
   OFF)
 
 # CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
@@ -883,7 +883,7 @@ cmake_dependent_option(
 # USE_FLASH_ATTENTION -> USE_ROCM -> Dependencies.cmake -> aotriton.cmake
 #
 if(USE_ROCM)
-  if(UNIX AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
+  if(USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION)
     include(cmake/External/aotriton.cmake)
   endif()
 endif()
 
@@ -45,6 +45,24 @@ struct OffsetCalculator {
 
   C10_HOST_DEVICE offset_type get(index_t linear_idx) const {
     offset_type offsets;
+
+#if defined(USE_ROCM)
+    if ((dims > 0) && (dims <= 2)) {
+      auto divmod = sizes_[0].divmod(linear_idx);
+      #pragma unroll
+      for (int arg = 0; arg < NARGS; arg++)
+        offsets[arg] = divmod.mod * strides_[0][arg];
+      if (dims >= 2) {
+        divmod = sizes_[1].divmod(divmod.div);
+        #pragma unroll
+        for (int arg = 0; arg < NARGS; arg++)
+          offsets[arg] += divmod.mod * strides_[1][arg];
+      }
+      // [...]
+      return offsets;
+    }
+#endif
+
     #pragma unroll
     for (int arg = 0; arg < NARGS; arg++) {
       offsets[arg] = 0;
 
@@ -38,6 +38,7 @@ inline int dataSize(cudnnDataType_t dataType)
   }
 }
 
+// NOTE [ cudnn fixSizeOneDimStride ]
 // The stride for a size-1 dimensions is not uniquely determined; in
 // fact, it can be anything you want, because the fact that the
 // tensor is size 1 at this dimension means that you will never actually
 
@@ -1,7 +1,6 @@
 #pragma once
 
-#include <c10/core/Allocator.h>
-#include <c10/core/DeviceType.h>
+#include <c10/hip/HIPCachingAllocator.h>
 
 // Use of c10::hip namespace here makes hipification easier, because
 // I don't have to also fix namespaces.  Sorry!
@@ -10,22 +9,220 @@ namespace c10::hip {
 // Takes a valid HIPAllocator (of any sort) and turns it into
 // an allocator pretending to be a CUDA allocator.  See
 // Note [Masquerading as CUDA]
-class HIPAllocatorMasqueradingAsCUDA final : public Allocator {
-  Allocator* allocator_;
+class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllocator {
+  HIPCachingAllocator::HIPAllocator* allocator_;
 public:
-  explicit HIPAllocatorMasqueradingAsCUDA(Allocator* allocator)
+  explicit HIPAllocatorMasqueradingAsCUDA(HIPCachingAllocator::HIPAllocator* allocator)
     : allocator_(allocator) {}
+
+  virtual ~HIPAllocatorMasqueradingAsCUDA() = default;
+
+  // From c10::Allocator
+
   DataPtr allocate(size_t size) override {
     DataPtr r = allocator_->allocate(size);
     r.unsafe_set_device(Device(c10::DeviceType::CUDA, r.device().index()));
     return r;
   }
+
+  bool is_simple_data_ptr(const DataPtr& data_ptr) const override {
+    return allocator_->is_simple_data_ptr(data_ptr);
+  }
+
   DeleterFnPtr raw_deleter() const override {
     return allocator_->raw_deleter();
   }
+
   void copy_data(void* dest, const void* src, std::size_t count) const final {
     allocator_->copy_data(dest, src, count);
   }
+
+  // From CUDAAllocator
+
+  void* raw_alloc(size_t nbytes) override {
+    return allocator_->raw_alloc(nbytes);
+  }
+
+  void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) override {
+    return allocator_->raw_alloc_with_stream(nbytes, stream);
+  }
+
+  void raw_delete(void* ptr) override {
+    allocator_->raw_delete(ptr);
+  }
+
+  void init(int device_count) override {
+    allocator_->init(device_count);
+  }
+
+  bool initialized() override {
+    return allocator_->initialized();
+  }
+
+  double getMemoryFraction(c10::DeviceIndex device) override {
+    return allocator_->getMemoryFraction(device);
+  }
+
+  void setMemoryFraction(double fraction, c10::DeviceIndex device) override {
+    allocator_->setMemoryFraction(fraction, device);
+  }
+
+  void emptyCache(MempoolId_t mempool_id = {0, 0}) override {
+    allocator_->emptyCache(mempool_id);
+  }
+
+  void enable(bool value) override {
+    allocator_->enable(value);
+  }
+
+  bool isEnabled() const override {
+    return allocator_->isEnabled();
+  }
+
+  void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) override {
+    allocator_->cacheInfo(device, largestBlock);
+  }
+
+  void* getBaseAllocation(void* ptr, size_t* size) override {
+    return allocator_->getBaseAllocation(ptr, size);
+  }
+
+  void recordStream(const DataPtr& ptr, HIPStream stream) override {
+    allocator_->recordStream(ptr, stream);
+  }
+
+  CachingDeviceAllocator::DeviceStats getDeviceStats(c10::DeviceIndex device) override {
+    return allocator_->getDeviceStats(device);
+  }
+
+  void resetAccumulatedStats(c10::DeviceIndex device) override {
+    allocator_->resetAccumulatedStats(device);
+  }
+
+  void resetPeakStats(c10::DeviceIndex device) override {
+    allocator_->resetPeakStats(device);
+  }
+
+  HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) override {
+    return allocator_->snapshot(mempool_id);
+  }
+
+  void beginAllocateToPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      std::function<bool(hipStream_t)> filter) override {
+    allocator_->beginAllocateToPool(device, mempool_id, filter);
+  }
+
+  void endAllocateToPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id) override {
+    allocator_->endAllocateToPool(device, mempool_id);
+  }
+
+  void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) override {
+    allocator_->releasePool(device, mempool_id);
+  }
+
+  int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) override {
+    return allocator_->getPoolUseCount(device, mempool_id);
+  }
+
+  void createOrIncrefPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      HIPAllocator* allocator = nullptr) override {
+    allocator_->createOrIncrefPool(device, mempool_id, allocator);
+  }
+
+  void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) override {
+    allocator_->setUseOnOOM(device, mempool_id);
+  }
+
+  bool checkPoolLiveAllocations(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      const std::unordered_set<void*>& expected_live_allocations) override {
+    return allocator_->checkPoolLiveAllocations(device, mempool_id, expected_live_allocations);
+  }
+
+  HIPCachingAllocator::ShareableHandle shareIpcHandle(void* ptr) override {
+    return allocator_->shareIpcHandle(ptr);
+  }
+
+  std::shared_ptr<void> getIpcDevPtr(std::string handle) override {
+    return allocator_->getIpcDevPtr(handle);
+  }
+
+  bool isHistoryEnabled() override {
+    return allocator_->isHistoryEnabled();
+  }
+
+  void recordHistory(
+      bool enabled,
+      HIPCachingAllocator::CreateContextFn context_recorder,
+      size_t alloc_trace_max_entries,
+      HIPCachingAllocator::RecordContext when,
+      bool clearHistory) override {
+    allocator_->recordHistory(enabled, context_recorder, alloc_trace_max_entries, when, clearHistory);
+  }
+
+  void recordAnnotation(
+      const std::vector<std::pair<std::string, std::string>>& md) override {
+    allocator_->recordAnnotation(md);
+  }
+
+  void pushCompileContext(std::string& md) override {
+    allocator_->pushCompileContext(md);
+  }
+
+  void popCompileContext() override {
+    allocator_->popCompileContext();
+  }
+
+  void attachOutOfMemoryObserver(HIPCachingAllocator::OutOfMemoryObserver observer) override {
+    allocator_->attachOutOfMemoryObserver(observer);
+  }
+
+  void attachAllocatorTraceTracker(HIPCachingAllocator::AllocatorTraceTracker tracker) override {
+    allocator_->attachAllocatorTraceTracker(tracker);
+  }
+
+  void enablePeerAccess(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) override {
+    allocator_->enablePeerAccess(dev, dev_to_access);
+  }
+
+  hipError_t memcpyAsync(
+      void* dst,
+      int dstDevice,
+      const void* src,
+      int srcDevice,
+      size_t count,
+      hipStream_t stream,
+      bool p2p_enabled) override {
+    return allocator_->memcpyAsync(dst, dstDevice, src, srcDevice, count, stream, p2p_enabled);
+  }
+
+  std::shared_ptr<HIPCachingAllocator::AllocatorState> getCheckpointState(
+      c10::DeviceIndex device,
+      MempoolId_t id) override {
+    return allocator_->getCheckpointState(device, id);
+  }
+
+  HIPCachingAllocator::CheckpointDelta setCheckpointPoolState(
+      c10::DeviceIndex device,
+      std::shared_ptr<HIPCachingAllocator::AllocatorState> pps) override {
+    auto cpd = allocator_->setCheckpointPoolState(device, pps);
+    for (auto& ptr : cpd.dataptrs_allocd) {
+      ptr.unsafe_set_device(Device(c10::DeviceType::CUDA, ptr.device().index()));
+    }
+    return cpd;
+  }
+
+  std::string name() override {
+    return allocator_->name();
+  }
+
 };
 
 } // namespace c10::hip
@@ -1,10 +1,11 @@
-#include <c10/core/Allocator.h>
+#include <c10/hip/HIPCachingAllocator.h>
+#include <ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h>
 #include <ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h>
 
 namespace c10 { namespace hip {
 namespace HIPCachingAllocatorMasqueradingAsCUDA {
 
-Allocator* get() {
+HIPCachingAllocator::HIPAllocator* get() {
   static HIPAllocatorMasqueradingAsCUDA allocator(HIPCachingAllocator::get());
   return &allocator;
 }
Original file line number	Diff line number	Diff line change
`@@ -38,6 +38,7 @@ inline int dataSize(cudnnDataType_t dataType)`
`38`	`38`	`}`
`39`	`39`	`}`
`40`	`40`
	`41`	`+// NOTE [ cudnn fixSizeOneDimStride ]`
`41`	`42`	`// The stride for a size-1 dimensions is not uniquely determined; in`
`42`	`43`	`// fact, it can be anything you want, because the fact that the`
`43`	`44`	`// tensor is size 1 at this dimension means that you will never actually`