diff --git a/bin/RegisterTritonDialects.h b/bin/RegisterTritonDialects.h
index 8aaa416663..96f491d46e 100644
--- a/bin/RegisterTritonDialects.h
+++ b/bin/RegisterTritonDialects.h
@@ -87,6 +87,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::registerTritonAMDGPUReorderInstructions();
   mlir::registerTritonAMDGPUStreamPipelineV2();
   mlir::registerTritonAMDGPUCanonicalizePointers();
+  mlir::registerTritonAMDGPUConvertToBufferOps();
 
   // TODO: register Triton & TritonGPU passes
   registry.insert<mlir::triton::TritonDialect, mlir::cf::ControlFlowDialect,
diff --git a/include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h b/include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h
index 45cfbbd181..68f430d054 100644
--- a/include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h
+++ b/include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h
@@ -82,6 +82,8 @@ class TargetInfoBase {
 
   virtual int getSharedAddressSpace() const = 0;
 
+  virtual bool supportVectorizedAtomics() const = 0;
+
   virtual ~TargetInfoBase() {}
 };
 } // namespace mlir::triton
diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp
index f42e41bed7..d8b948a2b7 100644
--- a/include/triton/Tools/Sys/GetEnv.hpp
+++ b/include/triton/Tools/Sys/GetEnv.hpp
@@ -13,6 +13,7 @@ namespace mlir::triton {
 inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     // clang-format off
     "AMDGCN_ENABLE_DUMP",
+    "AMDGCN_USE_BUFFER_OPS",
     "DISABLE_FAST_REDUCTION",
     "DISABLE_LLVM_OPT",
     "DISABLE_MMA_V3",
diff --git a/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp
index 0ccd97970a..1a0c115a9e 100644
--- a/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp
@@ -109,6 +109,19 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
       : ConvertOpToLLVMPattern(typeConverter, benefit), targetInfo(targetInfo) {
   }
 
+  // FIXME [Dot LL]
+  // Do for all DotOperandEncodingAttr once we have LLs for all of them
+  static bool isSupportedDotOpLayout(Attribute layout) {
+    if (auto dot = dyn_cast<DotOperandEncodingAttr>(layout)) {
+      if (auto mma = dyn_cast<NvidiaMmaEncodingAttr>(dot.getParent())) {
+        return mma.isAmpere() && dot.getKWidth() == 8;
+      }
+      if (isa<AMDMfmaEncodingAttr>(dot.getParent()))
+        return true;
+    }
+    return false;
+  };
+
   LogicalResult
   matchAndRewrite(LocalLoadOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
@@ -116,20 +129,10 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
     RankedTensorType dstTy = op.getType();
     Attribute srcLayout = srcTy.getEncoding();
     Attribute dstLayout = dstTy.getEncoding();
-    // FIXME [Dot LL]
-    // Do for all DotOperandEncodingAttr once we have LLs for all of them
-    auto isAmpereLargeKWidth = [](Attribute layout) {
-      if (auto dot = dyn_cast<DotOperandEncodingAttr>(layout)) {
-        if (auto mma = dyn_cast<NvidiaMmaEncodingAttr>(dot.getParent())) {
-          return mma.isAmpere() && dot.getKWidth() == 8;
-        }
-      }
-      return false;
-    };
     if (isa<SharedEncodingAttr>(srcLayout) &&
         (isa<BlockedEncodingAttr, MmaEncodingTrait, SliceEncodingAttr>(
              dstLayout) ||
-         isAmpereLargeKWidth(dstLayout))) {
+         isSupportedDotOpLayout(dstLayout))) {
       return lowerSharedToDistributed(op, adaptor, getTypeConverter(),
                                       rewriter);
     }
@@ -167,10 +170,10 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
     auto srcTy = op.getSrc().getType();
     auto dstTy = op.getResult().getType();
     auto dstShape = dstTy.getShape();
-    assert(dstShape.size() <= 2 &&
-           "Unexpected rank of ConvertLayout(shared->blocked)");
     auto srcSharedLayout = cast<SharedEncodingAttr>(srcTy.getEncoding());
     auto dstLayout = dstTy.getEncoding();
+    assert((dstShape.size() <= 2 || isSupportedDotOpLayout(dstLayout)) &&
+           "Unexpected rank of ConvertLayout(shared->distributed)");
     auto inOrd = getOrder(srcSharedLayout);
 
     auto smemObj = LLVM::getSharedMemoryObjectFromStruct(
@@ -184,31 +187,36 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
     // FIXME [Dot LL]
     // Ampere case
     // In this case, we need to pack the outputs into i32
-    if (isa<DotOperandEncodingAttr>(dstTy.getEncoding())) {
-      if (elemLlvmTy.isInteger(8)) {
-        auto concat = [&](Value a1, Value a2, Value a3, Value a4) {
-          return or_(or_(zext(i32_ty, a1), shl(zext(i32_ty, a2), i32_val(8))),
-                     or_(shl(zext(i32_ty, a3), i32_val(16)),
-                         shl(zext(i32_ty, a4), i32_val(24))));
-        };
-        SmallVector<Value> outVals32(outVals.size() / 4);
-        for (int i = 0; i < outVals32.size(); ++i) {
-          outVals32[i] = concat(outVals[4 * i], outVals[4 * i + 1],
-                                outVals[4 * i + 2], outVals[4 * i + 3]);
-        }
-        outVals = outVals32;
-      } else {
-        assert(elemLlvmTy.isBF16() && "Unexpected element type");
-        auto concat = [&](Value a, Value b) {
-          return or_(zext(i32_ty, bitcast(a, i16_ty)),
-                     shl(zext(i32_ty, bitcast(b, i16_ty)), i32_val(16)));
-        };
+    if (auto dotOp = dyn_cast<DotOperandEncodingAttr>(dstTy.getEncoding())) {
+      if (auto parent = dyn_cast<NvidiaMmaEncodingAttr>(dotOp.getParent())) {
+        if (parent.isAmpere()) {
+          if (elemLlvmTy.isInteger(8)) {
+            auto concat = [&](Value a1, Value a2, Value a3, Value a4) {
+              return or_(
+                  or_(zext(i32_ty, a1), shl(zext(i32_ty, a2), i32_val(8))),
+                  or_(shl(zext(i32_ty, a3), i32_val(16)),
+                      shl(zext(i32_ty, a4), i32_val(24))));
+            };
+            SmallVector<Value> outVals32(outVals.size() / 4);
+            for (int i = 0; i < outVals32.size(); ++i) {
+              outVals32[i] = concat(outVals[4 * i], outVals[4 * i + 1],
+                                    outVals[4 * i + 2], outVals[4 * i + 3]);
+            }
+            outVals = outVals32;
+          } else {
+            assert(elemLlvmTy.isBF16() && "Unexpected element type");
+            auto concat = [&](Value a, Value b) {
+              return or_(zext(i32_ty, bitcast(a, i16_ty)),
+                         shl(zext(i32_ty, bitcast(b, i16_ty)), i32_val(16)));
+            };
 
-        SmallVector<Value> outVals32(outVals.size() / 2);
-        for (int i = 0; i < outVals32.size(); ++i) {
-          outVals32[i] = concat(outVals[2 * i], outVals[2 * i + 1]);
+            SmallVector<Value> outVals32(outVals.size() / 2);
+            for (int i = 0; i < outVals32.size(); ++i) {
+              outVals32[i] = concat(outVals[2 * i], outVals[2 * i + 1]);
+            }
+            outVals = outVals32;
+          }
         }
-        outVals = outVals32;
       }
     }
 
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
index 2a3606b581..a7bdb3603c 100644
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -4026,10 +4026,11 @@ def _kernel(dst, src, CACHE: tl.constexpr):
         amdgcn = pgm.asm['amdgcn']
         cg_cache_modifier_str = 'nt'
         cv_cache_modifier_str = 'sc0 sc1'
+        buffer_load_line = [line for line in amdgcn.splitlines() if "buffer_load" in line]
         global_load_line = [line for line in amdgcn.splitlines() if "global_load" in line]
         flat_load_line = [line for line in amdgcn.splitlines() if "flat_load" in line]
         if cache == '' or cache == '.ca':
-            assert cg_cache_modifier_str not in global_load_line[0]
+            assert cg_cache_modifier_str not in (global_load_line[0] if global_load_line else buffer_load_line[0])
         if cache == '.cg':
             assert cg_cache_modifier_str in global_load_line[0]
         if cache == '.cv':
diff --git a/test/Conversion/amd/builtin_func_to_llvm.mlir b/test/Conversion/amd/builtin_func_to_llvm.mlir
new file mode 100644
index 0000000000..06ef06c542
--- /dev/null
+++ b/test/Conversion/amd/builtin_func_to_llvm.mlir
@@ -0,0 +1,12 @@
+// RUN: triton-opt %s -split-input-file --convert-triton-amdgpu-to-llvm="arch=gfx942 ftz=True" --convert-builtin-func-to-llvm="ftz=True" | FileCheck %s --check-prefix=LLVM_FTZ
+// RUN: triton-opt %s -split-input-file --convert-triton-amdgpu-to-llvm="arch=gfx942 ftz=False" --convert-builtin-func-to-llvm="ftz=False" | FileCheck %s --check-prefix=LLVM_NO_FTZ
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} {
+  tt.func public @test_fast_expf(%arg0: tensor<64xf32, #blocked>) attributes {noinline = false} {
+    // LLVM_FTZ: llvm.amdgcn.exp2.f32
+    // LLVM_NO_FTZ: llvm.exp2.f32
+    %0 = tt.extern_elementwise %arg0 {libname = "libdevice", libpath = "", pure = true, symbol = "__triton_hip_fast_expf"} : (tensor<64xf32, #blocked>) -> tensor<64xf32, #blocked>
+    tt.return
+  }
+}
diff --git a/test/Conversion/amd/compute-base-ptr.mlir b/test/Conversion/amd/compute-base-ptr.mlir
index e8376b1d8b..809e5a8699 100644
--- a/test/Conversion/amd/compute-base-ptr.mlir
+++ b/test/Conversion/amd/compute-base-ptr.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s --split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx942 | FileCheck %s
+// RUN: triton-opt %s --split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx942 --mlir-print-debuginfo --mlir-pretty-debuginfo| FileCheck %s
 
 #blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
 #mma = #triton_gpu.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [2, 4], instrShape = [16, 16], isTransposed = false}>
@@ -6,13 +6,14 @@
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.shared = 544 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL: @local_load_offset
   tt.func @local_load_offset(%arg0: tensor<16x16xf16, #mma>) {
-    %0 = triton_gpu.convert_layout %arg0 {allocation.offset = 0 : i32} : tensor<16x16xf16, #mma> -> tensor<16x16xf16, #blocked>
-    %1 = triton_gpu.local_alloc %0 {allocation.offset = 0 : i32} : (tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory>
+    %0 = triton_gpu.convert_layout %arg0 {allocation.offset = 0 : i32} : tensor<16x16xf16, #mma> -> tensor<16x16xf16, #blocked> loc(#loc1)
+    %1 = triton_gpu.local_alloc %0 {allocation.offset = 0 : i32} : (tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> loc(#loc2)
     // This catches base ptr calculation in the computeBasePtr, checks if the gep has correct element type.
-    // CHECK: llvm.sub
-    // CHECK-NEXT: llvm.getelementptr
-    // CHECK-SAME: (!llvm.ptr<3>, i32) -> !llvm.ptr<3>, f16
-    %2 = triton_gpu.local_load %1 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>>
+    // CHECK: llvm.getelementptr {{.*}} (!llvm.ptr<3>, i32) -> !llvm.ptr<3>, f16 local_load:3:0
+    %2 = triton_gpu.local_load %1 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>> loc(#loc3)
     tt.return
   }
 }
+#loc1 = loc("conert_layout":1:0)
+#loc2 = loc("local_alloc":2:0)
+#loc3 = loc("local_load":3:0)
diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir
index 876dc0d769..ef67338457 100644
--- a/test/Conversion/amd/tritongpu_to_llvm.mlir
+++ b/test/Conversion/amd/tritongpu_to_llvm.mlir
@@ -34,3 +34,31 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     tt.return
   }
 }
+
+// -----
+
+// Smoke test to check that mfma 32 and dot operand layouts can work with small tensors, for example with shape 16x16
+#mfma = #triton_gpu.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [32, 32], isTransposed = true}>
+#dotop0 = #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth=4}>
+#dotop1 = #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth=4}>
+#shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], hasLeadingOffset = false}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: small_mfma_tensor_conversions
+  tt.func public @small_mfma_tensor_conversions(%arg0: tensor<16x16xf16, #mfma>, %arg1: tensor<16x16x!tt.ptr<f32>, #mfma>) {
+    // CHECK-NOT: triton_gpu.convert_layout
+    %0 = triton_gpu.local_alloc %arg0 : (tensor<16x16xf16, #mfma>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory>
+    // CHECK-4: store {{.*}} vector<4xf16>
+    %1 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dotop0>
+    // CHECK-2: load {{.*}} vector<4xf16>
+    %2 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dotop1>
+    // CHECK-8: load {{.*}} vector<1xf16>
+    %3 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #mfma>
+    // CHECK-4: load {{.*}} vector<4xf16>
+    %4 = tt.fp_to_fp %3 : tensor<16x16xf16, #mfma> -> tensor<16x16xf32, #mfma>
+
+    %5 = tt.dot %1, %2, %4 : tensor<16x16xf16, #dotop0> * tensor<16x16xf16, #dotop1> -> tensor<16x16xf32, #mfma>
+    // Store result to prevent DCE from removing all conversion related code
+    %6 = triton_gpu.local_alloc %5 : (tensor<16x16xf32, #mfma>) -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory>
+    tt.return
+  }
+}
diff --git a/test/Conversion/tritongpu_to_llvm_hopper.mlir b/test/Conversion/tritongpu_to_llvm_hopper.mlir
index 83653d57b6..113ec3cf66 100644
--- a/test/Conversion/tritongpu_to_llvm_hopper.mlir
+++ b/test/Conversion/tritongpu_to_llvm_hopper.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-gpu-to-llvm=compute-capability=90 2>&1 | FileCheck %s
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-gpu-to-llvm='compute-capability=90 ptx-version=81' 2>&1 | FileCheck %s
 
 #mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0], instrShape = [16, 256, 32]}>
 #shared = #triton_gpu.shared<{vec = 16, perPhase = 4, maxPhase = 2, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], hasLeadingOffset = true}>
diff --git a/test/Conversion/tritongpu_to_llvm_hopper_ptx80.mlir b/test/Conversion/tritongpu_to_llvm_hopper_ptx80.mlir
new file mode 100644
index 0000000000..906c610023
--- /dev/null
+++ b/test/Conversion/tritongpu_to_llvm_hopper_ptx80.mlir
@@ -0,0 +1,44 @@
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-gpu-to-llvm='compute-capability=90 ptx-version=80' 2>&1 | FileCheck %s
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @atomic_add_f32_nomask(%dest_ptrs: tensor<256x!tt.ptr<f32>, #blocked> {tt.divisibility = 16 : i32, tt.contiguity = 16 : i32}, %data: tensor<256xf32, #blocked>) attributes {noinline = false} {
+    // CHECK-LABEL: atomic_add_f32_nomask
+    // CHECK: atom.global.gpu.acq_rel.add.f32
+    // CHECK: atom.global.gpu.acq_rel.add.f32
+    // CHECK: atom.global.gpu.acq_rel.add.f32
+    // CHECK: atom.global.gpu.acq_rel.add.f32
+    %0 = tt.atomic_rmw fadd, acq_rel, gpu, %dest_ptrs, %data : (tensor<256x!tt.ptr<f32>, #blocked>, tensor<256xf32, #blocked>) -> tensor<256xf32, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @atomic_add_f32_withmask(%dest_ptrs: tensor<256x!tt.ptr<f32>, #blocked> {tt.divisibility = 16 : i32, tt.contiguity = 16 : i32}, %data: tensor<256xf32, #blocked>, %mask: tensor<256xi1, #blocked> {tt.constancy = 2 : i32}) attributes {noinline = false} {
+    // CHECK-LABEL: atomic_add_f32_withmask
+    // CHECK: atom.global.gpu.acq_rel.add.f32
+    // CHECK: atom.global.gpu.acq_rel.add.f32
+    // CHECK: atom.global.gpu.acq_rel.add.f32
+    // CHECK: atom.global.gpu.acq_rel.add.f32
+    %0 = tt.atomic_rmw fadd, acq_rel, gpu, %dest_ptrs, %data, %mask : (tensor<256x!tt.ptr<f32>, #blocked>, tensor<256xf32, #blocked>, tensor<256xi1, #blocked>) -> tensor<256xf32, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [1], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @atomic_add_f16_withmask(%dest_ptrs: tensor<256x!tt.ptr<f16>, #blocked> {tt.divisibility = 16 : i32, tt.contiguity = 16 : i32}, %data: tensor<256xf16, #blocked>, %mask: tensor<256xi1, #blocked> {tt.constancy = 4 : i32}) attributes {noinline = false} {
+    // CHECK-LABEL: atomic_add_f16_withmask
+    // CHECK: atom.global.gpu.acq_rel.add.noftz.f16x2
+    // CHECK: atom.global.gpu.acq_rel.add.noftz.f16x2
+    // CHECK: atom.global.gpu.acq_rel.add.noftz.f16x2
+    // CHECK: atom.global.gpu.acq_rel.add.noftz.f16x2
+    %0 = tt.atomic_rmw fadd, acq_rel, gpu, %dest_ptrs, %data, %mask : (tensor<256x!tt.ptr<f16>, #blocked>, tensor<256xf16, #blocked>, tensor<256xi1, #blocked>) -> tensor<256xf16, #blocked>
+    tt.return
+  }
+}
diff --git a/test/TritonGPU/amd/amd-canonicalize-pointers.mlir b/test/TritonGPU/amd/amd-canonicalize-pointers.mlir
index eda2dd8d99..6c3e2ac42f 100644
--- a/test/TritonGPU/amd/amd-canonicalize-pointers.mlir
+++ b/test/TritonGPU/amd/amd-canonicalize-pointers.mlir
@@ -89,6 +89,46 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war
 
 // -----
 
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
+module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
+  //
+  // This is the same as conversion3, but now the `arith.extsi` operations
+  // disappeared and all the offsets are 32 bits.
+  //
+  // CHECK-LABEL: tt.func @conversion4
+  tt.func @conversion4(%arg0: !tt.ptr<f32>{tt.pointer_range = 32 : i32})-> tensor<1024xf32, #blocked>{
+     %c1024_i32 = arith.constant 1024 : i32
+     %0 = tt.get_program_id x : i32
+     %1 = arith.muli %0, %c1024_i32 : i32
+     %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+     %3 = tt.splat %1 : i32 -> tensor<1024xi32, #blocked>
+     %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
+
+     //CHECK: %0 = tt.get_program_id x : i32
+     //CHECK: %[[pid:.*]] = arith.muli %0, {{.*}} : i32
+     //CHECK: %[[makerange:.*]] = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+     //CHECK: %[[uniformOffset1:.*]] = arith.addi %[[pid]], {{.*}} : i32
+     //CHECK: %[[tensorOffset1:.*]] = arith.addi %{{.*}}, %[[makerange]] : tensor<1024xi32, #blocked>
+     //CHECK: %[[uniformOffset0:.*]] = arith.addi %[[pid:.*]], %{{.*}} : i32
+     //CHECK: %[[tensorOffset3:.*]] = arith.addi %{{.*}}, %[[makerange]] : tensor<1024xi32, #blocked>
+     //CHECK: %[[zero:.*]] = tt.splat %{{.*}} : i32 -> tensor<1024xi32, #blocked>
+     //CHECK: %[[uniformPtr0:.*]] = tt.addptr %arg0, %[[uniformOffset0:.*]] : !tt.ptr<f32>, i32
+     //CHECK: %[[tensorOffset0:.*]]= arith.addi %[[tensorOffset3]], %[[zero]] : tensor<1024xi32, #blocked>
+     //CHECK: %[[uniformPtr1:.*]] = tt.addptr %[[uniformPtr0]], %[[uniformOffset1]] : !tt.ptr<f32>, i32
+     //CHECK: %[[tensorOffset2:.*]] = arith.addi %[[tensorOffset1]], %[[tensorOffset0]]: tensor<1024xi32, #blocked>
+     //CHECK: %[[scalarPtr:.*]] = tt.splat %[[uniformPtr1]] : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>
+     //CHECK: %[[newPtr:.*]] = tt.addptr %[[scalarPtr]], %[[tensorOffset2]] : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
+     //CHECK: tt.load %[[newPtr]]
+     %5 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>
+     %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
+     %7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
+     %8 = tt.load %7 : tensor<1024x!tt.ptr<f32>, #blocked>
+     tt.return %8 : tensor<1024xf32, #blocked>
+  }
+}
+
+// -----
+
 #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
 module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
   // CHECK-LABEL: tt.func @forOp
diff --git a/test/TritonGPU/amd/amd-convert-buffer-ops.mlir b/test/TritonGPU/amd/amd-convert-buffer-ops.mlir
new file mode 100644
index 0000000000..4fb418e381
--- /dev/null
+++ b/test/TritonGPU/amd/amd-convert-buffer-ops.mlir
@@ -0,0 +1,124 @@
+// RUN: triton-opt %s -split-input-file --tritonamdgpu-convert-buffer-ops | FileCheck %s
+
+#blocked0 = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [1], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} {
+  // CHECK-LABEL: simple
+    tt.func @simple(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 :i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32) {
+    %c256_i32 = arith.constant 256 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c256_i32 : i32
+    %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked0>
+    %3 = tt.splat %1 : i32 -> tensor<256xi32, #blocked0>
+    // CHECK: %[[offset:.*]] = arith.addi
+    %4 = arith.addi %3, %2 : tensor<256xi32, #blocked0>
+    %5 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>, #blocked0>
+    %6 = tt.addptr %5, %4 : tensor<256x!tt.ptr<f32>, #blocked0>, tensor<256xi32, #blocked0>
+    %7 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>, #blocked0>
+    %8 = tt.addptr %7, %4 : tensor<256x!tt.ptr<f32>, #blocked0>, tensor<256xi32, #blocked0>
+    // CHECK: buffer_load %arg0[%[[offset]]]
+    %9 = tt.load %6 : tensor<256x!tt.ptr<f32>, #blocked0>
+    // CHECK: buffer_load %arg1[%[[offset]]]
+    %10 = tt.load %8 : tensor<256x!tt.ptr<f32>, #blocked0>
+    // CHECK: %[[data:.*]] = arith.addf
+    %11 = arith.addf %9, %10 : tensor<256xf32, #blocked0>
+    %12 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<256x!tt.ptr<f32>, #blocked0>
+    %13 = tt.addptr %12, %4 : tensor<256x!tt.ptr<f32>, #blocked0>, tensor<256xi32, #blocked0>
+    // CHECK: buffer_store %[[data]], %arg2[%[[offset]]]
+    tt.store %13, %11 : tensor<256x!tt.ptr<f32>, #blocked0>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
+  // CHECK-LABEL: assume_positive_offset
+  tt.func @assume_positive_offset(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}) ->  tensor<1024xf32, #blocked>{
+    %c1024_i32 = arith.constant 1024 : i32
+    %c128_i32 = arith.constant 128 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %sub = arith.subi %1, %c128_i32 : i32
+    %cmp = arith.cmpi sgt, %sub, %c0_i32 : i32
+    "llvm.intr.assume"(%cmp) : (i1) -> ()
+    %2 = tt.splat %sub : i32 -> tensor<1024xi32, #blocked>
+    %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    // CHECK: %[[offset:.*]] = arith.addi
+    %4 = arith.addi %2, %3 : tensor<1024xi32, #blocked>
+    // CHECK: %[[scalar_ptr:.*]] = tt.addptr %arg0
+    %5 = tt.addptr %arg0, %1 : !tt.ptr<f32>, i32
+    %8 = tt.splat %5 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>
+    %9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
+    // CHECK: buffer_load %[[scalar_ptr]][%[[offset]]]
+    %10 = tt.load %9 : tensor<1024x!tt.ptr<f32>, #blocked>
+    tt.return %10 : tensor<1024xf32, #blocked>
+  }
+}
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32}  {
+  // CHECK-LABEL: offset_64_bits
+  tt.func @offset_64_bits(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}) -> tensor<1024xf32, #blocked> {
+    %c1024_i32 = arith.constant 1024 : i32
+    %c128_i32 = arith.constant 128 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %sub = arith.subi %1, %c128_i32 : i32
+    %2 = tt.splat %sub : i32 -> tensor<1024xi32, #blocked>
+    %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    %ext2 = arith.extsi %2 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked>
+    %ext3 = arith.extsi %3 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked>
+    %4 = arith.addi %ext2, %ext3 : tensor<1024xi64, #blocked>
+    %5 = tt.addptr %arg0, %1 : !tt.ptr<f32>, i32
+    %8 = tt.splat %5 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>
+    %9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi64, #blocked>
+    // CHECK: tt.load
+    %10 = tt.load %9 : tensor<1024x!tt.ptr<f32>, #blocked>
+    tt.return %10 : tensor<1024xf32, #blocked>
+  }
+}
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32}  {
+  // CHECK-LABEL: offset_64_bits_narrow
+  tt.func public @offset_64_bits_narrow(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}) -> tensor<1024xf32, #blocked> {
+    %c1024_i32 = arith.constant 1024 : i32
+    %c128_i32 = arith.constant 128 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.splat %1: i32 -> tensor<1024xi32, #blocked>
+    %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    %ext2 = arith.extsi %2 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked>
+    %ext3 = arith.extsi %3 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked>
+    %4 = arith.addi %ext2, %ext3 : tensor<1024xi64, #blocked>
+    // CHECK: %[[scalar_ptr:.*]] = tt.addptr %arg0
+    %5 = tt.addptr %arg0, %1 : !tt.ptr<f32>, i32
+    %8 = tt.splat %5 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>
+    // CHECK: %[[offset_32_bit:.*]] = arith.trunci
+    %narrow4 = arith.trunci %4 : tensor<1024xi64, #blocked> to tensor <1024xi32, #blocked>
+    %9 = tt.addptr %8, %narrow4 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
+    // CHECK: buffer_load %[[scalar_ptr]][%[[offset_32_bit]]]
+    %10 = tt.load %9 : tensor<1024x!tt.ptr<f32>, #blocked>
+    tt.return %10 : tensor<1024xf32, #blocked>
+  }
+}
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32}  {
+  // CHECK-LABEL: non_canonical_ptr
+  tt.func @non_canonical_ptr(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: tensor<1024xi32, #blocked>) -> tensor<1024xf32, #blocked>{
+    %8 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>
+    %9 = tt.addptr %8, %arg1: tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
+    // CHECK: tt.load
+    %10 = tt.load %9 : tensor<1024x!tt.ptr<f32>, #blocked>
+    tt.return %10 : tensor<1024xf32, #blocked>
+  }
+}
diff --git a/test/TritonGPU/combine.mlir b/test/TritonGPU/combine.mlir
index 682c1cb301..3b727b4e95 100644
--- a/test/TritonGPU/combine.mlir
+++ b/test/TritonGPU/combine.mlir
@@ -2649,3 +2649,39 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 :
     tt.return
   }
 }
+
+// -----
+
+// Minimized reproducer for compiler crash during remove layouts conversions pass:
+// If dot result transformed into tensor with shape smaller than one MFMA instruction size, it triggers various asserts.
+// This is a smoke test that checks that compiler do not crash.
+//
+// CHECK-LABEL: small_tensor_mfma
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 1], order = [0, 1]}>
+#mma = #triton_gpu.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 1], instrShape = [32, 32], isTransposed = true}>
+#mma1 = #triton_gpu.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 1], instrShape = [16, 16], isTransposed = true}>
+module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
+  tt.func public @small_tensor_mfma(%arg0: !tt.ptr<f32>) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma>
+    %cst_0 = arith.constant dense<1.230000e+02> : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
+    %cst_1 = arith.constant dense<1.230000e+02> : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
+    %cst_2 = arith.constant dense<1.230000e+02> : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 4}>>
+    %cst_3 = arith.constant dense<1.230000e+02> : tensor<32x16xf32, #mma1>
+    %0 = tt.dot %cst_0, %cst_1, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<32x32xf32, #mma>
+    %1 = triton_gpu.convert_layout %0 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked>
+    %2 = "tt.reduce" (%1) ({
+    ^bb0(%arg1: f32, %arg2: f32):
+      %3 = arith.addf %arg1, %arg2 : f32
+      tt.reduce.return %3 : f32
+    }) {axis = 1 : i32} : (tensor<32x32xf32, #blocked>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xf32, #blocked>
+    %5 = tt.broadcast %4 : tensor<32x1xf32, #blocked> -> tensor<32x16xf32, #blocked>
+    %6 = triton_gpu.convert_layout %5 : tensor<32x16xf32, #blocked> -> tensor<32x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1, kWidth = 4}>>
+    %7 = tt.dot %cst_2, %6, %cst_3 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 4}>> * tensor<32x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1, kWidth = 4}>> -> tensor<32x16xf32, #mma1>
+    %addr = tt.splat %arg0 : !tt.ptr<f32> -> tensor<32x16x!tt.ptr<f32>, #blocked>
+    %8 = triton_gpu.convert_layout %7 : tensor<32x16xf32, #mma1> -> tensor<32x16xf32, #blocked>
+    tt.store %addr, %8 : tensor<32x16x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
diff --git a/test/TritonGPU/loop-pipeline-hip.mlir b/test/TritonGPU/loop-pipeline-hip.mlir
index 7fa7812c5a..3abcc581b9 100644
--- a/test/TritonGPU/loop-pipeline-hip.mlir
+++ b/test/TritonGPU/loop-pipeline-hip.mlir
@@ -233,3 +233,33 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     tt.return
   }
 }
+
+// -----
+
+// Check that the stream pipeliner updates the resulting memory layout of transpose ops to mutable if immutable local buffers are replaced
+// CHECK-LABEL: loop_with_dot_and_transpose
+// CHECK: triton_gpu.local_alloc {{.*}}, mutable>
+// CHECK: tt.trans {{.*}}, mutable> -> {{.*}}, mutable>
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [2, 2], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
+#shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1], hasLeadingOffset = false}>
+#shared1 = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], hasLeadingOffset = false}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx1201", "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @loop_with_dot_and_transpose(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: i32, %arg4: tensor<32x32x!tt.ptr<f32>, #blocked1>, %arg5: tensor<32x32x!tt.ptr<f32>, #blocked>) attributes {noinline = false} {
+    %c1_i32 = arith.constant 1 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked>
+    %0 = scf.for %arg2 = %c0_i32 to %arg1 step %c1_i32 iter_args(%arg3 = %cst) -> (tensor<32x32xf32, #blocked>)  : i32 {
+      %2 = tt.load %arg4 : tensor<32x32x!tt.ptr<f32>, #blocked1>
+      %3 = triton_gpu.local_alloc %2 : (tensor<32x32xf32, #blocked1>) -> !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory>
+      %4 = tt.trans %3 {order = array<i32: 1, 0>} : !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory>
+      %5 = triton_gpu.local_load %4 : !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>
+      %6 = triton_gpu.convert_layout %2 : tensor<32x32xf32, #blocked1> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>
+      %7 = tt.dot %6, %5, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<32x32xf32, #blocked>
+      scf.yield %7 : tensor<32x32xf32, #blocked>
+    }
+    tt.store %arg5, %0 : tensor<32x32x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
index bcee13b46e..162695c2d9 100644
--- a/third_party/amd/backend/compiler.py
+++ b/third_party/amd/backend/compiler.py
@@ -184,7 +184,10 @@ def make_ttgir(mod, metadata, options):
         passes.ttgpuir.add_reduce_data_duplication(pm)
         if amd.has_matrix_core_feature(options.arch):
             amd.passes.ttgpuir.add_reorder_instructions(pm)
-        amd.passes.ttgpuir.add_canonicalize_pointers(pm)
+        if os.environ.get("AMDGCN_USE_BUFFER_OPS", "0") == "1":
+            amd.passes.ttgpuir.add_canonicalize_pointers(pm)
+            passes.common.add_canonicalizer(pm)
+            amd.passes.ttgpuir.add_convert_to_buffer_ops(pm)
         passes.common.add_canonicalizer(pm)
         passes.common.add_cse(pm)
         passes.common.add_symbol_dce(pm)
@@ -229,12 +232,7 @@ def make_llir(src, metadata, options):
         amd.passes.ttgpuir.lower_instruction_sched_hints(pm, options.instruction_sched_variant)
         if os.environ.get("TRITON_DISABLE_LINE_INFO", "0") == "0":
             passes.llvmir.add_di_scope(pm)
-        # This pass (`add_builtin_func_to_llvmir`) serves as a temporary workaround to address the issue of excessive basic block
-        # count caused by predicated loads/stores. In certain kernels, the addition of these blocks can cause the MLIR
-        # canonicalizer to never finish when attempting to merge blocks. The permanent solution under consideration
-        # involves using MUBUF instructions that have built-in out-of-bounds checks, which would eliminate the need
-        # for conditional branching around memory accesses.
-        amd.passes.ttgpuir.add_builtin_func_to_llvmir(pm)
+        amd.passes.ttgpuir.add_builtin_func_to_llvmir(pm, __HIP_FTZ)
         pm.run(mod)
 
         # LLVM-IR (MLIR) -> LLVM-IR (LLVM)
diff --git a/third_party/amd/include/TritonAMDGPUToLLVM/Passes.h b/third_party/amd/include/TritonAMDGPUToLLVM/Passes.h
index 67ff40d5b9..bd726bd845 100644
--- a/third_party/amd/include/TritonAMDGPUToLLVM/Passes.h
+++ b/third_party/amd/include/TritonAMDGPUToLLVM/Passes.h
@@ -33,7 +33,8 @@ createOptimizeLDSUsagePass(StringRef arch, int32_t customLDSLimit = 0);
 
 std::unique_ptr<OperationPass<ModuleOp>>
 createConvertTritonAMDGPUToLLVMPass(StringRef targetArch, bool ftz);
-std::unique_ptr<OperationPass<ModuleOp>> createConvertBuiltinFuncToLLVMPass();
+std::unique_ptr<OperationPass<ModuleOp>>
+createConvertBuiltinFuncToLLVMPass(bool ftz);
 std::unique_ptr<OperationPass<ModuleOp>>
 createInsertInstructionSchedHintsPass();
 std::unique_ptr<OperationPass<ModuleOp>>
diff --git a/third_party/amd/include/TritonAMDGPUToLLVM/Passes.td b/third_party/amd/include/TritonAMDGPUToLLVM/Passes.td
index ccb2b1898f..9f4665aef2 100644
--- a/third_party/amd/include/TritonAMDGPUToLLVM/Passes.td
+++ b/third_party/amd/include/TritonAMDGPUToLLVM/Passes.td
@@ -49,10 +49,14 @@ def ConvertTritonAMDGPUToLLVM : Pass<"convert-triton-amdgpu-to-llvm", "mlir::Mod
 
 def ConvertBuiltinFuncToLLVM : Pass<"convert-builtin-func-to-llvm", "mlir::ModuleOp"> {
     let summary = "Convert Builtin Func to LLVM";
-    let constructor = "mlir::triton::createConvertBuiltinFuncToLLVMPass()";
+    let constructor = "mlir::triton::createConvertBuiltinFuncToLLVMPass(/*ftz=*/true)";
 
     let dependentDialects = ["mlir::LLVM::LLVMDialect"];
 
+    let options = [
+        Option<"ftz", "ftz", "bool", /*default*/"true",
+               "flush denorms for math functions">,
+    ];
 }
 
 def InsertInstructionSchedHints : Pass<"insert-instruction-sched-hints", "mlir::ModuleOp"> {
diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.h b/third_party/amd/include/TritonAMDGPUTransforms/Passes.h
index 841137887b..d0ffdae28e 100644
--- a/third_party/amd/include/TritonAMDGPUTransforms/Passes.h
+++ b/third_party/amd/include/TritonAMDGPUTransforms/Passes.h
@@ -2,6 +2,7 @@
 #define TRITON_DIALECT_TRITONAMDGPU_TRANSFORMS_PASSES_H_
 
 #include "mlir/Pass/Pass.h"
+#include "third_party/amd/include/Dialect/TritonAMDGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 
 namespace mlir {
@@ -23,6 +24,8 @@ std::unique_ptr<Pass> createTritonAMDGPUOptimizeEpiloguePass();
 
 std::unique_ptr<Pass> createTritonAMDGPUCanonicalizePointersPass();
 
+std::unique_ptr<Pass> createTritonAMDGPUConvertToBufferOpsPass();
+
 /// Generate the code for registering passes.
 #define GEN_PASS_REGISTRATION
 #include "TritonAMDGPUTransforms/Passes.h.inc"
diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td
index d59935e796..433e60be67 100644
--- a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td
+++ b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td
@@ -111,4 +111,14 @@ def TritonAMDGPUReorderInstructions: Pass<"tritonamdgpu-reorder-instructions", "
   let dependentDialects = [];
 }
 
+def TritonAMDGPUConvertToBufferOps : Pass<"tritonamdgpu-convert-buffer-ops", "mlir::ModuleOp"> {
+  let summary = "Convert memory operations to buffer operations";
+
+  let description = "This pass converts memory operations (e.g., tt.load/tt.store) to  amdgpu buffer operations, if possible";
+
+  let constructor = "mlir::createTritonAMDGPUConvertToBufferOpsPass()";
+
+  let dependentDialects = ["mlir::triton::amdgpu::TritonAMDGPUDialect"];
+}
+
 #endif
diff --git a/third_party/amd/language/hip/libdevice.py b/third_party/amd/language/hip/libdevice.py
index 6b40a40c9c..a69d4406cc 100644
--- a/third_party/amd/language/hip/libdevice.py
+++ b/third_party/amd/language/hip/libdevice.py
@@ -66,6 +66,13 @@ def exp(arg0, _builder=None):
         }, is_pure=True, _builder=_builder)
 
 
+@core.extern
+def fast_expf(arg0, _builder=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__triton_hip_fast_expf", core.dtype("fp32")),
+    }, is_pure=True, _builder=_builder)
+
+
 @core.extern
 def fast_dividef(arg0, arg1, _builder=None):
     return core.extern_elementwise("", "", [arg0, arg1], {
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/BufferOpsEmitter.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/BufferOpsEmitter.cpp
index be009af4d1..37bdb8fe99 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/BufferOpsEmitter.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/BufferOpsEmitter.cpp
@@ -133,7 +133,7 @@ Type BufferEmitter::getBufferOpType(Type type) {
   // will be bitcast-able to the original type. So if the types
   // ended up different, we simply have to emit a `bitcastOp` to convert
   Type bufferType = type;
-  if (bufferVecSize != vecSize)
+  if (bufferVecSize != vecSize || bufferElementType != elementType)
     bufferType = VectorType::get(bufferVecSize, bufferElementType);
   if (bufferVecSize == 1)
     bufferType = getElementTypeOrSelf(bufferType);
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/BuiltinFuncToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/BuiltinFuncToLLVM.cpp
index 18364b67e1..409d14774f 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/BuiltinFuncToLLVM.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/BuiltinFuncToLLVM.cpp
@@ -17,15 +17,14 @@ using namespace mlir;
 
 namespace {
 
-class CallOpConversion : public mlir::RewritePattern {
+class CallOpConversion : public OpRewritePattern<LLVM::CallOp> {
 public:
-  CallOpConversion(mlir::MLIRContext *context)
-      : mlir::RewritePattern(LLVM::CallOp::getOperationName(), 1, context) {}
+  CallOpConversion(mlir::MLIRContext *context, bool ftz)
+      : OpRewritePattern<LLVM::CallOp>(context, 1), ftz(ftz) {}
 
   LogicalResult
-  matchAndRewrite(mlir::Operation *op,
+  matchAndRewrite(LLVM::CallOp callOp,
                   mlir::PatternRewriter &rewriter) const override {
-    auto callOp = cast<LLVM::CallOp>(op);
     if (isPredicatedLoad(callOp)) {
       return convertPredicatedLoad(callOp, rewriter);
     } else if (isPredicatedStore(callOp)) {
@@ -195,6 +194,18 @@ class CallOpConversion : public mlir::RewritePattern {
       LLVM::FastmathFlagsAttr defaultFlags{};
       replacementOp = rewriter.create<LLVM::FMulOp>(
           loc, returnType, operands[0], rcpOp->getResult(0), defaultFlags);
+    } else if (calleeName == "__triton_hip_fast_expf") {
+      assert(operands.size() == 1);
+      assert(operands[0].getType().getIntOrFloatBitWidth() == 32);
+      const double log2e = 1.4426950408889634;
+      LLVM::FastmathFlagsAttr defaultFlags{};
+      auto mulOp = rewriter.create<LLVM::FMulOp>(
+          loc, rewriter.getF32Type(), operands[0],
+          LLVM::createConstantF32(loc, rewriter, log2e), defaultFlags);
+      const char *intrinsic = ftz ? "llvm.amdgcn.exp2.f32" : "llvm.exp2.f32";
+
+      replacementOp = LLVM::createLLVMIntrinsicCallOp(
+          rewriter, loc, intrinsic, returnType, mulOp->getResult(0));
     }
 
     if (replacementOp) {
@@ -204,11 +215,16 @@ class CallOpConversion : public mlir::RewritePattern {
 
     return mlir::failure();
   }
+
+private:
+  bool ftz;
 };
 
 struct ConvertBuiltinFuncToLLVM
     : public triton::impl::ConvertBuiltinFuncToLLVMBase<
           ConvertBuiltinFuncToLLVM> {
+  explicit ConvertBuiltinFuncToLLVM(bool ftz) { this->ftz = ftz; }
+
   void runOnOperation() override {
     MLIRContext *context = &getContext();
     ModuleOp mod = getOperation();
@@ -217,7 +233,7 @@ struct ConvertBuiltinFuncToLLVM
     config.enableRegionSimplification = GreedySimplifyRegionLevel::Aggressive;
 
     RewritePatternSet patterns(context);
-    patterns.add<CallOpConversion>(context);
+    patterns.add<CallOpConversion>(context, this->ftz);
 
     if (mlir::applyPatternsAndFoldGreedily(mod, std::move(patterns), config)
             .failed()) {
@@ -231,8 +247,9 @@ struct ConvertBuiltinFuncToLLVM
 namespace mlir {
 namespace triton {
 
-std::unique_ptr<OperationPass<ModuleOp>> createConvertBuiltinFuncToLLVMPass() {
-  return std::make_unique<ConvertBuiltinFuncToLLVM>();
+std::unique_ptr<OperationPass<ModuleOp>>
+createConvertBuiltinFuncToLLVMPass(bool ftz) {
+  return std::make_unique<ConvertBuiltinFuncToLLVM>(ftz);
 }
 
 } // namespace triton
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp
index 953b01dab0..b7ee4efc72 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -50,7 +50,11 @@ struct LocalLoadOpConversion
   }
 
 private:
-  // shared -> dot_operand if the result layout is mfma
+  /// Lower ttg.local_load in dot operand layout if the operand parent layout is
+  /// MFMA or WMMA.
+  ///
+  /// \returns value with packed loaded values or empty value if this local_load
+  /// is not supproted.
   Value lowerSharedToDotOperandMMA(
       triton::gpu::LocalLoadOp op, triton::gpu::LocalLoadOpAdaptor adaptor,
       const LLVMTypeConverter *typeConverter,
@@ -104,6 +108,8 @@ struct LocalLoadOpConversion
     isOuter = K == 1;
     Value res = lowerSharedToDotOperandMMA(op, adaptor, typeConverter, rewriter,
                                            dotOperandLayout, isOuter);
+    if (!res)
+      return failure();
     rewriter.replaceOp(op, res);
     return success();
   }
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
index f7dc8755fa..a45efd4a79 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -165,7 +165,7 @@ struct LoadStoreConversionBase {
     // Get alignment from the pointer. Since this is a scalar pointer
     // we should not take the pointer contiguity to consider alignment
     auto *axisInfo = axisAnalysisPass.getAxisInfo(ptr);
-    auto maxMultipleBytes = axisInfo->getDivisibility(order[0]);
+    auto maxMultipleBytes = axisInfo->getDivisibility(0);
     auto elemNumBits = triton::getPointeeBitWidth(tensorTy);
     auto elemNumBytes = std::max<unsigned>(elemNumBits / 8, 1);
     auto align = std::max<int64_t>(maxMultipleBytes / elemNumBytes, 1);
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp
index c96ddbbe89..3a40d73c2a 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp
@@ -245,4 +245,10 @@ void TargetInfo::assertFail(RewriterBase &rewriter, Location loc,
 
 int TargetInfo::getSharedAddressSpace() const { return 3; }
 
+bool TargetInfo::supportVectorizedAtomics() const {
+  // Note: not currently tested or used, but AMD generally supports vectorized
+  // atomics.
+  return true;
+}
+
 } // namespace mlir::triton::AMD
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h
index eabb5d6715..0ce38d4d76 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h
@@ -58,6 +58,8 @@ class TargetInfo : public mlir::triton::TargetInfoBase {
                   StringRef file, StringRef func, int line) const override;
   int getSharedAddressSpace() const override;
 
+  bool supportVectorizedAtomics() const override;
+
 private:
   void printfImpl(Value formatStrStart, int formatStrByteCount, ValueRange args,
                   RewriterBase &rewriter, bool useStdErr) const;
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt b/third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt
index 414e4a329f..7da8083cfb 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_triton_library(TritonAMDGPUTransforms
   AccelerateAMDMatmul.cpp
   CanonicalizePointers.cpp
+  ConvertToBufferOps.cpp
   OptimizeEpilogue.cpp
   ReorderInstructions.cpp
   StreamPipelineV2.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp
index f8c4979682..a5b32abfef 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/CanonicalizePointers.cpp
@@ -73,6 +73,17 @@ using namespace mlir;
 //    `%fat_ptr = tt.addptr(%t_ptr, %fatPointers[ptr].offset)`
 //    `%data = tt.load(%fat_ptr)`
 //
+// Please note that `%offset` might be a 32bit or 64bit integer. If
+// we can, we would like to use 32 bit integers. This can happen under
+// certain conditions:
+//
+// a) We can determine that the offset cannot overflow. In this case, we can
+//    downcast the pointer just before emitting the load
+// b) We know that the underlying memory size can be expressed as a 32 bit
+//    value. In this case we can simply start with a 32bit offset and downcast
+//    if we ever meet 64 bit operations (because we know that the offset can be
+//    contained in 32 bits)
+//
 class PointerCanonicalizer {
 public:
   explicit PointerCanonicalizer(ModuleOp moduleOp)
@@ -571,12 +582,16 @@ LogicalResult PointerCanonicalizer::rewriteAddPtrOp(triton::AddPtrOp addPtrOp,
   bool propagateAtrs = true;
   if (!isZeroConst(nonUniformOffset)) {
     Type addPtrOffsetType = getElementTypeOrSelf(nonUniformOffset);
+    Type fatPtrOffsetType = getElementTypeOrSelf(fatPtrOffset);
     canNarrow = canNarrow && canNarrowOffset(fatPtrOffset, nonUniformOffset);
 
-    // If we the incoming offset is 32 bits, then we have to cast to 64
-    if (addPtrOffsetType.isInteger(32))
+    // Upcast or downcast the offset accordingly
+    if (addPtrOffsetType.isInteger(32) && fatPtrOffsetType.isInteger(64))
       nonUniformOffset =
           extend32bitOffsetTo64Bits(rewriter, curLoc, nonUniformOffset);
+    else if (addPtrOffsetType.isInteger(64) && fatPtrOffsetType.isInteger(32))
+      nonUniformOffset =
+          narrow64bitOffsetTo32bits(rewriter, curLoc, nonUniformOffset);
 
     newOffset =
         rewriter.create<arith::AddIOp>(curLoc, nonUniformOffset, fatPtrOffset);
@@ -958,14 +973,18 @@ LogicalResult PointerCanonicalizer::rewritePointer(Value argPtr) {
 
 LogicalResult PointerCanonicalizer::rewriteFunction(triton::FuncOp funcOp) {
   Region &region = funcOp.getRegion();
-  for (Value arg : region.getArguments()) {
+  for (auto [idx, arg] : llvm::enumerate(region.getArguments())) {
     // The pointer argument needs to be a scalar
     if (!isa<triton::PointerType>(arg.getType()))
       continue;
+    int64_t bitness = 64;
+    if (IntegerAttr pointerRangeAttr =
+            funcOp.getArgAttrOfType<IntegerAttr>(idx, "tt.pointer_range"))
+      bitness = pointerRangeAttr.getInt();
 
     rewriter.setInsertionPointToStart(&region.front());
     Value zeroOffset =
-        rewriter.create<arith::ConstantIntOp>(region.getLoc(), 0, 64);
+        rewriter.create<arith::ConstantIntOp>(region.getLoc(), 0, bitness);
 
     // Start the rewrite
     clearFunctionState();
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp
new file mode 100644
index 0000000000..f1d922041f
--- /dev/null
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp
@@ -0,0 +1,260 @@
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Verifier.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "third_party/amd/include/Dialect/TritonAMDGPU/IR/Dialect.h"
+#include "triton/Analysis/Utility.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/Transforms/Utility.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include <deque>
+#include <optional>
+
+#define GEN_PASS_CLASSES
+#include "TritonAMDGPUTransforms/Passes.h"
+
+#define DEBUG_TYPE "tritonamdgpu-convert-buffer-ops"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+using namespace mlir;
+namespace ttg = mlir::triton::gpu;
+namespace tt = mlir::triton;
+
+namespace {
+bool verifyNonNegativeByAssumption(Value expr,
+                                   const DenseSet<Value> &assumptions) {
+  for (Value assume : assumptions) {
+    LDBG("Assumption:" << assume);
+    if (auto cmpOp = assume.getDefiningOp<arith::CmpIOp>()) {
+      bool isGreaterThan = (cmpOp.getPredicate() == arith::CmpIPredicate::sge ||
+                            cmpOp.getPredicate() == arith::CmpIPredicate::sgt);
+      APInt cst;
+      if (isGreaterThan && (cmpOp.getLhs() == expr) &&
+          matchPattern(cmpOp.getRhs(), m_ConstantInt(&cst))) {
+        return cst.isNonNegative();
+      }
+    }
+  }
+  return false;
+}
+
+bool verifyNonNegativeExpr(Value expr, const DenseSet<Value> &assumptions) {
+
+  // Check if the expression is contained in any assumption
+  if (verifyNonNegativeByAssumption(expr, assumptions)) {
+    LDBG("Non negative by assumption");
+    return true;
+  }
+
+  // Recurse if the operation is defined
+  Operation *op = expr.getDefiningOp();
+  if (!op)
+    return false;
+
+  bool nonNegative =
+      llvm::TypeSwitch<Operation *, bool>(expr.getDefiningOp())
+          .Case<triton::BroadcastOp>([&](auto broadcastOp) {
+            return verifyNonNegativeExpr(broadcastOp.getSrc(), assumptions);
+          })
+          .Case<triton::ExpandDimsOp>([&](auto expandOp) {
+            return verifyNonNegativeExpr(expandOp.getSrc(), assumptions);
+          })
+          .Case<triton::SplatOp>([&](auto splatOp) {
+            return verifyNonNegativeExpr(splatOp.getSrc(), assumptions);
+          })
+          .Case<triton::MakeRangeOp>([&](auto makeRangeOp) {
+            return makeRangeOp.getStart() >= 0 && makeRangeOp.getEnd() >= 0;
+          })
+          .Case<arith::ConstantIntOp>(
+              [&](auto constIntOp) { return constIntOp.value() >= 0; })
+          .Case<arith::ConstantOp>([&](arith::ConstantOp constOp) {
+            Value val = constOp.getResult();
+            DenseIntElementsAttr constVal;
+            if (matchPattern(val, m_Constant(&constVal)) && constVal.isSplat())
+              return constVal.getSplatValue<APInt>().isNonNegative();
+            return false;
+          })
+          .Case<triton::GetProgramIdOp>([&](auto pidOp) { return true; })
+          .Case<arith::MaxSIOp>([&](auto maxOp) {
+            // max(a,b) >= 0 iff a>=0 || b>=0
+            bool nnLhs = verifyNonNegativeExpr(maxOp.getLhs(), assumptions);
+            bool nnRhs = verifyNonNegativeExpr(maxOp.getRhs(), assumptions);
+            return nnLhs || nnRhs;
+          })
+          .Case<arith::RemSIOp>([&](auto remsiOp) {
+            // a % b >= 0 iff a>=0
+            return verifyNonNegativeExpr(remsiOp.getLhs(), assumptions);
+          })
+          .Case<arith::TruncIOp, arith::ExtSIOp>([&](Operation *unaryOp) {
+            // a = OP b >= 0 iff b >= 0
+            return verifyNonNegativeExpr(unaryOp->getOperand(0), assumptions);
+          })
+          .Case<arith::AddIOp, arith::MinSIOp, arith::MulIOp, arith::DivSIOp>(
+              // Generally speaking, a OP b >= 0  iff  a >= 0 && b >= 0 when
+              // OP != sub
+              [&](Operation *binOp) {
+                bool nnLhs =
+                    verifyNonNegativeExpr(binOp->getOperand(0), assumptions);
+                bool nnRhs =
+                    verifyNonNegativeExpr(binOp->getOperand(1), assumptions);
+                return nnLhs && nnRhs;
+              })
+          .Default([&](Operation *op) {
+            // Conservatively assume that the expression is negative
+            return false;
+          });
+  return nonNegative;
+}
+
+// Quick analysis on the Triton IR to decide if we can safely use
+// buffer operations
+bool canUseBufferOps(Value ptr, const DenseSet<Value> &assumptions) {
+  // 1. Check if the pointer is uniform: i.e., if it comes from a uniform
+  // pointer(splatted) and non-uniform offset addition
+
+  LDBG("Buffer op checks for: " << ptr);
+  auto addPtrOp = ptr.getDefiningOp<triton::AddPtrOp>();
+  if (!addPtrOp)
+    return false;
+
+  auto maybeSplatOp = addPtrOp.getPtr().getDefiningOp<triton::SplatOp>();
+  if (!maybeSplatOp)
+    return false;
+  LDBG("Pattern matched");
+
+  // 2. Check if the offset is a 32-bit tensor
+  Value offset = addPtrOp.getOffset();
+  if (cast<RankedTensorType>(offset.getType()).getElementTypeBitWidth() != 32)
+    return false;
+  LDBG("32 bit offset");
+
+  // 3. Check if the offset is non-negative
+  if (!verifyNonNegativeExpr(offset, assumptions))
+    return false;
+
+  LDBG("Non-negative");
+  return true;
+}
+} // namespace
+
+struct ConvertTritonLoadToBufferLoad
+    : public mlir::OpRewritePattern<triton::LoadOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  ConvertTritonLoadToBufferLoad(mlir::MLIRContext *context,
+                                DenseSet<Value> &assumptions)
+      : mlir::OpRewritePattern<triton::LoadOp>(context),
+        assumptions(assumptions) {}
+
+  mlir::LogicalResult
+  matchAndRewrite(triton::LoadOp op, PatternRewriter &rewriter) const override {
+    LDBG("Try to convert: " << op);
+    Value ptr = op.getPtr();
+
+    if (op.getCache() != triton::CacheModifier::NONE)
+      return failure();
+
+    if (canUseBufferOps(ptr, assumptions)) {
+      auto addPtrOp = ptr.getDefiningOp<triton::AddPtrOp>();
+      Value tensorPtr = addPtrOp.getPtr();
+      Value tensorOffset = addPtrOp.getOffset();
+      auto splatOp = tensorPtr.getDefiningOp<triton::SplatOp>();
+      Value basePtr = splatOp.getSrc();
+      Value maybeOther{};
+      if (op.getOther() && !isZeroConst(op.getOther()))
+        maybeOther = op.getOther();
+      Value maybeMask{};
+      if (op.getMask() && !isZeroConst(op.getMask()))
+        maybeMask = op.getMask();
+      rewriter.replaceOpWithNewOp<triton::amdgpu::BufferLoadOp>(
+          op, op.getType(), basePtr, tensorOffset, maybeMask, maybeOther);
+      return success();
+    }
+    LDBG("Failed to convert: " << op);
+    return failure();
+  }
+
+private:
+  // Assumptions collected through the function
+  DenseSet<Value> assumptions;
+};
+
+struct ConvertTritonStoreToBufferStore
+    : public mlir::OpRewritePattern<triton::StoreOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  ConvertTritonStoreToBufferStore(mlir::MLIRContext *context,
+                                  DenseSet<Value> &assumptions)
+      : mlir::OpRewritePattern<triton::StoreOp>(context),
+        assumptions(assumptions) {}
+
+  mlir::LogicalResult
+  matchAndRewrite(triton::StoreOp op,
+                  PatternRewriter &rewriter) const override {
+    LDBG("Try to convert: " << op);
+    Value ptr = op.getPtr();
+
+    if (op.getCache() != triton::CacheModifier::NONE)
+      return failure();
+
+    if (canUseBufferOps(ptr, assumptions)) {
+      auto addPtrOp = ptr.getDefiningOp<triton::AddPtrOp>();
+      Value tensorPtr = addPtrOp.getPtr();
+      Value tensorOffset = addPtrOp.getOffset();
+      auto splatOp = tensorPtr.getDefiningOp<triton::SplatOp>();
+      Value basePtr = splatOp.getSrc();
+      Value maybeMask{};
+      if (op.getMask() && !isZeroConst(op.getMask()))
+        maybeMask = op.getMask();
+      rewriter.replaceOpWithNewOp<triton::amdgpu::BufferStoreOp>(
+          op, op.getValue(), basePtr, tensorOffset, maybeMask);
+      return success();
+    }
+    LDBG("Failed to convert: " << op);
+    return failure();
+  }
+
+private:
+  // Assumptions collected through the function
+  DenseSet<Value> assumptions;
+};
+
+class TritonAMDGPUConvertToBufferOpsPass
+    : public TritonAMDGPUConvertToBufferOpsBase<
+          TritonAMDGPUConvertToBufferOpsPass> {
+
+public:
+  TritonAMDGPUConvertToBufferOpsPass() = default;
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    RewritePatternSet patterns(context);
+    ModuleOp m = getOperation();
+    // Collect assumptions in the function
+    DenseSet<Value> assumptions;
+    m.walk([&](LLVM::AssumeOp op) {
+      if (op->getOperand(0).getDefiningOp<arith::CmpIOp>())
+        assumptions.insert(op->getOperand(0));
+    });
+    LDBG("Number of assumptions found: " << assumptions.size());
+
+    patterns.add<ConvertTritonLoadToBufferLoad>(context, assumptions);
+    patterns.add<ConvertTritonStoreToBufferStore>(context, assumptions);
+    if (applyPatternsAndFoldGreedily(m, std::move(patterns)).failed())
+      signalPassFailure();
+  }
+};
+
+std::unique_ptr<Pass> mlir::createTritonAMDGPUConvertToBufferOpsPass() {
+  return std::make_unique<TritonAMDGPUConvertToBufferOpsPass>();
+}
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp
index 027f06652f..deb566a8b1 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp
@@ -149,7 +149,7 @@ void StreamPipeliner::createStreamCopy(
   SmallVector<ttg::LocalAllocOp> allocsToErase;
   for (Operation *user : loadOp->getUsers()) {
     if (auto alloc = dyn_cast<ttg::LocalAllocOp>(user)) {
-      alloc.replaceAllUsesWith(viewLoad.getResult());
+      triton::replaceUsesAndPropagateType(builder, alloc, viewLoad.getResult());
       allocsToErase.push_back(alloc);
     }
   }
diff --git a/third_party/amd/python/triton_amd.cc b/third_party/amd/python/triton_amd.cc
index a9f3a8ee2f..f97676aafe 100644
--- a/third_party/amd/python/triton_amd.cc
+++ b/third_party/amd/python/triton_amd.cc
@@ -41,8 +41,8 @@ void init_triton_amd_passes_ttgpuir(py::module &&m) {
         [](mlir::PassManager &pm, const std::string &arch, bool ftz) {
           pm.addPass(createConvertTritonAMDGPUToLLVMPass(arch, ftz));
         });
-  m.def("add_builtin_func_to_llvmir", [](mlir::PassManager &pm) {
-    pm.addPass(createConvertBuiltinFuncToLLVMPass());
+  m.def("add_builtin_func_to_llvmir", [](mlir::PassManager &pm, bool ftz) {
+    pm.addPass(createConvertBuiltinFuncToLLVMPass(ftz));
   });
   m.def("insert_instruction_sched_hints", [](mlir::PassManager &pm) {
     pm.addPass(createInsertInstructionSchedHintsPass());
@@ -66,6 +66,8 @@ void init_triton_amd_passes_ttgpuir(py::module &&m) {
                      mlir::createTritonAMDGPUOptimizeEpiloguePass);
   ADD_PASS_WRAPPER_0("add_canonicalize_pointers",
                      mlir::createTritonAMDGPUCanonicalizePointersPass);
+  ADD_PASS_WRAPPER_0("add_convert_to_buffer_ops",
+                     mlir::createTritonAMDGPUConvertToBufferOpsPass);
   ADD_PASS_WRAPPER_0("add_reorder_instructions",
                      mlir::createTritonAMDGPUReorderInstructionsPass);
   ADD_PASS_WRAPPER_1("add_stream_pipelinev2",
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp
index 0d90047a81..0aff097a44 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp
@@ -248,4 +248,10 @@ int TargetInfo::getSharedAddressSpace() const {
   return TritonGEN::TritonGENMemorySpace::kWorkgroup;
 }
 
+bool TargetInfo::supportVectorizedAtomics() const {
+  // Note: not currently tested or used, but AMD generally supports vectorized
+  // atomics.
+  return true;
+}
+
 } // namespace mlir::triton::intel
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.h b/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.h
index 76cd8106fb..92f4477a3a 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.h
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.h
@@ -60,6 +60,8 @@ class TargetInfo : public mlir::triton::TargetInfoBase {
                   StringRef file, StringRef func, int line) const override;
   int getSharedAddressSpace() const override;
 
+  bool supportVectorizedAtomics() const override;
+
 private:
 };
 } // namespace mlir::triton::intel
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
index 36e73d6b88..f8f0486d8f 100644
--- a/third_party/nvidia/backend/compiler.py
+++ b/third_party/nvidia/backend/compiler.py
@@ -60,12 +60,17 @@ def ptx_get_version(cuda_version) -> int:
     raise RuntimeError("Triton only support CUDA 10.0 or higher, but got CUDA version: " + cuda_version)
 
 
-@functools.lru_cache()
-def get_features(options):
+def get_ptx_version_from_options(options):
     ptx_version = options.ptx_version
     if ptx_version is None:
         _, cuda_version = _path_to_binary("ptxas")
         ptx_version = ptx_get_version(cuda_version)
+    return ptx_version
+
+
+@functools.lru_cache()
+def get_features(options):
+    ptx_version = get_ptx_version_from_options(options)
 
     # PTX 8.3 is the max version supported by llvm 3a83162168.
     #
@@ -240,6 +245,8 @@ def make_ttgir(mod, metadata, opt, capability):
 
     @staticmethod
     def make_llir(src, metadata, options, capability):
+        ptx_version = get_ptx_version_from_options(options)
+
         # warp-specialization mutates num_warps
         num_warp_groups = src.get_int_attr("triton_gpu.num-warp-groups-per-cta")
         if num_warp_groups is not None:
@@ -258,7 +265,7 @@ def make_llir(src, metadata, options, capability):
         passes.convert.add_scf_to_cf(pm)
         passes.convert.add_index_to_llvmir(pm)
         passes.ttgpuir.add_allocate_shared_memory(pm)
-        nvidia.passes.ttgpuir.add_to_llvmir(pm, capability)
+        nvidia.passes.ttgpuir.add_to_llvmir(pm, capability, ptx_version)
         nvidia.passes.ttnvgpuir.add_nvgpu_to_llvm(pm)
         passes.convert.add_arith_to_llvmir(pm)
         passes.common.add_canonicalizer(pm)
@@ -299,10 +306,7 @@ def make_llir(src, metadata, options, capability):
 
     @staticmethod
     def make_ptx(src, metadata, opt, capability):
-        ptx_version = opt.ptx_version
-        if ptx_version is None:
-            _, cuda_version = _path_to_binary("ptxas")
-            ptx_version = ptx_get_version(cuda_version)
+        ptx_version = get_ptx_version_from_options(opt)
 
         triple = 'nvptx64-nvidia-cuda'
         proc = 'sm_90a' if capability == 90 else f'sm_{capability}'
diff --git a/third_party/nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h b/third_party/nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h
index 30bfaea7d9..8cd8a180ca 100644
--- a/third_party/nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h
+++ b/third_party/nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h
@@ -26,6 +26,8 @@ createDecomposeUnsupportedConversionsPass();
 std::unique_ptr<OperationPass<ModuleOp>> createConvertTritonGPUToLLVMPass();
 std::unique_ptr<OperationPass<ModuleOp>>
 createConvertTritonGPUToLLVMPass(int32_t computeCapability);
+std::unique_ptr<OperationPass<ModuleOp>>
+createConvertTritonGPUToLLVMPass(int32_t computeCapability, int32_t ptxVersion);
 
 #define GEN_PASS_REGISTRATION
 #include "nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h.inc"
diff --git a/third_party/nvidia/include/TritonNVIDIAGPUToLLVM/Passes.td b/third_party/nvidia/include/TritonNVIDIAGPUToLLVM/Passes.td
index 07624c72d7..9f942dd536 100644
--- a/third_party/nvidia/include/TritonNVIDIAGPUToLLVM/Passes.td
+++ b/third_party/nvidia/include/TritonNVIDIAGPUToLLVM/Passes.td
@@ -30,6 +30,9 @@ def ConvertTritonGPUToLLVM : Pass<"convert-triton-gpu-to-llvm", "mlir::ModuleOp"
         Option<"computeCapability", "compute-capability",
                "int32_t", /*default*/"80",
                "device compute capability">,
+        Option<"ptxVersion", "ptx-version",
+               "int32_t", /*default*/"80",
+               "PTX version">,
     ];
 }
 
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp
index 760ba75d98..945fb092eb 100644
--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp
+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -649,13 +649,11 @@ struct AtomicRMWOpConversion
       : ConvertOpToLLVMPattern<triton::AtomicRMWOp>(converter, benefit),
         LoadStoreConversionBase(targetInfo, axisAnalysisPass) {}
 
-  bool supportsVectorized(Operation *moduleOp, RMWOp opType,
-                          Type elementType) const {
+  bool supportsVectorized(RMWOp opType, Type elementType) const {
     // vectorized atomics are only supported on hopper,
     // and only for specific atomic ops (add, min, max).
     // Note that "packed types" like f16x2 are supported sm60+.
-    auto computeCapability = getNVIDIAComputeCapability(moduleOp);
-    if (computeCapability < 90) {
+    if (!targetInfo.supportVectorizedAtomics()) {
       return false;
     }
 
@@ -707,8 +705,7 @@ struct AtomicRMWOpConversion
       vecOrig = vec;
       packed = 1;
       auto valTy = cast<RankedTensorType>(val.getType());
-      if (!supportsVectorized(moduleOp, atomicRmwAttr,
-                              valTy.getElementType())) {
+      if (!supportsVectorized(atomicRmwAttr, valTy.getElementType())) {
         packed =
             std::min<unsigned>(vecOrig, valTy.getElementType().isF16() ? 2 : 1);
         vec = 1;
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.cpp
index d6537ecb11..75f9354104 100644
--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.cpp
+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.cpp
@@ -583,4 +583,8 @@ void TargetInfo::assertFail(RewriterBase &rewriter, Location loc,
 
 int TargetInfo::getSharedAddressSpace() const { return 3; }
 
+bool TargetInfo::supportVectorizedAtomics() const {
+  return computeCapability >= 90 && ptxVersion >= 81;
+}
+
 } // namespace mlir::triton::NVIDIA
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.h b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.h
index 7a1b909cc4..ed9bd91a8d 100644
--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.h
+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.h
@@ -7,7 +7,8 @@ namespace mlir::triton::NVIDIA {
 
 class TargetInfo : public mlir::triton::TargetInfoBase {
 public:
-  TargetInfo(int computeCapability) : computeCapability(computeCapability) {}
+  TargetInfo(int computeCapability, int ptxVersion)
+      : computeCapability(computeCapability), ptxVersion(ptxVersion) {}
 
   bool supportMaximumMinimum() const override;
 
@@ -53,8 +54,11 @@ class TargetInfo : public mlir::triton::TargetInfoBase {
                   StringRef file, StringRef func, int line) const override;
   int getSharedAddressSpace() const override;
 
+  bool supportVectorizedAtomics() const override;
+
 private:
   int computeCapability;
+  int ptxVersion;
 };
 
 } // namespace mlir::triton::NVIDIA
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TritonGPUToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TritonGPUToLLVM.cpp
index 21f5b70632..6674c9a810 100644
--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TritonGPUToLLVM.cpp
+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TritonGPUToLLVM.cpp
@@ -79,13 +79,16 @@ struct ConvertTritonGPUToLLVM
   ConvertTritonGPUToLLVM(int32_t computeCapability)
       : ConvertTritonGPUToLLVMBase({computeCapability}) {}
 
+  ConvertTritonGPUToLLVM(int32_t computeCapability, int32_t ptxVersion)
+      : ConvertTritonGPUToLLVMBase({computeCapability, ptxVersion}) {}
+
   void runOnOperation() override {
     MLIRContext *context = &getContext();
     ModuleOp mod = getOperation();
 
     mlir::LowerToLLVMOptions option(context);
     option.overrideIndexBitwidth(32);
-    TargetInfo targetInfo(computeCapability);
+    TargetInfo targetInfo(computeCapability, ptxVersion);
     TritonGPUToLLVMTypeConverter typeConverter(context, option, targetInfo);
     TritonLLVMConversionTarget convTarget(*context);
     int numWarps = triton::gpu::TritonGPUDialect::getNumWarps(mod);
@@ -227,6 +230,12 @@ std::unique_ptr<OperationPass<ModuleOp>>
 createConvertTritonGPUToLLVMPass(int32_t computeCapability) {
   return std::make_unique<ConvertTritonGPUToLLVM>(computeCapability);
 }
+std::unique_ptr<OperationPass<ModuleOp>>
+createConvertTritonGPUToLLVMPass(int32_t computeCapability,
+                                 int32_t ptxVersion) {
+  return std::make_unique<ConvertTritonGPUToLLVM>(computeCapability,
+                                                  ptxVersion);
+}
 
 bool NVIDIA::canSkipBarSync(Operation *before, Operation *after) {
   // Multiple init barriers on the same allocation would usually not happen but
diff --git a/third_party/nvidia/triton_nvidia.cc b/third_party/nvidia/triton_nvidia.cc
index 1269dcda00..a7a0364013 100644
--- a/third_party/nvidia/triton_nvidia.cc
+++ b/third_party/nvidia/triton_nvidia.cc
@@ -18,9 +18,11 @@ void init_triton_nvidia_passes_ttgpuir(py::module &&m) {
   using namespace mlir::triton;
   // TODO: it is weird to pass mlir::triton::NVVM here since the conversion is
   // nvidia-specificontext
-  m.def("add_to_llvmir", [](mlir::PassManager &pm, int32_t capability) {
-    pm.addPass(mlir::triton::createConvertTritonGPUToLLVMPass(capability));
-  });
+  m.def("add_to_llvmir",
+        [](mlir::PassManager &pm, int32_t capability, int32_t ptxVersion) {
+          pm.addPass(mlir::triton::createConvertTritonGPUToLLVMPass(
+              capability, ptxVersion));
+        });
   m.def("add_decompose_unsupported_conversions", [](mlir::PassManager &pm) {
     pm.addPass(NVIDIA::createDecomposeUnsupportedConversionsPass());
   });
diff --git a/third_party/proton/README.md b/third_party/proton/README.md
index 674540b8af..fede11cedb 100644
--- a/third_party/proton/README.md
+++ b/third_party/proton/README.md
@@ -209,3 +209,7 @@ If you encounter permission related problems when using instruction sampling, yo
 
 The overhead of instruction sampling on NVIDIA GPUs is about 20x using Proton because we haven't enabled continuous sampling yet.
 Continuous sampling can allow for more runtime optimizations, but it makes it more challenging to attribute performance data back to the GPU kernels because: (1) it enables profiling of concurrent kernels, (2) it doesn't allow profiling of time and instruction samples simultaneously, and (3) it works best if we have a separate thread dedicated to attributing instruction samples to the GPU kernels
+
+- Visible devices on AMD GPUs
+
+Environment variables such as `HIP_VISIBLE_DEVICES`, and `CUDA_VISIBLE_DEVICES` are not supported on AMD GPUs. Once it's set, we cannot find a valid mapping between the device ID returned by RocTracer and the physical device ID. Instead, `ROCR_VISIBLE_DEVICES` is recommended to be used.
diff --git a/third_party/proton/proton/profile.py b/third_party/proton/proton/profile.py
index 2dd7a6f53e..808a1742a5 100644
--- a/third_party/proton/proton/profile.py
+++ b/third_party/proton/proton/profile.py
@@ -1,5 +1,6 @@
 import functools
 import triton
+import os
 
 from triton._C.libproton import proton as libproton
 from .hook import register_triton_hook, unregister_triton_hook
@@ -19,6 +20,16 @@ def _select_backend() -> str:
         raise ValueError("No backend is available for the current target.")
 
 
+def _check_env(backend: str) -> None:
+    if backend == "roctracer":
+        hip_device_envs = ["HIP_VISIBLE_DEVICES", "CUDA_VISIBLE_DEVICES"]
+        for env in hip_device_envs:
+            if os.getenv(env, None) is not None:
+                raise ValueError(
+                    f"Proton does not work when the environment variable {env} is set on AMD GPUs. Please unset it and use `ROCR_VISIBLE_DEVICES` instead"
+                )
+
+
 def start(
     name: Optional[str] = None,
     *,
@@ -66,6 +77,8 @@ def start(
     if backend is None:
         backend = _select_backend()
 
+    _check_env(backend)
+
     set_profiling_on()
     if hook and hook == "triton":
         register_triton_hook()