From 0b4feb77ce54a65897526861333ef8dcd78ef27f Mon Sep 17 00:00:00 2001
From: Philippe Tillet <phil@openai.com>
Date: Fri, 27 Sep 2024 10:02:48 -0700
Subject: [PATCH 01/12] [testing] moved `di = torch._dynamo.device_interface`
 into backend (#4818)

---
 python/triton/testing.py             | 3 ++-
 third_party/amd/backend/driver.py    | 4 ++++
 third_party/nvidia/backend/driver.py | 4 ++++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/python/triton/testing.py b/python/triton/testing.py
index 11f449faa4..126d5f6bac 100644
--- a/python/triton/testing.py
+++ b/python/triton/testing.py
@@ -5,6 +5,7 @@
 from contextlib import contextmanager
 from typing import Any, Dict, List
 from . import language as tl
+from . import runtime
 
 
 def nvsmi(attrs):
@@ -114,7 +115,7 @@ def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, fast_flu
     assert return_mode in ["min", "max", "mean", "median", "all"]
     import torch
 
-    di = torch._dynamo.device_interface.get_interface_for_device(device_type)
+    di = runtime.driver.active.get_device_interface()
 
     fn()
     di.synchronize()
diff --git a/third_party/amd/backend/driver.py b/third_party/amd/backend/driver.py
index c1ff6e1d65..80ba6d2a5c 100644
--- a/third_party/amd/backend/driver.py
+++ b/third_party/amd/backend/driver.py
@@ -484,6 +484,10 @@ def __init__(self):
         self.utils = HIPUtils()
         self.launcher_cls = HIPLauncher
 
+    def get_device_interface(self):
+        import torch
+        return torch.cuda
+
     @staticmethod
     def is_active():
         import torch
diff --git a/third_party/nvidia/backend/driver.py b/third_party/nvidia/backend/driver.py
index bf1f066d55..57c8844e1f 100644
--- a/third_party/nvidia/backend/driver.py
+++ b/third_party/nvidia/backend/driver.py
@@ -440,6 +440,10 @@ def get_current_target(self):
         warp_size = 32
         return GPUTarget("cuda", capability, warp_size)
 
+    def get_device_interface(self):
+        import torch
+        return torch.cuda
+
     @staticmethod
     def is_active():
         import torch

From fe47f98bc73a8c630c87cdde57ce11be26a09b60 Mon Sep 17 00:00:00 2001
From: Elliot Gorokhovsky <embg@fb.com>
Date: Fri, 27 Sep 2024 17:26:08 -0400
Subject: [PATCH 02/12] 09-persistent-matmul.py bugfix (#4820)

Currently we sleep between each rep for Triton kernels, but not for the
cuBLAS kernel. This may improve cuBLAS performance on fp8 due to thermal
issues.
---
 python/tutorials/09-persistent-matmul.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tutorials/09-persistent-matmul.py b/python/tutorials/09-persistent-matmul.py
index 699c4aa4f9..1464d489bc 100644
--- a/python/tutorials/09-persistent-matmul.py
+++ b/python/tutorials/09-persistent-matmul.py
@@ -554,7 +554,7 @@ def bench(K, dtype, tiles_per_update, reps=10):
     if cublas is not None:
         for _ in range(reps):
             cublas_matmul(a, b)
-        time.sleep(0.01)
+            time.sleep(0.01)
     if dtype == torch.float16:
         for _ in range(reps):
             torch_matmul(a, b)

From 6af74b2f4535682abfc0b08958bc2c6831036d29 Mon Sep 17 00:00:00 2001
From: Hongtao Yu <hoy@meta.com>
Date: Fri, 27 Sep 2024 17:54:34 -0700
Subject: [PATCH 03/12] [FRONTEND] Support passing dtype as constexpr for tma
 load (#4821)

Fixing an compile error like below when passing dtype through kernel arg
for `tl._experimental_descriptor_load`:

 AttributeError: 'constexpr' object has no attribute 'to_ir'
---
 python/test/unit/hopper/test_experimental_tma.py | 10 +++++-----
 python/triton/language/core.py                   |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/test/unit/hopper/test_experimental_tma.py b/python/test/unit/hopper/test_experimental_tma.py
index c026132143..9695a5e47e 100644
--- a/python/test/unit/hopper/test_experimental_tma.py
+++ b/python/test/unit/hopper/test_experimental_tma.py
@@ -57,7 +57,7 @@ def kernel(Z, desc, SIZE: tl.constexpr, BYVAL_TMA: tl.constexpr):
 @triton.jit
 def matmul_kernel_tma(a_desc_ptr, b_desc_ptr, c_desc_ptr,  #
                       M, N, K, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-                      BYVAL_TMA: tl.constexpr):
+                      BYVAL_TMA: tl.constexpr, dtype: tl.constexpr):
     if not BYVAL_TMA:
         tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(a_desc_ptr)
         tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(b_desc_ptr)
@@ -72,11 +72,11 @@ def matmul_kernel_tma(a_desc_ptr, b_desc_ptr, c_desc_ptr,  #
     offs_k = 0
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
     for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        a = tl._experimental_descriptor_load(a_desc_ptr, [offs_am, offs_k], [BLOCK_SIZE_M, BLOCK_SIZE_K], tl.float16)
-        b = tl._experimental_descriptor_load(b_desc_ptr, [offs_k, offs_bn], [BLOCK_SIZE_K, BLOCK_SIZE_N], tl.float16)
+        a = tl._experimental_descriptor_load(a_desc_ptr, [offs_am, offs_k], [BLOCK_SIZE_M, BLOCK_SIZE_K], dtype)
+        b = tl._experimental_descriptor_load(b_desc_ptr, [offs_k, offs_bn], [BLOCK_SIZE_K, BLOCK_SIZE_N], dtype)
         accumulator = tl.dot(a, b, acc=accumulator)
         offs_k += BLOCK_SIZE_K
-    accumulator = accumulator.to(tl.float16)
+    accumulator = accumulator.to(dtype)
     tl._experimental_descriptor_store(c_desc_ptr, accumulator, [offs_am, offs_bn])
 
 
@@ -101,7 +101,7 @@ def test_experimental_tma_matmul(num_stages, BLOCK_M, BLOCK_N, BLOCK_K, byval_tm
         desc_c = create_tma_desc_gmem_ptr(C.data_ptr(), [M, N], [BLOCK_M, BLOCK_N], C.element_size())
     kernel = matmul_kernel_tma[(triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N), 1,
                                 1)](desc_a, desc_b, desc_c, M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, BYVAL_TMA=byval_tma,
-                                    num_warps=8, num_stages=num_stages)
+                                    num_warps=8, num_stages=num_stages, dtype=tl.float16)
     ref_out = torch.matmul(A.to(torch.float32), B.to(torch.float32)).to(torch.float16)
     torch.testing.assert_close(ref_out, C, rtol=1e-3, atol=1e-3)
     if BLOCK_M >= 64 and BLOCK_N >= 64:
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
index 9dd0b5f6f4..2a0ad6a105 100644
--- a/python/triton/language/core.py
+++ b/python/triton/language/core.py
@@ -1613,7 +1613,7 @@ def _experimental_descriptor_load(desc_pointer, offsets, shape, dtype, _builder=
 
     This loads a tensor of data based on the descriptor and offsets.
     """
-    type = block_type(dtype, shape)
+    type = block_type(_constexpr_to_value(dtype), shape)
     return semantic.descriptor_load(desc_pointer, offsets, "", "", type, _builder)
 
 

From 755077cde95db74da41f3769293f83521699cabe Mon Sep 17 00:00:00 2001
From: Lixun Zhang <Lixun.Zhang@amd.com>
Date: Sun, 29 Sep 2024 23:07:37 -0500
Subject: [PATCH 04/12] [AMD] Always swap operands of mfma and use
 mfma.transposed layout (#4767)

This helps to improve writeout to use `global_store_dwordx2`.

Along the way this PR
- Fixed the issue with getOrder for mfma layout
- Fixed the issue with reduceOp when dealing with mfma.transposed layout

In general, getOrder and getThreadOrder can return different values, and
this is the case for mfma.transposed layout. Therefore, we shouldn't
assume order and threadOrder are always the same.
---
 include/triton/Dialect/TritonGPU/IR/Dialect.h | 25 ++++++++++++++++++-
 lib/Analysis/Utility.cpp                      |  4 +--
 .../TritonGPUToLLVM/ReduceOpToLLVM.cpp        |  3 ++-
 lib/Dialect/TritonGPU/IR/Dialect.cpp          | 21 +++++++++-------
 .../TritonGPU/IR/LinearLayoutConversions.cpp  | 14 +++++++++++
 .../AccelerateAMDMatmul.cpp                   | 22 +++-------------
 unittest/Dialect/TritonGPU/DialectTest.cpp    |  8 +++---
 .../TritonGPU/LinearLayoutConversionsTest.cpp |  4 +--
 8 files changed, 63 insertions(+), 38 deletions(-)

diff --git a/include/triton/Dialect/TritonGPU/IR/Dialect.h b/include/triton/Dialect/TritonGPU/IR/Dialect.h
index 3b012a6305..74ea99b588 100644
--- a/include/triton/Dialect/TritonGPU/IR/Dialect.h
+++ b/include/triton/Dialect/TritonGPU/IR/Dialect.h
@@ -75,9 +75,32 @@ getThreadsPerWarpWithUniqueData(Attribute layout,
 SmallVector<unsigned>
 getWarpsPerCTAWithUniqueData(Attribute layout, ArrayRef<int64_t> tensorShape);
 
+// Returns the dimensions of the tensor from minor (fast-varying) to
+// major (slow-varying). For blocked, mma, and dotOperand layouts,
+// though the elements are in registers, the order refers to memory
+// layout of the original tensor in global memory.
+// For shared Layout, the order refers to which dimension of the original tensor
+// is contiguous in shared memory.
+SmallVector<unsigned> getOrder(Attribute layout);
+
+// Returns the dimensions along which warpId's are distributed.
+// warpsPerCTA only tells the warp layout in the CTA, e.g. warpsPerCTA = [2, 4]
+// tells there are 2 warps along dim0 and 4 warps along dim1.
+// warpOrder tells the specific order when distributing warp IDs.
+// E.g. warpOrder = [0, 1] means the warp IDs are distributed as follows
+// [warp0  warp2  warp4 warp6]
+// [warp1  warp3  warp5 warp7]
+// Note that in most cases, getWarpOrder and getOrder return the same results.
+// But this is not guaranteed.
 SmallVector<unsigned> getWarpOrder(Attribute layout);
 
-SmallVector<unsigned> getOrder(Attribute layout);
+// Returns the dimensions along which threadId's are distributed.
+// Similar to warpOrder, threadOrder is necessary to tell the specific thread
+// distribution in the warp.
+// Note that, in most cases, getThreadOrder and getOrder return the same
+// results. But this is not guaranteed. One exception is mfma.transposed layout,
+// in which getOrder returns [1, 0] but getThreadOrder returns [0, 1].
+SmallVector<unsigned> getThreadOrder(Attribute layout);
 
 CTALayoutAttr getCTALayout(Attribute layout);
 
diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
index 56630c7318..b9468aa3e3 100644
--- a/lib/Analysis/Utility.cpp
+++ b/lib/Analysis/Utility.cpp
@@ -36,7 +36,7 @@ SmallVector<unsigned> getParentOrder(Attribute layout) {
   if (auto sliceEncoding = mlir::dyn_cast<SliceEncodingAttr>(layout)) {
     return getParentOrder(sliceEncoding.getParent());
   }
-  return getOrder(layout);
+  return getThreadOrder(layout);
 }
 
 } // namespace
@@ -75,7 +75,7 @@ unsigned ReduceOpHelper::getThreadOffsetOnReductionAxis() {
     threadOffset = threadsPerWarp[sliceLayout.getDim()];
   } else {
     auto threadsPerWarp = getThreadsPerWarp(srcLayout);
-    auto order = getOrder(srcLayout);
+    auto order = getThreadOrder(srcLayout);
     for (unsigned i = 0; i < order.size(); i++) {
       if (order[i] == axis)
         break;
diff --git a/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
index 16c9991a17..414328be50 100644
--- a/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
@@ -9,6 +9,7 @@ using namespace mlir::triton;
 using ::mlir::LLVM::delinearize;
 using ::mlir::LLVM::linearize;
 using ::mlir::triton::gpu::getOrder;
+using ::mlir::triton::gpu::getThreadOrder;
 using ::mlir::triton::gpu::getTotalElemsPerThread;
 
 namespace {
@@ -271,7 +272,7 @@ struct ReduceOpConversion
 
     auto threadsPerWarp =
         triton::gpu::getThreadsPerWarpWithUniqueData(srcLayout, srcShape);
-    auto order = getOrder(srcLayout);
+    auto order = getThreadOrder(srcLayout);
     SmallVector<Value> multiDimLaneId =
         delinearize(rewriter, loc, laneId, threadsPerWarp, order);
     Value laneIdAxis = multiDimLaneId[axis];
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
index a454fef566..48f31bdf2a 100644
--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -256,14 +256,6 @@ SmallVector<unsigned> getOrder(Attribute layout) {
     auto rank = distributedLayout.getWarpsPerCTA().size();
     SmallVector<unsigned> order(rank);
     std::iota(order.rbegin(), order.rend(), 0);
-    auto mfmaLayout = dyn_cast<AMDMfmaEncodingAttr>(layout);
-    if (!mfmaLayout)
-      return order;
-    // For transposed MFMA layouts, we swap M and N dimensions, which is
-    // always the first two in order; as we can have an optional batch
-    // dimension following them.
-    if (mfmaLayout.getIsTransposed())
-      std::swap(order[0], order[1]);
     return order;
   }
   if (auto dotLayout = dyn_cast<DotOperandEncodingAttr>(layout)) {
@@ -290,6 +282,14 @@ SmallVector<unsigned> getOrder(Attribute layout) {
   return {};
 };
 
+SmallVector<unsigned> getThreadOrder(Attribute layout) {
+  if (auto distributedLayout = mlir::dyn_cast<DistributedEncodingTrait>(layout))
+    return distributedLayout.getThreadOrder();
+  else
+    llvm::report_fatal_error("Unimplemented usage of getThreadOrder");
+  return {};
+};
+
 CTALayoutAttr getCTALayout(Attribute layout) {
   if (auto distributedLayout =
           mlir::dyn_cast<DistributedEncodingTrait>(layout)) {
@@ -1536,7 +1536,10 @@ SmallVector<unsigned> AMDMfmaEncodingAttr::getWarpOrder() const {
   return ::getWarpOrder(*this);
 }
 SmallVector<unsigned> AMDMfmaEncodingAttr::getThreadOrder() const {
-  return ::getOrder(*this);
+  auto order = ::getOrder(*this);
+  if (getIsTransposed())
+    std::swap(order[0], order[1]);
+  return order;
 }
 SmallVector<unsigned> AMDMfmaEncodingAttr::getThreadsPerWarp() const {
   unsigned rows, cols;
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
index 286b1eac51..f576ce2154 100644
--- a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
+++ b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -507,6 +507,13 @@ AMDMfmaEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
         {{kRegister, {{0, 1}, {0, 2}, {0, 8}, /*gap*/ {0, 16}}},
          {kLane, {{1, 0}, {2, 0}, {4, 0}, {8, 0}, {16, 0}, /*gap*/ {0, 4}}}},
         {outDimNames[order[0]], outDimNames[order[1]]});
+    // For mfma.transposed layout, the element ownership among threads are
+    // "transposed" within each warp.
+    if (getIsTransposed())
+      tileLayout = LinearLayout(
+          {{kRegister, {{1, 0}, {2, 0}, {8, 0}, /*gap*/ {16, 0}}},
+           {kLane, {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {0, 16}, /*gap*/ {4, 0}}}},
+          {outDimNames[order[0]], outDimNames[order[1]]});
   } else {
     assert(getMDim() == 16);
     // For mfma with 16x16 output, each of the 64 threads holds 4 elements.
@@ -521,6 +528,13 @@ AMDMfmaEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
         {{kRegister, {{0, 1}, {0, 2}}},
          {kLane, {{1, 0}, {2, 0}, {4, 0}, {8, 0}, /*gap*/ {0, 4}, {0, 8}}}},
         {outDimNames[order[0]], outDimNames[order[1]]});
+    // For mfma.transposed layout, the element ownership among threads are
+    // "transposed" within each warp.
+    if (getIsTransposed())
+      tileLayout = LinearLayout(
+          {{kRegister, {{1, 0}, {2, 0}}},
+           {kLane, {{0, 1}, {0, 2}, {0, 4}, {0, 8}, /*gap*/ {4, 0}, {8, 0}}}},
+          {outDimNames[order[0]], outDimNames[order[1]]});
   }
   if (hasBatchDim) {
     assert(order[2] == 0);
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
index bf976a8138..21b74ecf99 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
@@ -269,23 +269,6 @@ class BlockedToMFMA : public RewritePattern {
       : RewritePattern(tt::DotOp::getOperationName(), 2, context),
         mfmaVersion(mfmaVersion), enforcedNonKDim(nonKDim), kPack(kPack) {}
 
-  bool isChainDot(tt::DotOp &dotOp) const {
-    auto filter = [&dotOp](Operation *op) {
-      return op->getParentRegion() == dotOp->getParentRegion();
-    };
-    ForwardSliceOptions fwdOpt;
-    fwdOpt.filter = filter;
-    BackwardSliceOptions bwdOpt;
-    bwdOpt.omitBlockArguments = true;
-    bwdOpt.filter = filter;
-    auto slices = getSlice(dotOp, bwdOpt, fwdOpt);
-    for (Operation *op : slices) {
-      if (isa<tt::DotOp>(op) && (op != dotOp))
-        return true;
-    }
-    return false;
-  }
-
   bool isSecondDot(tt::DotOp &dotOp) const {
     auto filter = [&dotOp](Operation *op) {
       return op->getParentRegion() == dotOp->getParentRegion();
@@ -400,11 +383,12 @@ class BlockedToMFMA : public RewritePattern {
     auto warpsPerTile =
         warpsPerTileMFMA(dotOp, retShape, numWarps, {mDim, nDim});
 
-    bool isTransposed = isChainDot(dotOp);
+    // Always use transposed mfma layout. This enables larger vectorization
+    // for global store instructions
     mfmaEnc = ttg::AMDMfmaEncodingAttr::get(
         oldRetType.getContext(),
         /*versionMajor*/ mfmaVersion, /*versionMinor*/ 0, warpsPerTile,
-        /*instrShape*/ mDim, nDim, isTransposed, CTALayout);
+        /*instrShape*/ mDim, nDim, /*isTransposed*/ true, CTALayout);
 
     Type mfmaAccType;
     if (oldRetType.getElementType().isIntOrIndex())
diff --git a/unittest/Dialect/TritonGPU/DialectTest.cpp b/unittest/Dialect/TritonGPU/DialectTest.cpp
index 67b1d9e9bc..e3f521f1b3 100644
--- a/unittest/Dialect/TritonGPU/DialectTest.cpp
+++ b/unittest/Dialect/TritonGPU/DialectTest.cpp
@@ -559,7 +559,7 @@ TEST_F(AMDMfmaLayoutTest, mfma32) {
 
   auto tmfma2d = createTransposedMFMA(32, 32, {2, 4});
   ASSERT_THAT(tmfma2d.getThreadOrder(), testing::ElementsAre(0u, 1u));
-  ASSERT_THAT(tmfma2d.getWarpOrder(), testing::ElementsAre(0u, 1u));
+  ASSERT_THAT(tmfma2d.getWarpOrder(), testing::ElementsAre(1u, 0u));
 
   auto mfma3d = createMFMA(32, 32, {2, 4, 1});
   ASSERT_THAT(mfma3d.getThreadOrder(), testing::ElementsAre(2u, 1u, 0u));
@@ -567,7 +567,7 @@ TEST_F(AMDMfmaLayoutTest, mfma32) {
 
   auto tmfma3d = createTransposedMFMA(32, 32, {2, 4, 1});
   ASSERT_THAT(tmfma3d.getThreadOrder(), testing::ElementsAre(1u, 2u, 0u));
-  ASSERT_THAT(tmfma3d.getWarpOrder(), testing::ElementsAre(1u, 2u, 0u));
+  ASSERT_THAT(tmfma3d.getWarpOrder(), testing::ElementsAre(2u, 1u, 0u));
 }
 
 TEST_F(AMDMfmaLayoutTest, mfma16) {
@@ -577,7 +577,7 @@ TEST_F(AMDMfmaLayoutTest, mfma16) {
 
   auto tmfma2d = createTransposedMFMA(16, 16, {2, 4});
   ASSERT_THAT(tmfma2d.getThreadOrder(), testing::ElementsAre(0u, 1u));
-  ASSERT_THAT(tmfma2d.getWarpOrder(), testing::ElementsAre(0u, 1u));
+  ASSERT_THAT(tmfma2d.getWarpOrder(), testing::ElementsAre(1u, 0u));
 
   auto mfma3d = createMFMA(16, 16, {2, 4, 1});
   ASSERT_THAT(mfma3d.getThreadOrder(), testing::ElementsAre(2u, 1u, 0u));
@@ -585,7 +585,7 @@ TEST_F(AMDMfmaLayoutTest, mfma16) {
 
   auto tmfma3d = createTransposedMFMA(16, 16, {2, 4, 1});
   ASSERT_THAT(tmfma3d.getThreadOrder(), testing::ElementsAre(1u, 2u, 0u));
-  ASSERT_THAT(tmfma3d.getWarpOrder(), testing::ElementsAre(1u, 2u, 0u));
+  ASSERT_THAT(tmfma3d.getWarpOrder(), testing::ElementsAre(2u, 1u, 0u));
 }
 
 } // anonymous namespace
diff --git a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
index 0b7a0f7821..7d918602a7 100644
--- a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
+++ b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
@@ -529,14 +529,14 @@ TEST_F(LinearLayoutConversionsTest, MFMA32_2x4Warps) {
             LinearLayout(
                 {{S("register"), {{0, 1}, {0, 2}, {0, 8}, {0, 16}}},
                  {S("lane"), {{1, 0}, {2, 0}, {4, 0}, {8, 0}, {16, 0}, {0, 4}}},
-                 {S("warp"), {{32, 0}, {0, 0}, {0, 0}}},
+                 {S("warp"), {{0, 0}, {0, 0}, {32, 0}}},
                  {S("block"), {}}},
                 {S("dim0"), S("dim1")}));
   EXPECT_EQ(toLinearLayout({128, 128}, mfmaT),
             LinearLayout(
                 {{S("register"), {{0, 1}, {0, 2}, {0, 8}, {0, 16}, {64, 0}}},
                  {S("lane"), {{1, 0}, {2, 0}, {4, 0}, {8, 0}, {16, 0}, {0, 4}}},
-                 {S("warp"), {{32, 0}, {0, 32}, {0, 64}}},
+                 {S("warp"), {{0, 32}, {0, 64}, {32, 0}}},
                  {S("block"), {}}},
                 {S("dim0"), S("dim1")}));
 }

From 1e8844121eebbdb92e517fdd52c6014e91a7ef4c Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Mon, 30 Sep 2024 05:01:33 -0700
Subject: [PATCH 05/12] [BUILD] Avoid using lld as the linker on macOS (#4827)

LLD is not supported on macOS. This addresses failures like

> clang: error: invalid linker name in argument '-fuse-ld=lld'

See
https://github.com/triton-lang/triton/actions/runs/11099205977/job/30833066194#step:10:61
---
 python/setup.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/python/setup.py b/python/setup.py
index 57ac73899b..036d5f88a1 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -429,11 +429,14 @@ def build_extension(self, ext):
             cmake_args += [
                 "-DCMAKE_C_COMPILER=clang",
                 "-DCMAKE_CXX_COMPILER=clang++",
-                "-DCMAKE_LINKER=lld",
-                "-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=lld",
-                "-DCMAKE_MODULE_LINKER_FLAGS=-fuse-ld=lld",
-                "-DCMAKE_SHARED_LINKER_FLAGS=-fuse-ld=lld",
             ]
+            if platform.system() != "Darwin":
+                cmake_args += [
+                    "-DCMAKE_LINKER=lld",
+                    "-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=lld",
+                    "-DCMAKE_MODULE_LINKER_FLAGS=-fuse-ld=lld",
+                    "-DCMAKE_SHARED_LINKER_FLAGS=-fuse-ld=lld",
+                ]
 
         # Note that asan doesn't work with binaries that use the GPU, so this is
         # only useful for tools like triton-opt that don't run code on the GPU.

From 80947a2b68588b76368254a367f5fcc22a94b664 Mon Sep 17 00:00:00 2001
From: Bert Maher <bertmaher@users.noreply.github.com>
Date: Mon, 30 Sep 2024 08:44:42 -0400
Subject: [PATCH 06/12] [tools/triton-tensor-layout] Allow parsing ttgir files
 with triton_nvidia_gpu ops (#4686)

If you want to dump layouts read from an MLIR file, and that file
contains ops like `triton_nvidia_gpu.warp_group_dot`, this tool needs to
know about the `triton_nvidia_gpu` dialect, or else it will throw an
error about not finding the dialect
---
 bin/CMakeLists.txt           |  4 ++++
 bin/triton-tensor-layout.cpp | 12 +++++++-----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt
index 3dd9d916a5..363a406315 100644
--- a/bin/CMakeLists.txt
+++ b/bin/CMakeLists.txt
@@ -95,5 +95,9 @@ export_executable_symbols_for_plugins(triton-llvm-opt)
 add_llvm_executable(triton-tensor-layout triton-tensor-layout.cpp PARTIAL_SOURCES_INTENDED)
 target_link_libraries(triton-tensor-layout PRIVATE
   TritonGPUIR
+  TritonNvidiaGPUIR
   ${triton_libs}
+  ${conversion_libs}
+  ${dialect_libs}
+  TritonTestAnalysis
   )
diff --git a/bin/triton-tensor-layout.cpp b/bin/triton-tensor-layout.cpp
index e0e02b49b9..4087ac1350 100644
--- a/bin/triton-tensor-layout.cpp
+++ b/bin/triton-tensor-layout.cpp
@@ -1,8 +1,11 @@
+#include "RegisterTritonDialects.h"
+
 #include "mlir/AsmParser/AsmParser.h"
 #include "mlir/AsmParser/AsmParserState.h"
 #include "mlir/IR/MLIRContext.h"
 
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorOr.h"
@@ -114,7 +117,7 @@ LogicalResult printLayoutFromFile(MLIRContext *context, StringRef filename,
     return failure();
   }
 
-  auto printLambda = [&](StringRef name, Attribute attr) {
+  auto printLambda = [&](StringRef name, mlir::Attribute attr) {
     ss << "Print layout attribute: #" << name << " = " << attr << "\n";
 
     auto rankedTensorTy = RankedTensorType::get(
@@ -155,7 +158,7 @@ LogicalResult printLayoutFromString(MLIRContext *context,
   if (layoutAttrStr.empty())
     return success();
 
-  Attribute layout = parseAttribute(layoutAttrStr, context);
+  mlir::Attribute layout = parseAttribute(layoutAttrStr, context);
   if (!layout) {
     llvm::errs() << "Invalid layout attribute: " << layoutAttrStr << "\n";
     return failure();
@@ -178,8 +181,7 @@ int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv, "tensor layout printer\n");
 
   DialectRegistry registry;
-  // Register all dialects that can print tensor layout.
-  registry.insert<triton::gpu::TritonGPUDialect>();
+  registerTritonDialects(registry);
 
   MLIRContext ctx(registry);
   ctx.loadAllAvailableDialects();
@@ -189,7 +191,7 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  Type parsedTy = parseType(TensorStr, &ctx);
+  mlir::Type parsedTy = parseType(TensorStr, &ctx);
   if (!parsedTy) {
     llvm::errs() << "Fail to parse the tensor type argument: " << TensorStr
                  << "\n";

From 1df64d1aaf9ecd74124ccb503d5fe1016a8f92cf Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@amd.com>
Date: Mon, 30 Sep 2024 18:19:57 +0100
Subject: [PATCH 07/12] [AMD] Add alignment information to
 maskedLoad/maskedStore (#4816)

I think we should always set the right alignment to the
`maskedload`/`maskedstore` instructions.
---
 .../TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp  | 19 +++++++++++++------
 .../amd/lib/TritonAMDGPUToLLVM/Utility.cpp    | 16 +++++++++-------
 .../amd/lib/TritonAMDGPUToLLVM/Utility.h      |  4 ++--
 3 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
index 73b0f775cb..5a2815e5fc 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -119,6 +119,10 @@ struct LoadStoreConversionBase {
     return axisAnalysisPass.getMaskAlignment(mask);
   }
 
+  unsigned getPtrAlignment(Value ptr) const {
+    return axisAnalysisPass.getPtrAlignment(ptr);
+  }
+
 protected:
   const AMD::TargetInfo &targetInfo;
   ModuleAxisInfoAnalysis &axisAnalysisPass;
@@ -193,7 +197,9 @@ struct LoadOpConversion : public ConvertOpToLLVMPattern<triton::LoadOp>,
     // vectorized iteration through all the pointer/mask/other elements
     const int valueElemNBits =
         std::max(8u, valueElemTy.getIntOrFloatBitWidth());
+    const size_t valueElemNBytes = valueElemNBits / 8;
     const int numVecs = numElems / vec;
+    int64_t ptrAlignmentBytes = getPtrAlignment(ptr) * valueElemNBytes;
 
     auto cacheMod = op.getCache();
     SmallVector<Value> loadedVals;
@@ -230,8 +236,8 @@ struct LoadOpConversion : public ConvertOpToLLVMPattern<triton::LoadOp>,
         falseVal = v;
       }
 
-      auto loadVal =
-          llLoad(rewriter, loc, ptr, vecTy, pred, falseVal, cacheMod);
+      Value loadVal = llLoad(rewriter, loc, ptr, vecTy, pred, falseVal,
+                             ptrAlignmentBytes, cacheMod);
       for (size_t ii = 0; ii < vec; ++ii) {
         Value vecIdx = createIndexAttrConstant(
             rewriter, loc, this->getTypeConverter()->getIndexType(), ii % vec);
@@ -294,9 +300,10 @@ struct StoreOpConversion : public ConvertOpToLLVMPattern<triton::StoreOp>,
       vec = std::min(vec, maskAlign);
     }
 
-    const size_t dtsize =
-        std::max<int>(1, valueElemTy.getIntOrFloatBitWidth() / 8);
-    const size_t valueElemNBits = dtsize * 8;
+    const size_t valueElemNBits =
+        std::max<int>(8, valueElemTy.getIntOrFloatBitWidth());
+    const size_t valueElemNBytes = valueElemNBits / 8;
+    int64_t ptrAlignmentBytes = getPtrAlignment(ptr) * valueElemNBytes;
 
     auto cacheMod = op.getCache();
     const int numVecs = elemsPerThread / vec;
@@ -328,7 +335,7 @@ struct StoreOpConversion : public ConvertOpToLLVMPattern<triton::StoreOp>,
             rewriter, loc, this->getTypeConverter()->getIndexType(), s);
         storeVal = insert_element(vecTy, storeVal, otherElem, indexVal);
       }
-      llStore(rewriter, loc, ptr, storeVal, pred, cacheMod);
+      llStore(rewriter, loc, ptr, storeVal, pred, ptrAlignmentBytes, cacheMod);
     } // end vec
     rewriter.eraseOp(op);
     return success();
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp
index 262055d645..2e114c898f 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp
@@ -189,12 +189,14 @@ Value llGetPid(Location loc, RewriterBase &rewriter, ModuleOp moduleOp,
 }
 
 Value llLoad(RewriterBase &rewriter, Location loc, Value ptr, Type elemTy,
-             Value pred, Value falseVal, triton::CacheModifier cm) {
+             Value pred, Value falseVal, int64_t alignmentBytes,
+             triton::CacheModifier cm) {
 
   // Try to emit llvm.intr.masked.load if we can. In theory the backend should
   // be happier because we emit less branchy code to optimize. The backend will
   // lower it down however it wants at some point.
-  if (cm == triton::CacheModifier::CG || cm == triton::CacheModifier::NONE) {
+  if (alignmentBytes &&
+      (cm == triton::CacheModifier::CG || cm == triton::CacheModifier::NONE)) {
     // `llvm.intr.masked.load` only accepts vectors. If we see a scalar we need
     // to bitcast to `vector<1xelemTy>` (and back)
     int64_t vecSize = getNumElements(elemTy);
@@ -203,7 +205,7 @@ Value llLoad(RewriterBase &rewriter, Location loc, Value ptr, Type elemTy,
     Value maskVal = createVectorMaskFromPredicate(rewriter, loc, pred, vecSize);
     bool nt = (cm == triton::CacheModifier::CG);
     Value vecData = rewriter.create<LLVM::MaskedLoadOp>(
-        loc, vecType, ptr, maskVal, falseVal, vecSize, nt);
+        loc, vecType, ptr, maskVal, falseVal, alignmentBytes, nt);
     // If it is not a vector, remember to bitcast back to a scalar
     vecData = bitcast(vecData, elemTy);
     return vecData;
@@ -237,11 +239,11 @@ Value llLoad(RewriterBase &rewriter, Location loc, Value ptr, Type elemTy,
 }
 
 void llStore(RewriterBase &rewriter, Location loc, Value ptr, Value val,
-             Value pred, triton::CacheModifier cm) {
+             Value pred, int64_t alignmentBytes, triton::CacheModifier cm) {
   // Try to emit llvm.intr.masked.store if we can. In theory the backend should
   // be happier because we emit less branchy code to optimize. The backend will
   // lower it down however it wants at some point.
-  if (cm == triton::CacheModifier::NONE) {
+  if (alignmentBytes && cm == triton::CacheModifier::NONE) {
     // `llvm.intr.masked.store` only accepts vectors. If we see a scalar we need
     // to bitcast to `vector<1xelemTy>`
     Type elemTy = val.getType();
@@ -249,8 +251,8 @@ void llStore(RewriterBase &rewriter, Location loc, Value ptr, Value val,
     Type vecType = castToVectorType(elemTy);
     val = bitcast(val, vecType);
     Value maskVal = createVectorMaskFromPredicate(rewriter, loc, pred, vecSize);
-    auto op =
-        rewriter.create<LLVM::MaskedStoreOp>(loc, val, ptr, maskVal, vecSize);
+    auto op = rewriter.create<LLVM::MaskedStoreOp>(loc, val, ptr, maskVal,
+                                                   alignmentBytes);
     return;
   }
 
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.h b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.h
index 631f9bcce4..123234fd48 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.h
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.h
@@ -30,12 +30,12 @@ Value llGetPid(Location loc, RewriterBase &rewriter, ModuleOp moduleOp,
 // Loads from shared or global memory with predication.
 // `otherElems` is used to mask out the elements that are not loaded
 Value llLoad(RewriterBase &rewriter, Location loc, Value ptr, Type elemTy,
-             Value pred, Value falseVal,
+             Value pred, Value falseVal, int64_t alignmentBytes = 0,
              triton::CacheModifier cm = triton::CacheModifier::NONE);
 
 // Stores to shared or global memory with predication.
 void llStore(RewriterBase &rewriter, Location loc, Value ptr, Value val,
-             Value pred,
+             Value pred, int64_t alignmentBytes = 0,
              triton::CacheModifier cm = triton::CacheModifier::NONE);
 } // namespace mlir::LLVM::AMD
 

From 256ef34ca707a0c9675bafbbad2d89ecca3c8e94 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Mon, 30 Sep 2024 13:53:45 -0700
Subject: [PATCH 08/12] [CI][macOS] Pin LLVM version and install lld (#4831)

Pinned LLVM to v19; cannot do the same for LLD though.

This allows us to revert #4827.
---
 .github/workflows/integration-tests.yml    |  2 +-
 .github/workflows/integration-tests.yml.in |  2 +-
 python/setup.py                            | 11 ++++-------
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
index 06e2077657..7f65e92bdb 100644
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@@ -460,7 +460,7 @@ jobs:
       - name: Install brew dependencies
         run: |
           brew update
-          brew install ccache llvm
+          brew install ccache llvm@19 lld
       - name: Compute cache keys
         id: cache-key
         run: |
diff --git a/.github/workflows/integration-tests.yml.in b/.github/workflows/integration-tests.yml.in
index 2cfc2de824..0ff34e5833 100644
--- a/.github/workflows/integration-tests.yml.in
+++ b/.github/workflows/integration-tests.yml.in
@@ -439,7 +439,7 @@ jobs:
       - name: Install brew dependencies
         run: |
           brew update
-          brew install ccache llvm
+          brew install ccache llvm@19 lld
 
       - *compute-cache-keys-step
       - *cache-build-dependencies-step
diff --git a/python/setup.py b/python/setup.py
index 036d5f88a1..57ac73899b 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -429,14 +429,11 @@ def build_extension(self, ext):
             cmake_args += [
                 "-DCMAKE_C_COMPILER=clang",
                 "-DCMAKE_CXX_COMPILER=clang++",
+                "-DCMAKE_LINKER=lld",
+                "-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=lld",
+                "-DCMAKE_MODULE_LINKER_FLAGS=-fuse-ld=lld",
+                "-DCMAKE_SHARED_LINKER_FLAGS=-fuse-ld=lld",
             ]
-            if platform.system() != "Darwin":
-                cmake_args += [
-                    "-DCMAKE_LINKER=lld",
-                    "-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=lld",
-                    "-DCMAKE_MODULE_LINKER_FLAGS=-fuse-ld=lld",
-                    "-DCMAKE_SHARED_LINKER_FLAGS=-fuse-ld=lld",
-                ]
 
         # Note that asan doesn't work with binaries that use the GPU, so this is
         # only useful for tools like triton-opt that don't run code on the GPU.

From a6ecc7553f457f3fe35c5451f6e017b7a483ecff Mon Sep 17 00:00:00 2001
From: David Berard <dberard@fb.com>
Date: Mon, 30 Sep 2024 17:28:51 -0700
Subject: [PATCH 09/12] [AMD] StreamPipeline V1: fix depArg return mapping
 (#4832)

Previously, if an arg inside the loop was marked as a depArg, then a new
iter_arg would be added to the for loop to handle the arg; but any
usages of these variables _after_ the for loop would not be updated;
those usages would get the wrong value. This PR fixes this by updating
the return mapping. See the comment added in StreamPipeline.cpp for an
example.

Co-authored-by: Hongtao Yu <hoy@meta.com>
---
 test/TritonGPU/amd/amd-loop-pipeline-v1.mlir  | 31 +++++++++++++
 .../TritonAMDGPUTransforms/StreamPipeline.cpp | 45 ++++++++++++++++++-
 2 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 test/TritonGPU/amd/amd-loop-pipeline-v1.mlir

diff --git a/test/TritonGPU/amd/amd-loop-pipeline-v1.mlir b/test/TritonGPU/amd/amd-loop-pipeline-v1.mlir
new file mode 100644
index 0000000000..45eae93880
--- /dev/null
+++ b/test/TritonGPU/amd/amd-loop-pipeline-v1.mlir
@@ -0,0 +1,31 @@
+// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline | FileCheck %s
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [16, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#loc = loc("/data/users/dberard/triton-env/scripts/matmul.py":6:0)
+#mma = #triton_gpu.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [32, 32], isTransposed = false}>
+module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: tt.func @use_dep_args
+  tt.func @use_dep_args(%a_ptrs: tensor<64x32x!tt.ptr<bf16>, #blocked>, %b_ptrs: tensor<32x64x!tt.ptr<bf16>, #blocked1>, %loop_range: i32) -> (tensor<64x64xf32, #mma>, tensor<64x32x!tt.ptr<bf16>, #blocked>, tensor<32x64x!tt.ptr<bf16>, #blocked1>) {
+    %cst = arith.constant dense<32> : tensor<64x32xi32, #blocked>
+    %cst2 = arith.constant dense<2048> : tensor<32x64xi32, #blocked1>
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma>
+    %c0_i32 = arith.constant 0 : i32
+    %c8_i32 = arith.constant 8 : i32
+    %c32_i32 = arith.constant 32 : i32
+    // CHECK: tt.load
+    // CHECK: [[FOR_OUT:%[a-z0-9_]+]]:{{[0-9]+}} = scf.for
+    %for:3 = scf.for %arg6 = %c0_i32 to %loop_range step %c32_i32 iter_args(%arg7 = %cst_0, %arg8 = %a_ptrs, %arg9 = %b_ptrs) -> (tensor<64x64xf32, #mma>, tensor<64x32x!tt.ptr<bf16>, #blocked>, tensor<32x64x!tt.ptr<bf16>, #blocked1>)  : i32 {
+      %63 = tt.load %arg8 : tensor<64x32x!tt.ptr<bf16>, #blocked>
+      %64 = tt.load %arg9 : tensor<32x64x!tt.ptr<bf16>, #blocked1>
+      %65 = triton_gpu.convert_layout %63 : tensor<64x32xbf16, #blocked> -> tensor<64x32xbf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
+      %66 = triton_gpu.convert_layout %64 : tensor<32x64xbf16, #blocked1> -> tensor<32x64xbf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>
+      %67 = tt.dot %65, %66, %arg7 : tensor<64x32xbf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * tensor<32x64xbf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>> -> tensor<64x64xf32, #mma>
+      %68 = tt.addptr %arg8, %cst : tensor<64x32x!tt.ptr<bf16>, #blocked>, tensor<64x32xi32, #blocked>
+      %69 = tt.addptr %arg9, %cst2 : tensor<32x64x!tt.ptr<bf16>, #blocked1>, tensor<32x64xi32, #blocked1>
+      scf.yield %67, %68, %69 : tensor<64x64xf32, #mma>, tensor<64x32x!tt.ptr<bf16>, #blocked>, tensor<32x64x!tt.ptr<bf16>, #blocked1>
+    }
+    // CHECK: tt.return {{[^,]+}}, [[FOR_OUT]]#3, [[FOR_OUT]]#4
+    tt.return %for#0, %for#1, %for#2 : tensor<64x64xf32, #mma>, tensor<64x32x!tt.ptr<bf16>, #blocked>, tensor<32x64x!tt.ptr<bf16>, #blocked1>
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
index 8bdf9d1175..224df90283 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
@@ -71,7 +71,7 @@ class LoopPipeliner {
   /// shared mem and a next buffer stored in regs.
   int numStages = 2;
 
-  /// Arg indicies
+  /// Arg indicies in in pplForOp
   size_t depArgsBeginIdx;
   DenseMap<BlockArgument, size_t> depArgsIdx;
 
@@ -165,6 +165,9 @@ class LoopPipeliner {
   /// Collect loads to pipeline. Return success if we can pipeline this loop
   LogicalResult initialize();
 
+  // Update mapping from old forOp results to new pplForOp results
+  void setResultMapping(DenseMap<Value, Value> &newResults);
+
   /// Emit pipelined loads (before loop body)
   void emitPrologue();
 
@@ -548,6 +551,45 @@ void LoopPipeliner::emitPrologue() {
   } // for (Operation *op : orderedDeps)
 }
 
+void LoopPipeliner::setResultMapping(DenseMap<Value, Value> &newResults) {
+  // After pipelining, some of the depArgs have beem mapped to new args.
+  // We need to remap these.
+  //
+  // For example, if we have
+  //
+  //   ptr = ...
+  //   c = [zeros]
+  //   ret = scf.for iter_args(a_ptr=ptr, c=c)
+  //     a = load(a_ptr)
+  //     c += dot(a, ...)
+  //     a_ptr_new = a_ptr + N
+  //     scf.yield %a_ptr_new, %c
+  //
+  // then the ptr arg should be mapped to a new arg in the for loop.
+  //
+  //   ptr = ...
+  //   c = [zeros]
+  //   load_pre = load(ptr)
+  //   ptr_new = ptr + N
+  //   ret = scf.for iter_args(a_ptr=ptr, c=c, ld=load_pre, A_ptr_1=ptr_new)
+  //     a_next = load(A_ptr_1)
+  //     c += dot(ld, ...)
+  //     A_ptr_new = A_ptr_1 + N
+  //     scf.yield a_ptr, c, a_next, A_ptr_new
+  //
+  // After this, if there are downstream users of a_ptr, they should reference
+  // ret#3 instead of ret#0
+  for (const auto &origArg : llvm::enumerate(forOp.getRegionIterArgs())) {
+    if (depArgs.contains(origArg.value())) {
+      auto oldIdx = origArg.index();
+      auto newIdx = depArgsIdx[origArg.value()];
+      auto oldResult = forOp->getResult(oldIdx);
+      auto newResult = pplForOp->getResult(newIdx);
+      newResults[oldResult] = newResult;
+    }
+  }
+}
+
 void LoopPipeliner::emitEpilogue(DenseMap<Value, Value> &newResults) {
   if (!peelLastIter)
     return;
@@ -846,6 +888,7 @@ struct PipelinePass : public TritonAMDGPUStreamPipelineBase<PipelinePass> {
       DenseMap<Value, Value> newResults;
       for (unsigned i = 0; i < forOp->getNumResults(); ++i)
         newResults[forOp->getResult(i)] = pplForOp->getResult(i);
+      pipeliner.setResultMapping(newResults);
       pipeliner.emitEpilogue(newResults);
 
       // Replace the original loop

From 80a5cfbdd174db17ca60b28abca78b63dbf24d1a Mon Sep 17 00:00:00 2001
From: Keren Zhou <kerenzhou@openai.com>
Date: Mon, 30 Sep 2024 17:46:25 -0700
Subject: [PATCH 10/12] [PROTON][Experimental] Initialize instruction sampling
 support for NVIDIA GPUs (#4674)

This PR adds the "cupti_pcsampling" backend for collecting and
attributing instruction samples to the corresponding GPU code, including
the file path, function name, and line number. It currently serializes
kernel execution so that kernel runtime and GPU samples can be collected
in the same pass.
---
 .github/workflows/integration-tests.yml       |   1 +
 .github/workflows/integration-tests.yml.in    |   2 +-
 third_party/proton/README.md                  |  35 +-
 third_party/proton/csrc/include/Data/Metric.h |  76 ++-
 .../proton/csrc/include/Driver/Dispatch.h     |  10 +-
 .../proton/csrc/include/Driver/GPU/CuptiApi.h |  40 ++
 .../include/Profiler/Cupti/CuptiPCSampling.h  | 141 ++++++
 .../Profiler/{ => Cupti}/CuptiProfiler.h      |   2 +-
 .../csrc/include/Profiler/GPUProfiler.h       |  12 +
 .../{ => Roctracer}/RoctracerProfiler.h       |   2 +-
 .../proton/csrc/include/Utility/Atomic.h      |  20 +
 .../proton/csrc/include/Utility/Errors.h      |   5 +
 .../proton/csrc/include/Utility/String.h      |  12 +
 third_party/proton/csrc/lib/Data/TreeData.cpp | 122 ++---
 .../proton/csrc/lib/Driver/GPU/CuptiApi.cpp   |  39 ++
 .../lib/Profiler/Cupti/CuptiPCSampling.cpp    | 444 ++++++++++++++++++
 .../Profiler/{ => Cupti}/CuptiProfiler.cpp    | 135 ++++--
 .../{ => RocTracer}/RoctracerProfiler.cpp     |   2 +-
 .../proton/csrc/lib/Session/Session.cpp       |   7 +-
 third_party/proton/proton/profile.py          |   2 +-
 third_party/proton/proton/proton.py           |   3 +-
 third_party/proton/proton/scope.py            |  44 +-
 third_party/proton/proton/viewer.py           |   8 +-
 third_party/proton/test/example_cuda.json     |  20 +-
 third_party/proton/test/example_frame.json    |  19 +-
 third_party/proton/test/example_hip.json      |  20 +-
 third_party/proton/test/test_profile.py       |  44 +-
 third_party/proton/test/test_viewer.py        |  16 +-
 third_party/proton/tutorials/dynamic_net.py   |   3 +-
 29 files changed, 1115 insertions(+), 171 deletions(-)
 create mode 100644 third_party/proton/csrc/include/Profiler/Cupti/CuptiPCSampling.h
 rename third_party/proton/csrc/include/Profiler/{ => Cupti}/CuptiProfiler.h (90%)
 rename third_party/proton/csrc/include/Profiler/{ => Roctracer}/RoctracerProfiler.h (91%)
 create mode 100644 third_party/proton/csrc/lib/Profiler/Cupti/CuptiPCSampling.cpp
 rename third_party/proton/csrc/lib/Profiler/{ => Cupti}/CuptiProfiler.cpp (72%)
 rename third_party/proton/csrc/lib/Profiler/{ => RocTracer}/RoctracerProfiler.cpp (99%)

diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
index 7f65e92bdb..e9eca9fc74 100644
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@@ -25,6 +25,7 @@ env:
   TRITON_BUILD_WITH_CLANG_LLD: "TRUE"
   TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
   TRITON_DISABLE_LINE_INFO: 1
+  PROTON_SKIP_PC_SAMPLING_TEST: 1
 jobs:
   Runner-Preparation:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/integration-tests.yml.in b/.github/workflows/integration-tests.yml.in
index 0ff34e5833..81c95ac681 100644
--- a/.github/workflows/integration-tests.yml.in
+++ b/.github/workflows/integration-tests.yml.in
@@ -27,7 +27,7 @@ env:
   TRITON_BUILD_WITH_CLANG_LLD: "TRUE"
   TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
   TRITON_DISABLE_LINE_INFO: 1
-
+  PROTON_SKIP_PC_SAMPLING_TEST: 1
 
 jobs:
   Runner-Preparation:
diff --git a/third_party/proton/README.md b/third_party/proton/README.md
index 8b94f180c3..ccd79e7212 100644
--- a/third_party/proton/README.md
+++ b/third_party/proton/README.md
@@ -119,7 +119,7 @@ flops64: float  # The number of 64-bit floating-point operations
 bytes: int  # The number of bytes expected to be transferred
 ```
 
-### Command Line
+### Command line
 
 Proton can be used as a command-line tool to profile Python scripts and Pytest tests.
 The following examples demonstrate how to use Proton command-line.
@@ -149,6 +149,22 @@ More options can be found by running the following command.
 proton-viewer -h
 ```
 
+### Instruction sampling (experimental)
+
+Proton supports instruction sampling on NVIDIA GPUs.
+Please note that this is an experimental feature and may not work on all GPUs.
+You may experience ~20x end-to-end overhead when using instruction sampling, although the overhead for each individual GPU kernel is negligible.
+The overhead is mostly caused by data transfer and processing on the CPU.
+Additionally, the proton-viewer options `-i <regex> -d <depth> -t <threshold>` can be helpful for filtering out GPU kernels that are not of interest.
+The following example demonstrates how to use instruction sampling:
+
+```python
+import triton.profiler as proton
+
+
+proton.start(name="profile_name", context="shadow", backend="cupti_pcsampling")
+```
+
 ## Proton *vs* nsys
 
 - Runtime overhead (up to 1.5x)
@@ -173,11 +189,24 @@ Proton is designed to be portable and can be used on AMD GPUs. nsys only support
 
 Proton can register hooks to analyze the metadata of triton kernels, while nsys cannot. **Note** that the hooks do add additional overhead to proton.
 
-## Known Issues
+## Proton *vs* ncu
+
+Similar to the comparison between Proton and Nsight Systems (Nsys), Proton has a lower profiling overhead than Nsight Compute (NCU). We also plan to support instruction sampling on AMD GPUs.
+However, Nsight Compute supports the collection of more detailed metrics than Proton, such as memory access patterns, memory transactions, and other instruction-level metrics.
+In contrast, Proton only supports instruction sampling and is designed to be lightweight and portable.
+
+## Known issues
 
-- CUDA Graph
+- CUDA graph
 
 `hooks` cannot be used to accurately accumulate the number of FLOPs in CUDA graph mode profiling because kernels are captured and launched separately; metrics are not accumulated when kernels are launched in graph mode. This issue can be circumvented by using `scope` to supply FLOPs.
 
 If profiling is initiated after CUDA graph capturing, there may be minor memory leak issues.
 This is because the number of kernels in a graph instance (i.e., `cuGraphExec`) is unknown, preventing the deletion of mappings between the kernel ID and the graph ID.
+
+- Instruction sampling
+
+If you encounter permission related problems when using instruction sampling, you can lookup this [page](https://developer.nvidia.com/nvidia-development-tools-solutions-err_nvgpuctrperm-permission-issue-performance-counters) for help.
+
+The overhead of instruction sampling on NVIDIA GPUs is about 20x using Proton because we haven't enabled continuous sampling yet.
+Continuous sampling can allow for more runtime optimizations, but it makes it more challenging to attribute performance data back to the GPU kernels because: (1) it enables profiling of concurrent kernels, (2) it doesn't allow profiling of time and instruction samples simultaneously, and (3) it works best if we have a separate thread dedicated to attributing instruction samples to the GPU kernels
diff --git a/third_party/proton/csrc/include/Data/Metric.h b/third_party/proton/csrc/include/Data/Metric.h
index 0e22f7a050..a75692877c 100644
--- a/third_party/proton/csrc/include/Data/Metric.h
+++ b/third_party/proton/csrc/include/Data/Metric.h
@@ -7,7 +7,7 @@
 
 namespace proton {
 
-enum class MetricKind { Flexible, Kernel, Count };
+enum class MetricKind { Flexible, Kernel, PCSampling, Count };
 
 using MetricValueType = std::variant<uint64_t, int64_t, double, std::string>;
 
@@ -143,8 +143,78 @@ class KernelMetric : public Metric {
   const static inline bool AGGREGABLE[kernelMetricKind::Count] = {
       false, false, true, true, false, false};
   const static inline std::string VALUE_NAMES[kernelMetricKind::Count] = {
-      "StartTime (ns)", "EndTime (ns)", "Count",
-      "Time (ns)",      "DeviceId",     "DeviceType",
+      "start_time (ns)", "end_time (ns)", "count",
+      "time (ns)",       "device_id",     "device_type",
+  };
+};
+
+class PCSamplingMetric : public Metric {
+public:
+  enum PCSamplingMetricKind : int {
+    NumSamples,
+    NumStalledSamples,
+    StalledBranchResolving,
+    StalledNoInstruction,
+    StalledShortScoreboard,
+    StalledWait,
+    StalledLongScoreboard,
+    StalledTexThrottle,
+    StalledBarrier,
+    StalledMembar,
+    StalledIMCMiss,
+    StalledMIOThrottle,
+    StalledMathPipeThrottle,
+    StalledDrain,
+    StalledLGThrottle,
+    StalledNotSelected,
+    StalledMisc,
+    StalledDispatchStall,
+    StalledSleeping,
+    StalledSelected,
+    Count,
+  };
+
+  PCSamplingMetric()
+      : Metric(MetricKind::PCSampling, PCSamplingMetricKind::Count) {}
+
+  PCSamplingMetric(PCSamplingMetricKind kind, uint64_t samples,
+                   uint64_t stalledSamples)
+      : PCSamplingMetric() {
+    this->values[kind] = stalledSamples;
+    this->values[PCSamplingMetricKind::NumSamples] = samples;
+    this->values[PCSamplingMetricKind::NumStalledSamples] = stalledSamples;
+  }
+
+  virtual const std::string getName() const { return "PCSamplingMetric"; }
+
+  virtual const std::string getValueName(int valueId) const {
+    return VALUE_NAMES[valueId];
+  }
+
+  virtual bool isAggregable(int valueId) const { return true; }
+
+private:
+  const static inline std::string VALUE_NAMES[PCSamplingMetricKind::Count] = {
+      "num_samples",
+      "num_stalled_samples",
+      "stalled_branch_resolving",
+      "stalled_no_instruction",
+      "stalled_short_scoreboard",
+      "stalled_wait",
+      "stalled_long_scoreboard",
+      "stalled_tex_throttle",
+      "stalled_barrier",
+      "stalled_membar",
+      "stalled_imc_miss",
+      "stalled_mio_throttle",
+      "stalled_math_pipe_throttle",
+      "stalled_drain",
+      "stalled_lg_throttle",
+      "stalled_not_Selected",
+      "stalled_misc",
+      "stalled_dispatch_stall",
+      "stalled_sleeping",
+      "stalled_selected",
   };
 };
 
diff --git a/third_party/proton/csrc/include/Driver/Dispatch.h b/third_party/proton/csrc/include/Driver/Dispatch.h
index 6fe2d75942..1d8ec017cd 100644
--- a/third_party/proton/csrc/include/Driver/Dispatch.h
+++ b/third_party/proton/csrc/include/Driver/Dispatch.h
@@ -63,17 +63,17 @@ template <typename ExternLib> class Dispatch {
       *lib = dlopen(name, RTLD_NOLOAD);
     }
     if (*lib == nullptr) {
-      // If not found, try to load it from the default path
+      // If not found, try to load it from LD_LIBRARY_PATH
+      *lib = dlopen(name, RTLD_LOCAL | RTLD_LAZY);
+    }
+    if (*lib == nullptr) {
+      // If still not found, try to load it from the default path
       auto dir = std::string(ExternLib::defaultDir);
       if (dir.length() > 0) {
         auto fullPath = dir + "/" + name;
         *lib = dlopen(fullPath.c_str(), RTLD_LOCAL | RTLD_LAZY);
       }
     }
-    if (*lib == nullptr) {
-      // If still not found, try to load it from LD_LIBRARY_PATH
-      *lib = dlopen(name, RTLD_LOCAL | RTLD_LAZY);
-    }
     if (*lib == nullptr) {
       throw std::runtime_error("Could not find `" + std::string(name) +
                                "`. Make sure it is in your "
diff --git a/third_party/proton/csrc/include/Driver/GPU/CuptiApi.h b/third_party/proton/csrc/include/Driver/GPU/CuptiApi.h
index 845b415bd5..495964923e 100644
--- a/third_party/proton/csrc/include/Driver/GPU/CuptiApi.h
+++ b/third_party/proton/csrc/include/Driver/GPU/CuptiApi.h
@@ -2,11 +2,17 @@
 #define PROTON_DRIVER_GPU_CUPTI_H_
 
 #include "cupti.h"
+#include "cupti_pcsampling.h"
 
 namespace proton {
 
 namespace cupti {
 
+template <bool CheckSuccess> CUptiResult getVersion(uint32_t *version);
+
+template <bool CheckSuccess>
+CUptiResult getContextId(CUcontext context, uint32_t *pCtxId);
+
 template <bool CheckSuccess>
 CUptiResult activityRegisterCallbacks(
     CUpti_BuffersCallbackRequestFunc funcBufferRequested,
@@ -66,6 +72,40 @@ CUptiResult getGraphExecId(CUgraphExec graph, uint32_t *pId);
 template <bool CheckSuccess>
 CUptiResult getGraphId(CUgraph graph, uint32_t *pId);
 
+template <bool CheckSuccess>
+CUptiResult getCubinCrc(CUpti_GetCubinCrcParams *pParams);
+
+template <bool CheckSuccess>
+CUptiResult
+getSassToSourceCorrelation(CUpti_GetSassToSourceCorrelationParams *pParams);
+
+template <bool CheckSuccess>
+CUptiResult
+pcSamplingGetNumStallReasons(CUpti_PCSamplingGetNumStallReasonsParams *pParams);
+
+template <bool CheckSuccess>
+CUptiResult
+pcSamplingGetStallReasons(CUpti_PCSamplingGetStallReasonsParams *pParams);
+
+template <bool CheckSuccess>
+CUptiResult pcSamplingSetConfigurationAttribute(
+    CUpti_PCSamplingConfigurationInfoParams *pParams);
+
+template <bool CheckSuccess>
+CUptiResult pcSamplingEnable(CUpti_PCSamplingEnableParams *pParams);
+
+template <bool CheckSuccess>
+CUptiResult pcSamplingDisable(CUpti_PCSamplingDisableParams *pParams);
+
+template <bool CheckSuccess>
+CUptiResult pcSamplingGetData(CUpti_PCSamplingGetDataParams *pParams);
+
+template <bool CheckSuccess>
+CUptiResult pcSamplingStart(CUpti_PCSamplingStartParams *pParams);
+
+template <bool CheckSuccess>
+CUptiResult pcSamplingStop(CUpti_PCSamplingStopParams *pParams);
+
 } // namespace cupti
 
 } // namespace proton
diff --git a/third_party/proton/csrc/include/Profiler/Cupti/CuptiPCSampling.h b/third_party/proton/csrc/include/Profiler/Cupti/CuptiPCSampling.h
new file mode 100644
index 0000000000..58b6e2be81
--- /dev/null
+++ b/third_party/proton/csrc/include/Profiler/Cupti/CuptiPCSampling.h
@@ -0,0 +1,141 @@
+#ifndef PROTON_PROFILER_CUPTI_PC_SAMPLING_H_
+#define PROTON_PROFILER_CUPTI_PC_SAMPLING_H_
+
+#include "CuptiProfiler.h"
+#include "Driver/GPU/CudaApi.h"
+#include "Driver/GPU/CuptiApi.h"
+#include "Utility/Map.h"
+#include "Utility/Singleton.h"
+#include <atomic>
+#include <mutex>
+
+namespace proton {
+
+struct CubinData {
+  size_t cubinCrc;
+  const char *cubin;
+  size_t cubinSize;
+
+  struct LineInfoKey {
+    uint32_t functionIndex;
+    uint64_t pcOffset;
+
+    bool operator<(const LineInfoKey &other) const {
+      return functionIndex < other.functionIndex ||
+             (functionIndex == other.functionIndex &&
+              pcOffset < other.pcOffset);
+    }
+  };
+
+  struct LineInfoValue {
+    uint32_t lineNumber{};
+    const std::string functionName{};
+    const std::string dirName{};
+    const std::string fileName{};
+
+    LineInfoValue() = default;
+
+    LineInfoValue(uint32_t lineNumber, const std::string &functionName,
+                  const std::string &dirName, const std::string &fileName)
+        : lineNumber(lineNumber), functionName(functionName), dirName(dirName),
+          fileName(fileName) {}
+  };
+
+  std::map<LineInfoKey, LineInfoValue> lineInfo;
+};
+
+struct ConfigureData {
+  ConfigureData() = default;
+
+  ~ConfigureData() {
+    if (stallReasonNames) {
+      for (size_t i = 0; i < numStallReasons; i++) {
+        if (stallReasonNames[i])
+          std::free(stallReasonNames[i]);
+      }
+      std::free(stallReasonNames);
+    }
+    if (stallReasonIndices)
+      std::free(stallReasonIndices);
+    if (pcSamplingData.pPcData) {
+      for (size_t i = 0; i < numValidStallReasons; ++i) {
+        std::free(pcSamplingData.pPcData[i].stallReason);
+      }
+      std::free(pcSamplingData.pPcData);
+    }
+  }
+
+  void initialize(CUcontext context);
+
+  CUpti_PCSamplingConfigurationInfo configureStallReasons();
+  CUpti_PCSamplingConfigurationInfo configureSamplingPeriod();
+  CUpti_PCSamplingConfigurationInfo configureSamplingBuffer();
+  CUpti_PCSamplingConfigurationInfo configureScratchBuffer();
+  CUpti_PCSamplingConfigurationInfo configureHardwareBufferSize();
+  CUpti_PCSamplingConfigurationInfo configureStartStopControl();
+  CUpti_PCSamplingConfigurationInfo configureCollectionMode();
+
+  // The amount of data reserved on the GPU
+  static constexpr size_t HardwareBufferSize = 128 * 1024 * 1024;
+  // The amount of data copied from the hardware buffer each time
+  static constexpr size_t ScratchBufferSize = 16 * 1024 * 1024;
+  // The number of PCs copied from the scratch buffer each time
+  static constexpr size_t DataBufferPCCount = 1024;
+  // The sampling period in cycles = 2^frequency
+  static constexpr uint32_t DefaultFrequency = 10;
+
+  CUcontext context{};
+  uint32_t contextId;
+  uint32_t numStallReasons{};
+  uint32_t numValidStallReasons{};
+  char **stallReasonNames{};
+  uint32_t *stallReasonIndices{};
+  std::map<size_t, size_t> stallReasonIndexToMetricIndex{};
+  std::set<size_t> notIssuedStallReasonIndices{};
+  CUpti_PCSamplingData pcSamplingData{};
+  // The memory storing configuration information has to be kept alive during
+  // the profiling session
+  std::vector<CUpti_PCSamplingConfigurationInfo> configurationInfos;
+};
+
+class CuptiPCSampling : public Singleton<CuptiPCSampling> {
+
+public:
+  CuptiPCSampling() = default;
+  virtual ~CuptiPCSampling() = default;
+
+  void initialize(CUcontext context);
+
+  void start(CUcontext context);
+
+  void stop(CUcontext context, uint64_t externId, bool isAPI);
+
+  void finalize(CUcontext context);
+
+  void loadModule(const char *cubin, size_t cubinSize);
+
+  void unloadModule(const char *cubin, size_t cubinSize);
+
+private:
+  ConfigureData *getConfigureData(uint32_t contextId);
+
+  CubinData *getCubinData(uint64_t cubinCrc);
+
+  void processPCSamplingData(ConfigureData *configureData, uint64_t externId,
+                             bool isAPI);
+
+  ThreadSafeMap<uint32_t, ConfigureData> contextIdToConfigureData;
+  // In case the same cubin is loaded multiple times, we need to keep track of
+  // all of them
+  ThreadSafeMap<size_t, std::pair<CubinData, /*count=*/size_t>>
+      cubinCrcToCubinData;
+  ThreadSafeSet<uint32_t> contextInitialized;
+
+  std::atomic<bool> pcSamplingStarted{false};
+  std::mutex pcSamplingMutex{};
+  std::mutex contextMutex{};
+};
+
+} // namespace proton
+
+#endif // PROTON_PROFILER_CUPTI_PC_SAMPLING_H_
diff --git a/third_party/proton/csrc/include/Profiler/CuptiProfiler.h b/third_party/proton/csrc/include/Profiler/Cupti/CuptiProfiler.h
similarity index 90%
rename from third_party/proton/csrc/include/Profiler/CuptiProfiler.h
rename to third_party/proton/csrc/include/Profiler/Cupti/CuptiProfiler.h
index 344d0fd4b9..c443ec2e39 100644
--- a/third_party/proton/csrc/include/Profiler/CuptiProfiler.h
+++ b/third_party/proton/csrc/include/Profiler/Cupti/CuptiProfiler.h
@@ -1,7 +1,7 @@
 #ifndef PROTON_PROFILER_CUPTI_PROFILER_H_
 #define PROTON_PROFILER_CUPTI_PROFILER_H_
 
-#include "GPUProfiler.h"
+#include "Profiler/GPUProfiler.h"
 
 namespace proton {
 
diff --git a/third_party/proton/csrc/include/Profiler/GPUProfiler.h b/third_party/proton/csrc/include/Profiler/GPUProfiler.h
index 26c6d10b5d..d5033b06aa 100644
--- a/third_party/proton/csrc/include/Profiler/GPUProfiler.h
+++ b/third_party/proton/csrc/include/Profiler/GPUProfiler.h
@@ -31,6 +31,16 @@ class GPUProfiler : public Profiler,
                     std::unordered_map<uint64_t, std::pair<size_t, size_t>>>;
   using ApiExternIdSet = ThreadSafeSet<size_t, std::unordered_set<size_t>>;
 
+  ConcreteProfilerT &enablePCSampling() {
+    pcSamplingEnabled = true;
+    return dynamic_cast<ConcreteProfilerT &>(*this);
+  }
+  ConcreteProfilerT &disablePCSampling() {
+    pcSamplingEnabled = false;
+    return dynamic_cast<ConcreteProfilerT &>(*this);
+  }
+  bool isPCSamplingEnabled() const { return pcSamplingEnabled; }
+
 protected:
   // OpInterface
   void startOp(const Scope &scope) override {
@@ -140,6 +150,8 @@ class GPUProfiler : public Profiler,
     ConcreteProfilerT &profiler;
   };
   std::unique_ptr<GPUProfilerPimplInterface> pImpl;
+
+  bool pcSamplingEnabled{false};
 };
 
 } // namespace proton
diff --git a/third_party/proton/csrc/include/Profiler/RoctracerProfiler.h b/third_party/proton/csrc/include/Profiler/Roctracer/RoctracerProfiler.h
similarity index 91%
rename from third_party/proton/csrc/include/Profiler/RoctracerProfiler.h
rename to third_party/proton/csrc/include/Profiler/Roctracer/RoctracerProfiler.h
index 2f1791dcb5..b9bc08de8e 100644
--- a/third_party/proton/csrc/include/Profiler/RoctracerProfiler.h
+++ b/third_party/proton/csrc/include/Profiler/Roctracer/RoctracerProfiler.h
@@ -1,7 +1,7 @@
 #ifndef PROTON_PROFILER_ROCTRACER_PROFILER_H_
 #define PROTON_PROFILER_ROCTRACER_PROFILER_H_
 
-#include "GPUProfiler.h"
+#include "Profiler/GPUProfiler.h"
 
 namespace proton {
 
diff --git a/third_party/proton/csrc/include/Utility/Atomic.h b/third_party/proton/csrc/include/Utility/Atomic.h
index d7e40e73cd..0f759e0d61 100644
--- a/third_party/proton/csrc/include/Utility/Atomic.h
+++ b/third_party/proton/csrc/include/Utility/Atomic.h
@@ -1,4 +1,8 @@
+#ifndef PROTON_UTILITY_ATOMIC_H_
+#define PROTON_UTILITY_ATOMIC_H_
+
 #include <atomic>
+#include <mutex>
 
 namespace proton {
 
@@ -16,4 +20,20 @@ template <typename T> T atomicMin(std::atomic<T> &target, T value) {
   return current;
 }
 
+template <typename Condition, typename Function>
+void doubleCheckedLock(Condition enterCondition, std::mutex &lock,
+                       Function function) {
+  if (!enterCondition())
+    return;
+
+  std::unique_lock<std::mutex> guard(lock);
+
+  if (!enterCondition())
+    return;
+
+  function();
+}
+
 } // namespace proton
+
+#endif // PROTON_UTILITY_ATOMIC_H_
diff --git a/third_party/proton/csrc/include/Utility/Errors.h b/third_party/proton/csrc/include/Utility/Errors.h
index 62d4f3f665..094723d6f7 100644
--- a/third_party/proton/csrc/include/Utility/Errors.h
+++ b/third_party/proton/csrc/include/Utility/Errors.h
@@ -1,3 +1,6 @@
+#ifndef PROTON_UTILITY_ERRORS_H_
+#define PROTON_UTILITY_ERRORS_H_
+
 #include <stdexcept>
 
 namespace proton {
@@ -8,3 +11,5 @@ class NotImplemented : public std::logic_error {
 };
 
 } // namespace proton
+
+#endif // PROTON_UTILITY_ERRORS_H_
diff --git a/third_party/proton/csrc/include/Utility/String.h b/third_party/proton/csrc/include/Utility/String.h
index b7d45ae1f7..b4a1d3ff91 100644
--- a/third_party/proton/csrc/include/Utility/String.h
+++ b/third_party/proton/csrc/include/Utility/String.h
@@ -13,6 +13,18 @@ inline std::string toLower(const std::string &str) {
   return lower;
 }
 
+inline std::string replace(const std::string &str, const std::string &src,
+                           const std::string &dst) {
+  std::string replaced = str;
+  size_t pos = replaced.find(src, pos);
+  while (pos != std::string::npos) {
+    replaced.replace(pos, src.length(), dst);
+    pos += dst.length();
+    pos = replaced.find(src, pos);
+  }
+  return replaced;
+}
+
 } // namespace proton
 
 #endif // PROTON_UTILITY_STRING_H_
diff --git a/third_party/proton/csrc/lib/Data/TreeData.cpp b/third_party/proton/csrc/lib/Data/TreeData.cpp
index b12427f777..ec6ea1c784 100644
--- a/third_party/proton/csrc/lib/Data/TreeData.cpp
+++ b/third_party/proton/csrc/lib/Data/TreeData.cpp
@@ -180,66 +180,76 @@ void TreeData::dumpHatchet(std::ostream &os) const {
   jsonNodes[Tree::TreeNode::RootId] = &(output.back());
   std::set<std::string> valueNames;
   std::map<uint64_t, std::set<uint64_t>> deviceIds;
-  this->tree->template walk<Tree::WalkPolicy::PreOrder>(
-      [&](Tree::TreeNode &treeNode) {
-        const auto contextName = treeNode.name;
-        auto contextId = treeNode.id;
-        json *jsonNode = jsonNodes[contextId];
-        (*jsonNode)["frame"] = {{"name", contextName}, {"type", "function"}};
-        (*jsonNode)["metrics"] = json::object();
-        for (auto [metricKind, metric] : treeNode.metrics) {
-          if (metricKind == MetricKind::Kernel) {
-            auto kernelMetric = std::dynamic_pointer_cast<KernelMetric>(metric);
-            auto duration = std::get<uint64_t>(
-                kernelMetric->getValue(KernelMetric::Duration));
-            auto invocations = std::get<uint64_t>(
-                kernelMetric->getValue(KernelMetric::Invocations));
-            auto deviceId = std::get<uint64_t>(
-                kernelMetric->getValue(KernelMetric::DeviceId));
-            auto deviceType = std::get<uint64_t>(
-                kernelMetric->getValue(KernelMetric::DeviceType));
-            auto deviceTypeName =
-                getDeviceTypeString(static_cast<DeviceType>(deviceType));
-            (*jsonNode)["metrics"]
-                       [kernelMetric->getValueName(KernelMetric::Duration)] =
-                           duration;
-            (*jsonNode)["metrics"]
-                       [kernelMetric->getValueName(KernelMetric::Invocations)] =
-                           invocations;
-            (*jsonNode)["metrics"]
-                       [kernelMetric->getValueName(KernelMetric::DeviceId)] =
-                           std::to_string(deviceId);
-            (*jsonNode)["metrics"]
-                       [kernelMetric->getValueName(KernelMetric::DeviceType)] =
-                           deviceTypeName;
-            valueNames.insert(
-                kernelMetric->getValueName(KernelMetric::Duration));
-            valueNames.insert(
-                kernelMetric->getValueName(KernelMetric::Invocations));
-            deviceIds.insert({deviceType, {deviceId}});
-          } else {
-            throw std::runtime_error("MetricKind not supported");
-          }
-        }
-        for (auto [_, flexibleMetric] : treeNode.flexibleMetrics) {
-          auto valueName = flexibleMetric.getValueName(0);
+  this->tree->template walk<Tree::WalkPolicy::PreOrder>([&](Tree::TreeNode
+                                                                &treeNode) {
+    const auto contextName = treeNode.name;
+    auto contextId = treeNode.id;
+    json *jsonNode = jsonNodes[contextId];
+    (*jsonNode)["frame"] = {{"name", contextName}, {"type", "function"}};
+    (*jsonNode)["metrics"] = json::object();
+    for (auto [metricKind, metric] : treeNode.metrics) {
+      if (metricKind == MetricKind::Kernel) {
+        std::shared_ptr<KernelMetric> kernelMetric =
+            std::dynamic_pointer_cast<KernelMetric>(metric);
+        uint64_t duration =
+            std::get<uint64_t>(kernelMetric->getValue(KernelMetric::Duration));
+        uint64_t invocations = std::get<uint64_t>(
+            kernelMetric->getValue(KernelMetric::Invocations));
+        uint64_t deviceId =
+            std::get<uint64_t>(kernelMetric->getValue(KernelMetric::DeviceId));
+        uint64_t deviceType = std::get<uint64_t>(
+            kernelMetric->getValue(KernelMetric::DeviceType));
+        std::string deviceTypeName =
+            getDeviceTypeString(static_cast<DeviceType>(deviceType));
+        (*jsonNode)["metrics"]
+                   [kernelMetric->getValueName(KernelMetric::Duration)] =
+                       duration;
+        (*jsonNode)["metrics"]
+                   [kernelMetric->getValueName(KernelMetric::Invocations)] =
+                       invocations;
+        (*jsonNode)["metrics"]
+                   [kernelMetric->getValueName(KernelMetric::DeviceId)] =
+                       std::to_string(deviceId);
+        (*jsonNode)["metrics"]
+                   [kernelMetric->getValueName(KernelMetric::DeviceType)] =
+                       deviceTypeName;
+        valueNames.insert(kernelMetric->getValueName(KernelMetric::Duration));
+        valueNames.insert(
+            kernelMetric->getValueName(KernelMetric::Invocations));
+        deviceIds.insert({deviceType, {deviceId}});
+      } else if (metricKind == MetricKind::PCSampling) {
+        auto pcSamplingMetric =
+            std::dynamic_pointer_cast<PCSamplingMetric>(metric);
+        for (size_t i = 0; i < PCSamplingMetric::Count; i++) {
+          auto valueName = pcSamplingMetric->getValueName(i);
           valueNames.insert(valueName);
           std::visit(
               [&](auto &&value) { (*jsonNode)["metrics"][valueName] = value; },
-              flexibleMetric.getValues()[0]);
-        }
-        (*jsonNode)["children"] = json::array();
-        auto children = treeNode.children;
-        for (auto _ : children) {
-          (*jsonNode)["children"].push_back(json::object());
+              pcSamplingMetric->getValues()[i]);
         }
-        auto idx = 0;
-        for (auto child : children) {
-          auto [index, childId] = child;
-          jsonNodes[childId] = &(*jsonNode)["children"][idx];
-          idx++;
-        }
-      });
+      } else {
+        throw std::runtime_error("MetricKind not supported");
+      }
+    }
+    for (auto [_, flexibleMetric] : treeNode.flexibleMetrics) {
+      auto valueName = flexibleMetric.getValueName(0);
+      valueNames.insert(valueName);
+      std::visit(
+          [&](auto &&value) { (*jsonNode)["metrics"][valueName] = value; },
+          flexibleMetric.getValues()[0]);
+    }
+    (*jsonNode)["children"] = json::array();
+    auto children = treeNode.children;
+    for (auto _ : children) {
+      (*jsonNode)["children"].push_back(json::object());
+    }
+    auto idx = 0;
+    for (auto child : children) {
+      auto [index, childId] = child;
+      jsonNodes[childId] = &(*jsonNode)["children"][idx];
+      idx++;
+    }
+  });
   // Hints for all available metrics
   for (auto valueName : valueNames) {
     output[Tree::TreeNode::RootId]["metrics"][valueName] = 0;
diff --git a/third_party/proton/csrc/lib/Driver/GPU/CuptiApi.cpp b/third_party/proton/csrc/lib/Driver/GPU/CuptiApi.cpp
index 1d7e97314a..2c399d31c7 100644
--- a/third_party/proton/csrc/lib/Driver/GPU/CuptiApi.cpp
+++ b/third_party/proton/csrc/lib/Driver/GPU/CuptiApi.cpp
@@ -22,6 +22,11 @@ struct ExternLibCupti : public ExternLibBase {
 
 void *ExternLibCupti::lib = nullptr;
 
+DEFINE_DISPATCH(ExternLibCupti, getVersion, cuptiGetVersion, uint32_t *);
+
+DEFINE_DISPATCH(ExternLibCupti, getContextId, cuptiGetContextId, CUcontext,
+                uint32_t *);
+
 DEFINE_DISPATCH(ExternLibCupti, activityRegisterCallbacks,
                 cuptiActivityRegisterCallbacks,
                 CUpti_BuffersCallbackRequestFunc,
@@ -77,6 +82,40 @@ DEFINE_DISPATCH(ExternLibCupti, getGraphExecId, cuptiGetGraphExecId,
 DEFINE_DISPATCH(ExternLibCupti, getGraphId, cuptiGetGraphId, CUgraph,
                 uint32_t *);
 
+DEFINE_DISPATCH(ExternLibCupti, getCubinCrc, cuptiGetCubinCrc,
+                CUpti_GetCubinCrcParams *);
+
+DEFINE_DISPATCH(ExternLibCupti, getSassToSourceCorrelation,
+                cuptiGetSassToSourceCorrelation,
+                CUpti_GetSassToSourceCorrelationParams *);
+
+DEFINE_DISPATCH(ExternLibCupti, pcSamplingGetNumStallReasons,
+                cuptiPCSamplingGetNumStallReasons,
+                CUpti_PCSamplingGetNumStallReasonsParams *);
+
+DEFINE_DISPATCH(ExternLibCupti, pcSamplingGetStallReasons,
+                cuptiPCSamplingGetStallReasons,
+                CUpti_PCSamplingGetStallReasonsParams *);
+
+DEFINE_DISPATCH(ExternLibCupti, pcSamplingSetConfigurationAttribute,
+                cuptiPCSamplingSetConfigurationAttribute,
+                CUpti_PCSamplingConfigurationInfoParams *);
+
+DEFINE_DISPATCH(ExternLibCupti, pcSamplingEnable, cuptiPCSamplingEnable,
+                CUpti_PCSamplingEnableParams *);
+
+DEFINE_DISPATCH(ExternLibCupti, pcSamplingDisable, cuptiPCSamplingDisable,
+                CUpti_PCSamplingDisableParams *);
+
+DEFINE_DISPATCH(ExternLibCupti, pcSamplingGetData, cuptiPCSamplingGetData,
+                CUpti_PCSamplingGetDataParams *);
+
+DEFINE_DISPATCH(ExternLibCupti, pcSamplingStart, cuptiPCSamplingStart,
+                CUpti_PCSamplingStartParams *);
+
+DEFINE_DISPATCH(ExternLibCupti, pcSamplingStop, cuptiPCSamplingStop,
+                CUpti_PCSamplingStopParams *);
+
 } // namespace cupti
 
 } // namespace proton
diff --git a/third_party/proton/csrc/lib/Profiler/Cupti/CuptiPCSampling.cpp b/third_party/proton/csrc/lib/Profiler/Cupti/CuptiPCSampling.cpp
new file mode 100644
index 0000000000..f8fb2537a0
--- /dev/null
+++ b/third_party/proton/csrc/lib/Profiler/Cupti/CuptiPCSampling.cpp
@@ -0,0 +1,444 @@
+#include "Profiler/Cupti/CuptiPCSampling.h"
+#include "Data/Metric.h"
+#include "Driver/GPU/CudaApi.h"
+#include "Driver/GPU/CuptiApi.h"
+#include "Utility/Atomic.h"
+#include "Utility/Map.h"
+#include "Utility/String.h"
+#include <memory>
+#include <tuple>
+
+namespace proton {
+
+namespace {
+
+uint64_t getCubinCrc(const char *cubin, size_t size) {
+  CUpti_GetCubinCrcParams cubinCrcParams = {
+      .size = CUpti_GetCubinCrcParamsSize,
+      .cubinSize = size,
+      .cubin = cubin,
+      .cubinCrc = 0,
+  };
+  cupti::getCubinCrc<true>(&cubinCrcParams);
+  return cubinCrcParams.cubinCrc;
+}
+
+size_t getNumStallReasons(CUcontext context) {
+  size_t numStallReasons = 0;
+  CUpti_PCSamplingGetNumStallReasonsParams numStallReasonsParams = {
+      .size = CUpti_PCSamplingGetNumStallReasonsParamsSize,
+      .pPriv = NULL,
+      .ctx = context,
+      .numStallReasons = &numStallReasons};
+  cupti::pcSamplingGetNumStallReasons<true>(&numStallReasonsParams);
+  return numStallReasons;
+}
+
+std::tuple<uint32_t, std::string, std::string>
+getSassToSourceCorrelation(const char *functionName, uint64_t pcOffset,
+                           const char *cubin, size_t cubinSize) {
+  CUpti_GetSassToSourceCorrelationParams sassToSourceParams = {
+      .size = CUpti_GetSassToSourceCorrelationParamsSize,
+      .cubin = cubin,
+      .functionName = functionName,
+      .cubinSize = cubinSize,
+      .lineNumber = 0,
+      .pcOffset = pcOffset,
+      .fileName = NULL,
+      .dirName = NULL,
+  };
+  // Get source can fail if the line mapping is not available in the cubin so we
+  // don't check the return value
+  cupti::getSassToSourceCorrelation<false>(&sassToSourceParams);
+  auto fileNameStr = sassToSourceParams.fileName
+                         ? std::string(sassToSourceParams.fileName)
+                         : "";
+  auto dirNameStr =
+      sassToSourceParams.dirName ? std::string(sassToSourceParams.dirName) : "";
+  // It's user's responsibility to free the memory
+  if (sassToSourceParams.fileName)
+    std::free(sassToSourceParams.fileName);
+  if (sassToSourceParams.dirName)
+    std::free(sassToSourceParams.dirName);
+  return std::make_tuple(sassToSourceParams.lineNumber, fileNameStr,
+                         dirNameStr);
+}
+
+std::pair<char **, uint32_t *>
+getStallReasonNamesAndIndices(CUcontext context, size_t numStallReasons) {
+  char **stallReasonNames =
+      static_cast<char **>(std::calloc(numStallReasons, sizeof(char *)));
+  for (size_t i = 0; i < numStallReasons; i++) {
+    stallReasonNames[i] = static_cast<char *>(
+        std::calloc(CUPTI_STALL_REASON_STRING_SIZE, sizeof(char)));
+  }
+  uint32_t *stallReasonIndices =
+      static_cast<uint32_t *>(std::calloc(numStallReasons, sizeof(uint32_t)));
+  // Initialize the names with 128 characters to avoid buffer overflow
+  CUpti_PCSamplingGetStallReasonsParams stallReasonsParams = {
+      .size = CUpti_PCSamplingGetStallReasonsParamsSize,
+      .pPriv = NULL,
+      .ctx = context,
+      .numStallReasons = numStallReasons,
+      .stallReasonIndex = stallReasonIndices,
+      .stallReasons = stallReasonNames,
+  };
+  cupti::pcSamplingGetStallReasons<true>(&stallReasonsParams);
+  return std::make_pair(stallReasonNames, stallReasonIndices);
+}
+
+size_t matchStallReasonsToIndices(
+    size_t numStallReasons, char **stallReasonNames,
+    uint32_t *stallReasonIndices,
+    std::map<size_t, size_t> &stallReasonIndexToMetricIndex,
+    std::set<size_t> &notIssuedStallReasonIndices) {
+  // In case there's any invalid stall reasons, we only collect valid ones.
+  // Invalid ones are swapped to the end of the list
+  std::vector<bool> validIndex(numStallReasons, false);
+  size_t numValidStalls = 0;
+  for (size_t i = 0; i < numStallReasons; i++) {
+    bool notIssued = std::string(stallReasonNames[i]).find("not_issued") !=
+                     std::string::npos;
+    std::string cuptiStallName = std::string(stallReasonNames[i]);
+    for (size_t j = 0; j < PCSamplingMetric::PCSamplingMetricKind::Count; j++) {
+      auto metricName = PCSamplingMetric().getValueName(j);
+      if (cuptiStallName.find(metricName) != std::string::npos) {
+        if (notIssued)
+          notIssuedStallReasonIndices.insert(stallReasonIndices[i]);
+        stallReasonIndexToMetricIndex[stallReasonIndices[i]] = j;
+        validIndex[i] = true;
+        numValidStalls++;
+        break;
+      }
+    }
+  }
+  int invalidIndex = -1;
+  for (size_t i = 0; i < numStallReasons; i++) {
+    if (invalidIndex == -1 && !validIndex[i]) {
+      invalidIndex = i;
+    } else if (invalidIndex != -1 && validIndex[i]) {
+      std::swap(stallReasonIndices[invalidIndex], stallReasonIndices[i]);
+      std::swap(stallReasonNames[invalidIndex], stallReasonNames[i]);
+      validIndex[invalidIndex] = true;
+      invalidIndex++;
+    }
+  }
+  return numValidStalls;
+}
+
+#define CUPTI_CUDA12_4_VERSION 22
+#define CUPTI_CUDA12_4_PC_DATA_PADDING_SIZE sizeof(uint32_t)
+
+CUpti_PCSamplingData allocPCSamplingData(size_t collectNumPCs,
+                                         size_t numValidStallReasons) {
+  uint32_t libVersion = 0;
+  cupti::getVersion<true>(&libVersion);
+  size_t pcDataSize = sizeof(CUpti_PCSamplingPCData);
+  // Check cupti api version < 12.4 but cupti header version >= 12.4
+  // If so, we subtract 4 bytes from the size of CUpti_PCSamplingPCData
+  // because it introduces a new field (i.e., correlationId) at the end of the
+  // struct, which is not compatible with the previous versions.
+  if (libVersion < CUPTI_CUDA12_4_VERSION &&
+      CUPTI_API_VERSION >= CUPTI_CUDA12_4_VERSION)
+    pcDataSize -= CUPTI_CUDA12_4_PC_DATA_PADDING_SIZE;
+  CUpti_PCSamplingData pcSamplingData{
+      .size = pcDataSize,
+      .collectNumPcs = collectNumPCs,
+      .pPcData = static_cast<CUpti_PCSamplingPCData *>(
+          std::calloc(collectNumPCs, sizeof(CUpti_PCSamplingPCData)))};
+  for (size_t i = 0; i < collectNumPCs; ++i) {
+    pcSamplingData.pPcData[i].stallReason =
+        static_cast<CUpti_PCSamplingStallReason *>(std::calloc(
+            numValidStallReasons, sizeof(CUpti_PCSamplingStallReason)));
+  }
+  return pcSamplingData;
+}
+
+void enablePCSampling(CUcontext context) {
+  CUpti_PCSamplingEnableParams params = {
+      .size = CUpti_PCSamplingEnableParamsSize,
+      .pPriv = NULL,
+      .ctx = context,
+  };
+  cupti::pcSamplingEnable<true>(&params);
+}
+
+void disablePCSampling(CUcontext context) {
+  CUpti_PCSamplingDisableParams params = {
+      .size = CUpti_PCSamplingDisableParamsSize,
+      .pPriv = NULL,
+      .ctx = context,
+  };
+  cupti::pcSamplingDisable<true>(&params);
+}
+
+void startPCSampling(CUcontext context) {
+  CUpti_PCSamplingStartParams params = {
+      .size = CUpti_PCSamplingStartParamsSize,
+      .pPriv = NULL,
+      .ctx = context,
+  };
+  cupti::pcSamplingStart<true>(&params);
+}
+
+void stopPCSampling(CUcontext context) {
+  CUpti_PCSamplingStopParams params = {
+      .size = CUpti_PCSamplingStopParamsSize,
+      .pPriv = NULL,
+      .ctx = context,
+  };
+  cupti::pcSamplingStop<true>(&params);
+}
+
+void getPCSamplingData(CUcontext context,
+                       CUpti_PCSamplingData *pcSamplingData) {
+  CUpti_PCSamplingGetDataParams params = {
+      .size = CUpti_PCSamplingGetDataParamsSize,
+      .pPriv = NULL,
+      .ctx = context,
+      .pcSamplingData = pcSamplingData,
+  };
+  cupti::pcSamplingGetData<true>(&params);
+}
+
+void setConfigurationAttribute(
+    CUcontext context,
+    std::vector<CUpti_PCSamplingConfigurationInfo> &configurationInfos) {
+  CUpti_PCSamplingConfigurationInfoParams infoParams = {
+      .size = CUpti_PCSamplingConfigurationInfoParamsSize,
+      .pPriv = NULL,
+      .ctx = context,
+      .numAttributes = configurationInfos.size(),
+      .pPCSamplingConfigurationInfo = configurationInfos.data(),
+  };
+  cupti::pcSamplingSetConfigurationAttribute<true>(&infoParams);
+}
+
+} // namespace
+
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureStallReasons() {
+  numStallReasons = getNumStallReasons(context);
+  std::tie(this->stallReasonNames, this->stallReasonIndices) =
+      getStallReasonNamesAndIndices(context, numStallReasons);
+  numValidStallReasons = matchStallReasonsToIndices(
+      numStallReasons, stallReasonNames, stallReasonIndices,
+      stallReasonIndexToMetricIndex, notIssuedStallReasonIndices);
+  CUpti_PCSamplingConfigurationInfo stallReasonInfo{};
+  stallReasonInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON;
+  stallReasonInfo.attributeData.stallReasonData.stallReasonCount =
+      numValidStallReasons;
+  stallReasonInfo.attributeData.stallReasonData.pStallReasonIndex =
+      stallReasonIndices;
+  return stallReasonInfo;
+}
+
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureSamplingPeriod() {
+  CUpti_PCSamplingConfigurationInfo samplingPeriodInfo{};
+  samplingPeriodInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD;
+  samplingPeriodInfo.attributeData.samplingPeriodData.samplingPeriod =
+      DefaultFrequency;
+  return samplingPeriodInfo;
+}
+
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureSamplingBuffer() {
+  CUpti_PCSamplingConfigurationInfo samplingBufferInfo{};
+  samplingBufferInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_DATA_BUFFER;
+  this->pcSamplingData =
+      allocPCSamplingData(DataBufferPCCount, numValidStallReasons);
+  samplingBufferInfo.attributeData.samplingDataBufferData.samplingDataBuffer =
+      &this->pcSamplingData;
+  return samplingBufferInfo;
+}
+
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureScratchBuffer() {
+  CUpti_PCSamplingConfigurationInfo scratchBufferInfo{};
+  scratchBufferInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE;
+  scratchBufferInfo.attributeData.scratchBufferSizeData.scratchBufferSize =
+      ScratchBufferSize;
+  return scratchBufferInfo;
+}
+
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureHardwareBufferSize() {
+  CUpti_PCSamplingConfigurationInfo hardwareBufferInfo{};
+  hardwareBufferInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE;
+  hardwareBufferInfo.attributeData.hardwareBufferSizeData.hardwareBufferSize =
+      HardwareBufferSize;
+  return hardwareBufferInfo;
+}
+
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureStartStopControl() {
+  CUpti_PCSamplingConfigurationInfo startStopControlInfo{};
+  startStopControlInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL;
+  startStopControlInfo.attributeData.enableStartStopControlData
+      .enableStartStopControl = true;
+  return startStopControlInfo;
+}
+
+CUpti_PCSamplingConfigurationInfo ConfigureData::configureCollectionMode() {
+  CUpti_PCSamplingConfigurationInfo collectionModeInfo{};
+  collectionModeInfo.attributeType =
+      CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE;
+  collectionModeInfo.attributeData.collectionModeData.collectionMode =
+      CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS;
+  return collectionModeInfo;
+}
+
+void ConfigureData::initialize(CUcontext context) {
+  this->context = context;
+  cupti::getContextId<true>(context, &contextId);
+  configurationInfos.emplace_back(configureStallReasons());
+  configurationInfos.emplace_back(configureSamplingPeriod());
+  configurationInfos.emplace_back(configureHardwareBufferSize());
+  configurationInfos.emplace_back(configureScratchBuffer());
+  configurationInfos.emplace_back(configureSamplingBuffer());
+  configurationInfos.emplace_back(configureStartStopControl());
+  configurationInfos.emplace_back(configureCollectionMode());
+  setConfigurationAttribute(context, configurationInfos);
+}
+
+ConfigureData *CuptiPCSampling::getConfigureData(uint32_t contextId) {
+  return &contextIdToConfigureData[contextId];
+}
+
+CubinData *CuptiPCSampling::getCubinData(uint64_t cubinCrc) {
+  return &(cubinCrcToCubinData[cubinCrc].first);
+}
+
+void CuptiPCSampling::initialize(CUcontext context) {
+  uint32_t contextId = 0;
+  cupti::getContextId<true>(context, &contextId);
+  doubleCheckedLock([&]() { return !contextInitialized.contain(contextId); },
+                    contextMutex,
+                    [&]() {
+                      enablePCSampling(context);
+                      getConfigureData(contextId)->initialize(context);
+                      contextInitialized.insert(contextId);
+                    });
+}
+
+void CuptiPCSampling::start(CUcontext context) {
+  uint32_t contextId = 0;
+  cupti::getContextId<true>(context, &contextId);
+  doubleCheckedLock([&]() -> bool { return !pcSamplingStarted; },
+                    pcSamplingMutex,
+                    [&]() {
+                      initialize(context);
+                      // Ensure all previous operations are completed
+                      cuda::ctxSynchronize<true>();
+                      startPCSampling(context);
+                      pcSamplingStarted = true;
+                    });
+}
+
+void CuptiPCSampling::processPCSamplingData(ConfigureData *configureData,
+                                            uint64_t externId, bool isAPI) {
+  auto *pcSamplingData = &configureData->pcSamplingData;
+  auto &profiler = CuptiProfiler::instance();
+  auto dataSet = profiler.getDataSet();
+  // In the first round, we need to call getPCSamplingData to get the unsynced
+  // data from the hardware buffer
+  bool firstRound = true;
+  while (pcSamplingData->totalNumPcs > 0 ||
+         pcSamplingData->remainingNumPcs > 0 || firstRound) {
+    // Handle data
+    for (size_t i = 0; i < pcSamplingData->totalNumPcs; ++i) {
+      auto *pcData = pcSamplingData->pPcData + i;
+      auto *cubinData = getCubinData(pcData->cubinCrc);
+      auto key =
+          CubinData::LineInfoKey{pcData->functionIndex, pcData->pcOffset};
+      if (cubinData->lineInfo.find(key) == cubinData->lineInfo.end()) {
+        auto [lineNumber, fileName, dirName] =
+            getSassToSourceCorrelation(pcData->functionName, pcData->pcOffset,
+                                       cubinData->cubin, cubinData->cubinSize);
+        cubinData->lineInfo.try_emplace(key, lineNumber,
+                                        std::string(pcData->functionName),
+                                        dirName, fileName);
+      }
+      auto &lineInfo = cubinData->lineInfo[key];
+      for (size_t j = 0; j < pcData->stallReasonCount; ++j) {
+        auto *stallReason = &pcData->stallReason[j];
+        if (!configureData->stallReasonIndexToMetricIndex.count(
+                stallReason->pcSamplingStallReasonIndex))
+          throw std::runtime_error("Invalid stall reason index");
+        for (auto *data : dataSet) {
+          auto scopeId = externId;
+          if (isAPI)
+            scopeId = data->addScope(externId, lineInfo.functionName);
+          if (lineInfo.fileName.size())
+            scopeId = data->addScope(
+                scopeId, lineInfo.dirName + "/" + lineInfo.fileName + ":" +
+                             lineInfo.functionName + "@" +
+                             std::to_string(lineInfo.lineNumber));
+          auto metricKind = static_cast<PCSamplingMetric::PCSamplingMetricKind>(
+              configureData->stallReasonIndexToMetricIndex
+                  [stallReason->pcSamplingStallReasonIndex]);
+          auto samples = stallReason->samples;
+          auto stalledSamples =
+              configureData->notIssuedStallReasonIndices.count(
+                  stallReason->pcSamplingStallReasonIndex)
+                  ? 0
+                  : samples;
+          auto metric = std::make_shared<PCSamplingMetric>(metricKind, samples,
+                                                           stalledSamples);
+          data->addMetric(scopeId, metric);
+        }
+      }
+    }
+    if (pcSamplingData->remainingNumPcs > 0 || firstRound) {
+      getPCSamplingData(configureData->context, pcSamplingData);
+      firstRound = false;
+    } else
+      break;
+  }
+}
+
+void CuptiPCSampling::stop(CUcontext context, uint64_t externId, bool isAPI) {
+  uint32_t contextId = 0;
+  cupti::getContextId<true>(context, &contextId);
+  doubleCheckedLock([&]() -> bool { return pcSamplingStarted; },
+                    pcSamplingMutex,
+                    [&]() {
+                      auto *configureData = getConfigureData(contextId);
+                      stopPCSampling(context);
+                      pcSamplingStarted = false;
+                      processPCSamplingData(configureData, externId, isAPI);
+                    });
+}
+
+void CuptiPCSampling::finalize(CUcontext context) {
+  uint32_t contextId = 0;
+  cupti::getContextId<true>(context, &contextId);
+  if (!contextInitialized.contain(contextId))
+    return;
+  auto *configureData = getConfigureData(contextId);
+  contextIdToConfigureData.erase(contextId);
+  contextInitialized.erase(contextId);
+  disablePCSampling(context);
+}
+
+void CuptiPCSampling::loadModule(const char *cubin, size_t cubinSize) {
+  auto cubinCrc = getCubinCrc(cubin, cubinSize);
+  auto *cubinData = getCubinData(cubinCrc);
+  cubinData->cubinCrc = cubinCrc;
+  cubinData->cubinSize = cubinSize;
+  cubinData->cubin = cubin;
+}
+
+void CuptiPCSampling::unloadModule(const char *cubin, size_t cubinSize) {
+  // XXX: Unload module is supposed to be called in a thread safe manner
+  // i.e., no two threads will be calling unload module the same time
+  auto cubinCrc = getCubinCrc(cubin, cubinSize);
+  auto count = cubinCrcToCubinData[cubinCrc].second;
+  if (count > 1)
+    cubinCrcToCubinData[cubinCrc].second = count - 1;
+  else
+    cubinCrcToCubinData.erase(cubinCrc);
+}
+
+} // namespace proton
diff --git a/third_party/proton/csrc/lib/Profiler/CuptiProfiler.cpp b/third_party/proton/csrc/lib/Profiler/Cupti/CuptiProfiler.cpp
similarity index 72%
rename from third_party/proton/csrc/lib/Profiler/CuptiProfiler.cpp
rename to third_party/proton/csrc/lib/Profiler/Cupti/CuptiProfiler.cpp
index 573840fc6c..9ddbd7a715 100644
--- a/third_party/proton/csrc/lib/Profiler/CuptiProfiler.cpp
+++ b/third_party/proton/csrc/lib/Profiler/Cupti/CuptiProfiler.cpp
@@ -1,9 +1,10 @@
-#include "Profiler/CuptiProfiler.h"
+#include "Profiler/Cupti/CuptiProfiler.h"
 #include "Context/Context.h"
 #include "Data/Metric.h"
 #include "Driver/Device.h"
 #include "Driver/GPU/CudaApi.h"
 #include "Driver/GPU/CuptiApi.h"
+#include "Profiler/Cupti/CuptiPCSampling.h"
 #include "Utility/Map.h"
 
 #include <cstdlib>
@@ -162,6 +163,33 @@ void setGraphCallbacks(CUpti_SubscriberHandle subscriber, bool enable) {
 #undef CALLBACK_ENABLE
 }
 
+void setResourceCallbacks(CUpti_SubscriberHandle subscriber, bool enable) {
+#define CALLBACK_ENABLE(id)                                                    \
+  cupti::enableCallback<true>(static_cast<uint32_t>(enable), subscriber,       \
+                              CUPTI_CB_DOMAIN_RESOURCE, id)
+
+  CALLBACK_ENABLE(CUPTI_CBID_RESOURCE_MODULE_LOADED);
+  CALLBACK_ENABLE(CUPTI_CBID_RESOURCE_MODULE_UNLOAD_STARTING);
+  CALLBACK_ENABLE(CUPTI_CBID_RESOURCE_CONTEXT_CREATED);
+  CALLBACK_ENABLE(CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING);
+#undef CALLBACK_ENABLE
+}
+
+bool isDriverAPILaunch(CUpti_CallbackId cbId) {
+  return cbId == CUPTI_DRIVER_TRACE_CBID_cuLaunch ||
+         cbId == CUPTI_DRIVER_TRACE_CBID_cuLaunchGrid ||
+         cbId == CUPTI_DRIVER_TRACE_CBID_cuLaunchGridAsync ||
+         cbId == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel ||
+         cbId == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel_ptsz ||
+         cbId == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx ||
+         cbId == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx_ptsz ||
+         cbId == CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel ||
+         cbId == CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel_ptsz ||
+         cbId == CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice ||
+         cbId == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch ||
+         cbId == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz;
+}
+
 } // namespace
 
 struct CuptiProfiler::CuptiProfilerPimpl
@@ -186,6 +214,7 @@ struct CuptiProfiler::CuptiProfilerPimpl
   static constexpr size_t AttributeSize = sizeof(size_t);
 
   CUpti_SubscriberHandle subscriber{};
+  CuptiPCSampling pcSampling;
 
   ThreadSafeMap<uint32_t, size_t, std::unordered_map<uint32_t, size_t>>
       graphIdToNumInstances;
@@ -241,33 +270,58 @@ void CuptiProfiler::CuptiProfilerPimpl::callbackFn(void *userData,
   if (domain == CUPTI_CB_DOMAIN_RESOURCE) {
     auto *resourceData =
         static_cast<CUpti_ResourceData *>(const_cast<void *>(cbData));
-    auto *graphData =
-        static_cast<CUpti_GraphData *>(resourceData->resourceDescriptor);
     auto *pImpl = dynamic_cast<CuptiProfilerPimpl *>(profiler.pImpl.get());
-    uint32_t graphId = 0;
-    uint32_t graphExecId = 0;
-    if (graphData->graph)
-      cupti::getGraphId<true>(graphData->graph, &graphId);
-    if (graphData->graphExec)
-      cupti::getGraphExecId<true>(graphData->graphExec, &graphExecId);
-    if (cbId == CUPTI_CBID_RESOURCE_GRAPHNODE_CREATED ||
-        cbId == CUPTI_CBID_RESOURCE_GRAPHNODE_CLONED) {
-      if (!pImpl->graphIdToNumInstances.contain(graphId))
-        pImpl->graphIdToNumInstances[graphId] = 1;
-      else
-        pImpl->graphIdToNumInstances[graphId]++;
-    } else if (cbId == CUPTI_CBID_RESOURCE_GRAPHNODE_DESTROY_STARTING) {
-      pImpl->graphIdToNumInstances[graphId]--;
-    } else if (cbId == CUPTI_CBID_RESOURCE_GRAPHEXEC_CREATED) {
-      pImpl->graphExecIdToGraphId[graphExecId] = graphId;
-    } else if (cbId == CUPTI_CBID_RESOURCE_GRAPHEXEC_DESTROY_STARTING) {
-      pImpl->graphExecIdToGraphId.erase(graphExecId);
-    } else if (cbId == CUPTI_CBID_RESOURCE_GRAPH_DESTROY_STARTING) {
-      pImpl->graphIdToNumInstances.erase(graphId);
+    if (cbId == CUPTI_CBID_RESOURCE_MODULE_LOADED) {
+      auto *moduleResource = static_cast<CUpti_ModuleResourceData *>(
+          resourceData->resourceDescriptor);
+      if (profiler.isPCSamplingEnabled()) {
+        pImpl->pcSampling.loadModule(moduleResource->pCubin,
+                                     moduleResource->cubinSize);
+      }
+    } else if (cbId == CUPTI_CBID_RESOURCE_MODULE_UNLOAD_STARTING) {
+      auto *moduleResource = static_cast<CUpti_ModuleResourceData *>(
+          resourceData->resourceDescriptor);
+      if (profiler.isPCSamplingEnabled()) {
+        pImpl->pcSampling.unloadModule(moduleResource->pCubin,
+                                       moduleResource->cubinSize);
+      }
+    } else if (cbId == CUPTI_CBID_RESOURCE_CONTEXT_CREATED) {
+      if (profiler.isPCSamplingEnabled()) {
+        pImpl->pcSampling.initialize(resourceData->context);
+      }
+    } else if (cbId == CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING) {
+      if (profiler.isPCSamplingEnabled()) {
+        pImpl->pcSampling.finalize(resourceData->context);
+      }
+    } else {
+      auto *graphData =
+          static_cast<CUpti_GraphData *>(resourceData->resourceDescriptor);
+      uint32_t graphId = 0;
+      uint32_t graphExecId = 0;
+      if (graphData->graph)
+        cupti::getGraphId<true>(graphData->graph, &graphId);
+      if (graphData->graphExec)
+        cupti::getGraphExecId<true>(graphData->graphExec, &graphExecId);
+      if (cbId == CUPTI_CBID_RESOURCE_GRAPHNODE_CREATED ||
+          cbId == CUPTI_CBID_RESOURCE_GRAPHNODE_CLONED) {
+        if (!pImpl->graphIdToNumInstances.contain(graphId))
+          pImpl->graphIdToNumInstances[graphId] = 1;
+        else
+          pImpl->graphIdToNumInstances[graphId]++;
+      } else if (cbId == CUPTI_CBID_RESOURCE_GRAPHNODE_DESTROY_STARTING) {
+        pImpl->graphIdToNumInstances[graphId]--;
+      } else if (cbId == CUPTI_CBID_RESOURCE_GRAPHEXEC_CREATED) {
+        pImpl->graphExecIdToGraphId[graphExecId] = graphId;
+      } else if (cbId == CUPTI_CBID_RESOURCE_GRAPHEXEC_DESTROY_STARTING) {
+        pImpl->graphExecIdToGraphId.erase(graphExecId);
+      } else if (cbId == CUPTI_CBID_RESOURCE_GRAPH_DESTROY_STARTING) {
+        pImpl->graphIdToNumInstances.erase(graphId);
+      }
     }
   } else {
     const CUpti_CallbackData *callbackData =
         static_cast<const CUpti_CallbackData *>(cbData);
+    auto *pImpl = dynamic_cast<CuptiProfilerPimpl *>(profiler.pImpl.get());
     if (callbackData->callbackSite == CUPTI_API_ENTER) {
       auto scopeId = Scope::getNewScopeId();
       threadState.record(scopeId);
@@ -275,7 +329,6 @@ void CuptiProfiler::CuptiProfilerPimpl::callbackFn(void *userData,
       size_t numInstances = 1;
       if (cbId == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch ||
           cbId == CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz) {
-        auto *pImpl = dynamic_cast<CuptiProfilerPimpl *>(profiler.pImpl.get());
         auto graphExec = static_cast<const cuGraphLaunch_params *>(
                              callbackData->functionParams)
                              ->hGraph;
@@ -298,7 +351,17 @@ void CuptiProfiler::CuptiProfilerPimpl::callbackFn(void *userData,
                     << std::endl;
       }
       profiler.correlation.correlate(callbackData->correlationId, numInstances);
+      if (profiler.isPCSamplingEnabled() && isDriverAPILaunch(cbId)) {
+        pImpl->pcSampling.start(callbackData->context);
+      }
     } else if (callbackData->callbackSite == CUPTI_API_EXIT) {
+      if (profiler.isPCSamplingEnabled() && isDriverAPILaunch(cbId)) {
+        // XXX: Conservatively stop every GPU kernel for now
+        auto scopeId = profiler.correlation.externIdQueue.back();
+        pImpl->pcSampling.stop(
+            callbackData->context, scopeId,
+            profiler.correlation.apiExternIds.contain(scopeId));
+      }
       threadState.exitOp();
       profiler.correlation.submit(callbackData->correlationId);
     }
@@ -306,10 +369,15 @@ void CuptiProfiler::CuptiProfilerPimpl::callbackFn(void *userData,
 }
 
 void CuptiProfiler::CuptiProfilerPimpl::doStart() {
-  cupti::activityRegisterCallbacks<true>(allocBuffer, completeBuffer);
-  cupti::activityEnable<true>(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
-  // TODO: switch to directly subscribe the APIs and measure overhead
   cupti::subscribe<true>(&subscriber, callbackFn, nullptr);
+  if (profiler.isPCSamplingEnabled()) {
+    setResourceCallbacks(subscriber, /*enable=*/true);
+    // Continuous PC sampling is not compatible with concurrent kernel profiling
+    cupti::activityEnable<true>(CUPTI_ACTIVITY_KIND_KERNEL);
+  } else {
+    cupti::activityEnable<true>(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
+  }
+  cupti::activityRegisterCallbacks<true>(allocBuffer, completeBuffer);
   setGraphCallbacks(subscriber, /*enable=*/true);
   setRuntimeCallbacks(subscriber, /*enable=*/true);
   setDriverCallbacks(subscriber, /*enable=*/true);
@@ -326,8 +394,12 @@ void CuptiProfiler::CuptiProfilerPimpl::doFlush() {
   // If the current context is not set, we don't do any synchronization.
   CUcontext cuContext = nullptr;
   cuda::ctxGetCurrent<false>(&cuContext);
-  if (cuContext)
+  if (cuContext) {
     cuda::ctxSynchronize<true>();
+  }
+  if (profiler.isPCSamplingEnabled()) {
+    pcSampling.finalize(cuContext);
+  }
   profiler.correlation.flush(
       /*maxRetries=*/100, /*sleepMs=*/10,
       /*flush=*/[]() {
@@ -341,7 +413,12 @@ void CuptiProfiler::CuptiProfilerPimpl::doFlush() {
 }
 
 void CuptiProfiler::CuptiProfilerPimpl::doStop() {
-  cupti::activityDisable<true>(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
+  if (profiler.isPCSamplingEnabled()) {
+    setResourceCallbacks(subscriber, /*enable=*/false);
+    cupti::activityDisable<true>(CUPTI_ACTIVITY_KIND_KERNEL);
+  } else {
+    cupti::activityDisable<true>(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
+  }
   setGraphCallbacks(subscriber, /*enable=*/false);
   setRuntimeCallbacks(subscriber, /*enable=*/false);
   setDriverCallbacks(subscriber, /*enable=*/false);
diff --git a/third_party/proton/csrc/lib/Profiler/RoctracerProfiler.cpp b/third_party/proton/csrc/lib/Profiler/RocTracer/RoctracerProfiler.cpp
similarity index 99%
rename from third_party/proton/csrc/lib/Profiler/RoctracerProfiler.cpp
rename to third_party/proton/csrc/lib/Profiler/RocTracer/RoctracerProfiler.cpp
index 55af9eb714..68f3f0beac 100644
--- a/third_party/proton/csrc/lib/Profiler/RoctracerProfiler.cpp
+++ b/third_party/proton/csrc/lib/Profiler/RocTracer/RoctracerProfiler.cpp
@@ -1,4 +1,4 @@
-#include "Profiler/RoctracerProfiler.h"
+#include "Profiler/Roctracer/RoctracerProfiler.h"
 #include "Context/Context.h"
 #include "Data/Metric.h"
 #include "Driver/GPU/HipApi.h"
diff --git a/third_party/proton/csrc/lib/Session/Session.cpp b/third_party/proton/csrc/lib/Session/Session.cpp
index 1db512d075..9b0ef10d37 100644
--- a/third_party/proton/csrc/lib/Session/Session.cpp
+++ b/third_party/proton/csrc/lib/Session/Session.cpp
@@ -2,8 +2,8 @@
 #include "Context/Python.h"
 #include "Context/Shadow.h"
 #include "Data/TreeData.h"
-#include "Profiler/CuptiProfiler.h"
-#include "Profiler/RoctracerProfiler.h"
+#include "Profiler/Cupti/CuptiProfiler.h"
+#include "Profiler/Roctracer/RoctracerProfiler.h"
 #include "Utility/String.h"
 
 namespace proton {
@@ -13,6 +13,9 @@ Profiler *getProfiler(const std::string &profilerName) {
   if (proton::toLower(profilerName) == "cupti") {
     return &CuptiProfiler::instance();
   }
+  if (proton::toLower(profilerName) == "cupti_pcsampling") {
+    return &CuptiProfiler::instance().enablePCSampling();
+  }
   if (proton::toLower(profilerName) == "roctracer") {
     return &RoctracerProfiler::instance();
   }
diff --git a/third_party/proton/proton/profile.py b/third_party/proton/proton/profile.py
index 01d5a1947e..2dd7a6f53e 100644
--- a/third_party/proton/proton/profile.py
+++ b/third_party/proton/proton/profile.py
@@ -42,7 +42,7 @@ def start(
         name (str, optional): The name (with path) of the profiling session.
                               If not provided, the default name is "~/proton.hatchet".
         backend (str, optional): The backend to use for profiling.
-                                 Available options are [None, "cupti", "roctracer"].
+                                 Available options are [None, "cupti", "cupti_pcsampling", "roctracer"].
                                  Defaults to None, which automatically selects the backend matching the current active runtime.
         context (str, optional): The context to use for profiling.
                                  Available options are ["shadow", "python"].
diff --git a/third_party/proton/proton/proton.py b/third_party/proton/proton/proton.py
index 7ea6413ac5..cbb7a0b6f9 100644
--- a/third_party/proton/proton/proton.py
+++ b/third_party/proton/proton/proton.py
@@ -13,7 +13,8 @@ def parse_arguments():
     python -m triton.profiler.proton [options] script.py [script_args] [script_options]
 """, formatter_class=argparse.RawTextHelpFormatter)
     parser.add_argument("-n", "--name", type=str, help="Name of the profiling session")
-    parser.add_argument("-b", "--backend", type=str, help="Profiling backend", default=None, choices=["cupti"])
+    parser.add_argument("-b", "--backend", type=str, help="Profiling backend", default=None,
+                        choices=["cupti", "cupti_pcsampling", "roctracer"])
     parser.add_argument("-c", "--context", type=str, help="Profiling context", default="shadow",
                         choices=["shadow", "python"])
     parser.add_argument("-d", "--data", type=str, help="Profiling data", default="tree", choices=["tree"])
diff --git a/third_party/proton/proton/scope.py b/third_party/proton/proton/scope.py
index 5695b88075..26d946a8c1 100644
--- a/third_party/proton/proton/scope.py
+++ b/third_party/proton/proton/scope.py
@@ -5,7 +5,7 @@
 from .flags import get_profiling_on
 from triton._C.libproton import proton as libproton
 
-_local = threading.local()
+thread_local_scopes = threading.local()
 
 MetricValueType = Union[float, int]
 PropertyValueType = Union[float, int, str]
@@ -22,7 +22,7 @@ class scope:
             foo[1,](x, y)
         ```
 
-        decoarator:
+        decorator:
         ```python
         @proton.scope("test0", {metric_name: metric_value})
         def foo(x, y):
@@ -36,25 +36,25 @@ def foo(x, y):
 
     def __init__(self, name: str, metrics: Optional[dict[str, MetricValueType]] = None,
                  properties: Optional[dict[str, PropertyValueType]] = None) -> None:
-        self._name = name
-        self._metrics = metrics
-        self._properties = properties
+        self.name = name
+        self.metrics = metrics
+        self.properties = properties
 
     def __enter__(self):
         if not get_profiling_on():
             return self
-        self._id = libproton.record_scope()
-        libproton.enter_scope(self._id, self._name)
-        if self._metrics:
-            libproton.add_metrics(self._id, self._metrics)
-        if self._properties:
-            libproton.set_properties(self._id, self._properties)
+        self.id = libproton.record_scope()
+        libproton.enter_scope(self.id, self.name)
+        if self.metrics:
+            libproton.add_metrics(self.id, self.metrics)
+        if self.properties:
+            libproton.set_properties(self.id, self.properties)
         return self
 
     def __exit__(self, exc_type, exc_value, traceback) -> None:
         if not get_profiling_on():
             return
-        libproton.exit_scope(self._id, self._name)
+        libproton.exit_scope(self.id, self.name)
 
     def __call__(self, func):
 
@@ -62,14 +62,14 @@ def __call__(self, func):
         def wrapper(*args, **kwargs):
             if get_profiling_on():
                 id = libproton.record_scope()
-                libproton.enter_scope(id, self._name)
-                if self._metrics:
-                    libproton.add_metrics(id, self._metrics)
-                if self._properties:
-                    libproton.set_properties(id, self._properties)
+                libproton.enter_scope(id, self.name)
+                if self.metrics:
+                    libproton.add_metrics(id, self.metrics)
+                if self.properties:
+                    libproton.set_properties(id, self.properties)
             ret = func(*args, **kwargs)
             if get_profiling_on():
-                libproton.exit_scope(id, self._name)
+                libproton.exit_scope(id, self.name)
             return ret
 
         return wrapper
@@ -80,9 +80,9 @@ def enter_scope(name: str, *, triton_op: bool = False, metrics: Optional[dict[st
     if not get_profiling_on():
         return -1
     id = libproton.record_scope()
-    if not hasattr(_local, "scopes"):
-        _local.scopes = []
-    _local.scopes.append((id, name))
+    if not hasattr(thread_local_scopes, "scopes"):
+        thread_local_scopes.scopes = []
+    thread_local_scopes.scopes.append((id, name))
     if triton_op:
         libproton.enter_op(id, name)
     else:
@@ -97,7 +97,7 @@ def enter_scope(name: str, *, triton_op: bool = False, metrics: Optional[dict[st
 def exit_scope(triton_op: bool = False) -> int:
     if not get_profiling_on():
         return -1
-    id, name = _local.scopes.pop()
+    id, name = thread_local_scopes.scopes.pop()
     if triton_op:
         libproton.exit_op(id, name)
     else:
diff --git a/third_party/proton/proton/viewer.py b/third_party/proton/proton/viewer.py
index 2067466c94..9fe0e7e67d 100644
--- a/third_party/proton/proton/viewer.py
+++ b/third_party/proton/proton/viewer.py
@@ -43,7 +43,7 @@ def get_min_time_flops(df, device_info):
             num_sms = device_info[device_type][device_index]["num_sms"]
             clock_rate = device_info[device_type][device_index]["clock_rate"]
             for width in TritonHook.flops_width:
-                idx = df["DeviceId"] == device_index
+                idx = df["device_id"] == device_index
                 device_frames = df[idx]
                 if f"flops{width}" not in device_frames.columns:
                     continue
@@ -72,7 +72,7 @@ def get_min_time_bytes(df, device_info):
     min_time_bytes = pd.DataFrame(0.0, index=df.index, columns=["min_time"])
     for device_type in device_info:
         for device_index in device_info[device_type]:
-            idx = df["DeviceId"] == device_index
+            idx = df["device_id"] == device_index
             device_frames = df[idx]
             memory_clock_rate = device_info[device_type][device_index]["memory_clock_rate"]  # in khz
             bus_width = device_info[device_type][device_index]["bus_width"]  # in bits
@@ -105,7 +105,7 @@ def get_min_time_bytes(df, device_info):
 def derive_metrics(gf, metrics, raw_metrics, device_info):
     derived_metrics = []
     original_metrics = []
-    internal_frame_indices = gf.dataframe["DeviceId"].isna()
+    internal_frame_indices = gf.dataframe["device_id"].isna()
 
     def get_time_seconds(df):
         time_metric_name = match_available_metrics([time_factor_dict.name], raw_metrics)[0]
@@ -135,7 +135,7 @@ def get_time_seconds(df):
             derived_metrics.append(f"{metric} (inc)")
         elif metric in avg_time_factor_dict.factor:
             metric_time_unit = avg_time_factor_dict.name + "/" + metric.split("/")[1]
-            gf.dataframe[f"{metric} (inc)"] = (get_time_seconds(gf.dataframe) / gf.dataframe['Count'] /
+            gf.dataframe[f"{metric} (inc)"] = (get_time_seconds(gf.dataframe) / gf.dataframe['count'] /
                                                avg_time_factor_dict.factor[metric_time_unit])
             gf.dataframe.loc[internal_frame_indices, f"{metric} (inc)"] = np.nan
             derived_metrics.append(f"{metric} (inc)")
diff --git a/third_party/proton/test/example_cuda.json b/third_party/proton/test/example_cuda.json
index 0db9ace447..445f0e224c 100644
--- a/third_party/proton/test/example_cuda.json
+++ b/third_party/proton/test/example_cuda.json
@@ -8,10 +8,10 @@
           "type": "function"
         },
         "metrics": {
-          "Count": 10,
-          "DeviceId": "1",
-          "DeviceType": "CUDA",
-          "Time (ns)": 204800,
+          "count": 10,
+          "device_id": "1",
+          "device_type": "CUDA",
+          "time (ns)": 204800,
           "flops8": 1e11,
           "bytes": 1e8
         }
@@ -23,10 +23,10 @@
           "type": "function"
         },
         "metrics": {
-          "Count": 1,
-          "DeviceId": "0",
-          "DeviceType": "CUDA",
-          "Time (ns)": 204800,
+          "count": 1,
+          "device_id": "0",
+          "device_type": "CUDA",
+          "time (ns)": 204800,
           "flops8": 1e10,
           "bytes": 1e7
         }
@@ -37,8 +37,8 @@
       "type": "function"
     },
     "metrics": {
-      "Count": 0,
-      "Time (ns)": 0,
+      "count": 0,
+      "time (ns)": 0,
       "flops8": 0,
       "bytes": 0
     }
diff --git a/third_party/proton/test/example_frame.json b/third_party/proton/test/example_frame.json
index 64789a3b74..0069476fbc 100644
--- a/third_party/proton/test/example_frame.json
+++ b/third_party/proton/test/example_frame.json
@@ -10,10 +10,10 @@
               "type": "function"
             },
             "metrics": {
-              "Count": 1,
-              "DeviceId": "0",
-              "DeviceType": "HIP",
-              "Time (ns)": 204800
+              "count": 1,
+              "device_id": "0",
+              "device_type": "HIP",
+              "time (ns)": 204800
             }
           }
         ],
@@ -27,7 +27,12 @@
         "frame": {
           "name": "test1"
         },
-        "metrics": {}
+        "metrics": {
+          "count": 1,
+          "device_id": "0",
+          "device_type": "HIP",
+          "time (ns)": 204800
+        }
       }
     ],
     "frame": {
@@ -35,8 +40,8 @@
       "type": "function"
     },
     "metrics": {
-      "Count": 0,
-      "Time (ns)": 0
+      "count": 0,
+      "time (ns)": 0
     }
   },
   {
diff --git a/third_party/proton/test/example_hip.json b/third_party/proton/test/example_hip.json
index 2fcfad3c5d..68538706cf 100644
--- a/third_party/proton/test/example_hip.json
+++ b/third_party/proton/test/example_hip.json
@@ -8,10 +8,10 @@
           "type": "function"
         },
         "metrics": {
-          "Count": 1,
-          "DeviceId": "1",
-          "DeviceType": "HIP",
-          "Time (ns)": 204800,
+          "count": 1,
+          "device_id": "1",
+          "device_type": "HIP",
+          "time (ns)": 204800,
           "flops8": 1e11,
           "bytes": 1e8
         }
@@ -23,10 +23,10 @@
           "type": "function"
         },
         "metrics": {
-          "Count": 1,
-          "DeviceId": "0",
-          "DeviceType": "HIP",
-          "Time (ns)": 204800,
+          "count": 1,
+          "device_id": "0",
+          "device_type": "HIP",
+          "time (ns)": 204800,
           "flops8": 1e10,
           "bytes": 1e7
         }
@@ -37,8 +37,8 @@
       "type": "function"
     },
     "metrics": {
-      "Count": 0,
-      "Time (ns)": 0,
+      "count": 0,
+      "time (ns)": 0,
       "flops8": 0,
       "bytes": 0
     }
diff --git a/third_party/proton/test/test_profile.py b/third_party/proton/test/test_profile.py
index 1a69608a26..13cb9bd99c 100644
--- a/third_party/proton/test/test_profile.py
+++ b/third_party/proton/test/test_profile.py
@@ -25,7 +25,7 @@ def test_torch(context):
         if context == "shadow":
             assert len(data[0]["children"]) == 1
             assert data[0]["children"][0]["frame"]["name"] == "test"
-            assert data[0]["children"][0]["children"][0]["metrics"]["Time (ns)"] > 0
+            assert data[0]["children"][0]["children"][0]["metrics"]["time (ns)"] > 0
         elif context == "python":
             assert len(data[0]["children"]) == 1
             # The last frame is the torch kernel
@@ -111,7 +111,7 @@ def fn():
             assert len(test_frame["children"]) >= 2
         else:
             assert len(test_frame["children"]) >= 3
-        assert test_frame["children"][0]["metrics"]["Time (ns)"] > 0
+        assert test_frame["children"][0]["metrics"]["time (ns)"] > 0
 
 
 def test_metrics():
@@ -197,7 +197,41 @@ def foo(x, size: tl.constexpr, y):
         assert data[0]["children"][0]["frame"]["name"] == "test0"
         assert data[0]["children"][0]["children"][0]["frame"]["name"] == "foo_test_1ctas_1elems"
         assert data[0]["children"][0]["children"][0]["metrics"]["flops32"] == 1.0
-        assert data[0]["children"][0]["children"][0]["metrics"]["Time (ns)"] > 0
+        assert data[0]["children"][0]["children"][0]["metrics"]["time (ns)"] > 0
+
+
+def test_pcsampling():
+    if is_hip():
+        pytest.skip("HIP backend does not support pc sampling")
+
+    import os
+    if os.environ.get("PROTON_SKIP_PC_SAMPLING_TEST", "0") == "1":
+        pytest.skip("PC sampling test is disabled")
+
+    @triton.jit
+    def foo(x, y, size: tl.constexpr):
+        offs = tl.arange(0, size)
+        for _ in range(1000):
+            tl.store(y + offs, tl.load(x + offs))
+
+    with tempfile.NamedTemporaryFile(delete=True, suffix=".hatchet") as f:
+        proton.start(f.name.split(".")[0], hook="triton", backend="cupti_pcsampling")
+        with proton.scope("init"):
+            x = torch.ones((1024, ), device="cuda", dtype=torch.float32)
+            y = torch.zeros_like(x)
+        with proton.scope("test"):
+            foo[(1, )](x, y, x.size()[0], num_warps=4)
+        proton.finalize()
+        data = json.load(f)
+        init_frame = data[0]["children"][0]
+        test_frame = data[0]["children"][1]
+        # With line mapping
+        assert "foo" in test_frame["children"][0]["frame"]["name"]
+        assert test_frame["children"][0]["children"][0]["metrics"]["num_samples"] > 0
+        assert "@" in test_frame["children"][0]["children"][0]["frame"]["name"]
+        # Without line mapping
+        assert "elementwise" in init_frame["children"][0]["frame"]["name"]
+        assert init_frame["children"][0]["metrics"]["num_samples"] > 0
 
 
 def test_deactivate():
@@ -211,6 +245,6 @@ def test_deactivate():
         proton.finalize()
         data = json.load(f)
         # Root shouldn't have device id
-        assert "DeviceId" not in data[0]["metrics"]
+        assert "device_id" not in data[0]["metrics"]
         assert len(data[0]["children"]) == 1
-        assert "DeviceId" in data[0]["children"][0]["metrics"]
+        assert "device_id" in data[0]["children"][0]["metrics"]
diff --git a/third_party/proton/test/test_viewer.py b/third_party/proton/test/test_viewer.py
index c8343e1267..998825bbc8 100644
--- a/third_party/proton/test/test_viewer.py
+++ b/third_party/proton/test/test_viewer.py
@@ -52,8 +52,8 @@ def test_min_time_flops():
     with open(cuda_example_file, "r") as f:
         gf, _, device_info = get_raw_metrics(f)
         ret = get_min_time_flops(gf.dataframe, device_info)
-        device0_idx = gf.dataframe["DeviceId"] == "0"
-        device1_idx = gf.dataframe["DeviceId"] == "1"
+        device0_idx = gf.dataframe["device_id"] == "0"
+        device1_idx = gf.dataframe["device_id"] == "1"
         # sm89
         np.testing.assert_allclose(ret[device0_idx].to_numpy(), [[0.000025]], atol=1e-5)
         # sm90
@@ -61,8 +61,8 @@ def test_min_time_flops():
     with open(hip_example_file, "r") as f:
         gf, _, device_info = get_raw_metrics(f)
         ret = get_min_time_flops(gf.dataframe, device_info)
-        device0_idx = gf.dataframe["DeviceId"] == "0"
-        device1_idx = gf.dataframe["DeviceId"] == "1"
+        device0_idx = gf.dataframe["device_id"] == "0"
+        device1_idx = gf.dataframe["device_id"] == "1"
         # MI200
         np.testing.assert_allclose(ret[device0_idx].to_numpy(), [[0.000026]], atol=1e-5)
         # MI300
@@ -73,8 +73,8 @@ def test_min_time_bytes():
     with open(cuda_example_file, "r") as f:
         gf, _, device_info = get_raw_metrics(f)
         ret = get_min_time_bytes(gf.dataframe, device_info)
-        device0_idx = gf.dataframe["DeviceId"] == "0"
-        device1_idx = gf.dataframe["DeviceId"] == "1"
+        device0_idx = gf.dataframe["device_id"] == "0"
+        device1_idx = gf.dataframe["device_id"] == "1"
         # sm89
         np.testing.assert_allclose(ret[device0_idx].to_numpy(), [[9.91969e-06]], atol=1e-6)
         # sm90
@@ -82,8 +82,8 @@ def test_min_time_bytes():
     with open(hip_example_file, "r") as f:
         gf, _, device_info = get_raw_metrics(f)
         ret = get_min_time_bytes(gf.dataframe, device_info)
-        device0_idx = gf.dataframe["DeviceId"] == "0"
-        device1_idx = gf.dataframe["DeviceId"] == "1"
+        device0_idx = gf.dataframe["device_id"] == "0"
+        device1_idx = gf.dataframe["device_id"] == "1"
         # MI200
         np.testing.assert_allclose(ret[device0_idx].to_numpy(), [[6.10351e-06]], atol=1e-6)
         # MI300
diff --git a/third_party/proton/tutorials/dynamic_net.py b/third_party/proton/tutorials/dynamic_net.py
index a1a82b53e2..5793bebd09 100644
--- a/third_party/proton/tutorials/dynamic_net.py
+++ b/third_party/proton/tutorials/dynamic_net.py
@@ -85,13 +85,14 @@ def run():
 argparser.add_argument("--profile", action="store_true")
 argparser.add_argument("--mode", default="torch", choices=["torch", "torchinductor"])
 argparser.add_argument("--context", default="shadow", choices=["shadow", "python"])
+argparser.add_argument("--backend", default=None, choices=["cupti", "roctracer", "cupti_pcsampling"])
 
 args = argparser.parse_args()
 
 mode = args.mode
 
 if args.profile:
-    func = proton.profile(run, name="dynamic_net", context=args.context)
+    func = proton.profile(run, name="dynamic_net", context=args.context, backend=args.backend)
 else:
     func = run
 

From 49266aa908d29a7029348ff480a75a3ea4d6e704 Mon Sep 17 00:00:00 2001
From: Keren Zhou <kerenzhou@openai.com>
Date: Mon, 30 Sep 2024 17:52:59 -0700
Subject: [PATCH 11/12] [BACKEND] Linear Layout with stmatrix part 2: support
 stmatrix for `local_alloc` ops (#4763)

This PR enables the use of `stmatrix` for `local_alloc` ops through
linear layout and removes the legacy code from the `TargetInfo` class.
---
 .../TritonGPUToLLVM/TargetInfoBase.h          |   9 -
 .../TritonGPU/IR/LinearLayoutConversions.h    | 128 +++++++++++++-
 .../TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp |  18 +-
 .../TritonGPUToLLVM/MemoryOpToLLVM.cpp        |   1 -
 .../TritonGPU/IR/LinearLayoutConversions.cpp  | 103 +++++++++++-
 .../amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp |   9 -
 .../amd/lib/TritonAMDGPUToLLVM/TargetInfo.h   |   9 -
 .../ConvertLayoutOpToLLVM.cpp                 |  57 +++++--
 .../lib/TritonNVIDIAGPUToLLVM/TargetInfo.cpp  | 159 ++----------------
 .../lib/TritonNVIDIAGPUToLLVM/TargetInfo.h    |   9 -
 10 files changed, 287 insertions(+), 215 deletions(-)

diff --git a/include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h b/include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h
index 3e3ec2b8d4..6017deb7c7 100644
--- a/include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h
+++ b/include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h
@@ -57,15 +57,6 @@ class TargetInfoBase {
                           unsigned numLaneToReduce,
                           unsigned interleave) const = 0;
 
-  // TODO (Keren): Remove this function once layout conversion using stmatrix is
-  // handled by Linear Layout.
-  virtual bool processReplicaUsingStMatrix(
-      RewriterBase &rewriter, Location loc, Value smemBase,
-      SmallVector<Value> &vals, RankedTensorType srcTy, Type elemTy,
-      ArrayRef<unsigned> paddedRepShape, ArrayRef<unsigned> origRepShape,
-      ArrayRef<unsigned> outOrd, unsigned accumNumReplicates,
-      int swizzleByteWidth = 0) const = 0;
-
   virtual std::string getMulhiFuncName(Type resultElementTy) const = 0;
   // Emits LLVM code with |rewriter| to print a message following the given
   // format from the device. |formatStrStart| is the pointer to the start of
diff --git a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
index 7afaf94558..5f4c3b77f0 100644
--- a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
+++ b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
@@ -113,12 +113,134 @@ LinearLayout chooseShemLayoutForRegToRegConversion(
 //   row0  reg[0-1]   reg[4-5]
 //   row8  reg[2-3]   reg[6-7]
 //
+// When `swizzleByteSize` is non-zero, the layout is constructed
+// differently due to leading dimension offset and swizzling.
+// There are two key concepts to understand:
+//
+//   1. Chunks: The leading dimension (i.e., the column dimension) is divided
+//   into chunks, where each chunk's size is determined by `swizzleByteSize`.
+//   2. Swizzling within tiles: Each tile applies a swizzling pattern to its
+//   rows to optimize memory access.
+//
+// - Concept 1: Chunks
+//
+// In the swizzled layout, the leading dimension is strided by
+// `swizzleByteSize`. This introduces the concept of a "chunk", where each chunk
+// spans a certain number of columns.
+//
+// For a tile size of `stmatrix.x4` (16x16 elements), with each element being 16
+// bits (2 bytes), each tile occupies 16 rows and 32 bytes per row (since 16
+// elements * 2 bytes per element = 32 bytes per row).
+//
+// Given a `swizzleByteSize` of 128 bytes, the number of tiles per chunk can be
+// calculated as:
+//
+//   Number of tiles per chunk = swizzleByteSize / (bytes per row) = 128 bytes /
+//   32 bytes = 4 tiles
+//
+// Therefore, each chunk contains 4 tiles horizontally, spanning 64 columns
+// (since each tile is 16 columns):
+//
+//             col0-15    col16-31   col32-47   col48-63
+//   row0-15    tile0      tile1      tile2      tile3
+//
+// For a tensor of size 128x128 elements (#rows x #columns), and each element
+// being 16 bits, the tensor can be divided into multiple chunks both
+// horizontally and vertically.  Chunks are stored in memory in a "column-major"
+// order based on chunks, meaning chunk1's address follows chunk0's.
+//
+// Assuming we have 8 warps, and we assign each warp to process a chunk of 16
+// rows (rows per tile) and 128 columns (the width of two chunks). This results
+// in each warp handling one horizontal slice of the tensor.
+//
+// The overall layout can be visualized as:
+//
+//                        |<- 128 * 128 bytes ->|<- 128 * 128 bytes ->|
+//                              columns 0-63         columns 64-127
+//   warp0 | rows 0-15            chunk0               chunk8
+//   warp1 | rows 16-31           chunk1               chunk9
+//   warp2 | rows 32-47           chunk2               chunk10
+//   warp3 | rows 48-63           chunk3               chunk11
+//   warp4 | rows 64-79           chunk4               chunk12
+//   warp5 | rows 80-95           chunk5               chunk13
+//   warp6 | rows 96-111          chunk6               chunk14
+//   warp7 | rows 112-127         chunk7               chunk15
+//
+// - Concept 2: Swizzling within tiles
+//
+// Within each 16x16 tile, rows are swizzled to optimize memory access patterns.
+// This swizzling is similar to what's defined in `TritonGPUAttrDefs.td`. at the
+// level of each 16x16 tile rather than the entire tensor.
+//
+// Key parameters for swizzling:
+//
+//   - `perPhase`: The number of rows over which to apply a XOR operation at
+//   each phase.
+//   - `maxPhase`: The total number of phases.
+//   - `vectorWidth`: The number of elements per vector, which is 8 in this case
+//   because `stmatrix` stores 8 contiguous elements per thread.
+//
+// The offset of each element within a tile is calculated using the formula:
+//
+//   offset = row * swizzleByteSize + (vectorWidth * ((row / perPhase) %
+//   maxPhase)) * elementSize
+//
+// where `elementSize` is the size of each element in bytes (2 bytes for 16-bit
+// elements).
+//
+// For example, consider the element at index `(row=1, col=0)` in chunk0:
+//
+// Without swizzling:
+//
+//   offset = row * swizzleByteSize + col * elementSize
+//          = 1 * 128 bytes + 0 * 2 bytes
+//          = 128 bytes
+//
+// With swizzling (assuming `perPhase=1`, `maxPhase=8`, `vectorWidth=8`):
+//
+//   offset = row * swizzleByteSize + (vectorWidth * ((row / perPhase) %
+//   maxPhase)) * elementSize
+//          = 1 * 128 bytes + (8 * ((1 / 1) % 8)) * 2 bytes
+//          = 128 bytes + (8 * (1 % 8)) * 2 bytes
+//          = 128 bytes + 8 * 2 bytes
+//          = 128 bytes + 16 bytes
+//          = 144 bytes
+//
+// This swizzling ensures that elements are stored in a way that optimizes for
+// memory bandwidth and reduces bank conflicts.
+//
+// - Verification through Linear Layout
+//
+// We can verify the offsets with the following outputs of the corresponding
+// linear layout, where each element is 16 bits (2 bytes):
+//
+//   - register=1 -> offset=1
+//     register=2 -> offset=2
+//     register=4 -> offset=4
+//     register=8 -> offset=16
+//     register=16 -> offset=32
+//     register=32 -> offset=8192
+//   - lane=1 -> offset=72
+//     lane=2 -> offset=144
+//     lane=4 -> offset=288
+//     lane=8 -> offset=512
+//     lane=16 -> offset=8
+//   - warp=1 -> offset=1024
+//     warp=2 -> offset=2048
+//     warp=4 -> offset=4096
+//
+// For index `(row=1, col=0)`, which corresponds to `reg=0` and `lane=1` in
+// `warp=0`, the offset is calculated as 72 * 2 bytes = 144 bytes.  The result
+// matches our earlier calculation.
+//
 // TODO(Keren): We should replace tensorTy with a LinearLayout and the element
 // bit width of the tensor in the future to support more flexible tensor
 // encodings
-std::optional<LinearLayout> chooseStMatrixLayoutForRegToRegConversion(
-    MLIRContext *ctx, RankedTensorType tensorTy, ArrayRef<unsigned> repShape,
-    ArrayRef<unsigned> paddedRepShape, ArrayRef<unsigned> order);
+std::optional<LinearLayout>
+chooseStMatrixLayout(MLIRContext *ctx, RankedTensorType tensorTy,
+                     ArrayRef<unsigned> repShape,
+                     ArrayRef<unsigned> paddedRepShape,
+                     ArrayRef<unsigned> order, int swizzleByteSize);
 } // namespace mlir::triton::gpu
 
 #endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H
diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
index 403cac9dec..7bb0f198f9 100644
--- a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -215,15 +215,9 @@ struct ConvertLayoutOpConversion
       if (repId != 0) {
         barrier();
       }
-      auto successful = targetInfo.processReplicaUsingStMatrix(
-          rewriter, loc, smemBase, vals, srcTy,
-          getTypeConverter()->convertType(srcTy.getElementType()),
-          paddedRepShape, origRepShape, outOrd, accumNumReplicates);
-      if (!successful) {
-        processReplica(loc, rewriter, /*stNotRd*/ true, srcTy, inNumCTAsEachRep,
-                       multiDimRepId, inVec, paddedRepShape, origRepShape,
-                       outOrd, vals, smemBase);
-      }
+      processReplica(loc, rewriter, /*stNotRd*/ true, srcTy, inNumCTAsEachRep,
+                     multiDimRepId, inVec, paddedRepShape, origRepShape, outOrd,
+                     vals, smemBase);
       barrier();
       processReplica(loc, rewriter, /*stNotRd*/ false, dstTy, outNumCTAsEachRep,
                      multiDimRepId, outVec, paddedRepShape, origRepShape,
@@ -483,9 +477,9 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
     // Input dims: [reg, lane, warp]
     // Output dims: [offset, iteration]
     std::optional<LinearLayout> shmemStoreLayout =
-        chooseStMatrixLayoutForRegToRegConversion(
-            ctx, op.getSrc().getType(), scratchConfig.repShape,
-            scratchConfig.paddedRepShape, scratchConfig.order);
+        chooseStMatrixLayout(ctx, op.getSrc().getType(), scratchConfig.repShape,
+                             scratchConfig.paddedRepShape, scratchConfig.order,
+                             /*swizzleByteSize=*/0);
     bool isStMatrix = shmemStoreLayout.has_value();
     if (!isStMatrix) {
       shmemStoreLayout = srcLayout.invertAndCompose(sharedLayout);
diff --git a/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp
index 7a9b26f425..f78964dda9 100644
--- a/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp
@@ -116,7 +116,6 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
     RankedTensorType dstTy = op.getType();
     Attribute srcLayout = srcTy.getEncoding();
     Attribute dstLayout = dstTy.getEncoding();
-    // TODO: do we need to check if src is shared ?
     if (isa<SharedEncodingAttr>(srcLayout) &&
         isa<BlockedEncodingAttr, MmaEncodingTrait, SliceEncodingAttr>(
             dstLayout)) {
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
index f576ce2154..5647f0a0d7 100644
--- a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
+++ b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -820,8 +820,8 @@ namespace {
 // stmatrix.  These restrictions are retained from legacy code, and we could
 // relax some of them in the future.
 bool canUseStMatrix(RankedTensorType tensorTy, ArrayRef<unsigned> repShape,
-                    ArrayRef<unsigned> paddedRepShape,
-                    ArrayRef<unsigned> order) {
+                    ArrayRef<unsigned> paddedRepShape, ArrayRef<unsigned> order,
+                    int swizzleByteSize) {
   auto mmaLayout =
       mlir::dyn_cast<NvidiaMmaEncodingAttr>(tensorTy.getEncoding());
   if (!mmaLayout || !mmaLayout.isHopper())
@@ -840,17 +840,87 @@ bool canUseStMatrix(RankedTensorType tensorTy, ArrayRef<unsigned> repShape,
     return false;
   if (paddedRepShape[1] % 8 != 0)
     return false;
+  if (swizzleByteSize != 0 && swizzleByteSize != 32 && swizzleByteSize != 64 &&
+      swizzleByteSize != 128)
+    return false;
   return true;
 }
 
-} // anonymous namespace
+std::optional<LinearLayout> chooseStMatrixLayoutLeadingOffset(
+    MLIRContext *ctx, RankedTensorType tensorTy, ArrayRef<unsigned> repShape,
+    ArrayRef<unsigned> paddedRepShape, ArrayRef<unsigned> order,
+    int swizzleByteSize) {
+  StringAttr kReg = S("register");
+  StringAttr kLane = S("lane");
+  StringAttr kWarp = S("warp");
+  StringAttr kCol = S("dim1");
+  StringAttr kRow = S("dim0");
+  StringAttr kOffset = S("offset");
+
+  int perPhase;
+  int maxPhase;
+  if (swizzleByteSize == 32) {
+    perPhase = 4;
+    maxPhase = 2;
+  } else if (swizzleByteSize == 64) {
+    perPhase = 2;
+    maxPhase = 4;
+  } else if (swizzleByteSize == 128) {
+    perPhase = 1;
+    maxPhase = 8;
+  } else {
+    llvm::errs() << "Illegal swizzleByteSize: " << swizzleByteSize << "\n";
+    llvm::report_fatal_error("Illegal swizzleByteSize");
+  }
+
+  // stmatrix only supports 16-bit elements, and each vector has 8 elements
+  int elemBitWidth = 16;
+  int vecSize = 8;
+  int numRows = 16;
+  int numCols = 8 * swizzleByteSize / elemBitWidth;
+
+  // Construct a single stmatrix.x4 (16x16) tile
+  std::vector<std::vector<int>> basesReg = {{1, 0}, {2, 0}, {4, 0}};
+  std::vector<std::vector<int>> basesLane;
+  for (int logRow = 0; logRow < llvm::Log2_32(numRows); logRow++) {
+    int row = 1 << logRow;
+    basesLane.push_back({vecSize * ((row / perPhase) % maxPhase), row});
+  }
+  basesLane.push_back({8, 0});
+
+  // Expand the tile's register dimension to fit swizzleByteSize, which is a
+  // "chunk"
+  for (int logChunk = 0; logChunk < llvm::Log2_32(numCols / 16); logChunk++) {
+    int chunk = 1 << logChunk;
+    basesReg.push_back({16 * chunk, 0});
+  }
+
+  // Construct the layout for a single chunk
+  LinearLayout layout =
+      LinearLayout({{kReg, basesReg}, {kLane, basesLane}}, {kCol, kRow});
 
-std::optional<LinearLayout> chooseStMatrixLayoutForRegToRegConversion(
+  // Expand the `warp` dimension according to warpsPerCTA.
+  auto mma = cast<NvidiaMmaEncodingAttr>(tensorTy.getEncoding());
+  layout *=
+      identityND(kWarp, mma.getWarpsPerCTA(), /*order=*/{0, 1}, {kRow, kCol})
+          .transposeOuts(llvm::to_vector(layout.getOutDimNames()));
+
+  // Expand the `register` dimension so the size of columns matches `n`.
+  int n = mma.getInstrShape()[1];
+  int numWarpRows = layout.getOutDimSize(kRow);
+  layout = (layout.reshapeOuts({{kOffset, layout.getTotalOutDimSize()}}) *
+            LinearLayout::identity1D(n / numCols, kReg, kOffset))
+               .reshapeOuts({{kCol, n}, {kRow, numWarpRows}});
+
+  auto ret =
+      combineCtaCgaWithShape(layout, mma.getCTALayout(), tensorTy.getShape());
+  return ret.transposeOuts(llvm::to_vector(layout.getOutDimNames()))
+      .reshapeOuts({{kOffset, ret.getTotalOutDimSize()}, {S("iteration"), 1}});
+}
+
+std::optional<LinearLayout> chooseStMatrixLayoutNoLeadingOffset(
     MLIRContext *ctx, RankedTensorType tensorTy, ArrayRef<unsigned> repShape,
     ArrayRef<unsigned> paddedRepShape, ArrayRef<unsigned> order) {
-  if (!canUseStMatrix(tensorTy, repShape, paddedRepShape, order))
-    return std::nullopt;
-
   StringAttr kReg = S("register");
   StringAttr kLane = S("lane");
   StringAttr kWarp = S("warp");
@@ -880,4 +950,23 @@ std::optional<LinearLayout> chooseStMatrixLayoutForRegToRegConversion(
           {{S("offset"), ret.getTotalOutDimSize()}, {S("iteration"), 1}});
 }
 
+} // anonymous namespace
+
+std::optional<LinearLayout>
+chooseStMatrixLayout(MLIRContext *ctx, RankedTensorType tensorTy,
+                     ArrayRef<unsigned> repShape,
+                     ArrayRef<unsigned> paddedRepShape,
+                     ArrayRef<unsigned> order, int swizzleByteSize) {
+  if (!canUseStMatrix(tensorTy, repShape, paddedRepShape, order,
+                      swizzleByteSize))
+    return std::nullopt;
+
+  if (swizzleByteSize == 0)
+    return chooseStMatrixLayoutNoLeadingOffset(ctx, tensorTy, repShape,
+                                               paddedRepShape, order);
+  else
+    return chooseStMatrixLayoutLeadingOffset(
+        ctx, tensorTy, repShape, paddedRepShape, order, swizzleByteSize);
+}
+
 } // namespace mlir::triton::gpu
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp
index 99ecd423b4..6e196e995c 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp
@@ -136,15 +136,6 @@ bool TargetInfo::warpReduce(RewriterBase &rewriter, Location loc,
   return false;
 }
 
-bool TargetInfo::processReplicaUsingStMatrix(
-    RewriterBase &rewriter, Location loc, Value smemBase,
-    SmallVector<Value> &vals, RankedTensorType srcTy, Type elemTy,
-    ArrayRef<unsigned> paddedRepShape, ArrayRef<unsigned> origRepShape,
-    ArrayRef<unsigned> outOrd, unsigned accumNumReplicates,
-    int swizzleByteWidth) const {
-  return false;
-}
-
 void TargetInfo::printfImpl(Value formatStrStart, int formatStrByteCount,
                             ValueRange args, RewriterBase &rewriter,
                             bool useStdErr) const {
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h
index 544abb3e0b..d50f661193 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h
@@ -46,15 +46,6 @@ class TargetInfo : public mlir::triton::TargetInfoBase {
                   triton::ReduceOp op, unsigned numLaneToReduce,
                   unsigned interleave) const override;
 
-  bool processReplicaUsingStMatrix(RewriterBase &rewriter, Location loc,
-                                   Value smemBase, SmallVector<Value> &vals,
-                                   RankedTensorType srcTy, Type elemTy,
-                                   ArrayRef<unsigned> paddedRepShape,
-                                   ArrayRef<unsigned> origRepShape,
-                                   ArrayRef<unsigned> outOrd,
-                                   unsigned accumNumReplicates,
-                                   int swizzleByteWidth) const override;
-
   std::string getMulhiFuncName(Type resultElementTy) const override;
 
   void printf(RewriterBase &rewriter, Value formatStrStart,
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
index 8fb44ce644..2895da1b26 100644
--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -725,25 +725,59 @@ struct LocalAllocOpConversion
     else
       return failure();
 
+    auto *ctx = rewriter.getContext();
     Location loc = op->getLoc();
+
     RankedTensorType srcTy = op.getSrc().getType();
-    Value smemBase = LLVM::getSharedMemoryBase(loc, rewriter, op);
-    auto srcs = unpackLLElements(loc, adaptor.getSrc(), rewriter);
-    SmallVector<unsigned> shape;
-    for (int64_t dim : srcTy.getShape())
-      shape.push_back(dim);
-    bool loweredToStMatrix = targetInfo.processReplicaUsingStMatrix(
-        rewriter, loc, smemBase, srcs, srcTy,
-        getTypeConverter()->convertType(srcTy.getElementType()), shape, shape,
-        sharedLayout.getOrder(), 1, swizzleByteSize);
-    if (!loweredToStMatrix)
+    SmallVector<unsigned> shape =
+        convertType<unsigned, int64_t>(srcTy.getShape());
+    auto order = sharedLayout.getOrder();
+    auto layout = chooseStMatrixLayout(rewriter.getContext(), srcTy, shape,
+                                       shape, order, swizzleByteSize);
+    if (!layout.has_value())
       return failure();
 
+    Value smemBase = LLVM::getSharedMemoryBase(loc, rewriter, op);
+    auto smemPtrTy = ptr_ty(ctx, 3);
+
+    auto kRegister = str_attr("register");
+    auto kLane = str_attr("lane");
+    auto kWarp = str_attr("warp");
+    auto kBlock = str_attr("block");
+
+    Value threadId = getThreadId(rewriter, loc);
+    Value threadsPerWarp = i32_val(layout->getInDimSize(kLane));
+    Value laneId = urem(threadId, threadsPerWarp);
+    Value warpId = udiv(threadId, threadsPerWarp);
+
+    auto regBase = applyLinearLayout(loc, rewriter, *layout,
+                                     {{kRegister, i32_val(0)},
+                                      {kLane, laneId},
+                                      {kWarp, warpId},
+                                      {kBlock, i32_val(0)}})[0]
+                       .second;
+    auto srcVals = unpackLLElements(loc, adaptor.getSrc(), rewriter);
+    auto srcVec = layout->getNumConsecutiveInOut();
+    Type llvmElemTy = typeConverter->convertType(srcTy.getElementType());
+    for (int i = 0; i < srcVals.size(); i += srcVec) {
+      auto regIdx =
+          layout
+              ->apply({{kRegister, i}, {kLane, 0}, {kWarp, 0}, {kBlock, 0}})[0]
+              .second;
+      Value offset = xor_(regBase, i32_val(regIdx));
+      auto vecAddr = gep(smemPtrTy, llvmElemTy, smemBase, offset);
+      vecAddr.setInbounds(true);
+      SmallVector<Value> inValsVec;
+      for (int j = 0; j < srcVec; j++)
+        inValsVec.push_back(srcVals[i + j]);
+      Value valsVec = packLLVector(loc, inValsVec, rewriter);
+      targetInfo.storeMatrixShared(rewriter, loc, vecAddr, valsVec);
+    }
+
     auto resultTy = cast<MemDescType>(op.getType());
     // Workaround for 3D tensors
     // TODO: we need to modify the pipeline pass to give a proper shared
     // encoding to 3D tensors
-    auto order = sharedLayout.getOrder();
     SmallVector<unsigned> newOrder;
     if (resultTy.getShape().size() != order.size()) {
       for (auto i = 0; i < order.size(); ++i)
@@ -752,7 +786,6 @@ struct LocalAllocOpConversion
     } else {
       newOrder = SmallVector<unsigned>(order.begin(), order.end());
     }
-    auto llvmElemTy = typeConverter->convertType(resultTy.getElementType());
     auto shapePerCTA = getShapePerCTA(sharedLayout, resultTy.getShape());
     auto smemObj = SharedMemoryObject(smemBase, llvmElemTy, shapePerCTA,
                                       newOrder, loc, rewriter);
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.cpp
index 5813b9679e..1b7f71b8cb 100644
--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.cpp
+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.cpp
@@ -15,131 +15,6 @@ using ::mlir::LLVM::linearize;
 using ::mlir::triton::gpu::getShapePerCTA;
 using ::mlir::triton::gpu::getShapePerCTATile;
 namespace {
-Value computeStMatrixAddr(Value laneId, int matStride, Location loc,
-                          RewriterBase &rewriter, int swizzleByteWidth) {
-  Value rowInMat = urem(laneId, i32_val(8)); // row in the 8x8 matrix
-  // linear index of the matrix in the 2x2 matrices
-  // Decompose matIndex => s_0, s_1, that is the coordinate in 2x2 matrices in
-  // a warp.
-  Value matIndex = udiv(laneId, i32_val(8));
-  Value s0 = urem(matIndex, i32_val(2));
-  Value s1 = udiv(matIndex, i32_val(2));
-  if (swizzleByteWidth >= 32)
-    s1 = xor_(s1, and_(laneId, i32_val(1)));
-  Value mIndex = add(rowInMat, mul(s0, i32_val(8)));
-  int m8n8Stride = 8;
-  Value offset =
-      add(mul(mIndex, i32_val(matStride)), mul(s1, i32_val(m8n8Stride)));
-  return offset;
-}
-
-void stMatrixm8n8x4(Value offset, ArrayRef<Value> vals, int indexOffset,
-                    Value smemBase, Type elemTy, Location loc,
-                    RewriterBase &rewriter) {
-  SmallVector<Value> inputs;
-  auto prTy = ptr_ty(rewriter.getContext(), 3);
-  // Pack the input into 2xf16
-  Type packedTy = vec_ty(vals[0].getType(), 2);
-  for (int i = 0; i < 4; i++) {
-    Value input = undef(packedTy);
-    for (int j = 0; j < 2; j++) {
-      input = insert_element(packedTy, input, vals[indexOffset + i * 2 + j],
-                             i32_val(j));
-    }
-    inputs.push_back(bitcast(input, i32_ty));
-  }
-  Value addr = gep(smemBase.getType(), elemTy, smemBase, offset);
-  rewriter.create<triton::nvgpu::StoreMatrixOp>(loc, addr, inputs);
-}
-void storeDistributedToSharedWithStMatrix(
-    RankedTensorType tensorTy, Type elemTy, SmallVector<Value> &inVals,
-    Value smemBase, ArrayRef<unsigned> paddedRepShape,
-    ArrayRef<unsigned> origRepShape, Location loc, RewriterBase &rewriter,
-    int swizzleByteWidth) {
-  auto shapePerCTA = getShapePerCTA(tensorTy);
-  auto mmaLayout = mlir::cast<NvidiaMmaEncodingAttr>(tensorTy.getEncoding());
-  auto order = triton::gpu::getOrder(mmaLayout);
-  auto warpsPerCTA = mmaLayout.getWarpsPerCTA();
-  auto shapePerCTATile = getShapePerCTATile(mmaLayout);
-  ArrayRef<unsigned> mmaShape = mmaLayout.getInstrShape();
-  // 4xm8n8 matches exactly the size of 1 warp of wgmma layout for 16bit type
-  // and has a shape of 16x16.
-  int instrN = mmaShape[1] * warpsPerCTA[1];
-  int instrM = mmaShape[0] * warpsPerCTA[0];
-  std::array<int, 2> numRep = {ceil((int)origRepShape[0], instrM),
-                               ceil((int)origRepShape[1], instrN)};
-  int numBoxes = 1;
-  if (swizzleByteWidth == 128) {
-    int contigDimSizeInByte =
-        origRepShape[1] * elemTy.getIntOrFloatBitWidth() / 8;
-    numBoxes = ceil<int>(contigDimSizeInByte, 128);
-  }
-  SmallVector<unsigned> boxShape = {paddedRepShape[0], paddedRepShape[1]};
-  boxShape[1] = boxShape[1] / numBoxes;
-  Value thread = getThreadId(rewriter, loc);
-  Value warp = udiv(thread, i32_val(32));
-  Value lane = urem(thread, i32_val(32));
-
-  SmallVector<Value> multiDimWarpId =
-      delinearize(rewriter, loc, warp, warpsPerCTA);
-
-  // Compute the relative offset for each lane.
-  Value stMatrixLaneOffset =
-      computeStMatrixAddr(lane, boxShape[1], loc, rewriter, swizzleByteWidth);
-  multiDimWarpId[0] = mul(multiDimWarpId[0], i32_val(mmaShape[0]));
-  multiDimWarpId[1] = mul(multiDimWarpId[1], i32_val(mmaShape[1]));
-  SmallVector<Value> multiDimOffsetWrapped = getWrappedMultiDimOffset(
-      rewriter, loc, multiDimWarpId, boxShape, shapePerCTATile, shapePerCTA);
-  Value relativeOffset =
-      linearize(rewriter, loc, multiDimOffsetWrapped, boxShape, order);
-  relativeOffset = add(relativeOffset, stMatrixLaneOffset);
-  int indexOffset = 0;
-  int m8n8x4Stride = 16;
-  int numNChunk = mmaShape[1] / m8n8x4Stride;
-  unsigned totalNumElements = product(origRepShape);
-  numNChunk = numNChunk / numBoxes;
-  for (int m = 0; m < numRep[0]; m++) {
-    for (int n = 0; n < numRep[1]; n++) {
-      for (int box = 0; box < numBoxes; box++) {
-        for (int k = 0; k < numNChunk; k++) {
-          Value kOffset;
-          if (swizzleByteWidth >= 64) {
-            int swizzleBits = swizzleByteWidth == 128 ? 6 : 2;
-            Value o = lshr(and_(lane, i32_val(swizzleBits)), i32_val(1));
-            Value kV = xor_(o, i32_val(k));
-            kOffset = mul(kV, i32_val(m8n8x4Stride));
-          } else {
-            kOffset = i32_val(k * m8n8x4Stride);
-          }
-          Value addr = add(relativeOffset,
-                           i32_val(n * instrN + m * instrM * boxShape[1] +
-                                   box * (totalNumElements / numBoxes)));
-          addr = add(addr, kOffset);
-
-          stMatrixm8n8x4(addr, inVals, indexOffset, smemBase, elemTy, loc,
-                         rewriter);
-          indexOffset += 8;
-        }
-      }
-    }
-  }
-}
-
-bool isStMatrixCompatible(RankedTensorType tensorTy, int swizzleByteWidth) {
-  auto mmaLayout =
-      mlir::dyn_cast<NvidiaMmaEncodingAttr>(tensorTy.getEncoding());
-  if (!mmaLayout || !mmaLayout.isHopper())
-    return false;
-  if (tensorTy.getElementType().getIntOrFloatBitWidth() != 16)
-    return false;
-  if (swizzleByteWidth > 0 && mmaLayout.getInstrShape()[1] < 64)
-    return false;
-  if (swizzleByteWidth != 0 && swizzleByteWidth != 32 &&
-      swizzleByteWidth != 64 && swizzleByteWidth != 128)
-    return false;
-  return true;
-}
-
 // declare vprintf(i8*, i8*) as external function
 LLVM::LLVMFuncOp getVprintfDeclaration(RewriterBase &rewriter) {
   auto moduleOp = rewriter.getBlock()->getParent()->getParentOfType<ModuleOp>();
@@ -605,26 +480,22 @@ bool TargetInfo::warpReduce(RewriterBase &rewriter, Location loc,
 
 void TargetInfo::storeMatrixShared(RewriterBase &rewriter, Location loc,
                                    Value ptr, Value val) const {
-  auto vecTy = cast<VectorType>(val.getType());
-  Type elemTy = vecTy.getElementType();
-  stMatrixm8n8x4(i32_val(0), unpackLLVector(loc, val, rewriter), 0, ptr, elemTy,
-                 loc, rewriter);
-}
-
-bool TargetInfo::processReplicaUsingStMatrix(
-    RewriterBase &rewriter, Location loc, Value smemBase,
-    SmallVector<Value> &vals, RankedTensorType srcTy, Type elemTy,
-    ArrayRef<unsigned> paddedRepShape, ArrayRef<unsigned> origRepShape,
-    ArrayRef<unsigned> outOrd, unsigned accumNumReplicates,
-    int swizzleByteWidth) const {
-  if (isStMatrixCompatible(srcTy, swizzleByteWidth) &&
-      accumNumReplicates == 1 && outOrd[0] == 1 && paddedRepShape[1] % 8 == 0) {
-    storeDistributedToSharedWithStMatrix(srcTy, elemTy, vals, smemBase,
-                                         paddedRepShape, origRepShape, loc,
-                                         rewriter, swizzleByteWidth);
-    return true;
+  auto vals = unpackLLVector(loc, val, rewriter);
+  // Ensure input consists of 4 vectors, each holding 2 elements of 16 bits
+  assert(vals[0].getType().getIntOrFloatBitWidth() == 16 &&
+         "stmatrix requires elements to be 16-bit integers or floats");
+  assert(vals.size() == 8 &&
+         "stmatrix requires exactly 8 elements in the input vector");
+  Type packedTy = vec_ty(vals[0].getType(), 2);
+  SmallVector<Value> inputs;
+  for (int i = 0; i < 4; i++) {
+    Value input = undef(packedTy);
+    for (int j = 0; j < 2; j++) {
+      input = insert_element(packedTy, input, vals[i * 2 + j], i32_val(j));
+    }
+    inputs.push_back(bitcast(input, i32_ty));
   }
-  return false;
+  rewriter.create<triton::nvgpu::StoreMatrixOp>(loc, ptr, inputs);
 }
 
 std::string TargetInfo::getMulhiFuncName(Type resultElementTy) const {
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.h b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.h
index 011cc37f4b..97df9d840f 100644
--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.h
+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.h
@@ -41,15 +41,6 @@ class TargetInfo : public mlir::triton::TargetInfoBase {
                   triton::ReduceOp op, unsigned numLaneToReduce,
                   unsigned interleave) const override;
 
-  bool processReplicaUsingStMatrix(RewriterBase &rewriter, Location loc,
-                                   Value smemBase, SmallVector<Value> &vals,
-                                   RankedTensorType srcTy, Type elemTy,
-                                   ArrayRef<unsigned> paddedRepShape,
-                                   ArrayRef<unsigned> origRepShape,
-                                   ArrayRef<unsigned> outOrd,
-                                   unsigned accumNumReplicates,
-                                   int swizzleByteWidth) const override;
-
   std::string getMulhiFuncName(Type resultElementTy) const override;
 
   void printf(RewriterBase &rewriter, Value formatStrStart,

From 6c3e9535c44774dfd56357acba9c2183b247f58e Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Mon, 30 Sep 2024 20:10:48 -0700
Subject: [PATCH 12/12] Add git commit to the version as a suffix (#4812)

It's very common that we need to figure out the exact commit from which
the current installed triton package is compiled. Right now it will just
show a version number like `3.0.0` which isn't quite helpful.

With this commit we have

```
> pip show triton
Name: triton
Version: 3.0.0+git78e4f837
```
---
 python/setup.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/setup.py b/python/setup.py
index 57ac73899b..01de47d52d 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -637,9 +637,17 @@ def get_entry_points():
     return entry_points
 
 
+def get_git_commit_hash(length=8):
+    try:
+        cmd = ['git', 'rev-parse', f'--short={length}', 'HEAD']
+        return "+git{}".format(subprocess.check_output(cmd).strip().decode('utf-8'))
+    except Exception:
+        return ""
+
+
 setup(
     name=os.environ.get("TRITON_WHEEL_NAME", "triton"),
-    version="3.0.0" + os.environ.get("TRITON_WHEEL_VERSION_SUFFIX", ""),
+    version="3.0.0" + get_git_commit_hash() + os.environ.get("TRITON_WHEEL_VERSION_SUFFIX", ""),
     author="Philippe Tillet",
     author_email="phil@openai.com",
     description="A language and compiler for custom Deep Learning operations",