intel
diff --git a/‎.github/workflows/integration-tests-amd.yml
Lines changed: 0 additions & 32 deletions b/‎.github/workflows/integration-tests-amd.yml
Lines changed: 0 additions & 32 deletions
diff --git a/‎.github/workflows/integration-tests-nvidia.yml
Lines changed: 0 additions & 32 deletions b/‎.github/workflows/integration-tests-nvidia.yml
Lines changed: 0 additions & 32 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
Lines changed: 111 additions & 10 deletions b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
Lines changed: 111 additions & 10 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp
Lines changed: 7 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp
Lines changed: 7 additions & 0 deletions
diff --git a/‎python/test/unit/language/test_core.py
Lines changed: 0 additions & 5 deletions b/‎python/test/unit/language/test_core.py
Lines changed: 0 additions & 5 deletions
diff --git a/‎python/test/unit/language/test_frontend.py
Lines changed: 61 additions & 0 deletions b/‎python/test/unit/language/test_frontend.py
Lines changed: 61 additions & 0 deletions
diff --git a/‎python/triton/compiler/code_generator.py
Lines changed: 2 additions & 2 deletions b/‎python/triton/compiler/code_generator.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/triton/language/core.py
Lines changed: 4 additions & 1 deletion b/‎python/triton/language/core.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎test/Conversion/amd/buffer_load_to_local_to_llvm.mlir
Lines changed: 2 additions & 1 deletion b/‎test/Conversion/amd/buffer_load_to_local_to_llvm.mlir
Lines changed: 2 additions & 1 deletion
diff --git a/‎test/Conversion/cvt_to_llvm.mlir
Lines changed: 1 addition & 1 deletion b/‎test/Conversion/cvt_to_llvm.mlir
Lines changed: 1 addition & 1 deletion
@@ -60,26 +60,6 @@ jobs:
             ~/.triton/nvidia
             ~/.triton/json
           key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
-      - # Cache ~/.cache/ccache to speed up compilation.
-        #
-        # On branch `main` we always start from an empty cache, i.e. we skip the
-        # "restore" step.  This is to prevent the caches from accumulating stale
-        # files over time.
-        name: Restore cache of ccache and Triton compilation artifacts
-        id: restore-build-cache
-        if: github.ref != 'refs/heads/main'
-        uses: actions/cache/restore@v4
-        with:
-          path: |
-            ~/.ccache
-          # Restore the most recent cache entry.
-          restore-keys: |
-            triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-
-            triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-
-          # We expect this cache key never to hit and for us to fall back
-          # unconditionally to the restore-key, so it doesn't actually matter
-          # what we put here (so long as it doesn't hit an existing key).
-          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
       - name: Inspect cache directories
         run: |
           mkdir -p ~/.triton
@@ -152,18 +132,6 @@ jobs:
 
           mkdir -p ~/.ccache
           du -h -d 1 ~/.ccache
-      - # If we're on branch `main`, save the ccache Triton compilation artifacts
-        # to the cache so they can be used by other (non-main) CI runs.
-        #
-        # (It wouldn't be a problem to save the cache on every run, because github
-        # evicts cache entries LRU, but maybe this saves a bit of time in CI.)
-        name: Save ccache and Triton compilation artifacts to cache
-        if: github.ref == 'refs/heads/main'
-        uses: actions/cache/save@v4
-        with:
-          path: |
-            ~/.ccache
-          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
       - name: Clean up caches
         # Always cleanup the worker, even if builds or tests failed
         if: always()
 
@@ -57,26 +57,6 @@ jobs:
             ~/.triton/nvidia
             ~/.triton/json
           key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
-      - # Cache ~/.cache/ccache to speed up compilation.
-        #
-        # On branch `main` we always start from an empty cache, i.e. we skip the
-        # "restore" step.  This is to prevent the caches from accumulating stale
-        # files over time.
-        name: Restore cache of ccache and Triton compilation artifacts
-        id: restore-build-cache
-        if: github.ref != 'refs/heads/main'
-        uses: actions/cache/restore@v4
-        with:
-          path: |
-            ~/.ccache
-          # Restore the most recent cache entry.
-          restore-keys: |
-            triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-
-            triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-
-          # We expect this cache key never to hit and for us to fall back
-          # unconditionally to the restore-key, so it doesn't actually matter
-          # what we put here (so long as it doesn't hit an existing key).
-          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
       - name: Inspect cache directories
         run: |
           mkdir -p ~/.triton
@@ -130,15 +110,3 @@ jobs:
 
           mkdir -p ~/.ccache
           du -h -d 1 ~/.ccache
-      - # If we're on branch `main`, save the ccache Triton compilation artifacts
-        # to the cache so they can be used by other (non-main) CI runs.
-        #
-        # (It wouldn't be a problem to save the cache on every run, because github
-        # evicts cache entries LRU, but maybe this saves a bit of time in CI.)
-        name: Save ccache and Triton compilation artifacts to cache
-        if: github.ref == 'refs/heads/main'
-        uses: actions/cache/save@v4
-        with:
-          path: |
-            ~/.ccache
-          key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
@@ -1515,32 +1515,133 @@ chooseMfmaLikeStoreLayout(RankedTensorType valType) {
     return {};
   auto mfmaLayout = cast<AMDMfmaEncodingAttr>(valType.getEncoding());
 
-  // We currently only support transposed [B]F16 MFMA32x32 on CDNA4.
+  // We currently only support transposed [B]F16 MFMA32x32 and MFMA16x16 on
+  // CDNA4.
   bool isMfma32 = mfmaLayout.getMDim() == 32 && mfmaLayout.getNDim() == 32;
+  bool isMfma16 = mfmaLayout.getMDim() == 16 && mfmaLayout.getNDim() == 16;
+
+  auto valShape = valType.getShape();
+  // For mfma16x16, to use in-wavefront swap, we need to make sure the tiles
+  // used are in one wavefront if there are multiple tiles, which means
+  // warpsPerCTA = [numWarps, 1] and at least two tiles along the N dim. For
+  // now, it is only possible for FA-like kernels since during mfma generation,
+  // the WarpsPerCTA of the head dot in the chain will be reshaped to [numWaprs,
+  // 1].
+  // TODO: For gemm-like kernel, the transformation here cannot be applied for
+  // now and will support it.
+  bool validForMfma16 = isMfma16 && valShape.back() >= 16 * 2 &&
+                        mfmaLayout.getWarpsPerCTA().back() == 1;
+
   Type elemType = valType.getElementType();
   if (!(valType.getRank() == 2 && (elemType.isF16() || elemType.isBF16()) &&
         mfmaLayout.getVersionMajor() == 4 && mfmaLayout.getIsTransposed() &&
-        isMfma32))
+        (isMfma32 || validForMfma16)))
     return {};
 
-  auto valShape = valType.getShape();
   LinearLayout mfmaLL = mfmaLayout.toLinearLayout(valShape);
   auto mfmaOutDims = llvm::to_vector(mfmaLL.getOutDimNames());
   StringAttr dimM = mfmaOutDims[0];
   StringAttr dimN = mfmaOutDims[1];
-
   auto swapLL = LinearLayout::empty();
   // The rows are kept as is with an identity linear layout.
   swapLL *= LinearLayout::identity1D(valShape[0], dimM, dimM);
-  // In transposed mfma32 layout, each thread holds 4 consecutive values along N
-  // dim. We want to exchange column 4-7 (owned by thread 32-63) and column 8-11
-  // (owned by thread 0-31) every 16 columns to make each thread holds 8
-  // elements. This would mean exchange the 2nd and 3rd basis vector from an
-  // identity linear layout.
+  /*
+  clang-format off
+  In transposed mfma32 layout, Each thread holds 4 consecutive values along N
+  dim. We want to exchange column 4-7 (owned by thread 32-63, BLK0) and column
+  8-11 (owned by thread 0-31, BLK1) every 16 columns to make each thread holds 8
+  elements. This would mean exchange the 2nd and 3rd basis vector from an
+  identity linear layout on tensor elements.
+
+  Correspondingly, the transposed mfma16 layout, the output of
+  transposed of mfma16x16 is:
+
+              N/register
+  M/Lane          v0       v1       v2       v3       v4       v5       v6       v7
+              -------------------------------------------------------------------------
+  row0:  0-15 | tile-0 | tile-0 | tile-0 | tile-0 | tile-1 | tile-1 | tile-1 | tile-1 |
+              -------------------------------------------------------------------------
+  row1: 16-31 | tile-0 | tile-0 | tile-0 | tile-0 | tile-1 | tile-1 | tile-1 | tile-1 |
+              -------------------------------------------------------------------------
+  row2: 32-47 | tile-0 | tile-0 | tile-0 | tile-0 | tile-1 | tile-1 | tile-1 | tile-1 |
+              -------------------------------------------------------------------------
+  row3: 48-63 | tile-0 | tile-0 | tile-0 | tile-0 | tile-1 | tile-1 | tile-1 | tile-1 |
+              -------------------------------------------------------------------------
+  which means:
+  The columns from v0 to v3 are in the one output of mfma16x16 and
+  the columns from v4 to v7 are in the one output of mfma16x16,
+
+  The following graph is the same as the one above, execept the tile number is replaced with coordinates in the tenor,
+            N/register
+            -----------------------------------------------
+  M/lane    |(0,  0) ...  (0,  3) | (0,  16) ... (0,  19) |
+            |....                 | sub-tensor-0          |
+            |(15, 0) ...  (15, 3) | (15, 16) ... (15, 19) |
+            -----------------------------------------------
+            |(0,  4) ...  (0,  7) | (0,  20) ... (0,  23) |
+            |sub-tensor-1         | ....                  |
+            |(15, 0) ...  (15, 3) | (15, 20) ... (15, 23) |
+            -----------------------------------------------
+            |(0,  8) ...  (0,  11)| (0,  24) ... (0,  27) |
+            |....                 | sub-tensor-2          |
+            |(15, 8) ...  (15, 11)| (15, 24) ... (15, 27) |
+            -----------------------------------------------
+            |(0,  12) ... (0,  15)| (0,  28) ... (0,  31) |
+            |sub-tensor-3         | ....                  |
+            |(15, 12) ... (15, 15)| (15, 28) ... (15, 31) |
+            -----------------------------------------------
+  The basis vector for lane and register are:
+  Register = {{0, 1}, {0, 2}}
+  Lane = {{1, 0}, {2, 0}, {4, 0}, {8, 0}, {0, 4}, {0, 8}}
+  With this layout, only 4xfp16 can be packed in the final global store.
+
+  To use 128-bits global store, we need to pack 8 elements, which means the layout looks like:
+              N/register
+  M/Lane          v0       v1       v2       v3       v4       v5       v6       v7
+              -------------------------------------------------------------------------
+  row0:  0-15 | tile-0 | tile-0 | tile-0 | tile-0 | tile-0 | tile-0 | tile-0 | tile-0 |
+              -------------------------------------------------------------------------
+  row1: 16-31 | tile-1 | tile-1 | tile-1 | tile-1 | tile-1 | tile-1 | tile-1 | tile-1 |
+              -------------------------------------------------------------------------
+  row2: 32-47 | tile-0 | tile-0 | tile-0 | tile-0 | tile-0 | tile-0 | tile-0 | tile-0 |
+              -------------------------------------------------------------------------
+  row3: 48-63 | tile-1 | tile-1 | tile-1 | tile-1 | tile-1 | tile-1 | tile-1 | tile-1 |
+              -------------------------------------------------------------------------
+
+  The following graph is the same as the one above, execept the tile number is replaced with coordinates in the tenor:
+            N/register
+            -----------------------------------------------
+            |(0,  0) ...  (0,  3) | (0,  4) ...  (0,  7)  |
+            |....                 | sub-tensor-1          |
+            |(15, 0) ...  (15, 3) | (15, 16) ... (15, 19) |
+            -----------------------------------------------
+            |(0, 16) ...  (0, 19) | (0,  20) ... (0,  23) |
+            |sub-tensor-0         | ....                  |
+            |(15, 16) ... (15, 19)| (15, 20) ... (15, 23) |
+            -----------------------------------------------
+            |(0,  8) ...  (0,  11)| (0,  12) ... (0,  15) |
+            |....                 | sub-tensor-3          |
+            |(15, 8) ...  (15, 11)| (15, 12) ... (15, 15) |
+            -----------------------------------------------
+            |(0,  24) ... (0,  27)| (0,  28) ... (0,  31) |
+            |sub-tensor-2         | ....                  |
+            |(15, 24) ... (15, 27)| (15, 28) ... (15, 31) |
+            -----------------------------------------------
+  which means we need to exchange sub-tensor-0 with sub-tensor-1 and sub-tensor-2 and sub-tensor-3.
+  And basis vector for lane and register are:
+  Register = {{0, 1}, {0, 2}, {0, 4}}
+  Lane = {{1, 0}, {2, 0, [4, 0}, {8, 0}, {0, 16}, {0, 8}}
+
+  The steps to get this layout are, firstly we check the last dim of WarpsPerCTA is 1, so we can use v_permlane16.
+  Then, we exchange the 2nd and 4th elements in the basis vector of an identity linear and then it will be composed with
+  the original mfma16 LL.
+            clang-format on
+  */
+  auto destIdxInBases = isMfma32 ? 3 : 4;
   std::vector<std::vector<int32_t>> dimNBases(mfmaLL.getOutDimSizeLog2(dimN));
   std::generate(dimNBases.begin(), dimNBases.end(),
                 [i = 0]() mutable { return std::vector<int32_t>{1 << i++}; });
-  std::swap(dimNBases[2], dimNBases[3]);
+  std::swap(dimNBases[2], dimNBases[destIdxInBases]);
   swapLL *= LinearLayout({{dimN, dimNBases}}, {dimN});
 
   return mfmaLL.compose(swapLL);
 
@@ -941,6 +941,13 @@ LogicalResult getConvertBackwardSlice(
         auto srcEncoding = inferSrcEncoding(definingOp, encoding);
         if (!srcEncoding)
           return failure();
+        // If the infered layout matches the original one we don't need to keep
+        // propagating.
+        if (auto operandType =
+                dyn_cast<RankedTensorType>(operand.get().getType())) {
+          if (srcEncoding == operandType.getEncoding())
+            continue;
+        }
         enqueue(operand, srcEncoding);
       }
       continue;
 
@@ -5989,14 +5989,9 @@ def kernel(Out):
     DotOperandLayout(parent=MmaLayout([3, 0], [4, 1], [1, 1], [1, 1], [1, 0], [16, 32, 16]), op_idx=0, k_width=2),
     DotOperandLayout(parent=MmaLayout([3, 0], [4, 1], [1, 1], [1, 1], [1, 0], [16, 32, 16]), op_idx=0, k_width=1),
     MmaLayout([2, 0], [4, 1], [1, 1], [1, 1], [1, 0], [16, 8]),
-    DotOperandLayout(parent=MmaLayout([2, 0], [4, 1], [1, 1], [1, 1], [1, 0], [16, 8]), op_idx=0, k_width=2),
     DotOperandLayout(parent=MmaLayout([2, 0], [4, 1], [1, 1], [1, 1], [1, 0], [16, 8]), op_idx=1, k_width=2),
     DotOperandLayout(parent=MmaLayout([2, 0], [2, 2], [1, 1], [1, 1], [1, 0], [16, 8]), op_idx=0, k_width=2),
-    DotOperandLayout(parent=MmaLayout([2, 0], [2, 2], [1, 1], [1, 1], [1, 0], [16, 8]), op_idx=1, k_width=2),
     DotOperandLayout(parent=MmaLayout([2, 0], [4, 1], [1, 1], [1, 1], [1, 0], [16, 8]), op_idx=0, k_width=8),
-    DotOperandLayout(parent=MmaLayout([2, 0], [4, 1], [1, 1], [1, 1], [1, 0], [16, 8]), op_idx=1, k_width=8),
-    DotOperandLayout(parent=MmaLayout([2, 0], [2, 2], [1, 1], [1, 1], [1, 0], [16, 8]), op_idx=0, k_width=8),
-    DotOperandLayout(parent=MmaLayout([2, 0], [2, 2], [1, 1], [1, 1], [1, 0], [16, 8]), op_idx=1, k_width=8),
     SliceLayout(
         dim=1,
         parent=DotOperandLayout(parent=MmaLayout([3, 0], [4, 1, 1], [1, 1, 1], [1, 1, 1], [2, 1, 0], [16, 32, 16]),
 
@@ -179,6 +179,11 @@ def __init__(self, a, b):
     def create(a):
         return AggregateWithConstexpr(a, tl.constexpr(42))
 
+    @triton.jit
+    def modify(self, a):
+        self.a = a
+        return self
+
 
 @triton.jit
 def add_rhs_constexpr(agg):
@@ -196,3 +201,59 @@ def test_aggregate_with_constexpr():
     # CHECK: tt.func private @"test_frontend.add_rhs_constexpr__test_frontend.AggregateWithConstexpr<i32S4S, constexpr[42]>
     # CHECK: %cst = arith.constant dense<42> : tensor<4xi32>
     # CHECK: arith.addi %arg0, %cst : tensor<4xi32>
+
+
+@tl.constexpr_function
+def constexpr_function(x):
+    return x + 1
+
+
+@filecheck_test
+@triton.jit
+def test_constexpr_function_from_jit():
+    # CHECK-LABEL: test_constexpr_function
+    x: tl.constexpr = constexpr_function(7)
+    # CHECK: make_range {end = 8 : i32, start = 0 : i32}
+    tl.arange(0, x)
+
+
+def test_constexpr_function_from_python():
+    assert constexpr_function(7) == 8
+
+
+@triton.jit
+def swap(pair):
+    return pair.second, pair.first
+
+
+@filecheck_test
+@triton.jit
+def test_assign_tuple_attrs():
+    # CHECK-LABEL: test_assign_tuple_attrs
+    p = Pair(tl.arange(0, 4), tl.arange(4, 8))
+    # CHECK: [[P:%.*]]:2 = tt.call @{{.*}}swap
+    p.first, p.second = swap(p)
+    # CHECK: call @{{.*}}anchor{{.*}}([[P]]#0)
+    # CHECK: call @{{.*}}anchor{{.*}}([[P]]#1)
+    anchor(p.first)
+    anchor(p.second)
+
+
+@filecheck_test
+@triton.jit
+def test_reassign_aggregate_with_constexpr():
+    # CHECK-LABEL: test_reassign_aggregate_with_constexpr
+    agg = AggregateWithConstexpr.create(tl.arange(0, 4))
+    var = 1
+    # CHECK: [[AGG:%.*]] = scf.if {{.*}} -> (tensor<4xi32>)
+    # CHECK:   [[VALUE:%.*]] = tt.call {{.*}}modify
+    # CHECK:   yield [[VALUE]]
+    # CHECK: else
+    # CHECK:   [[VALUE:%.*]] = tt.call {{.*}}modify
+    # CHECK:   yield [[VALUE]]
+    if var == 0:
+        agg = agg.modify(tl.arange(4, 8))
+    else:
+        agg = agg.modify(tl.arange(8, 12))
+    # CHECK: call @{{.*}}anchor{{.*}}([[AGG]])
+    anchor(agg)
@@ -571,8 +571,8 @@ def assignTarget(self, target, value):
         if isinstance(target, ast.Subscript):
             return self.visit_Subscript_Store(target, value)
         if isinstance(target, ast.Tuple):
-            for i, name in enumerate(target.elts):
-                self.set_value(self.visit(name), value.values[i])
+            for i, target in enumerate(target.elts):
+                self.assignTarget(target, value.values[i])
             return
         if isinstance(target, ast.Attribute):
             base = self.visit(target.value)
 
@@ -178,6 +178,9 @@ class constexpr_type(base_type):
     def __init__(self, value):
         self.value = value
 
+    def __eq__(self, other):
+        return self.value == other.value
+
     def __repr__(self) -> str:
         return f"constexpr[{self.value}]"
 
@@ -338,7 +341,7 @@ def constexpr_function(f):
     @wraps(f)
     def wrapper(*args, **kwargs):
         # de-constexpr arguments and discard the _builder keyword argument:
-        args = [getattr(x, "value", x) for x in args]
+        args = [_unwrap_if_constexpr(x) for x in args]
         kwargs = {k: getattr(v, "value", v) for (k, v) in kwargs.items() if k != "_builder"}
 
         # call the raw Python function f:
 
@@ -271,7 +271,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.sha
 
     // Each thread needs to load 8 elements and we load 8 (sizePerThread) per buffer load instruction
     // GFX950: rocdl.make.buffer.rsrc
-    // GFX950: rocdl.ds_bpermute
+    // Src ptrs are contiguous so we do expect to bypass the ds_bpermute (see lowering to LLVM)
+    // GFX950-NOT: rocdl.ds_bpermute
     // GFX950: rocdl.raw.ptr.buffer.load.lds
     // GFX950-NOT: rocdl.raw.ptr.buffer.load.lds
 
 
@@ -127,7 +127,7 @@ tt.func private @convert_layout_blocked_blocked(%arg0: tensor<16x16xi32, #blocke
   // to this, we choose to fall back to the shared memory implementation.
 
   // CHECK-NOT: shfl.sync.idx
-  // CHECK: st.shared
+  // CHECK: store
 
   %0 = ttg.convert_layout %arg0 : tensor<16x16xi32, #blocked0> -> tensor<16x16xi32, #blocked1>
   tt.return %0 : tensor<16x16xi32, #blocked1>