intel
diff --git a/‎.github/workflows/create_release.yml‎
Lines changed: 76 additions & 0 deletions b/‎.github/workflows/create_release.yml‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎cmake/llvm-hash.txt‎
Lines changed: 1 addition & 1 deletion b/‎cmake/llvm-hash.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 4 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 0 additions & 1 deletion b/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 9 additions & 3 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/AutomaticWarpSpecialization.cpp‎
Lines changed: 3 additions & 3 deletions b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/AutomaticWarpSpecialization.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/IR/Dialect.cpp‎
Lines changed: 12 additions & 55 deletions b/‎lib/Dialect/TritonNvidiaGPU/IR/Dialect.cpp‎
Lines changed: 12 additions & 55 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/PromoteLHSToTMem.cpp‎
Lines changed: 16 additions & 17 deletions b/‎lib/Dialect/TritonNvidiaGPU/Transforms/PromoteLHSToTMem.cpp‎
Lines changed: 16 additions & 17 deletions
@@ -0,0 +1,76 @@
+name: Create Release
+
+on:
+  push:
+    branches:
+      - main
+      - release/*
+    tags:
+      # Final Release tags look like: v1.11.0
+      - v[0-9]+.[0-9]+.[0-9]+
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  release:
+    types: [published]
+  pull_request:
+    paths: [.github/workflows/create_release.yml]
+
+jobs:
+
+  release:
+    if: ${{ github.repository == 'triton-lang/triton' }}
+    name: Create Release
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    outputs:
+      release_name: "${{ steps.release_name.outputs.name }}"
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          show-progress: false
+          submodules: 'recursive'
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      - name: Fake name for PRs
+        if: ${{ github.event_name == 'pull_request' }}
+        run: echo "PT_GITHUB_REF=refs/tags/pr-tag" >> "$GITHUB_ENV"
+      - name: Real name for non-PRs
+        if: ${{ github.event_name != 'pull_request' }}
+        run: echo "PT_GITHUB_REF=$GITHUB_REF" >> "$GITHUB_ENV"
+      - name: Set filenames
+        run: |
+          tag_or_branch="${PT_GITHUB_REF#refs/tags/}"
+          tag_or_branch="${tag_or_branch#refs/heads/}"
+          # replace directory separators with _ in branch name
+          tag_or_branch="${tag_or_branch//\//_}"
+          echo "RELEASE_NAME=triton-$tag_or_branch" >> "$GITHUB_ENV"
+          echo "RELEASE_FILE=triton-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
+      - name: Create source distribution
+        run: |
+            # Create new folder with specified name so extracting the archive yields that
+            rm -rf "/tmp/$RELEASE_NAME"
+            cp -r "$PWD" "/tmp/$RELEASE_NAME"
+            mv "/tmp/$RELEASE_NAME" .
+            # Cleanup
+            find "$RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
+            # Create archive
+            tar -czf "$RELEASE_FILE" "$RELEASE_NAME"
+            echo "Created source archive $RELEASE_FILE with content: $(ls -a "$RELEASE_NAME")"
+      - name: Upload source distribution for release
+        if: ${{ github.event_name == 'release' }}
+        uses: softprops/action-gh-release@v2
+        with:
+          files: ${{env.RELEASE_FILE}}
+      - name: Upload source distribution to GHA artifacts for release tags
+        if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
+        uses: actions/[email protected]
+        with:
+          name: ${{ env.RELEASE_FILE }}
+          path: ${{ env.RELEASE_FILE }}
+      - name: Set output
+        id: release_name
+        run: echo "name=release_name::${{ env.RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
+  cancel-in-progress: true
@@ -1 +1 @@
-71a977d0d611f3e9f6137a6b8a26b730b2886ce9
+1d4801f22ab1fd6205b1cf625b690aefc554cd4c
@@ -270,6 +270,10 @@ llvm::SmallVector<T> expandMatrixShapeWithBatch(llvm::ArrayRef<T> s);
 
 llvm::SmallVector<unsigned>
 expandMatrixOrderWithBatch(llvm::ArrayRef<unsigned> o);
+
+// Return true if the two layouts represent the exact same mapping.
+bool areLayoutsEquivalent(ArrayRef<int64_t> shape, Attribute lhs,
+                          Attribute rhs);
 } // namespace mlir::triton::gpu
 
 #endif // TRITON_DIALECT_TRITONGPU_IR_DIALECT_H_
@@ -43,7 +43,6 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "NVPTX_ENABLE_DUMP",
     "STORE_TMEM_TO_GLOBAL_BYPASS_SMEM",
     "ALLOW_LHS_TMEM_LAYOUT_CONVERSION",
-    "ENABLE_LHS_TO_TMEM",
     "TRITON_F32_DEFAULT",
     "ENABLE_MMA_V5_ATT_PIPELINE",
     "TRITON_INTEL_ADVANCED_PATH",
 
@@ -2469,10 +2469,9 @@ struct TritonGPUInferLayoutInterface
     }
     if (!expected || !got)
       return failure();
+
     // Check whether the encodings are structurally the same.
-    auto expectedLL = triton::gpu::toLinearLayout(shape, expected);
-    auto gotLL = triton::gpu::toLinearLayout(shape, got);
-    if (expectedLL != gotLL) {
+    if (!areLayoutsEquivalent(shape, expected, got)) {
       return emitOptionalError(loc, "Expected result encoding ", expected,
                                " but was ", got);
     }
@@ -3208,3 +3207,10 @@ int triton::gpu::lookupThreadsPerWarp(OpBuilder &rewriter) {
   assert(op && "cannot create thread ID outside of module");
   return triton::gpu::TritonGPUDialect::getThreadsPerWarp(cast<ModuleOp>(op));
 }
+
+bool triton::gpu::areLayoutsEquivalent(ArrayRef<int64_t> shape, Attribute lhs,
+                                       Attribute rhs) {
+  auto lhsLL = triton::gpu::toLinearLayout(shape, lhs);
+  auto rhsLL = triton::gpu::toLinearLayout(shape, rhs);
+  return lhsLL == rhsLL;
+}
@@ -35,9 +35,9 @@ void AutomaticWarpSpecialization::runOnOperation() {
   OpPassManager pm;
   pm.addPass(createTritonGPULoadMMASpecialization({numStages}));
   pm.addPass(createTritonGPURewritePartitionDependencies());
-  // `int-range-optimizations` and SCCP are good at cleaning up loop arithmetic.
-  // FIXME: Re-enable integer range analysis once it is fixed.
-  // pm.addPass(arith::createIntRangeOptimizationsPass());
+  // `int-range-optimizations` is good at cleaning up loop arithmetic involving
+  // circular buffers.
+  pm.addPass(arith::createIntRangeOptimizationsPass());
   pm.addPass(createSCCPPass());
   pm.addPass(createCSEPass());
   pm.addPass(createTritonGPUPartitionLoops());
 
@@ -171,69 +171,26 @@ bool isDistributedLayoutTMemCompatible(Operation *op,
   int numWarps = lookupNumWarps(op);
   assert(numWarps % 4 == 0);
   int numWarpGroups = numWarps / 4;
-
-  int blockM = 0;
-  int blockN = 0;
-  bool scalesEncoding = false;
-  if (auto attr = dyn_cast<triton::nvidia_gpu::TensorMemoryEncodingAttr>(
+  if (isa<triton::nvidia_gpu::TensorMemoryScalesEncodingAttr>(
           memType.getEncoding())) {
-    blockM = attr.getBlockM();
-    blockN = attr.getBlockN();
-  } else {
-    assert(isa<triton::nvidia_gpu::TensorMemoryScalesEncodingAttr>(
-               memType.getEncoding()) &&
-           "Expecting a tensor memory encoding attribute");
     return tensorType.getEncoding() ==
            triton::gpu::LinearEncodingAttr::get(
                tensorType.getContext(),
                getScaleTMEMStoreLinearLayout(tensorType, numWarps));
   }
+  auto attr =
+      cast<triton::nvidia_gpu::TensorMemoryEncodingAttr>(memType.getEncoding());
+  int blockM = attr.getBlockM();
+  int blockN = attr.getBlockN();
   if (isDistributedLayoutSplitMTmemLoadStore(tensorType, memType, numWarps))
     return true;
-  auto shapePerCTA = mlir::triton::gpu::getShapePerCTA(tensorType);
-  int numElements = product(shapePerCTA);
-  int numBlocks = ceil<int>(numElements, blockM * blockN);
-  bool useStridedMessage = blockM == 64;
-
-  int numWarpGroupsPerBlock = ceil<int>(numWarpGroups, numBlocks);
-
-  auto tensorEncoding =
-      cast<triton::gpu::BlockedEncodingAttr>(tensorType.getEncoding());
-  auto sizePerThread = tensorEncoding.getSizePerThread();
-  auto threadsPerWarp = tensorEncoding.getThreadsPerWarp();
-  auto warpsPerCTA = tensorEncoding.getWarpsPerCTA();
-  auto order = tensorEncoding.getOrder();
-
-  if (order.size() != 2)
-    return false;
-
-  if (order[0] != 0 || order[1] != 1)
-    return false;
-
-  if (useStridedMessage) {
-    // For blockM=64 we need to use 16x32bx2 message, meaning the distributed
-    // layout needs to be organized into 16x2 threads per warp and one row
-    // access per thread.
-    if (threadsPerWarp[0] != 16 || threadsPerWarp[1] != 2 ||
-        sizePerThread[0] != 1)
-      return false;
-
-    if (numBlocks == 1) {
-      // with blockM=64 and just single block we cannot split along the M
-      // dimension. Check that if we split, we split along N.
-      if (numWarpGroupsPerBlock > 1) {
-        if (warpsPerCTA[1] == 1)
-          return false;
-      }
-    }
-  } else {
-    // For blockM=128, we need to use a 32x32b message, which requires 32
-    // threads to be sequentially ordered across the M dimension, ensuring
-    // that each thread accesses a single and unique TMEM datapath.
-    if (threadsPerWarp[0] != 32 || sizePerThread[0] != 1)
-      return false;
-  }
-  return true;
+  Attribute layout =
+      nvidia_gpu::getTmemCompatibleLayout(blockM, blockN, tensorType, numWarps);
+  // TODO: Add support for more layout compatible with tmem load/store. There
+  // will only be a discret set of layout possible due to the limiations of
+  // tmem_load/store.
+  return areLayoutsEquivalent(tensorType.getShape(), layout,
+                              tensorType.getEncoding());
 }
 
 } // namespace nvidia_gpu
 
@@ -52,9 +52,23 @@ template <class MMAOpTy> class LHSToTMem : public OpRewritePattern<MMAOpTy> {
       return failure();
     Value src = localAllocOp.getSrc();
     auto srcType = cast<RankedTensorType>(src.getType());
-    auto srcLayout = cast<ttg::BlockedEncodingAttr>(srcType.getEncoding());
+    auto srcLayout = srcType.getEncoding();
+    auto accTMemEncoding = dyn_cast<ttng::TensorMemoryEncodingAttr>(
+        tcGen5MMAOp.getD().getType().getEncoding());
+    ArrayRef<unsigned> CTASplitNum =
+        triton::gpu::getCTALayout(srcLayout).getCTASplitNum();
+    // TMem encoding for A operand is the same as for D (Acc), but packed.
+    auto aTMemEncoding = ttng::TensorMemoryEncodingAttr::get(
+        context, accTMemEncoding.getBlockM(), lhs.getType().getShape()[1],
+        /*unpacked=*/false, CTASplitNum[0], CTASplitNum[1]);
+    Attribute tensorMemorySpace =
+        triton::nvidia_gpu::TensorMemorySpaceAttr::get(context);
+    ttg::MemDescType lhsMemDescType = ttg::MemDescType::get(
+        lhs.getType().getShape(), lhs.getType().getElementType(), aTMemEncoding,
+        tensorMemorySpace,
+        /*mutableMemory=*/false);
     bool layoutTmemCompatible = ttng::isDistributedLayoutTMemCompatible(
-        tcGen5MMAOp, srcType, tcGen5MMAOp.getD().getType());
+        tcGen5MMAOp, srcType, lhsMemDescType);
     Attribute newLayout = srcLayout;
     if (!layoutTmemCompatible) {
       if (triton::tools::getBoolEnv("ALLOW_LHS_TMEM_LAYOUT_CONVERSION")) {
@@ -70,19 +84,6 @@ template <class MMAOpTy> class LHSToTMem : public OpRewritePattern<MMAOpTy> {
           RankedTensorType::get(ty.getShape(), ty.getElementType(), newLayout);
       src = rewriter.create<ttg::ConvertLayoutOp>(loc, newTy, src);
     }
-    auto accTMemEncoding = dyn_cast<ttng::TensorMemoryEncodingAttr>(
-        tcGen5MMAOp.getD().getType().getEncoding());
-    ArrayRef<unsigned> CTASplitNum = srcLayout.getCTALayout().getCTASplitNum();
-    // TMem encoding for A operand is the same as for D (Acc), but unpacked.
-    auto aTMemEncoding = ttng::TensorMemoryEncodingAttr::get(
-        context, accTMemEncoding.getBlockM(), lhs.getType().getShape()[1],
-        /*unpacked=*/false, CTASplitNum[0], CTASplitNum[1]);
-    Attribute tensorMemorySpace =
-        triton::nvidia_gpu::TensorMemorySpaceAttr::get(context);
-    Type lhsMemDescType = triton::gpu::MemDescType::get(
-        lhs.getType().getShape(), lhs.getType().getElementType(), aTMemEncoding,
-        tensorMemorySpace,
-        /*mutableMemory=*/false);
     Value tMemAlloc =
         rewriter.create<ttng::TMEMAllocOp>(loc, lhsMemDescType, src);
     tcGen5MMAOp.getAMutable().assign(tMemAlloc);
@@ -100,8 +101,6 @@ class TritonNvidiaGPUPromoteLHSToTMemPass
       TritonNvidiaGPUPromoteLHSToTMemPassBase;
 
   void runOnOperation() override {
-    if (!triton::tools::getBoolEnv("ENABLE_LHS_TO_TMEM"))
-      return;
     MLIRContext *context = &getContext();
     ModuleOp m = getOperation();
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-71a977d0d611f3e9f6137a6b8a26b730b2886ce9`
	`1`	`+1d4801f22ab1fd6205b1cf625b690aefc554cd4c`