triton-lang
diff --git a/‎.github/workflows/llvm-build.yml‎
Lines changed: 15 additions & 8 deletions b/‎.github/workflows/llvm-build.yml‎
Lines changed: 15 additions & 8 deletions
diff --git a/‎.github/workflows/llvm-build/almalinux.Dockerfile‎
Lines changed: 6 additions & 1 deletion b/‎.github/workflows/llvm-build/almalinux.Dockerfile‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎include/triton/Analysis/Utility.h‎
Lines changed: 0 additions & 13 deletions b/‎include/triton/Analysis/Utility.h‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 9 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 10 additions & 13 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 10 additions & 13 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Partition.h‎
Lines changed: 4 additions & 7 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Partition.h‎
Lines changed: 4 additions & 7 deletions
@@ -9,6 +9,8 @@ on:
   pull_request:
     paths:
       - .github/workflows/llvm-build.yml
+      - .github/workflows/llvm-build/almalinux.Dockerfile
+      - .github/workflows/llvm-build/centos.Dockerfile
   workflow_dispatch:
 
 env:
@@ -135,6 +137,7 @@ jobs:
         -DLLVM_INSTALL_UTILS=ON
         -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU"
         -DLLVM_ENABLE_TERMINFO=OFF
+        -DLLVM_ENABLE_ZSTD=OFF
         llvm-project/llvm
 
         ninja -C llvm-project/build check-mlir install
@@ -237,7 +240,11 @@ jobs:
       run: |
         # if this step crashes, it can leave behind a stale docker container
         docker container prune -f
-        docker rmi -f $(docker images -q)
+
+        images=$(docker images -q)
+        if [ -n "$images" ]; then
+          docker rmi -f $images
+        fi
 
         docker build --tag llvm-build --build-arg llvm_dir=llvm-project \
           -f llvm-build/.github/workflows/llvm-build/almalinux.Dockerfile .
@@ -264,16 +271,16 @@ jobs:
         path: |
           ${{ github.workspace }}/llvm-*-${{ matrix.config.target-os }}-${{ matrix.config.arch }}.tar.gz
 
-    - name: Azure Login
-      if: ${{ (github.repository == 'triton-lang/triton') }}
+    - name: Azure login
+      if: ${{ (github.repository == 'triton-lang/triton') && github.ref_name == 'llvm-head' }}
       uses: azure/login@v2
       with:
-        client-id: ${{ secrets.AZURE_CLIENT_ID }}
-        tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-        subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+        client-id: ${{ secrets.AZURE_CLIENT_ID_LLVM }}
+        tenant-id: ${{ secrets.AZURE_TENANT_ID_LLVM }}
+        subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID_LLVM }}
 
     - name: Upload LLVM Artifacts to Azure
-      if: ${{ (github.repository == 'triton-lang/triton') }}
+      if: ${{ (github.repository == 'triton-lang/triton') && github.ref_name == 'llvm-head' }}
       shell: bash -el {0}
       run: |
         az storage blob upload --account-name oaitriton --auth-mode login --container-name public --file "${{ env.llvm_install_dir }}.tar.gz" --name "llvm-builds/${{ env.llvm_install_dir }}.tar.gz" --overwrite
@@ -282,7 +289,7 @@ jobs:
         echo "Blob URL: ${URL}"
 
     - name: Azure Logout
-      if: ${{ (github.repository == 'triton-lang/triton') }}
+      if: ${{ (github.repository == 'triton-lang/triton') && github.ref_name == 'llvm-head' }}
       run: |
         az logout
         az cache purge
 
@@ -1,4 +1,5 @@
-FROM almalinux:8
+# https://github.com/AlmaLinux/container-images/blob/9f9b3c8c8cf4a57fd42f362570ff47c75788031f/default/amd64/Dockerfile
+FROM almalinux:8.10-20250411
 ARG llvm_dir=llvm-project
 # Add the cache artifacts and the LLVM source tree to the container
 ADD sccache /sccache
@@ -8,6 +9,7 @@ ENV SCCACHE_CACHE_SIZE="2G"
 
 RUN dnf install --assumeyes llvm-toolset
 RUN dnf install --assumeyes python38-pip python38-devel git
+RUN alternatives --set python3 /usr/bin/python3.8
 
 RUN python3 -m pip install --upgrade pip
 RUN python3 -m pip install --upgrade cmake ninja sccache lit
@@ -26,6 +28,8 @@ RUN cmake -GNinja -Bbuild \
   -DCMAKE_CXX_FLAGS="-Wno-everything" \
   -DCMAKE_LINKER=lld \
   -DCMAKE_INSTALL_PREFIX="/install" \
+  -DPython3_EXECUTABLE="/usr/bin/python3.8" \
+  -DPython_EXECUTABLE="/usr/bin/python3.8" \
   -DLLVM_BUILD_UTILS=ON \
   -DLLVM_BUILD_TOOLS=ON \
   -DLLVM_ENABLE_ASSERTIONS=ON \
@@ -34,6 +38,7 @@ RUN cmake -GNinja -Bbuild \
   -DLLVM_ENABLE_TERMINFO=OFF \
   -DLLVM_INSTALL_UTILS=ON \
   -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" \
+  -DLLVM_ENABLE_ZSTD=OFF \
   /source/llvm-project/llvm
 
 RUN ninja -C build install
@@ -268,19 +268,6 @@ bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy);
 // ConvertLayoutOpHelper in the future
 bool shouldUseDistSmem(Attribute srcLayout, Attribute dstLayout);
 
-/// Multi-root DAG topological sort.
-/// Performs a topological sort of the Operation in the `toSort` SetVector.
-/// Returns a topologically sorted SetVector.
-/// It is faster than mlir::topologicalSort because it prunes nodes that have
-/// been visited before.
-SetVector<Operation *>
-multiRootTopologicalSort(const SetVector<Operation *> &toSort);
-
-/// This uses the toplogicalSort above
-SetVector<Operation *>
-multiRootGetSlice(Operation *op, TransitiveFilter backwardFilter = nullptr,
-                  TransitiveFilter forwardFilter = nullptr);
-
 /// Create a basic DataFlowSolver with constant and dead code analysis included.
 std::unique_ptr<DataFlowSolver> createDataFlowSolver();
 
 
@@ -732,6 +732,7 @@ def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
       `lhs` `=` $a_elem_type `rhs` `=` $b_elem_type attr-dict
       `:` type($a) (`,` type($a_scale)^)? `*` type($b) (`,` type($b_scale)^)? `->` type($d)
     }];
+    let hasVerifier = 1;
 }
 
 //
 
@@ -45,6 +45,11 @@ constexpr static char AttrNumWarpsName[] = "ttg.num-warps";
 constexpr static char AttrNumCTAsName[] = "ttg.num-ctas";
 constexpr static char AttrTargetName[] = "ttg.target";
 constexpr static char AttrNumThreadsPerWarp[] = "ttg.threads-per-warp";
+// FIXME: rename to match above
+constexpr static char kPartitionAttrName[] = "ttg.partition";
+constexpr static char kPartitionOutputsAttrName[] = "ttg.partition.outputs";
+constexpr static char kPartitionStagesAttrName[] = "ttg.partition.stages";
+constexpr static char kWarpSpecializeTagAttrName[] = "ttg.warp_specialize.tag";
 
 // Find the contextual number of warps on which this operation is executed.
 int lookupNumWarps(Operation *op);
@@ -293,6 +298,10 @@ LogicalResult verifyMemoryOpTypes(Operation *op, ShapedType srcTy,
                                   ShapedType dstTy);
 // Verify a memory allocation operation.
 LogicalResult verifyAllocOp(Operation *op, Value src, MemDescType dstTy);
+
+std::optional<SetVector<int>> getPartitionIds(Operation *op);
+std::optional<int> getNumOutputPartitionIds(Operation *op);
+std::optional<SetVector<int>> getOutputPartitionIds(Operation *op, int idx);
 } // namespace mlir::triton::gpu
 
 #endif // TRITON_DIALECT_TRITONGPU_IR_DIALECT_H_
@@ -117,19 +117,6 @@ chooseDsReadTrLayout(Attribute enc, ArrayRef<int64_t> shape,
                      int32_t elemBitWidth, unsigned instBitWidth,
                      unsigned numLanesInShuffleGroup);
 
-LinearLayout getScaleTMEMStoreLinearLayout(RankedTensorType scaleType,
-                                           int numWarps);
-
-std::optional<LinearLayout>
-getTmemLoadStoreLayout16x256(int M, int N, RankedTensorType oldType,
-                             int numWarps);
-
-// Return a layout valid for TMemLoad op for a tmem layout of block MxN that
-// distribute the data long M for the warp groups. This doesn't affect the TMem
-// layout it just returns a distributed layout compatible for tmem_load.
-LinearLayout getTmemLoadLayoutSplitLongM(int M, int N, RankedTensorType oldType,
-                                         int numWarps);
-
 // Create LinearLayout for scale in scaled mfma.
 LinearLayout chooseScaledMfmaScaleLayout(MLIRContext *ctx, int dotOperandIdx,
                                          ArrayRef<int64_t> dotOperandShape,
@@ -161,5 +148,15 @@ std::optional<LinearLayout> chooseMfmaLikeStoreLayout(RankedTensorType valType);
 LinearLayout getCoreMatrixLinearLayout(NVMMASharedEncodingAttr shared,
                                        bool disableSwizzle);
 
+// Make a LinearLayout that maps a block-id to an N-dimensional index.
+//
+// The tensor is split up into CTAsPerCGA pieces, which are distributed among
+// the CTAsPerCGA CTAs (i.e. blocks) in the CGA (i.e. groups).
+//
+// See the nomenclature note at the top of the LinearLayoutConversions.cpp file
+// for an explanation of why this is called makeCgaLayout when it accepts a
+// CTALayoutAttr.
+LinearLayout makeCgaLayout(CTALayoutAttr layout);
+
 } // namespace mlir::triton::gpu
 #endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H
@@ -16,11 +16,6 @@ class ForOp;
 } // namespace scf
 } // namespace mlir
 
-static constexpr char kPartitionAttrName[] = "ttg.partition";
-static constexpr char kPartitionOutputsAttrName[] = "ttg.partition.outputs";
-static constexpr char kPartitionStagesAttrName[] = "ttg.partition.stages";
-static constexpr char kWarpSpecializeTagAttrName[] = "ttg.warp_specialize.tag";
-
 //===----------------------------------------------------------------------===//
 // PartitionSet
 //===----------------------------------------------------------------------===//
@@ -40,6 +35,7 @@ class Partition {
   ArrayRef<Operation *> getOps() const { return ops; }
   void addOp(Operation *op) { ops.push_back(op); }
   bool hasOp(Operation *op) const;
+  bool empty() const { return ops.empty(); }
 
   // Iterate the inputs of the partition. Input values are those that originate
   // from a different partition or a previous iteration of the current
@@ -127,8 +123,9 @@ void setPartition(Operation *op, const SetVector<Partition *> &partitions);
 // which does not work with Partition instances and iterate* functions, since
 // it does not keep the op attributes and the op list of a partition in sync.
 void setPartition(Operation *op, const SetVector<int> &partitionIds);
-
-std::optional<SetVector<int>> getPartitionIds(Operation *op);
+void setPartitionOutputs(Operation *op,
+                         ArrayRef<SetVector<int>> partitionOutputsIds);
+SmallVector<SetVector<int>, 4> getPartitionOutputs(Operation *op);
 
 } // namespace mlir::triton::gpu
Original file line number	Diff line number	Diff line change
`@@ -732,6 +732,7 @@ def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,`
`732`	`732`	`lhs` `=` $a_elem_type `rhs` `=` $b_elem_type attr-dict
`733`	`733`	`:` type($a) (`,` type($a_scale)^)? `*` type($b) (`,` type($b_scale)^)? `->` type($d)
`734`	`734`	`}];`
	`735`	`+ let hasVerifier = 1;`
`735`	`736`	`}`
`736`	`737`
`737`	`738`	`//`