AndreyPavlenko
diff --git a/‎.github/workflows/integration-tests.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/integration-tests.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎bin/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions b/‎bin/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎cmake/nvidia-toolchain-version.json‎
Lines changed: 6 additions & 5 deletions b/‎cmake/nvidia-toolchain-version.json‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 2 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 58 additions & 8 deletions b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 58 additions & 8 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/Utility.h‎
Lines changed: 6 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/Utility.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 11 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 18 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 18 additions & 0 deletions
@@ -262,13 +262,13 @@ jobs:
             echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           cd python/test/unit
-          python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
+          python3 -m pytest -s -n 8 --ignore=cuda/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
           python3 -m pytest -s -n 8 language/test_subprocess.py
           python3 -m pytest -s -n 8 test_debug.py --forked
           # Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
           TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s language/test_line_info.py
-          # Run hopper/test_flashattention.py separately to avoid out of gpu memory
-          python3 -m pytest -s hopper/test_flashattention.py
+          # Run cuda/test_flashattention.py separately to avoid out of gpu memory
+          python3 -m pytest -s cuda/test_flashattention.py
           TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=${INSTRUMENTATION_LIB_DIR}/libGPUInstrumentationTestLib.so \
           python3 -m pytest --capture=tee-sys -rfs -vvv instrumentation/test_gpuhello.py
       - name: Run interpreter tests
 
@@ -300,13 +300,13 @@ jobs:
             echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           cd python/test/unit
-          python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
+          python3 -m pytest -s -n 8 --ignore=cuda/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
           python3 -m pytest -s -n 8 language/test_subprocess.py
           python3 -m pytest -s -n 8 test_debug.py --forked
           # Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
           TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s language/test_line_info.py
-          # Run hopper/test_flashattention.py separately to avoid out of gpu memory
-          python3 -m pytest -s hopper/test_flashattention.py
+          # Run cuda/test_flashattention.py separately to avoid out of gpu memory
+          python3 -m pytest -s cuda/test_flashattention.py
           TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=${INSTRUMENTATION_LIB_DIR}/libGPUInstrumentationTestLib.so \
           python3 -m pytest --capture=tee-sys -rfs -vvv instrumentation/test_gpuhello.py
       - name: Run interpreter tests
 
@@ -66,6 +66,7 @@ cmake-build-*
 cuobjdump
 nvdisasm
 ptxas
+ptxas-blackwell
 
 # Third-party include
 third_party/nvidia/backend/include
 
@@ -13,6 +13,7 @@ target_link_libraries(triton-opt PRIVATE
   ${triton_libs}
   # tests
   TritonTestAnalysis
+  TritonTestDialectTritonGPU
   # MLIR core
   MLIROptLib
   MLIRPass
@@ -31,6 +32,7 @@ target_link_libraries(triton-reduce PRIVATE
   ${triton_libs}
   # tests
   TritonTestAnalysis
+  TritonTestDialectTritonGPU
   # MLIR core
   MLIRReduceLib
   MLIRPass
@@ -48,6 +50,7 @@ target_link_libraries(triton-lsp PRIVATE
   ${triton_libs}
   # tests
   TritonTestAnalysis
+  TritonTestDialectTritonGPU
   # MLIR core
   MLIRLspServerLib
   MLIRPass
@@ -85,4 +88,5 @@ target_link_libraries(triton-tensor-layout PRIVATE
   ${conversion_libs}
   ${dialect_libs}
   TritonTestAnalysis
+  TritonTestDialectTritonGPU
   )
@@ -1,8 +1,9 @@
 {
+  "ptxas-blackwell": "12.8.61",
   "ptxas": "12.4.99",
-  "cuobjdump": "12.4.99",
-  "nvdisasm": "12.4.99",
-  "cudacrt": "12.4.99",
-  "cudart": "12.4.99",
-  "cupti": "12.4.99"
+  "cuobjdump": "12.8.55",
+  "nvdisasm": "12.8.55",
+  "cudacrt": "12.8.61",
+  "cudart": "12.8.57",
+  "cupti": "12.8.57"
 }
@@ -25,6 +25,8 @@ namespace triton {
 constexpr int patternBenefitDefault = 1;
 constexpr int patternBenefitPrioritizeOverLLVMConversions = 10;
 constexpr int patternBenefitClampOptimizedPattern = 20;
+constexpr int patternBenefitConvertLayoutOptimizedPattern = 20;
+constexpr int patternBenefitNvidiaTensorCoreSubviewPattern = 20;
 
 struct BackendCallbacks {
   /**
 
@@ -669,14 +669,14 @@ def TT_DotOp : TT_Op<"dot", [Pure,
 // DotScaled Op
 //
 def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
-                                          AttrSizedOperandSegments,
-                                          DotLike,
-                                          TypesMatchWith<"result's type matches accumulator's type",
-                                                         "d", "c", "$_self">]> {
+                             AttrSizedOperandSegments,
+                             DotLike,
+                             TypesMatchWith<"result's type matches accumulator's type",
+                                            "d", "c", "$_self">]> {
     let summary = "dot_scaled";
 
     let description = [{
-        $d = matrix_multiply(scale($lhs, $lhs_scale), scale($rhs, $rhs_scale)) + $c.
+        $d = matrix_multiply(scale($lhs, $lhs_scale), scale(rlhs, $rhs_scale)) + $c.
         Where scale(x, s) is a function that applies the scale per block following microscaling spec.
     }];
 
@@ -687,16 +687,15 @@ def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
       RankedTensorOf<[TT_Float,I8]>:$lhs,
       RankedTensorOf<[TT_Float,I8]>:$rhs,
       TT_FloatTensor:$c,
-      Optional<RankedTensorOf<[I8]>>:$lhs_scale,
-      Optional<RankedTensorOf<[I8]>>:$rhs_scale,
+      Optional<RankedTensorOf<[TT_Float, I8]>>:$lhs_scale,
+      Optional<RankedTensorOf<[TT_Float, I8]>>:$rhs_scale,
       TT_ScaleDotElemTypeAttr:$lhs_type,
       TT_ScaleDotElemTypeAttr:$rhs_type,
       BoolAttr:$fastMath
     );
 
     let results = (outs TT_FloatTensor:$d);
 
-    // Not sure why I need to fully specify the optional group, but otherwise it complains when loading the mlir file
     let assemblyFormat = [{
       $lhs (`scale` $lhs_scale^)? `,` $rhs (`scale` $rhs_scale^)? `,` $c `lhs` `=` $lhs_type `rhs` `=` $rhs_type attr-dict
       `:` type($lhs) (`,` type($lhs_scale)^)? `*` type($rhs) (`,` type($rhs_scale)^)? `->` type($d)
@@ -1297,6 +1296,57 @@ def TT_ExperimentalDescriptorStoreOp : TT_Op<"experimental_descriptor_store", [
   let hasVerifier = 1;
 }
 
+def TT_ExperimentalDescriptorGatherOp : TT_Op<"experimental_descriptor_gather", [MemoryEffects<[MemRead<GlobalMemory>]>]> {
+  let summary = "gather multiple rows from a descriptor into a single tensor";
+  let description = [{
+    The `tt.experimental_desciptor_gather` op will be lowered to NVIDIA TMA
+    load operations on targets that support it.
+
+    `desc_ptr` is a pointer to the TMA descriptor allocated in global memory.
+    The descriptor block must have 1 row and the indices must be a 1D tensor.
+    Accordingly, the result is a 2D tensor multiple rows.
+
+    This is an escape hatch and is only there for testing/experimenting. This
+    op will be removed in the future.
+  }];
+
+  let arguments = (ins
+    TT_TensorDescType:$desc,
+    RankedTensorOf<[I32]>:$x_offsets,
+    I32:$y_offset
+  );
+  let results = (outs TT_Tensor:$result);
+
+  let assemblyFormat = [{
+    $desc `[` $x_offsets `,` $y_offset `]`
+    attr-dict `:` functional-type(operands, results)
+  }];
+
+  let hasVerifier = 1;
+
+  let extraClassDeclaration = [{
+    // TMA gathers have resstrictions on the minimum size of the gather result.
+    // This function verifies the result type.
+    static LogicalResult verifyResultType(Operation *op, mlir::ShapedType type);
+  }];
+}
+
+def TT_ExperimentalDescriptorScatterOp : TT_Op<"experimental_descriptor_scatter", [
+    MemoryEffects<[MemRead<GlobalMemory>, MemWrite<GlobalMemory>]>,
+]> {
+  let arguments = (ins
+    TT_TensorDescType:$desc,
+    RankedTensorOf<[I32]>:$x_offsets,
+    I32:$y_offset,
+    TT_Tensor:$src
+  );
+
+  let assemblyFormat = [{
+    $desc `[` $x_offsets `,` $y_offset `]` `,` $src
+    attr-dict `:` type(operands)
+  }];
+}
+
 def TT_ExperimentalTensormapCreateOp: TT_Op<
   "experimental_tensormap_create",
   [
 
@@ -167,6 +167,12 @@ template <typename VecT> bool isConsecutive(const VecT &vec) {
   return isConsecutive(ArrayRef(vec));
 }
 
+template <typename T> auto seq(T start, T end, T step) {
+  auto len = ceil<T>(end - start, step);
+  return llvm::map_range(llvm::seq<T>(0, len),
+                         [=](T i) { return start + i * step; });
+}
+
 } // namespace triton
 } // namespace mlir
 
 
@@ -9,6 +9,7 @@
 #include "triton/Tools/LinearLayout.h"
 
 namespace mlir::triton::gpu {
+class SharedEncodingAttr;
 
 // - BlockedEncodingAttrs have the following input dimensions.
 //
@@ -41,6 +42,16 @@ namespace mlir::triton::gpu {
 LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
                             std::optional<int32_t> elemBitWidth = std::nullopt);
 
+// Convert the shared encoding of a tensor with `hasLeadingOffset=true` to a
+// LinearLayout that maps from a linear shared memory offset to tensor index.
+//
+// If `disableSwizzle` is set, then the resulting layout does not include
+// swizzling.
+LinearLayout sharedToLinearLayoutLeadingOffset(ArrayRef<int64_t> shape,
+                                               SharedEncodingAttr shared,
+                                               int32_t elemBitWidth,
+                                               bool disableSwizzle = false);
+
 // Given a linear layout where the input dimensions contain a "block" dimension,
 // this method sets the "block" dimension to 0 and removes the corresponding
 // output dimensions.
 
@@ -23,6 +23,24 @@ def TritonGPUPipeline : Pass<"tritongpu-pipeline", "mlir::ModuleOp"> {
   ];
 }
 
+def TritonGPUTC05MMAPipeline : Pass<"tritongpu-tc05mma-pipeline", "mlir::ModuleOp"> {
+  let summary = "Test pass calling TC05MMA pipeline";
+
+  let description = [{
+    This pass is used to test the TC05MMA pipelining under LIT. Internally it calls
+    `getTC05MMASchedule` to get the schedule and then applies the pipelining.
+  }];
+
+  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
+                           "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
+                           "mlir::scf::SCFDialect",
+                           "mlir::arith::ArithDialect"];
+
+  let options = [
+    Option<"disableExpander", "disable-expander", "bool", /*default*/"false", "Run only loop pre-process">
+  ];
+}
+
 def TritonGPUTestPipelineAssignLatencies : Pass<"tritongpu-test-pipeline-assign-latencies", "mlir::ModuleOp"> {
   let summary = "test assigning latencies to interesting ops ahead of pipelining";