intel
diff --git a/‎.github/workflows/create_release.yml‎
Lines changed: 13 additions & 12 deletions b/‎.github/workflows/create_release.yml‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎.github/workflows/wheels.yml‎
Lines changed: 6 additions & 1 deletion b/‎.github/workflows/wheels.yml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎MANIFEST.in‎
Lines changed: 18 additions & 0 deletions b/‎MANIFEST.in‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 5 additions & 5 deletions b/‎README.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎bench/bench/bench_mlp.py‎
Lines changed: 19 additions & 7 deletions b/‎bench/bench/bench_mlp.py‎
Lines changed: 19 additions & 7 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/OpInterfaces.h‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Dialect/Triton/IR/OpInterfaces.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonAttrDefs.td‎
Lines changed: 15 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/TritonAttrDefs.td‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOpInterfaces.td‎
Lines changed: 26 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/TritonOpInterfaces.td‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 25 additions & 6 deletions b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 25 additions & 6 deletions
@@ -43,30 +43,31 @@ jobs:
           tag_or_branch="${tag_or_branch#refs/heads/}"
           # replace directory separators with _ in branch name
           tag_or_branch="${tag_or_branch//\//_}"
+          if [[ ${tag_or_branch} == v* ]]; then
+            # strip trailing v from tag name
+            tag_or_branch="${tag_or_branch#v}"
+            # important: version must be fixed in setup.py
+            sed -i -e "s:^TRITON_VERSION = .*:TRITON_VERSION = '${tag_or_branch}':" setup.py || exit 1
+          fi
           echo "RELEASE_NAME=triton-$tag_or_branch" >> "$GITHUB_ENV"
-          echo "RELEASE_FILE=triton-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
       - name: Create source distribution
         run: |
-            # Create new folder with specified name so extracting the archive yields that
-            rm -rf "/tmp/$RELEASE_NAME"
-            cp -r "$PWD" "/tmp/$RELEASE_NAME"
-            mv "/tmp/$RELEASE_NAME" .
-            # Cleanup
-            find "$RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
-            # Create archive
-            tar -czf "$RELEASE_FILE" "$RELEASE_NAME"
-            echo "Created source archive $RELEASE_FILE with content: $(ls -a "$RELEASE_NAME")"
+          pip install build || exit 1
+          python -m build -s || exit 1
+          cd dist || exit 1
+          release_file=( *.tar.gz )
+          echo "RELEASE_FILE=${release_file}" >> "$GITHUB_ENV"
       - name: Upload source distribution for release
         if: ${{ github.event_name == 'release' }}
         uses: softprops/action-gh-release@v2
         with:
-          files: ${{env.RELEASE_FILE}}
+          files: dist/${{env.RELEASE_FILE}}
       - name: Upload source distribution to GHA artifacts for release tags
         if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
         uses: actions/[email protected]
         with:
           name: ${{ env.RELEASE_FILE }}
-          path: ${{ env.RELEASE_FILE }}
+          path: dist/${{ env.RELEASE_FILE }}
       - name: Set output
         id: release_name
         run: echo "name=release_name::${{ env.RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}"
 
@@ -79,7 +79,12 @@ jobs:
           export CIBW_BUILD="cp3{9,10,11,12,13,13t}-manylinux_${{ matrix.config.arch }}"
           export CIBW_SKIP="cp{35,36,37,38}-*"
           export CIBW_FREE_THREADED_SUPPORT=1
-          python3 -m cibuildwheel python --output-dir wheelhouse
+          python3 -m cibuildwheel . --output-dir wheelhouse
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: cibw-wheels-manylinux_2_28_${{ matrix.config.arch }}-wheels-upload
+          path: ./wheelhouse/*.whl
 
       - name: Install Azure CLI
         if: ${{ steps.check-version.outputs.new_commit == 'true' }}
 
@@ -0,0 +1,18 @@
+graft bin
+graft cmake
+graft docs
+graft include
+graft lib
+graft python/src
+graft python/test
+graft python/triton/backends/amd
+graft python/triton/backends/nvidia
+graft python/triton/tools/extra/cuda
+graft test
+graft third_party
+graft unittest
+include CMakeLists.txt
+include Makefile
+include python/build_helpers.py
+include python/requirements.txt
+include python/test-requirements.txt
@@ -79,7 +79,7 @@ dev-install-torch:
 
 .PHONY: dev-install-triton
 dev-install-triton:
-	$(PYTHON) -m pip install -e python --no-build-isolation -v
+	$(PYTHON) -m pip install -e . --no-build-isolation -v
 
 .PHONY: dev-install
 .NOPARALLEL: dev-install
 
@@ -71,7 +71,7 @@ git clone https://github.com/triton-lang/triton.git
 cd triton
 
 pip install -r python/requirements.txt # build-time dependencies
-pip install -e python
+pip install -e .
 ```
 
 Or with a virtualenv:
@@ -84,7 +84,7 @@ python -m venv .venv --prompt triton
 source .venv/bin/activate
 
 pip install -r python/requirements.txt # build-time dependencies
-pip install -e python
+pip install -e .
 ```
 
 # Building with a custom LLVM
@@ -124,7 +124,7 @@ arbitrary LLVM version.
        $ LLVM_INCLUDE_DIRS=$LLVM_BUILD_DIR/include \
          LLVM_LIBRARY_DIR=$LLVM_BUILD_DIR/lib \
          LLVM_SYSPATH=$LLVM_BUILD_DIR \
-         pip install -e python
+         pip install -e .
 
 # Tips for building
 
@@ -139,7 +139,7 @@ arbitrary LLVM version.
   can be changed anytime.
 
 - If you're running out of memory when building Triton, specify the `MAX_JOBS`
-  environment variable (to the `pip install -e python` command) to limit the
+  environment variable (to the `pip install -e .` command) to limit the
   number of jobs.
 
 - Pass `--no-build-isolation` to `pip install` to make nop builds faster.
@@ -150,7 +150,7 @@ arbitrary LLVM version.
   (probably because, in our build, users don't invoke cmake directly, but
   instead use setup.py).  Teach vscode how to compile Triton as follows.
 
-    - Do a local build. Run command `pip install -e python`
+    - Do a local build. Run command `pip install -e .`
     - Get the full path to the `compile_commands.json` file produced by the build:
       `find python/build -name 'compile_commands.json' | xargs readlink -f`.
       You might get a full path similar to `/Users/{username}/triton/python/build/cmake.macosx-11.1-arm64-cpython-3.12/compile_commands.json`
 
@@ -98,7 +98,7 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
     # -- benchmark --
     fpath = Path(f"logs/{name}/{batch}-{dim1}-{dim2}-{n_expts_tot}-{n_expts_act}-{x_dtype}-{w_dtype}.hatchet")
     fpath.parent.mkdir(parents=True, exist_ok=True)
-    x_dtype = {"bf16": torch.bfloat16, "fp8": torch.float8_e4m3fn}[x_dtype]
+    x_dtype = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp8": torch.float8_e4m3fn}[x_dtype]
     # special treatment of fp8_e4m3 on AMD CDNA3 because it uses fp8_e4m3fnuz
     if x_dtype == torch.float8_e4m3fn and get_cdna_version() == 3:
         x_dtype = torch.float8_e4m3fnuz
@@ -140,17 +140,29 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
             min_time = max(min_time_flops, min_time_bytes)
             util = min_time / tot_time
         else:
-            util = "N/A"
+            util = 0.0
         tflops = sum([tot_flops[w] for w in [8, 16]]) / tot_time * 1e-3
         tbps = tot_bytes / tot_time * 1e-3
+        print(f"Utilization: {util:.0%}; {tflops:>6.1f} TFLOPs, {tbps:.1f} TB/s")
 
     return util, tflops, tbps
 
 
 if __name__ == "__main__":
     has_native_mx4 = torch.cuda.get_device_capability(0)[0] >= 10 or get_cdna_version() == 4
-    qxdtype = "fp8" if has_native_mx4 else "bf16"
-    print(bench_mlp(8192, 8192, 8192, 1, 1, "fp8", "fp8", TP=1, EP=1, name="dense"))
-    print(bench_mlp(8192, 8192, 8192, 1, 1, qxdtype, "mx4", TP=1, EP=1, name="dense"))
-    print(bench_mlp(2048, 5120, 8192, 128, 4, "fp8", "fp8", TP=4, EP=1, name="llama4"))
-    print(bench_mlp(2048, 5120, 8192, 128, 4, qxdtype, "mx4", TP=4, EP=1, name="llama4"))
+    if SPECS is None:
+        print("Current GPU has no specs provided, utilization is N/A")
+    if has_native_mx4:
+        bench_mlp(8192, 8192, 8192, 1, 1, "fp8", "fp8", TP=1, EP=1, name="dense")
+        bench_mlp(8192, 8192, 8192, 1, 1, "fp8", "mx4", TP=1, EP=1, name="dense")
+        bench_mlp(2048, 5120, 8192, 128, 4, "fp8", "fp8", TP=4, EP=1, name="llama4")
+        bench_mlp(2048, 5120, 8192, 128, 4, "fp8", "mx4", TP=4, EP=1, name="llama4")
+    else:
+        # bf16/fp16 x fp8 is skipped because matmul_ogs requires x and w has the
+        # same type when not doing mxfp operation
+        bench_mlp(8192, 8192, 8192, 1, 1, "fp8", "fp8", TP=1, EP=1, name="dense")
+        bench_mlp(8192, 8192, 8192, 1, 1, "fp16", "mx4", TP=1, EP=1, name="dense")
+        bench_mlp(8192, 8192, 8192, 1, 1, "bf16", "mx4", TP=1, EP=1, name="dense")
+        bench_mlp(2048, 5120, 8192, 128, 4, "fp8", "fp8", TP=4, EP=1, name="llama4")
+        bench_mlp(2048, 5120, 8192, 128, 4, "bf16", "mx4", TP=4, EP=1, name="llama4")
+        bench_mlp(2048, 5120, 8192, 128, 4, "fp16", "mx4", TP=4, EP=1, name="llama4")
@@ -2,6 +2,7 @@
 #define TRITON_IR_OP_INTERFACES_H_
 
 #include "mlir/IR/OpDefinition.h"
+#include "triton/Dialect/Triton/IR/Types.h"
 
 namespace mlir {
 
 
@@ -67,6 +67,21 @@ def TT_AtomicRMWAttr : I32EnumAttr<
     let cppNamespace = "::mlir::triton";
 }
 
+def TT_DescriptorReduceKindAttr : I32EnumAttr<
+    "DescriptorReduceKind", "",
+    [
+        I32EnumAttrCase<"ADD", 1, "add">,
+        I32EnumAttrCase<"MIN", 2, "min">,
+        I32EnumAttrCase<"MAX", 3, "max">,
+        I32EnumAttrCase<"INC", 4, "inc">,
+        I32EnumAttrCase<"DEC", 5, "dec">,
+        I32EnumAttrCase<"AND", 6, "and">,
+        I32EnumAttrCase<"OR", 7, "or">,
+        I32EnumAttrCase<"XOR", 8, "xor">,
+    ]> {
+    let cppNamespace = "::mlir::triton";
+}
+
 def TT_MemSyncScopeAttr : I32EnumAttr<
     "MemSyncScope", "",
     [
 
@@ -75,5 +75,31 @@ def DotOpInterface : OpInterface<"DotOpInterface"> {
   let verify = [{ return ::mlir::triton::impl::verifyDotOpInterface($_op); }];
 }
 
+def TT_DescriptorOpInterface : OpInterface<"DescriptorOpInterface"> {
+  let description = [{
+    Common interface to get the descriptor argument from an operation on tensor descriptors.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      /*desc=*/"Get the descriptor",
+      /*retType=*/"::mlir::TypedValue<mlir::triton::TensorDescType>",
+      /*methodName=*/"getDesc",
+      /*args=*/(ins)>,
+  ];
+}
+
+def TT_DescriptorStoreLikeOpInterface : OpInterface<"DescriptorStoreLikeOpInterface", [TT_DescriptorOpInterface]> {
+  let cppNamespace = "::mlir::triton";
+
+  let methods = [
+    InterfaceMethod<
+      /*desc=*/"Get Source tensor",
+      /*retType=*/"::mlir::TypedValue<mlir::RankedTensorType>",
+      /*methodName=*/"getSrc",
+      /*args=*/(ins)>,
+  ];
+}
+
 
 #endif // TRITON_OP_INTERFACES
@@ -1019,7 +1019,7 @@ def TT_MakeTensorDescOp : TT_Op<"make_tensor_descriptor", [
   let assemblyFormat = "$base `,` `[` $shape `]` `,` `[` $strides `]` attr-dict `:` type($base) `,` type($result)";
 
   let builders = [
-    OpBuilder<(ins "Value":$base, "ValueRange":$shape, "ValueRange":$strides, "ArrayRef<int32_t>":$blockShape)>
+    OpBuilder<(ins "Value":$base, "ValueRange":$shape, "ValueRange":$strides, "ArrayRef<int32_t>":$blockShape, "bool":$isSignedInteger)>
   ];
 
   let extraClassDeclaration = [{
@@ -1259,7 +1259,7 @@ def ReturnOp : TT_Op<"return", [Pure, HasParent<"FuncOp">, /*MemRefsNormalizable
 }
 
 
-def TT_DescriptorLoadOp : TT_Op<"descriptor_load"> {
+def TT_DescriptorLoadOp : TT_Op<"descriptor_load", [TT_DescriptorOpInterface]> {
   let summary = "Load from descriptor";
   let description = [{
     This operation will be lowered to Nvidia TMA load operation on targets supporting it.
@@ -1287,7 +1287,7 @@ def TT_DescriptorLoadOp : TT_Op<"descriptor_load"> {
   let hasVerifier = 1;
 }
 
-def TT_DescriptorStoreOp : TT_Op<"descriptor_store"> {
+def TT_DescriptorStoreOp : TT_Op<"descriptor_store", [TT_DescriptorStoreLikeOpInterface]> {
   let summary = "store value based on descriptor";
   let description = [{
     This operation will be lowered to Nvidia TMA store operation on targets supporting it.
@@ -1304,11 +1304,30 @@ def TT_DescriptorStoreOp : TT_Op<"descriptor_store"> {
     $desc `[` $indices `]` `,` $src
     attr-dict `:` qualified(type($desc)) `,` type($src)
   }];
-
   let hasVerifier = 1;
 }
 
-def TT_DescriptorGatherOp : TT_Op<"descriptor_gather"> {
+def TT_DescriptorReduceOp : TT_Op<"descriptor_reduce", [TT_DescriptorStoreLikeOpInterface]> {
+  let summary = "performs a reducing store operation based on a descriptor";
+  let description = [{
+    This operation will be lowered to Nvidia TMA store operation on targets supporting it.
+    `desc` is a tensor descriptor object.
+    The shape and types of `src` must match the descriptor otherwise the result is undefined.
+  }];
+  let arguments = (ins
+    TT_DescriptorReduceKindAttr:$kind,
+    Arg<TT_TensorDescType, "", [MemRead<GlobalMemory>, MemWrite<GlobalMemory>]>:$desc,
+    TT_Tensor:$src,
+    Variadic<I32>:$indices
+  );
+
+  let assemblyFormat = [{
+    $kind `,` $desc `[` $indices `]` `,` $src
+    attr-dict `:` qualified(type($desc)) `,` type($src)
+  }];
+}
+
+def TT_DescriptorGatherOp : TT_Op<"descriptor_gather", [TT_DescriptorOpInterface]> {
   let summary = "gather multiple rows from a descriptor into a single tensor";
   let description = [{
     The `tt.descriptor_gather` op will be lowered to NVIDIA TMA
@@ -1341,7 +1360,7 @@ def TT_DescriptorGatherOp : TT_Op<"descriptor_gather"> {
   }];
 }
 
-def TT_DescriptorScatterOp : TT_Op<"descriptor_scatter"> {
+def TT_DescriptorScatterOp : TT_Op<"descriptor_scatter", [TT_DescriptorStoreLikeOpInterface]> {
   let summary = "scatter multiple rows to a descriptor from a single tensor";
   let description = [{
     The `tt.descriptor_scatter` op will be lowered to NVIDIA TMA