intel
diff --git a/‎.github/actions/setup-pytorch/action.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/actions/setup-pytorch/action.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/integration-tests.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/integration-tests.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/triton-benchmarks.yml‎
Lines changed: 0 additions & 28 deletions b/‎.github/workflows/triton-benchmarks.yml‎
Lines changed: 0 additions & 28 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/triton_kernels_benchmark/gemm_benchmark.py‎
Lines changed: 20 additions & 19 deletions b/‎benchmarks/triton_kernels_benchmark/gemm_benchmark.py‎
Lines changed: 20 additions & 19 deletions
diff --git a/‎bin/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎bin/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎bin/triton-llvm-opt.cpp‎
Lines changed: 8 additions & 0 deletions b/‎bin/triton-llvm-opt.cpp‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp‎
Lines changed: 3 additions & 8 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp‎
Lines changed: 3 additions & 8 deletions
@@ -120,7 +120,7 @@ runs:
         cd pytorch
         pip install wheel
         pip install -r requirements.txt
-        USE_STATIC_MKL=1 python setup.py bdist_wheel
+        USE_STATIC_MKL=1 CFLAGS="-Wno-error=maybe-uninitialized" python setup.py bdist_wheel
 
     - name: Install PyTorch (built from source)
       if: ${{ inputs.mode == 'source' }}
 
@@ -236,7 +236,7 @@ jobs:
       - name: Install pip dependencies
         run: |
           python3 -m pip install --upgrade pip
-          python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-forked pytest-xdist lit
+          python3 -m pip install cython setuptools wheel cmake==3.24 ninja lit
       - name: Install Triton
         env:
           CUDA_HOME: "/usr/local/cuda"
@@ -569,7 +569,7 @@ jobs:
           python3 -m venv ~/.venv
           source ~/.venv/bin/activate
           python3 -m pip install --upgrade pip
-          python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-xdist lit pybind11
+          python3 -m pip install cython setuptools wheel cmake==3.24 ninja lit pybind11
       - name: Install Triton
         env:
           TRITON_BUILD_WITH_O1: "true"
 
@@ -268,7 +268,7 @@ jobs:
       - name: Install pip dependencies
         run: |
           python3 -m pip install --upgrade pip
-          python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-forked pytest-xdist lit
+          python3 -m pip install cython setuptools wheel cmake==3.24 ninja lit
 
       - name: Install Triton
         env:
@@ -481,7 +481,7 @@ jobs:
           python3 -m venv ~/.venv
           source ~/.venv/bin/activate
           python3 -m pip install --upgrade pip
-          python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-xdist lit pybind11
+          python3 -m pip install cython setuptools wheel cmake==3.24 ninja lit pybind11
       - name: Install Triton
         env:
           TRITON_BUILD_WITH_O1: "true"
 
@@ -158,21 +158,6 @@ jobs:
           python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
           python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
 
-      - name: Run Triton GEMM kernel benchmark - default path
-        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_default') }}
-        run: |
-          cd benchmarks/triton_kernels_benchmark
-          # Default path:
-          TRITON_INTEL_ADVANCED_PATH=0 \
-          IGC_VISAOptions=" -enableBCR -nolocalra" \
-          IGC_DisableLoopUnroll=1 \
-          python gemm_benchmark.py --reports $REPORTS
-          mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-default-path.csv
-
-          source ../../scripts/capture-hw-details.sh
-          TAG="${TAG}-dflt"
-          python ../../scripts/build_report.py $REPORTS/matmul-performance-default-path.csv $REPORTS/gemm-triton-default-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
-
       - name: Run Triton GEMM kernel benchmark - advanced path
         if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_advanced') }}
         run: |
@@ -260,19 +245,6 @@ jobs:
           python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
           python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
 
-      - name: Run Triton FA kernel benchmark - default path
-        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmark || '[]'), 'flash_attention_fwd_benchmark.py_default') }}
-        run: |
-          cd benchmarks/triton_kernels_benchmark
-          TRITON_INTEL_ADVANCED_PATH=0 \
-          TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT=1 \
-          IGC_VISAOptions=" -enableBCR" \
-          python flash_attention_fwd_benchmark.py --reports $REPORTS
-
-          TAG="${TAG}-dflt"
-          source ../../scripts/capture-hw-details.sh
-          python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-default-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
-
       - name: Run Triton FA kernel benchmark - advanced path
         if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_fwd_benchmark.py_advanced') }}
         run: |
 
@@ -185,7 +185,7 @@ if(TRITON_BUILD_PYTHON_MODULE)
   include_directories(${PYTHON_SRC_PATH})
 
   # Python Interpreter is used to run lit tests
-  find_package(Python3 REQUIRED COMPONENTS Development Interpreter)
+  find_package(Python3 REQUIRED COMPONENTS Development.Module Interpreter)
   find_package(pybind11 CONFIG REQUIRED HINTS "${Python3_SITELIB}")
 
   if (DEFINED TRITON_PLUGIN_DIRS)
 
@@ -227,28 +227,28 @@ def get_shapes(B, M, N, K, transpose_a, transpose_b):
 @benchmark_suit.perf_report(
     benchmark_suit.Benchmark(
         # argument names to use as an x-axis for the plot
-        x_names=['B', 'M', 'K', 'N'],
+        x_names=['B', 'M', 'N', 'K'],
         # different possible values for `x_name`
         x_vals=[[1, 1024 * i, 1024 * i, 1024 * i] for i in [1, 2, 4, 8]] +  #
         [  #
-            [1, 1, 5120, 13824],  #
-            [1, 4, 4096, 12288],  #
+            [1, 1, 13824, 5120],  #
+            [1, 4, 12288, 4096],  #
             [1, 512, 8192, 8192],  #
             [1, 512, 8192, 32768],  #
             [1, 512, 32768, 8192],  #
-            [1, 1024, 16384, 8192],  #
-            [1, 1024, 28672, 8192],  #
-            [1, 3072, 4096, 3072],  # FIXME: Remove this case when gemm_streamk_benchmark can get better performance
-            [1, 4096, 16384, 8192],  #
-            [1, 8192, 16384, 1024],  #
-            [1, 8192, 16384, 4096],  #
+            [1, 1024, 8192, 16384],  #
+            [1, 1024, 8192, 28672],  #
+            [1, 3072, 3072, 4096],  # FIXME: Remove this case when gemm_streamk_benchmark can get better performance
+            [1, 4096, 8192, 16384],  #
+            [1, 8192, 1024, 16384],  #
+            [1, 8192, 4096, 16384],  #
             [1, 16384, 1024, 8192],  #
             [1, 16384, 4096, 8192],  #
             [1, 16384, 8192, 1024],  #
             [1, 16384, 8192, 4096],  #
             [4, 32768, 128, 4096],  #
             [4, 32768, 4096, 128],  #
-            [32, 4096, 4096, 128],  #
+            [32, 4096, 128, 4096],  #
             [4096, 8, 128, 16384],  #
             [4096, 8, 16384, 128]
         ],
@@ -268,6 +268,7 @@ def get_shapes(B, M, N, K, transpose_a, transpose_b):
 def benchmark(B, M, N, K, provider):
     a_shape, b_shape = get_shapes(B, M, N, K, transpose_a=TRANSPOSE_A, transpose_b=TRANSPOSE_B)
 
+    torch.manual_seed(0)
     a = torch.rand(a_shape, device='xpu', dtype=torch.bfloat16)
     b = torch.rand(b_shape, device='xpu', dtype=torch.bfloat16)
 
@@ -291,10 +292,10 @@ def benchmark(B, M, N, K, provider):
     elif provider == 'triton':
         assert len(a.shape) == len(b.shape), 'Incompatible sizes'
         if len(a.shape) == 3:
-            c = torch.empty((B, M, N), device='xpu', dtype=torch.float32)
+            c = torch.zeros((B, M, N), device='xpu', dtype=torch.float32)
         else:
             assert len(a.shape) == 2, 'Expecting shape of length 2'
-            c = torch.empty((M, N), device='xpu', dtype=torch.float32)
+            c = torch.zeros((M, N), device='xpu', dtype=torch.float32)
         triton_fn = lambda: matmul(a, b, c, transpose_a=TRANSPOSE_A, transpose_b=TRANSPOSE_B)
         torch_fn = lambda: torch.matmul(torch_a, torch_b).to(torch.float32)
         rtol = 1e-2 if a.dtype == torch.bfloat16 else 1e-3
@@ -304,17 +305,17 @@ def benchmark(B, M, N, K, provider):
                                                                  kernel_name='matmul_kernel_with_block_pointers')
     elif provider == 'xetla':
         if B == 1:
-            c = torch.empty((M, N), device='xpu', dtype=torch.float32)
-            acc = torch.empty((M, N), device='xpu', dtype=torch.float32)
-            cnt = torch.empty((M, N), device='xpu', dtype=torch.int32)
+            c = torch.zeros((M, N), device='xpu', dtype=torch.float32)
+            acc = torch.zeros((M, N), device='xpu', dtype=torch.float32)
+            cnt = torch.zeros((M, N), device='xpu', dtype=torch.int32)
         else:
-            c = torch.empty((B, M, N), device='xpu', dtype=torch.float32)
-            acc = torch.empty((B, M, N), device='xpu', dtype=torch.float32)
-            cnt = torch.empty((B, M, N), device='xpu', dtype=torch.int32)
+            c = torch.zeros((B, M, N), device='xpu', dtype=torch.float32)
+            acc = torch.zeros((B, M, N), device='xpu', dtype=torch.float32)
+            cnt = torch.zeros((B, M, N), device='xpu', dtype=torch.int32)
         name = f'gemm_shape_{B}_{M}_{K}_{N}'
         # FIXME: Use gemm_streamk_benchmark.py when Triton streamk can get
         # better performance.
-        if (B, M, N, K) == (1, 3072, 4096, 3072):
+        if (B, M, N, K) == (1, 3072, 3072, 4096):
             name = 'gemm_streamk_shape_3072_4096_3072'
         func = getattr(xetla_kernel, name)
         xetla_fn = lambda: func(a, b, c, acc, cnt)
 
@@ -13,6 +13,7 @@ target_link_libraries(triton-opt PRIVATE
   TritonTransforms
   TritonGPUTransforms
   TritonNvidiaGPUTransforms
+  TritonIntelLLVMIR
   MLIRGPUToROCDLTransforms
   ${dialect_libs}
   ${conversion_libs}
@@ -88,6 +89,7 @@ target_link_libraries(triton-llvm-opt PRIVATE
   LLVMSupport
   LLVMOption
   LLVMCodeGen
+  TritonIntelLLVMIR
   TritonIntelGPUIR
   )
 export_executable_symbols_for_plugins(triton-llvm-opt)
 
@@ -1,6 +1,7 @@
 /// Trimmed down clone of llvm opt to be able to test triton custom llvm ir
 /// passes.
 #include "lib/Target/LLVMIR/LLVMPasses.h"
+#include "third_party/intel/lib/LLVMIR/LLVMPasses.h"
 #include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -42,6 +43,11 @@ static cl::opt<bool>
                         llvm::cl::desc("run pass to break phi struct"),
                         cl::init(false));
 
+static cl::opt<bool> FreezeMaskedDivRem(
+    "freeze-masked-div-rem",
+    llvm::cl::desc("run pass to insert freeze between masked load and div/rem"),
+    cl::init(false));
+
 namespace {
 static std::function<Error(Module *)> makeOptimizingPipeline() {
   return [](Module *m) -> Error {
@@ -62,6 +68,8 @@ static std::function<Error(Module *)> makeOptimizingPipeline() {
     llvm::FunctionPassManager fpm;
     if (BreakStructPhiNodes)
       fpm.addPass(BreakStructPhiNodesPass());
+    if (FreezeMaskedDivRem)
+      fpm.addPass(FreezeMaskedDivRemPass());
     mpm.addPass(createModuleToFunctionPassAdaptor(std::move(fpm)));
     mpm.run(*m, mam);
     return Error::success();
 
@@ -35,7 +35,7 @@ void replaceUsesAndPropagateType(OpBuilder &builder, Operation *oldUse,
 // Return the minClusterId and maxClusterId for the given ForOp.
 std::pair<int, int> getMinMaxCluster(scf::ForOp &forOp);
 std::pair<int, int> getStageCluster(Operation *op);
-void setStageCluster(scf::ForOp &forOp, Operation *op, int stage, int cluster);
+void setStageCluster(Operation *op, int stage, int cluster);
 } // namespace triton
 } // namespace mlir
 
 
@@ -64,11 +64,7 @@ class OpBuilderWithStage : public OpBuilder {
   OpTy createWithStage(Location location, int stage, int cluster,
                        Args &&...args) {
     OpTy op = OpBuilder::create<OpTy>(location, std::forward<Args>(args)...);
-    auto ctx = getContext();
-    op->setAttr(mlir::triton::kLoopStageAttrName,
-                IntegerAttr::get(IntegerType::get(ctx, 32), stage));
-    op->setAttr(mlir::triton::kLoopClusterAttrName,
-                IntegerAttr::get(IntegerType::get(ctx, 32), cluster));
+    tt::setStageCluster(op, stage, cluster);
     return op;
   }
   using OpBuilder::create;
@@ -204,9 +200,8 @@ static int createAsyncCopy(scf::ForOp forOp, tt::LoadOp loadOp, Value alloc,
     // Prefetch load if is not MMAV3 and is used by the dot.
     if (loadToInfo[loadOp].usedByDot) {
       assert(stageForFirstUse >= 1);
-      tt::setStageCluster(forOp, wait, stageForFirstUse - 1, maxClusterId + 1);
-      tt::setStageCluster(forOp, viewLoad, stageForFirstUse - 1,
-                          maxClusterId + 1);
+      tt::setStageCluster(wait, stageForFirstUse - 1, maxClusterId + 1);
+      tt::setStageCluster(viewLoad, stageForFirstUse - 1, maxClusterId + 1);
       retCode = stageForFirstUse - 1;
     }
   }