intel
diff --git a/‎.github/workflows/integration-tests.yml‎
Lines changed: 8 additions & 8 deletions b/‎.github/workflows/integration-tests.yml‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 8 additions & 8 deletions b/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/triton_kernels_benchmark/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/benchmark_driver.py‎
Lines changed: 23 additions & 12 deletions b/‎benchmarks/triton_kernels_benchmark/benchmark_driver.py‎
Lines changed: 23 additions & 12 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/benchmark_testing.py‎
Lines changed: 6 additions & 4 deletions b/‎benchmarks/triton_kernels_benchmark/benchmark_testing.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/gemm_benchmark.py‎
Lines changed: 5 additions & 0 deletions b/‎benchmarks/triton_kernels_benchmark/gemm_benchmark.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/gemm_streamk_benchmark.py‎
Lines changed: 2 additions & 3 deletions b/‎benchmarks/triton_kernels_benchmark/gemm_streamk_benchmark.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 2 additions & 0 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 22 additions & 4 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 22 additions & 4 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 5 additions & 5 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 5 additions & 5 deletions
@@ -245,9 +245,9 @@ jobs:
           lit -v "${LIT_TEST_DIR}"
       - name: Run python tests on CUDA
         run: |
-          SHARED_LIB_DIR="${GITHUB_WORKSPACE}/python/build/$(ls python/build | grep -i lib)/triton/_C"
-          if [ ! -d "${SHARED_LIB_DIR}" ]; then
-            echo "Coult not find '${SHARED_LIB_DIR}'" ; exit -1
+          INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/build/$(ls python/build | grep -i lib)/triton/instrumentation"
+          if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
+            echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           cd python/test/unit
           python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
@@ -257,7 +257,7 @@ jobs:
           TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s language/test_line_info.py
           # Run hopper/test_flashattention.py separately to avoid out of gpu memory
           python3 -m pytest -s hopper/test_flashattention.py
-          TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=${SHARED_LIB_DIR}/libGPUHello.so \
+          TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=${INSTRUMENTATION_LIB_DIR}/libGPUInstrumentationTestLib.so \
           python3 -m pytest --capture=tee-sys -rfs -vvv instrumentation/test_gpuhello.py
       - name: Run interpreter tests
         if: ${{ matrix.runner[0] == 'h100-runner-set' }}
@@ -401,9 +401,9 @@ jobs:
           lit -v "${LIT_TEST_DIR}"
       - name: Run python tests on HIP
         run: |
-          SHARED_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/_C"
-          if [ ! -d "${SHARED_LIB_DIR}" ]; then
-            echo "Coult not find '${SHARED_LIB_DIR}'" ; exit -1
+          INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
+          if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
+            echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
           cd python/test/unit
@@ -412,7 +412,7 @@ jobs:
                  --ignore=test_debug.py
           # TODO: uncomment
           # pytest --capture=tee-sys -rfs test_debug.py
-          TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=${SHARED_LIB_DIR}/libGPUHello.so \
+          TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=${INSTRUMENTATION_LIB_DIR}/libGPUInstrumentationTestLib.so \
           pytest --capture=tee-sys -rfs -vvv instrumentation/test_gpuhello.py
 
           # Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
 
@@ -279,9 +279,9 @@ jobs:
 
       - name: Run python tests on CUDA
         run: |
-          SHARED_LIB_DIR="${GITHUB_WORKSPACE}/python/build/$(ls python/build | grep -i lib)/triton/_C"
-          if [ ! -d "${SHARED_LIB_DIR}" ]; then
-            echo "Coult not find '${SHARED_LIB_DIR}'" ; exit -1
+          INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/build/$(ls python/build | grep -i lib)/triton/instrumentation"
+          if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
+            echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           cd python/test/unit
           python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
@@ -291,7 +291,7 @@ jobs:
           TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s language/test_line_info.py
           # Run hopper/test_flashattention.py separately to avoid out of gpu memory
           python3 -m pytest -s hopper/test_flashattention.py
-          TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=${SHARED_LIB_DIR}/libGPUHello.so \
+          TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=${INSTRUMENTATION_LIB_DIR}/libGPUInstrumentationTestLib.so \
           python3 -m pytest --capture=tee-sys -rfs -vvv instrumentation/test_gpuhello.py
 
       - name: Run interpreter tests
@@ -397,9 +397,9 @@ jobs:
 
       - name: Run python tests on HIP
         run: |
-          SHARED_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/_C"
-          if [ ! -d "${SHARED_LIB_DIR}" ]; then
-            echo "Coult not find '${SHARED_LIB_DIR}'" ; exit -1
+          INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
+          if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
+            echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
           cd python/test/unit
@@ -408,7 +408,7 @@ jobs:
                  --ignore=test_debug.py
           # TODO: uncomment
           # pytest --capture=tee-sys -rfs test_debug.py
-          TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=${SHARED_LIB_DIR}/libGPUHello.so \
+          TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=${INSTRUMENTATION_LIB_DIR}/libGPUInstrumentationTestLib.so \
           pytest --capture=tee-sys -rfs -vvv instrumentation/test_gpuhello.py
 
           # Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
 
@@ -1,6 +1,6 @@
-from .benchmark_testing import do_bench, assert_close, perf_report, Benchmark, USE_IPEX_OPTION  # type: ignore # noqa: F401
+from .benchmark_testing import do_bench, assert_close, perf_report, Benchmark, USE_IPEX_OPTION, BENCHMARKING_METHOD  # type: ignore # noqa: F401
 
-if USE_IPEX_OPTION:
+if USE_IPEX_OPTION or BENCHMARKING_METHOD == "UPSTREAM_PYTORCH_PROFILER":
     from triton.runtime import driver
     from . import benchmark_driver
     # replace the launcher with the profilier hook.
 
@@ -10,15 +10,15 @@
 from triton.runtime.build import _build, quiet
 
 import torch
-import intel_extension_for_pytorch
+
+from .benchmark_testing import USE_IPEX_OPTION
 
 _dirname = os.getenv("ZE_PATH", default="/usr/local")
 
 include_dir = [
     os.path.join(_dirname, "include"),
     os.path.join(torch.utils.cmake_prefix_path, "../../include"),
-    os.path.join(torch.utils.cmake_prefix_path, "../../include/torch/csrc/api/include"),
-    os.path.join(intel_extension_for_pytorch.cmake_prefix_path, "../../include")
+    os.path.join(torch.utils.cmake_prefix_path, "../../include/torch/csrc/api/include")
 ]
 
 oneapi_root = os.getenv("ONEAPI_ROOT")
@@ -28,12 +28,15 @@
         os.path.join(oneapi_root, "compiler/latest/include/sycl")
     ]
 
-library_dir = [
-    os.path.join(_dirname, "lib"),
-    os.path.join(torch.utils.cmake_prefix_path, "../../lib"),
-    os.path.join(intel_extension_for_pytorch.cmake_prefix_path, "../../lib")
-]
-libraries = ["ze_loader", "sycl", "torch", "intel-ext-pt-gpu"]
+library_dir = [os.path.join(_dirname, "lib"), os.path.join(torch.utils.cmake_prefix_path, "../../lib")]
+libraries = ["ze_loader", "sycl", "torch"]
+
+if USE_IPEX_OPTION:
+    import intel_extension_for_pytorch
+
+    include_dir.append(os.path.join(intel_extension_for_pytorch.cmake_prefix_path, "../../include"))
+    library_dir.append(os.path.join(intel_extension_for_pytorch.cmake_prefix_path, "../../lib"))
+    libraries.append("intel-ext-pt-gpu")
 
 
 def compile_module_from_src(src, name):
@@ -141,6 +144,14 @@ def format_of(ty):
     fmt = "iiiOOOOOO" + args_format
     args_list = ", " + ", ".join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else ""
 
+    record_function_header = "#include <ATen/record_function.h>"
+    ipex_header = ""
+    xpu_profiler_record = ""
+    if USE_IPEX_OPTION:
+        record_function_header = "#include <torch/extension.h>"
+        ipex_header = "#include <ipex.h>"
+        xpu_profiler_record = "xpu::profiler_record(kernel_name, event);"
+
     # generate glue code
     src = f"""
     #include <cstddef>
@@ -149,8 +160,8 @@ def format_of(ty):
     #include <iomanip>
     #include <level_zero/ze_api.h>
     #include <sycl/sycl.hpp>
-    #include <torch/extension.h>
-    #include <ipex.h>
+    {record_function_header}
+    {ipex_header}
 
     #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
     #include <Python.h>
@@ -291,7 +302,7 @@ def format_of(ty):
       }}
       }};
     auto event = stream.submit(cgf);
-    xpu::profiler_record(kernel_name, event);
+    {xpu_profiler_record}
   }}
 // end sycl
     static PyObject* launch(PyObject* self, PyObject* args) {{
 
@@ -213,16 +213,18 @@ def do_bench_upstream_pytorch_profiler(fn, n_warmup=25, n_repeat=100, grad_to_no
 
     function_events = prof.events()
 
-    functions = []
+    all_functions = []
     if isinstance(kernel_name, str):
         kernel_name = [kernel_name]
     for ker_name in kernel_name:
-        functions.extend(list(filter(lambda x: x.name.startswith(ker_name), function_events)))  # pylint: disable=cell-var-from-loop
+        functions = list(filter(lambda x: x.name.startswith(ker_name), function_events))  # pylint: disable=cell-var-from-loop
+        assert len(functions) == n_repeat, f"the profiling number for kernel: '{ker_name}' not match, {len(functions)}"
+        all_functions.append(functions)
     # profiling_func_filter = filter(lambda x: x.name.startswith("__profile_kernel_of_func"), function_events)
 
-    assert len(functions) == n_repeat, f"the profiling number not match, {len(functions)}"
     # Make the time to the milliseconds.
-    times = torch.tensor([f.self_device_time_total * 1e-3 for f in functions], dtype=torch.float)
+    times = torch.tensor([sum(map(lambda elem: elem.self_device_time_total, f)) * 1e-3 for f in zip(*all_functions)],
+                         dtype=torch.float)
     return _summarize_statistics(times, quantiles, return_mode)
 
 
 
@@ -309,6 +309,10 @@ def benchmark(B, M, N, K, provider):
             acc = torch.empty((B, M, N), device='xpu', dtype=torch.float32)
             cnt = torch.empty((B, M, N), device='xpu', dtype=torch.int32)
         name = f'gemm_shape_{B}_{M}_{K}_{N}'
+        # FIXME: Use gemm_streamk_benchmark.py when Triton streamk can get
+        # better performance.
+        if (B, M, N, K) == (1, 3072, 4096, 3072):
+            name = 'gemm_streamk_shape_3072_4096_3072'
         func = getattr(xetla_kernel, name)
         xetla_fn = lambda: func(a, b, c, acc, cnt)
         torch_fn = lambda: torch.matmul(a, b).to(torch.float32)
@@ -338,6 +342,7 @@ def benchmark(B, M, N, K, provider):
             'gemm_shape_32_4096_4096_128': 'Test_32x4096x4096x128_row_row',
             'gemm_shape_4096_8_128_16384': 'Test_4096x8x128x16384_row_row',
             'gemm_shape_4096_8_16384_128': 'Test_4096x8x16384x128_row_row',
+            'gemm_streamk_shape_3072_4096_3072': 'stream_k_gemm_run',
         }
 
         # benchmark_suit.assert_close(xetla_fn(), torch_fn(), atol=1e-4, rtol=1.0, err_msg='xetla to torch')
 
@@ -293,9 +293,8 @@ def benchmark(M, N, K, provider):
         torch_fn = lambda: torch.matmul(a, b).to(torch.float32)
 
         # benchmark_suit.assert_close(xetla_fn(), torch_fn(), atol=1e-4, rtol=1.0, err_msg='xetla to torch')
-        _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(
-            xetla_fn, n_warmup=10, n_repeat=10, quantiles=quantiles,
-            kernel_name='gpu::xetla::kernel::gemm_universal_t<dispatch_stream_k')
+        _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(xetla_fn, n_warmup=10, n_repeat=10,
+                                                                 quantiles=quantiles, kernel_name='stream_k_gemm_run')
     else:
         raise NotImplementedError(f'Unsupported provider {provider}')
 
 
@@ -88,6 +88,8 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::registerTritonAMDGPUStreamPipeline();
   mlir::registerTritonAMDGPUStreamPipelineV2();
   mlir::registerTritonAMDGPUCanonicalizePointers();
+  mlir::triton::registerTritonAMDGPUInsertInstructionSchedHints();
+  mlir::triton::registerTritonAMDGPULowerInstructionSchedHints();
 
   // TODO: register Triton & TritonGPU passes
   registry.insert<mlir::triton::TritonDialect, mlir::cf::ControlFlowDialect,
 
@@ -27,15 +27,33 @@ constexpr int patternBenefitPrioritizeOverLLVMConversions = 10;
 constexpr int patternBenefitClampOptimizedPattern = 20;
 constexpr int patternBenefitConvertLayoutOptimizedPattern = 20;
 
+struct BackendCallbacks {
+  /**
+   * A backend-specific callback for appending auxiliary data during
+   * `LocalStoreOp` conversion.
+   *
+   * @param[in] op The reference to the re-written `LocalStoreOp`.
+   * @param[in] count The number of issued LLVM instructions.
+   * @param[in] type The input type of issued LLVM instructions.
+   */
+  std::function<void(triton::gpu::LocalStoreOp op, size_t llvmOpCount,
+                     Type llvmOpType)>
+      localStoreOpConversion = nullptr;
+};
+
 void populateElementwiseOpToLLVMPatterns(
     LLVMTypeConverter &typeConverter, RewritePatternSet &patterns,
     ModuleAxisInfoAnalysis &axisInfoAnalysis, const TargetInfoBase &targetInfo,
     PatternBenefit benefit);
 
-void populateMemoryOpToLLVMPattern(LLVMTypeConverter &typeConverter,
-                                   const TargetInfoBase &targetInfo,
-                                   RewritePatternSet &patterns,
-                                   PatternBenefit benefit);
+// The given callback is invoked at the end of a successful rewrite. The
+// callback receives 1) the current source op, 2) the number of issued LLVM
+// instructions and 3) their input types. Each MLIR backend can provide a
+// callback and, thus, handle backend-specific behaviors.
+void populateMemoryOpToLLVMPattern(
+    LLVMTypeConverter &typeConverter, const TargetInfoBase &targetInfo,
+    RewritePatternSet &patterns, PatternBenefit benefit,
+    std::optional<BackendCallbacks> backendCallbacks = std::nullopt);
 
 void populateAssertOpToLLVMPattern(LLVMTypeConverter &typeConverter,
                                    RewritePatternSet &patterns,
 
@@ -1366,11 +1366,11 @@ SmallVector<Value> loadSharedToDistributed(RankedTensorType dstTy,
                                            Location loc, RewriterBase &rewriter,
                                            const TargetInfoBase &target);
 
-void storeDistributedToShared(MemDescType dstTy, RankedTensorType srcTy,
-                              Type elemLlvmTy, ArrayRef<Value> srcVals,
-                              Value smemBase, ArrayRef<Value> dstStrides,
-                              Location loc, RewriterBase &rewriter,
-                              const TargetInfoBase &target);
+void storeDistributedToShared(
+    MemDescType dstTy, RankedTensorType srcTy, Type elemLlvmTy,
+    ArrayRef<Value> srcVals, Value smemBase, ArrayRef<Value> dstStrides,
+    Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
+    std::pair<size_t, Type> *const llvmOpCount = nullptr);
 
 inline Value getStructFromSharedMemoryObject(Location loc,
                                              const SharedMemoryObject &smemObj,