intel
diff --git a/‎.github/pins/pytorch-upstream.txt‎
Lines changed: 1 addition & 1 deletion b/‎.github/pins/pytorch-upstream.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/auto-update-translator-cid.yml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/auto-update-translator-cid.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/integration-tests.yml‎
Lines changed: 14 additions & 7 deletions b/‎.github/workflows/integration-tests.yml‎
Lines changed: 14 additions & 7 deletions
diff --git a/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 15 additions & 9 deletions b/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 15 additions & 9 deletions
diff --git a/‎benchmarks/setup.py‎
Lines changed: 34 additions & 8 deletions b/‎benchmarks/setup.py‎
Lines changed: 34 additions & 8 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/benchmark_driver.py‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/triton_kernels_benchmark/benchmark_driver.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/benchmark_testing.py‎
Lines changed: 1 addition & 2 deletions b/‎benchmarks/triton_kernels_benchmark/benchmark_testing.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/gemm_splitk_benchmark.py‎
Lines changed: 21 additions & 7 deletions b/‎benchmarks/triton_kernels_benchmark/gemm_splitk_benchmark.py‎
Lines changed: 21 additions & 7 deletions
diff --git a/‎benchmarks/xetla_kernel/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/xetla_kernel/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/xetla_kernel/python_main.cpp‎
Lines changed: 34 additions & 0 deletions b/‎benchmarks/xetla_kernel/python_main.cpp‎
Lines changed: 34 additions & 0 deletions
@@ -1 +1 @@
-487873f7cafeb0fd390eaefe40496b804bceabbd
+0efa590d435d2b4aefcbad9014dd5fa75dcf8405
@@ -86,7 +86,6 @@ jobs:
       - name: Search the latest valid Translator cid
         if: ${{ env.TARGET_PRID == null }}
         run: |
-          env
           ./scripts/check-update-translator-cid.sh $CID_LATEST $CID_CURRENT
           if git status --porcelain ./lib/Target/SPIRV/spirv-llvm-translator.conf | grep '^ M'; then
             echo "MODIFIED=true" >> $GITHUB_ENV
 
@@ -239,14 +239,14 @@ jobs:
           cd python
           LIT_TEST_DIR="build/$(ls build | grep -i cmake)/test"
           if [ ! -d "${LIT_TEST_DIR}" ]; then
-            echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
+            echo "Could not find '${LIT_TEST_DIR}'" ; exit -1
           fi
           lit -v "${LIT_TEST_DIR}"
       - name: Run python tests on CUDA
         run: |
           INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/build/$(ls python/build | grep -i lib)/triton/instrumentation"
           if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
-            echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
+            echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           cd python/test/unit
           python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
@@ -268,14 +268,16 @@ jobs:
            language/test_random.py language/test_block_pointer.py language/test_subprocess.py language/test_line_info.py \
            runtime/test_autotuner.py::test_kwargs[False]\
            ../../tutorials/06-fused-attention.py::test_op --device cpu
+      - name: Run regression tests
+        run: |
+          cd python/test/regression
+          python3 -m pytest -s -n 8 .
       - name: Run C++ unittests
         run: |
           cd python
           cd "build/$(ls build | grep -i cmake)"
           ctest -j32
       - name: Run Proton tests
-        env:
-          LD_LIBRARY_PATH: "/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
         run: |
           cd third_party/proton
           python3 -m pytest -s test
@@ -395,14 +397,14 @@ jobs:
           cd python
           LIT_TEST_DIR="build/$(ls build | grep -i cmake)/test"
           if [ ! -d "${LIT_TEST_DIR}" ]; then
-            echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
+            echo "Could not find '${LIT_TEST_DIR}'" ; exit -1
           fi
           lit -v "${LIT_TEST_DIR}"
       - name: Run python tests on HIP
         run: |
           INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
           if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
-            echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
+            echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
           cd python/test/unit
@@ -416,10 +418,15 @@ jobs:
 
           # Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
           TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s -n 8 language/test_line_info.py
+      - name: Run regression tests
+        run: |
+          # Reenable test_functional_regression.py once it's fixed
+          cd python/test/regression
+          python3 -m pytest -s -n 8 ./test_cast_matmul.py
       - name: Run Proton tests
         run: |
           cd third_party/proton
-          python3 -m pytest test
+          python3 -m pytest -s test
       - name: Run C++ unittests
         run: |
           cd python
 
@@ -272,15 +272,15 @@ jobs:
           cd python
           LIT_TEST_DIR="build/$(ls build | grep -i cmake)/test"
           if [ ! -d "${LIT_TEST_DIR}" ]; then
-            echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
+            echo "Could not find '${LIT_TEST_DIR}'" ; exit -1
           fi
           lit -v "${LIT_TEST_DIR}"
 
       - name: Run python tests on CUDA
         run: |
           INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/build/$(ls python/build | grep -i lib)/triton/instrumentation"
           if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
-            echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
+            echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           cd python/test/unit
           python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
@@ -304,16 +304,20 @@ jobs:
            runtime/test_autotuner.py::test_kwargs[False]\
            ../../tutorials/06-fused-attention.py::test_op --device cpu
 
+      - name: Run regression tests
+        run: |
+          cd python/test/regression
+          python3 -m pytest -s -n 8 .
+
       - &run-cpp-unittests-step
         name: Run C++ unittests
         run: |
           cd python
           cd "build/$(ls build | grep -i cmake)"
           ctest -j32
 
-      - name: Run Proton tests
-        env:
-          LD_LIBRARY_PATH: "/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
+      - &run-proton-tests-step
+        name: Run Proton tests
         run: |
           cd third_party/proton
           python3 -m pytest -s test
@@ -398,7 +402,7 @@ jobs:
         run: |
           INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
           if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
-            echo "Coult not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
+            echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
           cd python/test/unit
@@ -413,11 +417,13 @@ jobs:
           # Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
           TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s -n 8 language/test_line_info.py
 
-      - name: Run Proton tests
+      - name: Run regression tests
         run: |
-          cd third_party/proton
-          python3 -m pytest test
+          # Reenable test_functional_regression.py once it's fixed
+          cd python/test/regression
+          python3 -m pytest -s -n 8 ./test_cast_matmul.py
 
+      - *run-proton-tests-step
       - *run-cpp-unittests-step
       - *save-build-artifacts-step
       - *inspect-cache-directories-step
 
@@ -125,11 +125,37 @@ def run(self):
         super().run()
 
 
-setup(name="triton-kernels-benchmark", packages=[
-    "triton_kernels_benchmark",
-], package_dir={
-    "triton_kernels_benchmark": "triton_kernels_benchmark",
-}, package_data={"triton_kernels_benchmark": ["xetla_kernel.cpython-*.so"]}, cmdclass={
-    "build_ext": build_ext,
-    "clean": clean,
-}, ext_modules=[CMakeExtension("triton_kernels_benchmark")])
+def get_git_commit_hash(length=8):
+    try:
+        cmd = ["git", "rev-parse", f"--short={length}", "HEAD"]
+        return f"+git{subprocess.check_output(cmd).strip().decode('utf-8')}"
+    except (
+            FileNotFoundError,
+            subprocess.CalledProcessError,
+            subprocess.TimeoutExpired,
+    ):
+        return ""
+
+
+setup(
+    name="triton-kernels-benchmark",
+    version="3.1.0" + get_git_commit_hash(),
+    packages=["triton_kernels_benchmark"],
+    install_requires=[
+        "torch",
+        "pandas",
+        "tabulate",
+        "matplotlib",
+    ],
+    package_dir={"triton_kernels_benchmark": "triton_kernels_benchmark"},
+    package_data={"triton_kernels_benchmark": ["xetla_kernel.cpython-*.so"]},
+    cmdclass={
+        "build_ext": build_ext,
+        "clean": clean,
+    },
+    ext_modules=[CMakeExtension("triton_kernels_benchmark")],
+    extra_require={
+        "ipex": ["numpy<=2.0", "intel-extension-for-pytorch=2.1.10"],
+        "pytorch": ["torch>=2.6"],
+    },
+)
@@ -405,6 +405,7 @@ def serialize_kernel_metadata(arg, args_dict):
     args_dict["shared_memory"] = arg.shared
     args_dict["kernel_name"] = arg.name
     args_dict["spv_name"] = f"{arg.name}.spv"
+    args_dict["build_flags"] = arg.build_flags
 
 
 def serialize_args(args, constants, signature):
 
@@ -153,8 +153,7 @@ def do_bench_elapsed_time(fn, n_warmup=25, n_repeat=100, grad_to_none=None, quan
     warmup_time = n_warmup * estimate_ms
     rep_time = n_repeat * estimate_ms
 
-    times = triton_do_bench(fn, warmup=warmup_time, rep=rep_time, grad_to_none=grad_to_none, return_mode="all",
-                            device_type=device)
+    times = triton_do_bench(fn, warmup=warmup_time, rep=rep_time, grad_to_none=grad_to_none, return_mode="all")
     times = torch.tensor(times, dtype=torch.float)
     return _summarize_statistics(times, quantiles, return_mode)
 
 
@@ -3,6 +3,7 @@
 import triton.language as tl
 
 import triton_kernels_benchmark as benchmark_suit
+import xetla_kernel
 
 if benchmark_suit.USE_IPEX_OPTION:
     import intel_extension_for_pytorch  # type: ignore # noqa: F401
@@ -131,9 +132,9 @@ def forward(ctx, a, b, c, acc_dtype=None):
         line_arg='provider',
         # argument name whose value corresponds to a different line in the plot
         # possible values for `line_arg``
-        line_vals=['triton'],
+        line_vals=['triton', 'xetla'],
         # label name for the lines
-        line_names=['Triton'],
+        line_names=['Triton', 'XeTLA'],
         # line styles
         styles=[('green', '-'), ('green', '--'), ('blue', '-'), ('blue', '--')],
         ylabel=['GB/s', 'TFlops'],  # label name for the y-axis
@@ -148,23 +149,36 @@ def benchmark(M, N, K, provider):
     quantiles = [0.5, 0.0, 1.0]
 
     if provider == 'onednn':
-        _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(lambda: torch.matmul(a, b), n_warmup=10, n_repeat=10,
-                                                              quantiles=quantiles)
+        _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(lambda: torch.matmul(a, b), n_warmup=10, n_repeat=10,
+                                                                 quantiles=quantiles)
     elif provider == 'triton':
         c = torch.empty((M, N), device='xpu', dtype=torch.float32)
         triton_fn = lambda: matmul(a, b, c)
         torch_fn = lambda: torch.matmul(a, b).to(torch.float32)
         rtol = 1e-2 if a.dtype == torch.bfloat16 else 1e-3
         benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=1e-4, rtol=rtol, err_msg='triton to torch')
-        _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10, quantiles=quantiles,
-                                                              kernel_name='_kernel')
+        _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10,
+                                                                 quantiles=quantiles, kernel_name='_kernel')
+    elif provider == 'xetla':
+        c = torch.empty((M, N), device='xpu', dtype=torch.float32)
+        acc = torch.empty((M, N), device='xpu', dtype=torch.float32)
+        cnt = torch.empty((M, N), device='xpu', dtype=torch.int32)
+
+        name = f'gemm_splitk_shape_{M}_{K}_{N}'
+        func = getattr(xetla_kernel, name)
+        xetla_fn = lambda: func(a, b, c, acc, cnt)
+        torch_fn = lambda: torch.matmul(a, b).to(torch.float32)
+
+        # benchmark_suit.assert_close(xetla_fn(), torch_fn(), atol=1e-4, rtol=1.0, err_msg='xetla to torch')
+        _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(xetla_fn, n_warmup=10, n_repeat=10,
+                                                                 quantiles=quantiles, kernel_name='split_k_gemm_run')
     else:
         raise NotImplementedError(f'Unsupported provider {provider}')
 
     tflops = lambda mean: 2 * M * N * K * (1e-12) / (mean * 1e-3)
     gbps = lambda mean: 2 * (M * K + K * N) + 4.0 * (M * N) * (1e-9) / (mean * 1e-3)
 
-    return (gbps(mean), gbps(max_ms), gbps(min_ms)), (tflops(mean), tflops(max_ms), tflops(min_ms)), cv
+    return (gbps(mean_ms), gbps(max_ms), gbps(min_ms)), (tflops(mean_ms), tflops(max_ms), tflops(min_ms)), cv
 
 
 if __name__ == '__main__':
 
@@ -45,6 +45,7 @@ endif()
 add_subdirectory(softmax)
 add_subdirectory(gemm)
 add_subdirectory(stream_k_gemm)
+add_subdirectory(split_k_gemm)
 add_subdirectory(flash_attention)
 
 install(TARGETS xetla_kernel LIBRARY DESTINATION .)
@@ -2,6 +2,7 @@
 #include "flash_attention/fmha_forward_v5.h"
 #include "gemm/gemm.h"
 #include "softmax/softmax.h"
+#include "split_k_gemm/split_k_gemm.h"
 #include "stream_k_gemm/stream_k_gemm.h"
 #include <CL/sycl.hpp>
 #include <c10/core/ScalarType.h>
@@ -95,6 +96,29 @@ at::Tensor bf16_stream_k_gemm(const at::Tensor &a, const at::Tensor &b,
   return acc;
 }
 
+template <int m, int k, int n,
+          kslicing_impl_t kslicing_type = kslicing_impl_t::none>
+at::Tensor bf16_split_k_gemm(const at::Tensor &a, const at::Tensor &b,
+                             const at::Tensor &c, const at::Tensor &acc,
+                             const at::Tensor &cnt) {
+  CHECK_INPUT(a);
+  CHECK_INPUT(b);
+  CHECK_INPUT(c);
+  CHECK_INPUT(acc);
+#ifdef USE_IPEX
+  RECORD_FUNCTION("xetla split_k_gemm", {});
+#endif
+
+  auto queue = get_current_sycl_queue();
+  auto evt = split_k_gemm_run<m, k, n, kslicing_type>(
+      a.data_ptr(), b.data_ptr(), c.data_ptr(), acc.data_ptr(), cnt.data_ptr(),
+      queue);
+#ifdef USE_IPEX
+  xpu::profiler_record("xetla kernel", evt);
+#endif
+  return acc;
+}
+
 #define CALL_IMPL_ATTENTION_FWD_FUNC(P)                                        \
   fmha::fmha_forward_impl<P, T, use_mask, IsCausal, use_dropout>(              \
       queue, q.data_ptr(), k.data_ptr(), v.data_ptr(), out.data_ptr(),         \
@@ -283,6 +307,16 @@ PYBIND11_MODULE(xetla_kernel, m) {
   // gemm stream k
   m.def("gemm_streamk_shape_3072_4096_3072", &bf16_stream_k_gemm,
         "bf16_gemm_streamk (XeTLA)");
+  // gemm split k
+  m.def("gemm_splitk_shape_512_32768_8192",
+        &bf16_split_k_gemm<512, 32768, 8192, kslicing_impl_t::global>,
+        "bf16_gemm_splitk (XeTLA)");
+  m.def("gemm_splitk_shape_1024_28672_8192",
+        &bf16_split_k_gemm<1024, 28672, 8192, kslicing_impl_t::global>,
+        "bf16_gemm_splitk (XeTLA)");
+  m.def("gemm_splitk_shape_3072_4096_3072",
+        &bf16_split_k_gemm<3072, 4096, 3072, kslicing_impl_t::global>,
+        "bf16_gemm_splitk (XeTLA)");
   // flash_attn
   m.def("flash_attn_causal_false", &flash_attn<false, false, false>,
         "flash attn fwd (XeTLA)");
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-487873f7cafeb0fd390eaefe40496b804bceabbd`
	`1`	`+0efa590d435d2b4aefcbad9014dd5fa75dcf8405`