Merge branch 'main' into lesh/conda-oct

leshikus · web-flow · commit 15f2e14edefd · 2024-10-16T11:53:36.000+02:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
       - id: ruff
         files: '^python/.*'
         args: ["--fix", "--line-length", "120"]
-        stages: [commit, push, manual]
+        stages: [pre-commit, pre-push, manual]
         exclude: |
           (?x)(
             ^python/triton/runtime/.*|
@@ -35,14 +35,14 @@ repos:
     hooks:
       - id: yapf
         args: ["-p", "-i"]
-        stages: [commit, push, manual]
+        stages: [pre-commit, pre-push, manual]
         exclude: "python/test/unit/language/test_line_info.py"
 
   - repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v16.0.6
     hooks:
       - id: clang-format
-        stages: [commit, push, manual]
+        stages: [pre-commit, pre-push, manual]
 
   # Expand YAML anchors in files used by github workflows, because github can't
   # do this itself.  This lets us use anchors, which avoids code duplication.
@@ -69,15 +69,15 @@ repos:
     - id: bandit
       files: '^(benchmarks|scripts|third_party/intel)/.*\.py$'
       args: ["-c", "bandit.yaml", "-s", "B404,B603,B607"]
-      stages: [commit, push, manual]
+      stages: [pre-commit, pre-push, manual]
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.1.3
     hooks:
       - id: ruff
         files: '^(benchmarks|third_party/intel|scripts)/.*'
         args: ["--fix", "--line-length", "120"]
-        stages: [commit, push, manual]
+        stages: [pre-commit, pre-push, manual]
 
   - repo: https://github.com/pycqa/pylint
     rev: v3.2.6
@@ -105,7 +105,7 @@ repos:
           - --disable=too-many-locals
           - --disable=too-many-statements
           - --disable=too-many-arguments
-        stages: [commit, push, manual]
+        stages: [pre-commit, pre-push, manual]
 
       - id: pylint
         name: pylint for benchmarks
@@ -136,7 +136,7 @@ repos:
           - --disable=too-many-statements
           - --disable=too-many-arguments
           - --disable=fixme
-        stages: [commit, push, manual]
+        stages: [pre-commit, pre-push, manual]
 
 
 exclude: |
diff --git a/benchmarks/triton_kernels_benchmark/gemm_streamk_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_streamk_benchmark.py
@@ -10,6 +10,7 @@
 import triton.language as tl
 
 import triton_kernels_benchmark as benchmark_suit
+import xetla_kernel
 
 if benchmark_suit.USE_IPEX_OPTION:
     import intel_extension_for_pytorch  # type: ignore # noqa: F401
@@ -253,9 +254,9 @@ def matmul(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor):
         line_arg='provider',
         # argument name whose value corresponds to a different line in the plot
         # possible values for `line_arg``
-        line_vals=['triton'],
+        line_vals=['triton', 'xetla'],
         # label name for the lines
-        line_names=['Triton'],
+        line_names=['Triton', 'XeTLA'],
         # line styles
         styles=[('green', '-'), ('green', '--'), ('blue', '-'), ('blue', '--')],
         ylabel=['GB/s', 'TFlops'],  # label name for the y-axis
@@ -281,6 +282,20 @@ def benchmark(M, N, K, provider):
         _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10,
                                                                  quantiles=quantiles,
                                                                  kernel_name=['first_wave', 'full_tiles'])
+    elif provider == 'xetla':
+        c = torch.empty((M, N), device='xpu', dtype=torch.float32)
+        acc = torch.empty((M, N), device='xpu', dtype=torch.float32)
+        cnt = torch.empty((M, N), device='xpu', dtype=torch.int32)
+
+        name = f'gemm_streamk_shape_{M}_{K}_{N}'
+        func = getattr(xetla_kernel, name)
+        xetla_fn = lambda: func(a, b, c, acc, cnt)
+        torch_fn = lambda: torch.matmul(a, b).to(torch.float32)
+
+        # benchmark_suit.assert_close(xetla_fn(), torch_fn(), atol=1e-4, rtol=1.0, err_msg='xetla to torch')
+        _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(
+            xetla_fn, n_warmup=10, n_repeat=10, quantiles=quantiles,
+            kernel_name='gpu::xetla::kernel::gemm_universal_t<dispatch_stream_k')
     else:
         raise NotImplementedError(f'Unsupported provider {provider}')
 
diff --git a/benchmarks/xetla_kernel/python_main.cpp b/benchmarks/xetla_kernel/python_main.cpp
@@ -280,7 +280,10 @@ PYBIND11_MODULE(xetla_kernel, m) {
         &bf16_gemm<Test_4096x8x128x16384_row_row>, "bf16_gemm (XeTLA)");
   m.def("gemm_shape_4096_8_16384_128",
         &bf16_gemm<Test_4096x8x16384x128_row_row>, "bf16_gemm (XeTLA)");
-  // flash_attn_fwd
+  // gemm stream k
+  m.def("gemm_streamk_shape_3072_4096_3072", &bf16_stream_k_gemm,
+        "bf16_gemm_streamk (XeTLA)");
+  // flash_attn
   m.def("flash_attn_causal_false", &flash_attn<false, false, false>,
         "flash attn fwd (XeTLA)");
   m.def("flash_attn_causal_true", &flash_attn<false, true, false>,
diff --git a/benchmarks/xetla_kernel/stream_k_gemm/stream_k_gemm.h b/benchmarks/xetla_kernel/stream_k_gemm/stream_k_gemm.h
@@ -36,9 +36,6 @@ sycl::event stream_k_gemm_run(void *_A, void *_B, void *_C, void *_Acc,
   using data_type_c = float;
   using data_type_acc = float;
 
-  auto context = queue.get_info<sycl::info::queue::context>();
-  auto device = queue.get_info<sycl::info::queue::device>();
-
   data_type_a *A = static_cast<data_type_a *>(_A);
   data_type_b *B = static_cast<data_type_b *>(_B);
   data_type_c *C = static_cast<data_type_c *>(_C);
@@ -52,7 +49,7 @@ sycl::event stream_k_gemm_run(void *_A, void *_B, void *_C, void *_Acc,
   constexpr uint32_t sg_tile_k = 32;
 
   // StreamK parameters - xecores available for stream_k dispatch
-  uint32_t avail_xecores = 32;
+  uint32_t avail_xecores = 64;
 
   // Org the compute shape for sub-matrix
   using tile_shape =