intel
diff --git a/‎.github/pins/pytorch-upstream.txt‎
Lines changed: 1 addition & 1 deletion b/‎.github/pins/pytorch-upstream.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 8 additions & 0 deletions b/‎.gitignore‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 23 additions & 23 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 23 additions & 23 deletions
diff --git a/‎benchmarks/CMakeLists.txt‎
Lines changed: 3 additions & 1 deletion b/‎benchmarks/CMakeLists.txt‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎benchmarks/cmake/FindXeTLALibrary.cmake‎
Lines changed: 3 additions & 1 deletion b/‎benchmarks/cmake/FindXeTLALibrary.cmake‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎benchmarks/setup.py‎
Lines changed: 91 additions & 39 deletions b/‎benchmarks/setup.py‎
Lines changed: 91 additions & 39 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/benchmark_testing.py‎
Lines changed: 6 additions & 4 deletions b/‎benchmarks/triton_kernels_benchmark/benchmark_testing.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/gemm_benchmark.py‎
Lines changed: 5 additions & 0 deletions b/‎benchmarks/triton_kernels_benchmark/gemm_benchmark.py‎
Lines changed: 5 additions & 0 deletions
@@ -1 +1 @@
-0a2685160140656e3e53818611dd2c65c4397be5
+8321eec009c8c79145ebccd51fdfc336e5f8b848
@@ -6,11 +6,19 @@ build-*/
 python/build/
 python/dist/
 python/triton*.egg-info/
+python/*.whl
 
 python/triton/_C/*.pyd
 python/triton/_C/*.so
 python/triton/_C/*.dylib
 
+benchmarks/dist
+benchmarks/*.egg-info/
+benchmarks/**/*.so
+
+# Logs
+inductor_log/
+
 # Backends copied from submodules
 python/triton/backends/
 !python/triton/backends/__init__.py
 
@@ -22,7 +22,7 @@ repos:
       - id: ruff
         files: '^python/.*'
         args: ["--fix", "--line-length", "120"]
-        stages: [commit, push, manual]
+        stages: [pre-commit, pre-push, manual]
         exclude: |
           (?x)(
             ^python/triton/runtime/.*|
@@ -35,49 +35,49 @@ repos:
     hooks:
       - id: yapf
         args: ["-p", "-i"]
-        stages: [commit, push, manual]
+        stages: [pre-commit, pre-push, manual]
         exclude: "python/test/unit/language/test_line_info.py"
 
   - repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v16.0.6
     hooks:
       - id: clang-format
-        stages: [commit, push, manual]
+        stages: [pre-commit, pre-push, manual]
 
   # Expand YAML anchors in files used by github workflows, because github can't
   # do this itself.  This lets us use anchors, which avoids code duplication.
-#  - repo: local
-#    hooks:
-#    - id: expand-yaml-anchors
-#      name: Expand YAML anchors
-#      language: golang
-#      additional_dependencies: [github.com/mikefarah/yq/v4@latest]
-#      entry: >
-#        bash -c '
-#          OUT=".github/workflows/integration-tests.yml"
-#          IN="$OUT.in"
-#          echo "# AUTOGENERATED by pre-commit, modify the .in file instead." > "$OUT" &&
-#          echo >> "$OUT"
-#          yq "explode(.)" "$IN" >> "$OUT"
-#        '
-#      files: ^.github/workflows/integration-tests.yml.*
-#      pass_filenames: false
+  - repo: local
+    hooks:
+    - id: expand-yaml-anchors
+      name: Expand YAML anchors
+      language: golang
+      additional_dependencies: [github.com/mikefarah/yq/v4@latest]
+      entry: >
+        bash -c '
+          OUT=".github/workflows/integration-tests.yml"
+          IN="$OUT.in"
+          echo "# AUTOGENERATED by pre-commit, modify the .in file instead." > "$OUT" &&
+          echo >> "$OUT"
+          yq "explode(.)" "$IN" >> "$OUT"
+        '
+      files: ^.github/workflows/integration-tests.yml.*
+      pass_filenames: false
 
   - repo: https://github.com/PyCQA/bandit
     rev: '1.7.9'
     hooks:
     - id: bandit
       files: '^(benchmarks|scripts|third_party/intel)/.*\.py$'
       args: ["-c", "bandit.yaml", "-s", "B404,B603,B607"]
-      stages: [commit, push, manual]
+      stages: [pre-commit, pre-push, manual]
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.1.3
     hooks:
       - id: ruff
         files: '^(benchmarks|third_party/intel|scripts)/.*'
         args: ["--fix", "--line-length", "120"]
-        stages: [commit, push, manual]
+        stages: [pre-commit, pre-push, manual]
 
   - repo: https://github.com/pycqa/pylint
     rev: v3.2.6
@@ -105,7 +105,7 @@ repos:
           - --disable=too-many-locals
           - --disable=too-many-statements
           - --disable=too-many-arguments
-        stages: [commit, push, manual]
+        stages: [pre-commit, pre-push, manual]
 
       - id: pylint
         name: pylint for benchmarks
@@ -136,7 +136,7 @@ repos:
           - --disable=too-many-statements
           - --disable=too-many-arguments
           - --disable=fixme
-        stages: [commit, push, manual]
+        stages: [pre-commit, pre-push, manual]
 
 
 exclude: |
 
@@ -10,9 +10,11 @@ if(NOT WIN32)
     list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 endif()
 
-find_package(Python3 COMPONENTS Interpreter)
+find_package(Python3 REQUIRED
+  COMPONENTS Development.Module)
 find_package(Torch REQUIRED)
 find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib")
+find_package(XeTLALibrary REQUIRED)
 
 if(USE_IPEX)
   string(APPEND CMAKE_CXX_FLAGS " -DUSE_IPEX")
 
@@ -3,13 +3,15 @@
 include(FetchContent)
 
 if (NOT XeTLALibrary_FOUND)
+    # TODO: switch ot FetchContent_MakeAvailable once XeTLA supports it
+    cmake_policy(SET CMP0169 OLD)
 
     set(XeTLALibrary_SOURCE_DIR
             "${CMAKE_CURRENT_BINARY_DIR}/XeTLALibrary")
     message(STATUS "XeTLALibrary is not specified. Will try to download
                   XeTLA library from https://github.com/intel/xetla into
                   ${XeTLALibrary_SOURCE_DIR}")
-    file(READ xetla-library.conf XeTLALibrary_TAG)
+    file(READ xetla_kernel/xetla-library.conf XeTLALibrary_TAG)
     # Strip the potential trailing newline from tag
     string(STRIP "${XeTLALibrary_TAG}" XeTLALibrary_TAG)
     FetchContent_Declare(xetla-library
 
@@ -1,83 +1,135 @@
 import os
-import re
 import shutil
 import subprocess
-import sysconfig
 import sys
 
-from setuptools import setup
+# TODO: update once there is replacement for clean:
+#  https://github.com/pypa/setuptools/discussions/2838
+from distutils import log  # pylint: disable=[deprecated-module]
+from distutils.dir_util import remove_tree  # pylint: disable=[deprecated-module]
+from distutils.command.clean import clean as _clean  # pylint: disable=[deprecated-module]
+
+from setuptools import setup, Extension
+from setuptools.command.build_ext import build_ext as _build_ext
 
 import torch
 
-ipex_cmake_prefix_path = ""
-USE_IPEX_OPTION = os.getenv("USE_IPEX", "1")
-if USE_IPEX_OPTION == "1":
-    import intel_extension_for_pytorch
-    ipex_cmake_prefix_path = f";{intel_extension_for_pytorch.cmake_prefix_path}"
+
+class CMakeExtension(Extension):
+
+    def __init__(self, name):
+        # don't invoke the original build_ext for this special extension
+        super().__init__(name, sources=[])
 
 
 class CMakeBuild():
 
-    def __init__(self):
+    def __init__(self, debug=False, dry_run=False):
         self.current_dir = os.path.abspath(os.path.dirname(__file__))
         self.build_temp = self.current_dir + "/build/temp"
         self.extdir = self.current_dir + "/triton_kernels_benchmark"
+        self.build_type = self.get_build_type(debug)
+        self.cmake_prefix_paths = [torch.utils.cmake_prefix_path]
+        self.use_ipex = False
+        self.dry_run = dry_run
+
+    def get_build_type(self, debug):
+        DEBUG_OPTION = os.getenv("DEBUG", "0")
+        return "Debug" if debug or (DEBUG_OPTION == "1") else "Release"
 
     def run(self):
-        try:
-            out = subprocess.check_output(["cmake", "--version"])
-        except OSError as error:
-            raise RuntimeError("CMake must be installed") from error
+        self.check_ipex()
+        self.build_extension()
 
-        match = re.search(r"version\s*(?P<major>\d+)\.(?P<minor>\d+)([\d.]+)?", out.decode())
-        cmake_major, cmake_minor = int(match.group("major")), int(match.group("minor"))
-        if (cmake_major, cmake_minor) < (3, 18):
-            raise RuntimeError("CMake >= 3.18.0 is required")
+    def check_ipex(self):
+        self.use_ipex = os.getenv("USE_IPEX", "1") == "1"
+        if not self.use_ipex:
+            return
+        try:
+            import intel_extension_for_pytorch
+        except ImportError:
+            log.warn("ipex is not installed trying to build without ipex")
+            self.use_ipex = False
+            return
+        self.cmake_prefix_paths.append(intel_extension_for_pytorch.cmake_prefix_path)
 
-        self.build_extension()
+    def check_call(self, *popenargs, **kwargs):
+        log.info(" ".join(popenargs[0]))
+        if not self.dry_run:
+            subprocess.check_call(*popenargs, **kwargs)
 
     def build_extension(self):
         ninja_dir = shutil.which("ninja")
         # create build directories
         if not os.path.exists(self.build_temp):
             os.makedirs(self.build_temp)
-        # python directories
-        python_include_dir = sysconfig.get_path("platinclude")
         cmake_args = [
             "-G",
             "Ninja",  # Ninja is much faster than make
             "-DCMAKE_MAKE_PROGRAM=" +
             ninja_dir,  # Pass explicit path to ninja otherwise cmake may cache a temporary path
-            f"-DCMAKE_PREFIX_PATH={torch.utils.cmake_prefix_path}{ipex_cmake_prefix_path}",
-            f"-DUSE_IPEX={USE_IPEX_OPTION}",
-            "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON",
-            "-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY=" + self.extdir,
-            "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + self.extdir,
-            "-DPython3_EXECUTABLE:FILEPATH=" + sys.executable,
-            "-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON",
-            "-DPYTHON_INCLUDE_DIRS=" + python_include_dir,
+            "-DCMAKE_PREFIX_PATH=" + ";".join(self.cmake_prefix_paths),
+            f"-DUSE_IPEX={int(self.use_ipex)}",
+            "-DCMAKE_INSTALL_PREFIX=" + self.extdir,
+            "-DPython3_ROOT_DIR:FILEPATH=" + sys.exec_prefix,
+            "-DCMAKE_VERBOSE_MAKEFILE=TRUE",
             "-DCMAKE_C_COMPILER=icx",
             "-DCMAKE_CXX_COMPILER=icpx",
+            "-DCMAKE_BUILD_TYPE=" + self.build_type,
+            "-S",
+            self.current_dir,
+            "-B",
+            self.build_temp,
         ]
 
-        # configuration
-        build_type = "Debug"
-        build_args = ["--config", build_type]
-        cmake_args += ["-DCMAKE_BUILD_TYPE=" + build_type]
         max_jobs = os.getenv("MAX_JOBS", str(2 * os.cpu_count()))
-        build_args += ["-j" + max_jobs]
+        build_args = [
+            "--build",
+            self.build_temp,
+            "-j" + max_jobs,
+        ]
+
+        install_args = [
+            "--build",
+            self.build_temp,
+            "--target",
+            "install",
+        ]
 
         env = os.environ.copy()
-        cmake_dir = self.build_temp
-        subprocess.check_call(["cmake", self.current_dir] + cmake_args, cwd=cmake_dir, env=env)
-        subprocess.check_call(["cmake", "--build", "."] + build_args, cwd=cmake_dir)
+        self.check_call(["cmake"] + cmake_args, env=env)
+        self.check_call(["cmake"] + build_args)
+        self.check_call(["cmake"] + install_args)
+
+    def clean(self):
+        if os.path.exists(self.build_temp):
+            remove_tree(self.build_temp, dry_run=self.dry_run)
+        else:
+            log.warn("'%s' does not exist -- can't clean it", os.path.relpath(self.build_temp,
+                                                                              os.path.dirname(__file__)))
+
 
+class build_ext(_build_ext):
+
+    def run(self):
+        cmake = CMakeBuild(debug=self.debug, dry_run=self.dry_run)
+        cmake.run()
+        super().run()
+
+
+class clean(_clean):
+
+    def run(self):
+        cmake = CMakeBuild(dry_run=self.dry_run)
+        cmake.clean()
+        super().run()
 
-cmake = CMakeBuild()
-cmake.run()
 
 setup(name="triton-kernels-benchmark", packages=[
     "triton_kernels_benchmark",
 ], package_dir={
     "triton_kernels_benchmark": "triton_kernels_benchmark",
-}, package_data={"triton_kernels_benchmark": ["xetla_kernel.so"]})
+}, package_data={"triton_kernels_benchmark": ["xetla_kernel.cpython-*.so"]}, cmdclass={
+    "build_ext": build_ext,
+    "clean": clean,
+}, ext_modules=[CMakeExtension("triton_kernels_benchmark")])
@@ -213,16 +213,18 @@ def do_bench_upstream_pytorch_profiler(fn, n_warmup=25, n_repeat=100, grad_to_no
 
     function_events = prof.events()
 
-    functions = []
+    all_functions = []
     if isinstance(kernel_name, str):
         kernel_name = [kernel_name]
     for ker_name in kernel_name:
-        functions.extend(list(filter(lambda x: x.name.startswith(ker_name), function_events)))  # pylint: disable=cell-var-from-loop
+        functions = list(filter(lambda x: x.name.startswith(ker_name), function_events))  # pylint: disable=cell-var-from-loop
+        assert len(functions) == n_repeat, f"the profiling number for kernel: '{ker_name}' not match, {len(functions)}"
+        all_functions.append(functions)
     # profiling_func_filter = filter(lambda x: x.name.startswith("__profile_kernel_of_func"), function_events)
 
-    assert len(functions) == n_repeat, f"the profiling number not match, {len(functions)}"
     # Make the time to the milliseconds.
-    times = torch.tensor([f.self_device_time_total * 1e-3 for f in functions], dtype=torch.float)
+    times = torch.tensor([sum(map(lambda elem: elem.self_device_time_total, f)) * 1e-3 for f in zip(*all_functions)],
+                         dtype=torch.float)
     return _summarize_statistics(times, quantiles, return_mode)
 
 
 
@@ -309,6 +309,10 @@ def benchmark(B, M, N, K, provider):
             acc = torch.empty((B, M, N), device='xpu', dtype=torch.float32)
             cnt = torch.empty((B, M, N), device='xpu', dtype=torch.int32)
         name = f'gemm_shape_{B}_{M}_{K}_{N}'
+        # FIXME: Use gemm_streamk_benchmark.py when Triton streamk can get
+        # better performance.
+        if (B, M, N, K) == (1, 3072, 4096, 3072):
+            name = 'gemm_streamk_shape_3072_4096_3072'
         func = getattr(xetla_kernel, name)
         xetla_fn = lambda: func(a, b, c, acc, cnt)
         torch_fn = lambda: torch.matmul(a, b).to(torch.float32)
@@ -338,6 +342,7 @@ def benchmark(B, M, N, K, provider):
             'gemm_shape_32_4096_4096_128': 'Test_32x4096x4096x128_row_row',
             'gemm_shape_4096_8_128_16384': 'Test_4096x8x128x16384_row_row',
             'gemm_shape_4096_8_16384_128': 'Test_4096x8x16384x128_row_row',
+            'gemm_streamk_shape_3072_4096_3072': 'stream_k_gemm_run',
         }
 
         # benchmark_suit.assert_close(xetla_fn(), torch_fn(), atol=1e-4, rtol=1.0, err_msg='xetla to torch')
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-0a2685160140656e3e53818611dd2c65c4397be5`
	`1`	`+8321eec009c8c79145ebccd51fdfc336e5f8b848`