Merge branch 'main' into amyachev/issue2736

anmyachev · web-flow · commit 04d334979c82 · 2024-11-19T14:07:19.000+01:00
diff --git a/.github/workflows/build-test-gpu.yml b/.github/workflows/build-test-gpu.yml
@@ -39,6 +39,10 @@ on:
         description: Ignore pytest.skip
         type: boolean
         default: false
+      use_system_python:
+        description: Use system Python
+        type: boolean
+        default: false
 
 permissions: read-all
 
@@ -60,3 +64,4 @@ jobs:
       skip_list: ${{ inputs.skip_list }}
       run_name: ${{ inputs.run_name || format('Build and test {0}', inputs.runner_label) }}
       enable_unskip: ${{ inputs.enable_unskip }}
+      use_system_python: ${{ inputs.use_system_python || false }}
diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml
@@ -56,6 +56,10 @@ on:
         description: Runner label for version
         type: string
         default: runner-0.0.20
+      use_system_python:
+        description: Use system Python
+        type: boolean
+        default: false
 
 permissions: read-all
 
@@ -91,10 +95,16 @@ jobs:
           key: pip-${{ inputs.python_version }}-${{ hashFiles('python/pyproject.toml', 'python/setup.py') }}-${{ env.CACHE_NUMBER }}
 
       - name: Install Python ${{ inputs.python_version }}
+        if: ${{ !inputs.use_system_python }}
         uses: actions/setup-python@v5
         with:
           python-version: ${{ inputs.python_version }}
 
+      - name: Identify Python version
+        run: |
+          PYTHON_VERSION="$(python -c 'import sys; print(f"{sys.version_info[0]}.{ sys.version_info[1]}")')"
+          echo "PYTHON_VERSION=$PYTHON_VERSION" | tee -a $GITHUB_ENV
+
       - name: Setup PyTorch
         uses: ./.github/actions/setup-pytorch
         with:
diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml
@@ -28,6 +28,10 @@ on:
         description: JSON list of benchmarks to skip
         type: string
         default: "[]"
+      use_system_python:
+        description: Use system Python
+        type: boolean
+        default: false
   schedule:
     - cron: "5 23 * * *"
   pull_request:
@@ -67,10 +71,16 @@ jobs:
           key: pip-$PYTHON_VERSION-$GITHUB_SHA
 
       - name: Install Python
+        if: ${{ !(inputs.use_system_python || false) }}
         uses: actions/setup-python@v5
         with:
           python-version: ${{ env.PYTHON_VERSION }}
 
+      - name: Identify Python version
+        run: |
+          PYTHON_VERSION="$(python -c 'import sys; print(f"{sys.version_info[0]}.{ sys.version_info[1]}")')"
+          echo "PYTHON_VERSION=$PYTHON_VERSION" | tee -a $GITHUB_ENV
+
       - name: Install Python build dependencies
         run: |
           pip install wheel cmake
diff --git a/benchmarks/triton_kernels_benchmark/gemm_splitk_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_splitk_benchmark.py
@@ -128,6 +128,7 @@ def forward(ctx, a, b, c, acc_dtype=None):
             [512, 32768, 8192],
             [1024, 28672, 8192],
             [3072, 4096, 3072],
+            [4096, 4096, 4096],
         ],
         line_arg='provider',
         # argument name whose value corresponds to a different line in the plot
@@ -152,17 +153,17 @@ def benchmark(M, N, K, provider):
         _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(lambda: torch.matmul(a, b), n_warmup=10, n_repeat=10,
                                                                  quantiles=quantiles)
     elif provider == 'triton':
-        c = torch.empty((M, N), device='xpu', dtype=torch.float32)
+        c = torch.zeros((M, N), device='xpu', dtype=torch.float32)
         triton_fn = lambda: matmul(a, b, c)
         torch_fn = lambda: torch.matmul(a, b).to(torch.float32)
         rtol = 1e-2 if a.dtype == torch.bfloat16 else 1e-3
         benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=1e-4, rtol=rtol, err_msg='triton to torch')
         _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10,
                                                                  quantiles=quantiles, kernel_name='_kernel')
     elif provider == 'xetla':
-        c = torch.empty((M, N), device='xpu', dtype=torch.float32)
-        acc = torch.empty((M, N), device='xpu', dtype=torch.float32)
-        cnt = torch.empty((M, N), device='xpu', dtype=torch.int32)
+        c = torch.zeros((M, N), device='xpu', dtype=torch.float32)
+        acc = torch.zeros((M, N), device='xpu', dtype=torch.float32)
+        cnt = torch.zeros((M, N), device='xpu', dtype=torch.int32)
 
         name = f'gemm_splitk_shape_{M}_{K}_{N}'
         func = getattr(xetla_kernel, name)
diff --git a/benchmarks/triton_kernels_benchmark/gemm_streamk_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_streamk_benchmark.py
@@ -275,17 +275,17 @@ def benchmark(M, N, K, provider):
         _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(lambda: torch.matmul(a, b), n_warmup=10, n_repeat=10,
                                                                  quantiles=quantiles)
     elif provider == 'triton':
-        c = torch.empty((M, N), device=a.device, dtype=torch.float32)
+        c = torch.zeros((M, N), device=a.device, dtype=torch.float32)
         triton_fn = lambda: matmul(a, b, c)
         torch_fn = lambda: torch.matmul(a, b).to(torch.float32)
         benchmark_suit.assert_close(triton_fn(), torch_fn(), atol=1e-4, rtol=1e-2, err_msg='triton to torch')
         _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10,
                                                                  quantiles=quantiles,
                                                                  kernel_name=['first_wave', 'full_tiles'])
     elif provider == 'xetla':
-        c = torch.empty((M, N), device='xpu', dtype=torch.float32)
-        acc = torch.empty((M, N), device='xpu', dtype=torch.float32)
-        cnt = torch.empty((M, N), device='xpu', dtype=torch.int32)
+        c = torch.zeros((M, N), device='xpu', dtype=torch.float32)
+        acc = torch.zeros((M, N), device='xpu', dtype=torch.float32)
+        cnt = torch.zeros((M, N), device='xpu', dtype=torch.int32)
 
         name = f'gemm_streamk_shape_{M}_{K}_{N}'
         func = getattr(xetla_kernel, name)
diff --git a/benchmarks/xetla_kernel/python_main.cpp b/benchmarks/xetla_kernel/python_main.cpp
@@ -317,6 +317,9 @@ PYBIND11_MODULE(xetla_kernel, m) {
   m.def("gemm_splitk_shape_3072_4096_3072",
         &bf16_split_k_gemm<3072, 4096, 3072, kslicing_impl_t::global>,
         "bf16_gemm_splitk (XeTLA)");
+  m.def("gemm_splitk_shape_4096_4096_4096",
+        &bf16_split_k_gemm<4096, 4096, 4096, kslicing_impl_t::global>,
+        "bf16_gemm_splitk (XeTLA)");
   // flash_attn
   m.def("flash_attn_causal_false", &flash_attn<false, false, false>,
         "flash attn fwd (XeTLA)");
diff --git a/cmake/llvm-hash.txt b/cmake/llvm-hash.txt
@@ -1 +1 @@
-fa57c7a6a5f594a9e3ae2dbe3542cf89a20cdd73
+bd9145c8c21334e099d51b3e66f49d51d24931ee
diff --git a/python/setup.py b/python/setup.py
@@ -207,7 +207,7 @@ def get_llvm_package_info():
     with open(llvm_hash_path, "r") as llvm_hash_file:
         rev = llvm_hash_file.read(8)
     name = f"llvm-{rev}-{system_suffix}"
-    url = f"https://github.com/intel/intel-xpu-backend-for-triton/releases/download/llvm-{rev}/{name}.tar.gz"
+    url = f"https://oaitriton.blob.core.windows.net/public/llvm-builds/{name}.tar.gz"
     return Package("llvm", name, url, "LLVM_INCLUDE_DIRS", "LLVM_LIBRARY_DIR", "LLVM_SYSPATH")
 
 
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -1571,7 +1571,7 @@ struct AtomicRMWOpConversion
     auto lowPtrBits = and_(intPtr, i64_val(3));
     auto elemIndex = trunc(i32_ty, lshr(lowPtrBits, i64_val(1)));
     auto alignPtr = inttoptr(rmwPtr.getType(), sub(intPtr, lowPtrBits));
-    auto firstValInt = load(i32_ty, alignPtr, 4, false, false, false,
+    auto firstValInt = load(i32_ty, alignPtr, 4, false, false, false, false,
                             LLVM::AtomicOrdering::acquire);
 
     // Create a loop body block. It has a single parameter which holds the
diff --git a/third_party/nvidia/backend/driver.c b/third_party/nvidia/backend/driver.c
@@ -1,5 +1,11 @@
 #include "cuda.h"
+#ifdef WIN32
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#else
 #include <dlfcn.h>
+#endif
 #include <stdbool.h>
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
@@ -161,6 +167,27 @@ typedef CUresult (*cuTensorMapEncodeTiled_t)(
     CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion,
     CUtensorMapFloatOOBfill oobFill);
 
+#ifdef WIN32
+#define defineGetFunctionHandle(name, symbolName)                              \
+  static symbolName##_t name() {                                               \
+    /* Open the shared library */                                              \
+    HMODULE handle = LoadLibraryA("nvcuda.dll");                               \
+    if (!handle) {                                                             \
+      PyErr_SetString(PyExc_RuntimeError, "Failed to open nvcuda.dll");        \
+      return NULL;                                                             \
+    }                                                                          \
+    symbolName##_t funcHandle =                                                \
+        (symbolName##_t)GetProcAddress((HMODULE)handle, #symbolName);          \
+    /* Check for errors */                                                     \
+    long err = GetLastError();                                                 \
+    if (err) {                                                                 \
+      PyErr_SetString(PyExc_RuntimeError,                                      \
+                      "Failed to retrieve " #symbolName " from nvcuda.dll");   \
+      return NULL;                                                             \
+    }                                                                          \
+    return funcHandle;                                                         \
+  }
+#else
 #define defineGetFunctionHandle(name, symbolName)                              \
   static symbolName##_t name() {                                               \
     /* Open the shared library */                                              \
@@ -182,6 +209,7 @@ typedef CUresult (*cuTensorMapEncodeTiled_t)(
     }                                                                          \
     return funcHandle;                                                         \
   }
+#endif
 
 defineGetFunctionHandle(getCuOccupancyMaxActiveClustersHandle,
                         cuOccupancyMaxActiveClusters);
diff --git a/third_party/nvidia/backend/driver.py b/third_party/nvidia/backend/driver.py
@@ -167,7 +167,12 @@ def format_of(ty):
 #include \"cuda.h\"
 #include <stdbool.h>
 #include <Python.h>
+#ifndef _WIN32
 #include <dlfcn.h>
+#else
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
 
 static inline void gpuAssert(CUresult code, const char *file, int line)
 {{
@@ -190,6 +195,7 @@ def format_of(ty):
 
 typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra);
 
+#ifndef _WIN32
 static cuLaunchKernelEx_t getLaunchKernelExHandle() {{
   // Open the shared library
   void* handle = dlopen("libcuda.so.1", RTLD_LAZY);
@@ -208,6 +214,25 @@ def format_of(ty):
   }}
   return cuLaunchKernelExHandle;
 }}
+#else
+static cuLaunchKernelEx_t getLaunchKernelExHandle() {{
+  // Open the shared library
+  HMODULE handle = LoadLibraryA("nvcuda.dll");
+  if (!handle) {{
+    PyErr_SetString(PyExc_RuntimeError, "Failed to open nvcuda.dll");
+    return NULL;
+  }}
+  cuLaunchKernelEx_t cuLaunchKernelExHandle =
+      (cuLaunchKernelEx_t)GetProcAddress((HMODULE)handle, "cuLaunchKernelEx");
+  // Check for errors
+  long error = GetLastError();
+  if (error) {{
+    PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from nvcuda.dll");
+    return NULL;
+  }}
+  return cuLaunchKernelExHandle;
+}}
+#endif
 
 static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction function, CUdeviceptr global_scratch{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
   void *params[] = {{ {', '.join(params)} }};

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-fa57c7a6a5f594a9e3ae2dbe3542cf89a20cdd73`
	`1`	`+bd9145c8c21334e099d51b3e66f49d51d24931ee`