Merge branch 'main' into etiotto.remove_masks

etiotto · web-flow · commit c099a86c6ca4 · 2025-11-03T10:18:53.000-05:00
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -1352,8 +1352,6 @@ def test_atomic_rmw(op, dtype_x_str, mode, sem, device):
             pytest.xfail("Only test atomic bfloat16/float16 ops on GPU")
     if "uint" in dtype_x_str and mode in ["min_neg", "all_neg"]:
         pytest.xfail("uint cannot be negative")
-    if is_xpu() and dtype_x_str == 'bfloat16':
-        pytest.skip("bfloat16 not yet supported for xpu")
 
     n_programs = 5
 
@@ -1442,8 +1440,6 @@ def kernel(X):
                           for check_return_val in ([True, False] if is_hip() else [True])])
 def test_tensor_atomic_rmw(shape, axis, num_ctas, dtype_x_str, check_return_val, device):
     check_type_supported(dtype_x_str, device)
-    if is_xpu() and dtype_x_str == 'bfloat16':
-        pytest.skip("bfloat16 not yet supported for xpu")
     shape0, shape1 = shape
     # triton kernel
 
@@ -1523,8 +1519,6 @@ def torch_to_triton_dtype(t):
                                                          for dtype_x_str in ['bfloat16', 'float16', 'float32']])
 def test_tensor_atomic_add_non_exclusive_offset(size, num_ctas, dtype_x_str, device):
     check_type_supported(dtype_x_str, device)
-    if is_xpu() and dtype_x_str == 'bfloat16':
-        pytest.skip("bfloat16 not yet supported for xpu")
 
     @triton.jit
     def kernel(X, val, NUM: tl.constexpr):
@@ -1549,8 +1543,6 @@ def kernel(X, val, NUM: tl.constexpr):
                                                          for dtype_x_str in ['bfloat16', 'float16', 'float32']])
 def test_tensor_atomic_add_shift_1(size, num_ctas, dtype_x_str, device):
     check_type_supported(dtype_x_str, device)
-    if is_xpu() and dtype_x_str == 'bfloat16':
-        pytest.skip("bfloat16 not yet supported for xpu")
 
     @triton.jit
     def kernel(X, val, NUM: tl.constexpr):
@@ -1587,9 +1579,6 @@ def test_tensor_atomic_add_access_patterns(shape, idx_order, mask_step, num_ctas
     if is_interpreter():
         pytest.xfail("not supported in the interpreter")
 
-    if is_xpu() and dtype_x_str == 'bfloat16':
-        pytest.skip("bfloat16 not yet supported for xpu")
-
     @triton.jit
     def kernel(in_ptr, idx_ptr, out_ptr, shape0, shape1, mask_step, XBLOCK: tl.constexpr):
         xoffset = tl.program_id(0) * XBLOCK
@@ -5872,7 +5861,7 @@ def simple(data, out):
 
 def test_num_ctas_pre_sm90(device):
     if not is_cuda() and not is_hip():
-        pytest.skip("Only supported on CUDA and HIP")
+        pytest.xfail("Only supported on CUDA and HIP")
 
     @triton.jit
     def _kernel(src):
diff --git a/python/test/unit/language/test_tensor_descriptor.py b/python/test/unit/language/test_tensor_descriptor.py
@@ -1566,9 +1566,6 @@ def test_tensor_descriptor_reduce(kind, descriptor, dtype_str, num_ctas, M_BLOCK
             pytest.xfail("Multi-CTA not supported")
         if is_hip_cdna3() and (kind, dtype_str, M_BLOCK, N_BLOCK) in REDUCE_SKIP_HIP_CDNA3:
             pytest.skip("Broken on rocm")
-        if is_xpu():
-            if (kind, dtype_str) in [("add", "bfloat16")]:
-                pytest.skip("FIXME: issue #3914")
 
     @triton.jit(debug=True)
     def kernel(out_desc, out_ptr, a_ptr, M, N, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr, kind: tl.constexpr):
diff --git a/python/triton/compiler/compiler.py b/python/triton/compiler/compiler.py
@@ -138,12 +138,19 @@ def parse(full_name, ext, context):
         return module
     if ext == "llir" or ext == "ptx" or ext == "amdgcn":
         return Path(full_name).read_text()
-    if ext == "cubin" or ext == "hsaco":
+    if ext == "cubin" or ext == "hsaco" or ext == "zebin":
         return Path(full_name).read_bytes()
     if ext == "spv":
         return Path(full_name).read_bytes()
 
 
+def read_file(full_name, ext):
+    if ext in ["cubin", "hsaco", "spv", "zebin"]:
+        return Path(full_name).read_bytes()
+    else:
+        return Path(full_name).read_text()
+
+
 def filter_traceback(e: BaseException):
     """
     Removes code_generator.py and related files from tracebacks.
@@ -332,7 +339,7 @@ def compile(src, target=None, options=None, _env_vars=None):
             print(f"\nOverriding kernel with file {full_name}")
             next_module = parse(full_name, ext, context)
         # If TRITON_STORE_BINARY_ONLY is 1, only store cubin/hsaco/json
-        if (not store_only_binary) or (ext in ("cubin", "hsaco", "json", "spv")):
+        if (not store_only_binary) or (ext in ("cubin", "hsaco", "zebin", "json", "spv")):
             metadata_group[ir_filename] = fn_cache_manager.put(next_module, ir_filename)
         if fn_dump_manager is not None:
             fn_dump_manager.put(next_module, ir_filename)
@@ -422,11 +429,9 @@ def __init__(self, src, metadata_group, hash):
         self.name = self.metadata.name
         # stores the text of each level of IR that was generated during compilation
         asm_files = [Path(p) for c, p in metadata_group.items() if not c.endswith(".json")]
-        binary_ext = backend.binary_ext
-        self.asm = AsmDict({
-            file.suffix[1:]: file.read_bytes() if file.suffix[1:] == binary_ext else file.read_text()
-            for file in asm_files
-        })
+
+        self.asm = AsmDict({file.suffix[1:]: read_file(file, file.suffix[1:]) for file in asm_files})
+        binary_ext = metadata.get("binary_ext", backend.binary_ext)
         self.metadata_group = metadata_group
         self.kernel = self.asm[binary_ext]
         # binaries are lazily initialized
diff --git a/python/triton/tools/compile.py b/python/triton/tools/compile.py
@@ -183,7 +183,8 @@ def constexpr(s):
         if hints.get((i, ), None) == 16:
             suffix += 'd'
     func_name = '_'.join([out_name, sig_hash, suffix])
-    asm = ccinfo.asm[backend.binary_ext]  # store binary data once
+    binary_ext = getattr(ccinfo.metadata, "binary_ext", backend.binary_ext)
+    asm = ccinfo.asm[binary_ext]  # store binary data once
 
     hex_ = str(binascii.hexlify(asm))[2:-1]
 
diff --git a/scripts/skiplist/lts/language.txt b/scripts/skiplist/lts/language.txt
@@ -1,3 +1,10 @@
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/4665
 python/test/unit/language/test_core.py::test_dot3d[8-1-32-32-32-32-32-float64-float64]
 python/test/unit/language/test_core.py::test_dot3d[4-1-64-64-64-32-32-float64-float64]
+# Below bfloat16 tests require IGC 1188 or above
+python/test/unit/language/test_core.py::test_atomic_rmw[r".*bfloat16.*"]@regexp
+python/test/unit/language/test_core.py::test_tensor_atomic_rmw[r".*bfloat16.*"]@regexp
+python/test/unit/language/test_core.py::test_tensor_atomic_add_non_exclusive_offset[r".*bfloat16.*"]@regexp
+python/test/unit/language/test_core.py::test_tensor_atomic_add_shift_1[r".*bfloat16.*"]@regexp
+python/test/unit/language/test_core.py::test_tensor_atomic_add_access_patterns[r".*bfloat16.*"]@regexp
+python/test/unit/language/test_tensor_descriptor.py::test_tensor_descriptor_reduce[r".*bfloat16.*"]@regexp
diff --git a/setup.py b/setup.py
@@ -481,6 +481,15 @@ def build_extension(self, ext):
             cmake_args.append("-DLLVM_EXTERNAL_LIT=" + lit_dir)
         cmake_args.extend(thirdparty_cmake_args)
 
+        result = subprocess.run(["bash", "./scripts/capture-hw-details.sh"], stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE, check=True, text=True, env=os.environ.copy())
+        agama_version = None
+        for line in result.stdout.splitlines():
+            if line.startswith("AGAMA_VERSION="):
+                agama_version = line.split("=", 1)[1].strip()
+                break
+        cmake_args.append(f"-DAGAMA_VERSION={agama_version}")
+
         # configuration
         cfg = get_build_type()
         build_args = ["--config", cfg]
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
@@ -376,55 +376,57 @@ def make_spv(src, metadata, options, device_arch):
 
         if knobs.intel.disable_igc_opt:
             metadata["build_flags"] += " -cl-opt-disable"
+        return spirv
+
+    @staticmethod
+    def make_zebin(src, metadata, options, device_arch):
+        metadata["binary_ext"] = "zebin"
 
         shader_dump_opt = ""
         if knobs.intel.dump_shader_info:
             # The IGC (Intel Graphic Compiler) only parses the options at first time in JIT-ing the binary per process.
             # Have to use the `ocloc` to generate the binary in sub-process to work around the limitation.
-            assert options.generate_native_code, "Only support native code generation with shader dump"
             shader_dump_opt = f" -igc_opts ',DumpToCustomDir={metadata['cache_dir']},ShaderDumpEnable=1'"
 
         metadata["generate_native_code"] = options.generate_native_code
 
-        if options.generate_native_code:
-            with track("generate_native_code"), tempfile.TemporaryDirectory() as temp_dir:
-                with tempfile.NamedTemporaryFile(mode='wb', suffix='.spv', dir=temp_dir, delete=False) as fsrc:
-                    fsrc.write(spirv)
-                fbin = fsrc.name + '.o'
-
-                ocloc_cmd = [
-                    'ocloc', 'compile', '-file', fsrc.name, '-o', fbin, '-spirv_input', '-device', device_arch,
-                    '-options', metadata["build_flags"] + shader_dump_opt
-                ]
-
-                try:
-                    output = subprocess.check_output(ocloc_cmd, stderr=subprocess.STDOUT, text=True)
-                    if 'spilled' in output and metadata["build_flags"].find("-cl-intel-256-GRF-per-thread") == -1:
-                        """
-                        The exact message is something like:
-                            warning: kernel matmul_kernel  compiled SIMD16 allocated 128 regs and spilled around 217
-                        is "spilled" enough for now?
-                        """
-                        metadata["build_flags"] += " -cl-intel-256-GRF-per-thread"
-                        # re-run with new build flags
-                        ocloc_cmd[-1] = metadata["build_flags"] + shader_dump_opt
-                        subprocess.check_output(ocloc_cmd, stderr=subprocess.STDOUT, text=True)
-                except subprocess.CalledProcessError as e:
-                    if e.returncode == 255:
-                        error = 'Internal Triton ZEBIN codegen error'
-                    elif e.returncode == 128 + signal.SIGSEGV:
-                        error = '`ocloc` raised SIGSEGV'
-                    else:
-                        error = f'`ocloc` failed with error code {e.returncode}'
-
-                    raise RuntimeError(f'{error}\n'
-                                       f'`ocloc` stderr:\n{e.output}\n'
-                                       f'Repro command: {ocloc_cmd}\n') from e
-
-                with open(fbin, 'rb') as f:
-                    zebin = f.read()
-            return zebin
-        return spirv
+        with track("generate_native_code"), tempfile.TemporaryDirectory() as temp_dir:
+            with tempfile.NamedTemporaryFile(mode='wb', suffix='.spv', dir=temp_dir, delete=False) as fsrc:
+                fsrc.write(src)
+            fbin = fsrc.name + '.o'
+
+            ocloc_cmd = [
+                'ocloc', 'compile', '-file', fsrc.name, '-o', fbin, '-spirv_input', '-device', device_arch, '-options',
+                metadata["build_flags"] + shader_dump_opt
+            ]
+
+            try:
+                output = subprocess.check_output(ocloc_cmd, stderr=subprocess.STDOUT, text=True)
+                if 'spilled' in output and metadata["build_flags"].find("-cl-intel-256-GRF-per-thread") == -1:
+                    """
+                    The exact message is something like:
+                        warning: kernel matmul_kernel  compiled SIMD16 allocated 128 regs and spilled around 217
+                    is "spilled" enough for now?
+                    """
+                    metadata["build_flags"] += " -cl-intel-256-GRF-per-thread"
+                    # re-run with new build flags
+                    ocloc_cmd[-1] = metadata["build_flags"] + shader_dump_opt
+                    subprocess.check_output(ocloc_cmd, stderr=subprocess.STDOUT, text=True)
+            except subprocess.CalledProcessError as e:
+                if e.returncode == 255:
+                    error = 'Internal Triton ZEBIN codegen error'
+                elif e.returncode == 128 + signal.SIGSEGV:
+                    error = '`ocloc` raised SIGSEGV'
+                else:
+                    error = f'`ocloc` failed with error code {e.returncode}'
+
+                raise RuntimeError(f'{error}\n'
+                                   f'`ocloc` stderr:\n{e.output}\n'
+                                   f'Repro command: {ocloc_cmd}\n') from e
+
+            with open(fbin, 'rb') as f:
+                zebin = f.read()
+        return zebin
 
     def add_stages(self, stages, options, language):
         if language == Language.TRITON:
@@ -434,6 +436,8 @@ def add_stages(self, stages, options, language):
             stages["ttgir"] = lambda src, metadata: self.gluon_to_ttgir(src, metadata, options)
         stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options)
         stages["spv"] = lambda src, metadata: self.make_spv(src, metadata, options, self.device_arch)
+        if options.generate_native_code:
+            stages["zebin"] = lambda src, metadata: self.make_zebin(src, metadata, options, self.device_arch)
         if knobs.runtime.add_stages_inspection_hook is not None:
             knobs.runtime.add_stages_inspection_hook(self, stages, options, language, None)
 
diff --git a/third_party/intel/cmake/FindSPIRVToLLVMTranslator.cmake b/third_party/intel/cmake/FindSPIRVToLLVMTranslator.cmake
@@ -26,28 +26,30 @@ if (NOT SPIRVToLLVMTranslator_FOUND)
 
             FetchContent_MakeAvailable(spirv-llvm-translator)
 
-            # FIXME: Don't apply patch when Agama driver is updated.
-            execute_process(
-                COMMAND git apply --check ${CMAKE_CURRENT_LIST_DIR}/3122.patch
-                WORKING_DIRECTORY ${spirv-llvm-translator_SOURCE_DIR}
-                ERROR_QUIET
-                RESULT_VARIABLE PATCH_RESULT
-            )
-            if(PATCH_RESULT EQUAL 0)
+            # FIXME: Don't apply patch when LTS driver is updated.
+            if(DEFINED AGAMA_VERSION AND AGAMA_VERSION STREQUAL "1146")
                 execute_process(
-                        COMMAND git apply ${CMAKE_CURRENT_LIST_DIR}/3122.patch
-                        WORKING_DIRECTORY ${spirv-llvm-translator_SOURCE_DIR}
-                        RESULT_VARIABLE PATCH_RESULT
+                    COMMAND git apply --check ${CMAKE_CURRENT_LIST_DIR}/3122.patch
+                    WORKING_DIRECTORY ${spirv-llvm-translator_SOURCE_DIR}
+                    ERROR_QUIET
+                    RESULT_VARIABLE PATCH_RESULT
                 )
-            else()
-                execute_process( # Check if the patch is already applied
-                        COMMAND git apply --reverse --check ${CMAKE_CURRENT_LIST_DIR}/3122.patch
-                        WORKING_DIRECTORY ${spirv-llvm-translator_SOURCE_DIR}
-                        RESULT_VARIABLE PATCH_RESULT
-                )
-            endif()
-            if(NOT PATCH_RESULT EQUAL 0)
-                message(FATAL_ERROR "Failed to apply 3122.patch to SPIRV-LLVM-Translator")
+                if(PATCH_RESULT EQUAL 0)
+                    execute_process(
+                            COMMAND git apply ${CMAKE_CURRENT_LIST_DIR}/3122.patch
+                            WORKING_DIRECTORY ${spirv-llvm-translator_SOURCE_DIR}
+                            RESULT_VARIABLE PATCH_RESULT
+                    )
+                else()
+                    execute_process( # Check if the patch is already applied
+                            COMMAND git apply --reverse --check ${CMAKE_CURRENT_LIST_DIR}/3122.patch
+                            WORKING_DIRECTORY ${spirv-llvm-translator_SOURCE_DIR}
+                            RESULT_VARIABLE PATCH_RESULT
+                    )
+                endif()
+                if(NOT PATCH_RESULT EQUAL 0)
+                    message(FATAL_ERROR "Failed to apply 3122.patch to SPIRV-LLVM-Translator")
+                endif()
             endif()
 
             # FIXME: Don't apply patch when Agama driver is updated to incorporate with the SPV_INTEL_bfloat16_arithmetic extension.
diff --git a/third_party/intel/lib/Target/SPIRV/SPIRVTranslation.cpp b/third_party/intel/lib/Target/SPIRV/SPIRVTranslation.cpp
@@ -107,7 +107,7 @@ class SmallVectorBuffer : public std::streambuf {
 
 static SPIRV::TranslatorOpts getSPIRVOpts() {
   SPIRV::TranslatorOpts SPIRVOpts{SPIRV::VersionNumber::SPIRV_1_4};
-  static constexpr std::array<SPIRV::ExtensionID, 18> AllowedExtensions{
+  static constexpr std::array<SPIRV::ExtensionID, 19> AllowedExtensions{
       SPIRV::ExtensionID::SPV_EXT_shader_atomic_float_add,
       SPIRV::ExtensionID::SPV_INTEL_2d_block_io,
       SPIRV::ExtensionID::SPV_INTEL_arbitrary_precision_integers,
@@ -124,6 +124,7 @@ static SPIRV::TranslatorOpts getSPIRVOpts() {
       SPIRV::ExtensionID::SPV_INTEL_tensor_float32_conversion,
       SPIRV::ExtensionID::SPV_INTEL_unstructured_loop_controls,
       SPIRV::ExtensionID::SPV_INTEL_vector_compute,
+      SPIRV::ExtensionID::SPV_KHR_bfloat16,
       SPIRV::ExtensionID::SPV_KHR_bit_instructions,
       SPIRV::ExtensionID::SPV_KHR_non_semantic_info};
 
diff --git a/third_party/intel/tools/intel/compile.cpp b/third_party/intel/tools/intel/compile.cpp
@@ -138,9 +138,6 @@ int32_t {kernel_name}(sycl::queue &stream, {signature}) {{
   size_t global_range_y = {gridY};
   size_t global_range_z = {gridZ};
   size_t local_range_x = {num_warps} * {threads_per_warp};
-  if (driver_version.find("+") != std::string::npos) {{
-    local_range_x = 16;
-  }}
   size_t local_range_y = 1;
   size_t local_range_z = 1;