Fix issue #5544 that the double GRF mode is not used when build native binary. (#5560) (#5576)

anmyachev · chengjunlu · web-flow · commit f3892f070347 · 2025-12-01T12:22:57.000+01:00
To get the spill size from the zebin instead of from the output string from ocloc. It uses extra python package `pyelftools`. --------- (cherry picked from commit ba1d008) Signed-off-by: Lu,Chengjun <chengjun.lu@intel.com> Co-authored-by: Lu, Chengjun <chengjun.lu@intel.com>
diff --git a/python/test/unit/intel/test_native_code_generation.py b/python/test/unit/intel/test_native_code_generation.py
@@ -13,3 +13,18 @@ def kernel(X, SIZE: tl.constexpr):
 
     x = to_triton(numpy_random(SIZE, dtype_str="bfloat16"), device=device, dst_type="bfloat16")
     kernel[(1, )](x, SIZE=SIZE, num_warps=4, generate_native_code=True)
+
+
+def test_auto_large_grf(device):
+    SIZE = 1024
+
+    @triton.jit
+    def kernel(X, SIZE: tl.constexpr):
+        x = tl.arange(0, SIZE)
+        y = tl.sort(x, descending=True)
+        tl.store(X + x, y)
+
+    x = to_triton(numpy_random(SIZE, dtype_str="float32"), device=device, dst_type="float32")
+    # Triton XPU will auto choose large GRF mode for grf_mode='default'
+    k = kernel[(1, )](x, SIZE=SIZE, num_warps=1, generate_native_code=True, grf_mode='default')
+    assert "-cl-intel-256-GRF-per-thread" in k.metadata.build_flags
diff --git a/setup.py b/setup.py
@@ -856,6 +856,7 @@ def get_triton_version_suffix():
     description="A language and compiler for custom Deep Learning operations",
     long_description="",
     install_requires=[
+        "pyelftools",
         "importlib-metadata; python_version < '3.10'",
     ],
     packages=list(get_packages()),
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
@@ -11,9 +11,11 @@
 import hashlib
 import tempfile
 import signal
+import re
 import os
 import subprocess
 from pathlib import Path
+from elftools.elf.elffile import ELFFile
 
 try:  # XPUBackend allows metaclasses injection
     from .meta import XPUBackendMeta
@@ -68,6 +70,23 @@ def hash(self):
         return hashlib.sha256(key.encode("utf-8")).hexdigest()
 
 
+SPILL_SIZE_RE = re.compile(r'spill_size\s*[:=]\s*(\d+)')
+
+
+def extract_spill_size_from_zebin(file):
+    with open(file, 'rb') as f:
+        elf = ELFFile(f)
+        zeinfo = elf.get_section_by_name(".ze_info")
+        if zeinfo is None:
+            raise RuntimeError('Internal Triton ZEBIN codegen error:'
+                               'Section .ze_info not found in zebin')
+        text = zeinfo.data().decode('utf-8')
+        match = SPILL_SIZE_RE.search(text)
+        if match:
+            return int(match.group(1))
+    return 0
+
+
 class XPUBackend(BaseBackend, metaclass=XPUBackendMeta):
     arch_to_impl = {}  # Architecture id to backend implementation class mapping
     binary_ext = "spv"
@@ -427,21 +446,20 @@ def make_zebin(cls, src, metadata, options):
 
             ocloc_cmd = [
                 'ocloc', 'compile', '-file', fsrc.name, '-o', fbin, '-spirv_input', '-device', cls.device_arch,
-                '-options', metadata["build_flags"] + shader_dump_opt
+                '-options', metadata['build_flags'] + shader_dump_opt
             ]
 
             try:
-                output = subprocess.check_output(ocloc_cmd, stderr=subprocess.STDOUT, text=True)
-                if 'spilled' in output and metadata["build_flags"].find("-cl-intel-256-GRF-per-thread") == -1:
-                    """
-                    The exact message is something like:
-                        warning: kernel matmul_kernel  compiled SIMD16 allocated 128 regs and spilled around 217
-                    is "spilled" enough for now?
-                    """
-                    metadata["build_flags"] += " -cl-intel-256-GRF-per-thread"
-                    # re-run with new build flags
-                    ocloc_cmd[-1] = metadata["build_flags"] + shader_dump_opt
-                    subprocess.check_output(ocloc_cmd, stderr=subprocess.STDOUT, text=True)
+                subprocess.check_output(ocloc_cmd, stderr=subprocess.STDOUT, text=True)
+                if options.grf_mode == 'default':
+                    spill_size = extract_spill_size_from_zebin(fbin)
+                    # The threshold of 1000 for spill_size is chosen based on empirical observations
+                    # and aligned with triton/backends/intel/driver.c
+                    if spill_size > 1000:
+                        metadata["build_flags"] += " -cl-intel-256-GRF-per-thread"
+                        # re-run with double GRF mode
+                        ocloc_cmd[-1] = metadata["build_flags"] + shader_dump_opt
+                        subprocess.check_output(ocloc_cmd, stderr=subprocess.STDOUT, text=True)
             except subprocess.CalledProcessError as e:
                 if e.returncode == 255:
                     error = 'Internal Triton ZEBIN codegen error'