|
11 | 11 | import hashlib |
12 | 12 | import tempfile |
13 | 13 | import signal |
| 14 | +import re |
14 | 15 | import os |
15 | 16 | import subprocess |
16 | 17 | from pathlib import Path |
| 18 | +from elftools.elf.elffile import ELFFile |
17 | 19 |
|
18 | 20 | try: # XPUBackend allows metaclasses injection |
19 | 21 | from .meta import XPUBackendMeta |
@@ -68,6 +70,23 @@ def hash(self): |
68 | 70 | return hashlib.sha256(key.encode("utf-8")).hexdigest() |
69 | 71 |
|
70 | 72 |
|
| 73 | +SPILL_SIZE_RE = re.compile(r'spill_size\s*[:=]\s*(\d+)') |
| 74 | + |
| 75 | + |
| 76 | +def extract_spill_size_from_zebin(file): |
| 77 | + with open(file, 'rb') as f: |
| 78 | + elf = ELFFile(f) |
| 79 | + zeinfo = elf.get_section_by_name(".ze_info") |
| 80 | + if zeinfo is None: |
| 81 | + raise RuntimeError('Internal Triton ZEBIN codegen error:' |
| 82 | + 'Section .ze_info not found in zebin') |
| 83 | + text = zeinfo.data().decode('utf-8') |
| 84 | + match = SPILL_SIZE_RE.search(text) |
| 85 | + if match: |
| 86 | + return int(match.group(1)) |
| 87 | + return 0 |
| 88 | + |
| 89 | + |
71 | 90 | class XPUBackend(BaseBackend, metaclass=XPUBackendMeta): |
72 | 91 | arch_to_impl = {} # Architecture id to backend implementation class mapping |
73 | 92 | binary_ext = "spv" |
@@ -428,21 +447,20 @@ def make_zebin(cls, src, metadata, options): |
428 | 447 |
|
429 | 448 | ocloc_cmd = [ |
430 | 449 | 'ocloc', 'compile', '-file', fsrc.name, '-o', fbin, '-spirv_input', '-device', cls.device_arch, |
431 | | - '-options', metadata["build_flags"] + shader_dump_opt |
| 450 | + '-options', metadata['build_flags'] + shader_dump_opt |
432 | 451 | ] |
433 | 452 |
|
434 | 453 | try: |
435 | | - output = subprocess.check_output(ocloc_cmd, stderr=subprocess.STDOUT, text=True) |
436 | | - if 'spilled' in output and metadata["build_flags"].find("-cl-intel-256-GRF-per-thread") == -1: |
437 | | - """ |
438 | | - The exact message is something like: |
439 | | - warning: kernel matmul_kernel compiled SIMD16 allocated 128 regs and spilled around 217 |
440 | | - is "spilled" enough for now? |
441 | | - """ |
442 | | - metadata["build_flags"] += " -cl-intel-256-GRF-per-thread" |
443 | | - # re-run with new build flags |
444 | | - ocloc_cmd[-1] = metadata["build_flags"] + shader_dump_opt |
445 | | - subprocess.check_output(ocloc_cmd, stderr=subprocess.STDOUT, text=True) |
| 454 | + subprocess.check_output(ocloc_cmd, stderr=subprocess.STDOUT, text=True) |
| 455 | + if options.grf_mode == 'default': |
| 456 | + spill_size = extract_spill_size_from_zebin(fbin) |
| 457 | + # The threshold of 1000 for spill_size is chosen based on empirical observations |
| 458 | + # and aligned with triton/backends/intel/driver.c |
| 459 | + if spill_size > 1000: |
| 460 | + metadata["build_flags"] += " -cl-intel-256-GRF-per-thread" |
| 461 | + # re-run with double GRF mode |
| 462 | + ocloc_cmd[-1] = metadata["build_flags"] + shader_dump_opt |
| 463 | + subprocess.check_output(ocloc_cmd, stderr=subprocess.STDOUT, text=True) |
446 | 464 | except subprocess.CalledProcessError as e: |
447 | 465 | if e.returncode == 255: |
448 | 466 | error = 'Internal Triton ZEBIN codegen error' |
|
0 commit comments