@@ -376,55 +376,57 @@ def make_spv(src, metadata, options, device_arch):
376376
377377 if knobs .intel .disable_igc_opt :
378378 metadata ["build_flags" ] += " -cl-opt-disable"
379+ return spirv
380+
381+ @staticmethod
382+ def make_zebin (src , metadata , options , device_arch ):
383+ metadata ["binary_ext" ] = "zebin"
379384
380385 shader_dump_opt = ""
381386 if knobs .intel .dump_shader_info :
382387 # The IGC (Intel Graphic Compiler) only parses the options at first time in JIT-ing the binary per process.
383388 # Have to use the `ocloc` to generate the binary in sub-process to work around the limitation.
384- assert options .generate_native_code , "Only support native code generation with shader dump"
385389 shader_dump_opt = f" -igc_opts ',DumpToCustomDir={ metadata ['cache_dir' ]} ,ShaderDumpEnable=1'"
386390
387391 metadata ["generate_native_code" ] = options .generate_native_code
388392
389- if options .generate_native_code :
390- with track ("generate_native_code" ), tempfile .TemporaryDirectory () as temp_dir :
391- with tempfile .NamedTemporaryFile (mode = 'wb' , suffix = '.spv' , dir = temp_dir , delete = False ) as fsrc :
392- fsrc .write (spirv )
393- fbin = fsrc .name + '.o'
394-
395- ocloc_cmd = [
396- 'ocloc' , 'compile' , '-file' , fsrc .name , '-o' , fbin , '-spirv_input' , '-device' , device_arch ,
397- '-options' , metadata ["build_flags" ] + shader_dump_opt
398- ]
399-
400- try :
401- output = subprocess .check_output (ocloc_cmd , stderr = subprocess .STDOUT , text = True )
402- if 'spilled' in output and metadata ["build_flags" ].find ("-cl-intel-256-GRF-per-thread" ) == - 1 :
403- """
404- The exact message is something like:
405- warning: kernel matmul_kernel compiled SIMD16 allocated 128 regs and spilled around 217
406- is "spilled" enough for now?
407- """
408- metadata ["build_flags" ] += " -cl-intel-256-GRF-per-thread"
409- # re-run with new build flags
410- ocloc_cmd [- 1 ] = metadata ["build_flags" ] + shader_dump_opt
411- subprocess .check_output (ocloc_cmd , stderr = subprocess .STDOUT , text = True )
412- except subprocess .CalledProcessError as e :
413- if e .returncode == 255 :
414- error = 'Internal Triton ZEBIN codegen error'
415- elif e .returncode == 128 + signal .SIGSEGV :
416- error = '`ocloc` raised SIGSEGV'
417- else :
418- error = f'`ocloc` failed with error code { e .returncode } '
419-
420- raise RuntimeError (f'{ error } \n '
421- f'`ocloc` stderr:\n { e .output } \n '
422- f'Repro command: { ocloc_cmd } \n ' ) from e
423-
424- with open (fbin , 'rb' ) as f :
425- zebin = f .read ()
426- return zebin
427- return spirv
393+ with track ("generate_native_code" ), tempfile .TemporaryDirectory () as temp_dir :
394+ with tempfile .NamedTemporaryFile (mode = 'wb' , suffix = '.spv' , dir = temp_dir , delete = False ) as fsrc :
395+ fsrc .write (src )
396+ fbin = fsrc .name + '.o'
397+
398+ ocloc_cmd = [
399+ 'ocloc' , 'compile' , '-file' , fsrc .name , '-o' , fbin , '-spirv_input' , '-device' , device_arch , '-options' ,
400+ metadata ["build_flags" ] + shader_dump_opt
401+ ]
402+
403+ try :
404+ output = subprocess .check_output (ocloc_cmd , stderr = subprocess .STDOUT , text = True )
405+ if 'spilled' in output and metadata ["build_flags" ].find ("-cl-intel-256-GRF-per-thread" ) == - 1 :
406+ """
407+ The exact message is something like:
408+ warning: kernel matmul_kernel compiled SIMD16 allocated 128 regs and spilled around 217
409+ is "spilled" enough for now?
410+ """
411+ metadata ["build_flags" ] += " -cl-intel-256-GRF-per-thread"
412+ # re-run with new build flags
413+ ocloc_cmd [- 1 ] = metadata ["build_flags" ] + shader_dump_opt
414+ subprocess .check_output (ocloc_cmd , stderr = subprocess .STDOUT , text = True )
415+ except subprocess .CalledProcessError as e :
416+ if e .returncode == 255 :
417+ error = 'Internal Triton ZEBIN codegen error'
418+ elif e .returncode == 128 + signal .SIGSEGV :
419+ error = '`ocloc` raised SIGSEGV'
420+ else :
421+ error = f'`ocloc` failed with error code { e .returncode } '
422+
423+ raise RuntimeError (f'{ error } \n '
424+ f'`ocloc` stderr:\n { e .output } \n '
425+ f'Repro command: { ocloc_cmd } \n ' ) from e
426+
427+ with open (fbin , 'rb' ) as f :
428+ zebin = f .read ()
429+ return zebin
428430
429431 def add_stages (self , stages , options , language ):
430432 if language == Language .TRITON :
@@ -434,6 +436,8 @@ def add_stages(self, stages, options, language):
434436 stages ["ttgir" ] = lambda src , metadata : self .gluon_to_ttgir (src , metadata , options )
435437 stages ["llir" ] = lambda src , metadata : self .make_llir (src , metadata , options )
436438 stages ["spv" ] = lambda src , metadata : self .make_spv (src , metadata , options , self .device_arch )
439+ if options .generate_native_code :
440+ stages ["zebin" ] = lambda src , metadata : self .make_zebin (src , metadata , options , self .device_arch )
437441 if knobs .runtime .add_stages_inspection_hook is not None :
438442 knobs .runtime .add_stages_inspection_hook (self , stages , options , language , None )
439443
0 commit comments