Sync changes from NV compiler.py (#5122)

whitneywhtsang · ienkovich · web-flow · commit 902ee1585811 · 2025-09-17T00:41:16.000-04:00
1. renamed `threads_per_warp` to `warp_size`
2. changed some passes under `gluon_to_ttgir` from `ttgpuir` to `gluon`

Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
Co-authored-by: Ilya Enkovich &lt;ilya.enkovich@intel.com&gt;
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -2393,8 +2393,7 @@ def kernel(X, Z, BLOCK: tl.constexpr):
     # triton result
     z_tri = to_triton(numpy_random((1, ), dtype_str=z_dtype_str), device=device, dst_type=z_tri_dtype_str)
     if is_xpu():
-        kernel[(1, )](x_tri, z_tri, BLOCK=shape, num_ctas=num_ctas, num_warps=num_warps,
-                      threads_per_warp=threads_per_warp)
+        kernel[(1, )](x_tri, z_tri, BLOCK=shape, num_ctas=num_ctas, num_warps=num_warps, warp_size=threads_per_warp)
     else:
         kernel[(1, )](x_tri, z_tri, BLOCK=shape, num_ctas=num_ctas)
     z_tri = to_numpy(z_tri)
@@ -2527,7 +2526,7 @@ def kernel(X, Z, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.const
     kern_kwargs = {}
     if is_xpu():
         kern_kwargs['num_warps'] = num_warps
-        kern_kwargs['threads_per_warp'] = threads_per_warp
+        kern_kwargs['warp_size'] = threads_per_warp
     if axis is not None and axis >= len(shape):
         with pytest.raises(triton.TritonError):
             kernel[(1, )](x_tri, z_tri, BLOCK_M=shape[0], BLOCK_N=shape[1], BLOCK_K=BLOCK_K, IS_3D=IS_3D, AXIS=axis,
diff --git a/python/tutorials/02-fused-softmax.py b/python/tutorials/02-fused-softmax.py
@@ -160,7 +160,7 @@ def allocated_slm_size(size_smem):
 
     # pre-compile kernel to get register usage and compute thread occupancy.
     kernel = softmax_kernel.warmup(y, x, x.stride(0), y.stride(0), n_rows, n_cols, num_warps=num_warps,
-                                   threads_per_warp=WARP_SIZE, BLOCK_SIZE=BLOCK_SIZE, grid=(1, ))
+                                   warp_size=WARP_SIZE, BLOCK_SIZE=BLOCK_SIZE, grid=(1, ))
     kernel._init_handles()
     size_smem = kernel.metadata.shared
     num_programs = occupancy(num_warps, size_smem)
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
@@ -21,7 +21,7 @@ class XPUOptions:
     num_ctas: int = 1
     num_stages: int = 2
     cluster_dims: tuple = (1, 1, 1)
-    threads_per_warp: int = 32
+    warp_size: int = 32
     optimize_epilogue: bool = False
     enable_fp_fusion: bool = True
     launch_cooperative_grid: bool = False
@@ -177,10 +177,10 @@ def load_dialects(self, ctx):
 
     @staticmethod
     def validate_options(opt, properties):
-        # Check threads_per_warp and num_threads are within limits.
-        if opt.threads_per_warp not in properties['sub_group_sizes']:
+        # Check warp_size and num_threads are within limits.
+        if opt.warp_size not in properties['sub_group_sizes']:
             raise ValueError(
-                f"threads_per_warp={opt.threads_per_warp} is unsupported for the target (supported values are {properties['sub_group_sizes']})"
+                f"warp_size={opt.warp_size} is unsupported for the target (supported values are {properties['sub_group_sizes']})"
             )
         if opt.num_warps > properties['max_num_sub_groups']:
             raise ValueError(
@@ -197,7 +197,7 @@ def annotate_module(mod, properties, opt, target_arch):
         module_opts.support_sg_2d_block = properties["has_subgroup_2d_block_io"]
         module_opts.support_dpas = properties["has_subgroup_matrix_multiply_accumulate"]
         module_opts.support_bf16_conversion = properties["has_bfloat16_conversions"]
-        module_opts.threads_per_warp = opt.threads_per_warp
+        module_opts.threads_per_warp = opt.warp_size
         module_opts.target_arch = target_arch
         intel.passes.ttgpuir.add_triton_annotate_module(pm, module_opts)
         pm.run(mod)
@@ -241,8 +241,8 @@ def make_ttgir(mod, metadata, opt, properties):
         # Annotate module with information required by subsequent transformations.
         XPUBackend.annotate_module(mod, properties, opt, "spir64")
 
-        # Overwrite the threads_per_warp option with the module annotation.
-        opt.threads_per_warp = intel.get_threads_per_warp(mod)
+        # Overwrite the warp_size option with the module annotation.
+        opt.warp_size = intel.get_threads_per_warp(mod)
         XPUBackend.validate_options(opt, properties)
 
         if (properties["has_subgroup_2d_block_io"] and properties["has_subgroup_matrix_multiply_accumulate"]
@@ -251,7 +251,7 @@ def make_ttgir(mod, metadata, opt, properties):
 
         pm = ir.pass_manager(mod.context)
         pm.enable_debug()
-        passes.ttir.add_convert_to_ttgpuir(pm, "xpu", opt.num_warps, opt.threads_per_warp, opt.num_ctas)
+        passes.ttir.add_convert_to_ttgpuir(pm, "xpu", opt.num_warps, opt.warp_size, opt.num_ctas)
         # optimize TTGIR
         intel.passes.ttgpuir.add_coalesce(pm)
         intel.passes.ttgpuir.add_remove_layout_conversions(pm)
@@ -296,11 +296,11 @@ def gluon_to_ttgir(self, src, metadata, options):
         pm = ir.pass_manager(mod.context)
         pm.enable_debug()
 
-        passes.ttgpuir.add_inliner(pm)
+        passes.gluon.add_inliner(pm)
         passes.gluon.add_resolve_auto_encodings(pm)
         passes.common.add_sccp(pm)
         passes.ttir.add_loop_aware_cse(pm)
-        passes.ttgpuir.add_canonicalizer(pm)
+        passes.gluon.add_canonicalizer(pm)
         passes.ttgpuir.add_combine_tensor_select_and_if(pm)
 
         pm.run(mod)