[KERNELS] Change routing code to avoid storage(). (#8357)

yongjik · web-flow · commit d5156d76a173 · 2025-10-03T11:14:07.000-07:00
(Calls to storage() caused issues when using FakeTensor.)
diff --git a/python/triton_kernels/triton_kernels/routing.py b/python/triton_kernels/triton_kernels/routing.py
@@ -128,16 +128,15 @@ def forward(ctx, expt_scal, expt_indx, n_expts_tot, bitmatrix):
         expt_offs = torch.empty(n_expts_tot, dtype=torch.int32, device=device)
         combined_indx = torch.empty(n_gates_pad * 2, dtype=torch.int32, device=device)
         gate_scal = torch.empty(n_gates_pad, dtype=dtype, device=device)
-        token_offs_combined = empty_aligned((block_m_num + 1, n_expts_tot + 1), torch.int32, device, MEMSET_BLOCK_A)
-        block_pid_map = empty_aligned((block_m_num, max_n_tiles(n_expts_tot, n_gates_pad)), torch.int32, device,
-                                      MEMSET_BLOCK_A)
+        token_offs_combined, _ = empty_aligned((block_m_num + 1, n_expts_tot + 1), torch.int32, device, MEMSET_BLOCK_A)
+        block_pid_map, block_pid_map_n_elts = empty_aligned((block_m_num, max_n_tiles(n_expts_tot, n_gates_pad)),
+                                                            torch.int32, device, MEMSET_BLOCK_A)
         # slice padded allocations
         combine_indx = combined_indx[:n_gates_pad]
         dispatch_indx = combined_indx[n_gates_pad:]
         token_offs_raw, token_offs_pad = token_offs_combined[0], token_offs_combined[1:]
 
         # grid sizes
-        block_pid_map_n_elts = block_pid_map.untyped_storage().size() // block_pid_map.dtype.itemsize
         blocks1a = exact_div(block_pid_map_n_elts, MEMSET_BLOCK_A) + token_offs_combined.shape[0]
         blocks1b = cdiv(n_gates_pad * 2, MEMSET_BLOCK) + n_expts_tot + 1
         blocks2a = n_expts_tot * token_offs_pad.shape[0]
@@ -198,7 +197,7 @@ def empty_aligned(shape, dtype, device, pad_size):
     pad = lambda x: cdiv(x, pad_size) * pad_size
     ret = torch.empty((*shape[:-1], pad(shape[-1])), dtype=dtype, device=device)
     ret_slices = (*[slice(None)] * (len(shape) - 1), slice(0, shape[-1]))
-    return ret[ret_slices]
+    return ret[ret_slices], ret.numel()
 
 
 def max_n_tiles(n_expts_tot, n_gates):
@@ -217,10 +216,11 @@ def compute_expt_data(expt_hist, n_expts_tot, n_gates):
     MEMSET_BLOCK = 512
     dtype = torch.int32
     device = expt_hist.device
-    token_offs_combined = empty_aligned((block_m_num + 1, n_expts_tot + 1), dtype, device, MEMSET_BLOCK)
-    block_pid_map = empty_aligned((block_m_num, max_n_tiles(n_expts_tot, n_gates)), dtype, device, MEMSET_BLOCK)
+    token_offs_combined, _ = empty_aligned((block_m_num + 1, n_expts_tot + 1), dtype, device, MEMSET_BLOCK)
+    block_pid_map, block_pid_map_size = empty_aligned((block_m_num, max_n_tiles(n_expts_tot, n_gates)), dtype, device,
+                                                      MEMSET_BLOCK)
     token_offs_raw, token_offs_pad = token_offs_combined[0], token_offs_combined[1:]
-    n_memset_blocks = exact_div(block_pid_map.storage().size(), MEMSET_BLOCK)
+    n_memset_blocks = exact_div(block_pid_map_size, MEMSET_BLOCK)
 
     _expt_data_memset[(token_offs_combined.shape[0] + n_memset_blocks, )](
         expt_hist, n_expts_tot,  #