[KERNELS] Add an option to avoid device sync on launch_metadata. (#7296)

yongjik · web-flow · commit 1ab9f6530b61 · 2025-06-25T11:46:56.000-07:00
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_common.py
@@ -78,17 +78,37 @@ def convert_dtype(dtype):
 
 
 def matmul_launch_metadata(grid, kernel, args):
+    from ..proton_opts import launch_metadata_allow_sync
+
     ret = dict()
     M, N, K = args["M"], args["N"], args["K"]
     Y, X, W = [t.base if isinstance(t, TensorDescriptor) else t for t in [args["Y"], args["X"], args["W"]]]
+    tokens_per_expt = args.get("TOKENS_PER_EXPT_FOR_ANNOTATION")
     hist = args["ExptHist"]
     if hist is not None:
-        n_tokens = float(hist.sum())
-        n_w_bytes = (W.numel() * W.element_size() // hist.numel()) * (hist > 0).sum()
+        # If annotation is given, use that to generate name for profiling.
+        if tokens_per_expt is not None:
+            n_rows = f"{tokens_per_expt}*"
+        elif launch_metadata_allow_sync():
+            n_rows = int(hist.float().mean())
+        else:
+            n_rows = "unknown"
+
+        if launch_metadata_allow_sync():
+            n_tokens = float(hist.sum())
+            n_w_bytes = (W.numel() * W.element_size() // hist.numel()) * (hist > 0).sum()
+        elif tokens_per_expt is not None:
+            n_tokens = tokens_per_expt * args["N_EXPTS_TOT"]
+            # This may not be totally correct (e.g., we might not be using all experts)
+            # but it's better than nothing.
+            n_w_bytes = W.numel() * W.element_size()
+        else:
+            n_tokens = None
+            n_w_bytes = 0
 
         # If annotation is given, use that to generate name for profiling.
         tokens_per_expt = args.get("TOKENS_PER_EXPT_FOR_ANNOTATION")
-        n_rows = f"{tokens_per_expt}*" if tokens_per_expt is not None else int(hist.float().mean())
+        n_rows = f"{tokens_per_expt}*" if tokens_per_expt is not None else n_rows
     else:
         n_tokens = None
         n_w_bytes = W.numel() * W.element_size()
@@ -101,6 +121,10 @@ def matmul_launch_metadata(grid, kernel, args):
     ep_subtile = args["EPILOGUE_SUBTILE"]
     if ep_subtile is not None and ep_subtile > 1:
         ret["name"] += f" ep/{ep_subtile}"
+
+    if hist is not None and n_tokens is None:
+        return ret  # Don't fill metadata because we can't compute them properly.
+
     fM = M if M is not None else n_tokens
     fK = K if K is not None else n_tokens
     ret[f"flops{nbits}"] = 2.0 * fM * N * fK
@@ -115,7 +139,7 @@ def matmul_launch_metadata(grid, kernel, args):
         assert n_tokens is not None
         n_expts_act = args["N_EXPTS_ACT"]
 
-        if gindx is not None:
+        if (gindx is not None) and launch_metadata_allow_sync():
             # recreate inverse GatherIndx.
             dst = torch.full_like(gindx, -1)
             idx = torch.arange(len(gindx), device=gindx.device, dtype=torch.int32)
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py
@@ -29,7 +29,8 @@ def _zero_masked_rows(
 
 
 _matmul_ogs_repr = make_matmul_repr("_matmul_ogs", [0, 1, 2])
-@triton.jit(repr=_matmul_ogs_repr, launch_metadata=matmul_launch_metadata)
+@triton.jit(do_not_specialize=["TOKENS_PER_EXPT_FOR_ANNOTATION"],
+            repr=_matmul_ogs_repr, launch_metadata=matmul_launch_metadata)
 def _matmul_ogs(
              Y, Out, stride_y_k, stride_y_z, stride_y_m, stride_y_n,
              YExpectedScale, YActualScale, YChecksumScale,
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py
@@ -96,7 +96,8 @@ def _load_writeback_idx_and_mask(WriteBackIndx, writeback_size, offs, mask):
 
 
 _matmul_ogs_repr = make_matmul_repr("_p_matmul_ogs", [0, 1, 2])
-@triton.jit(repr=_matmul_ogs_repr, launch_metadata=matmul_launch_metadata)
+@triton.jit(do_not_specialize=["TOKENS_PER_EXPT_FOR_ANNOTATION"],
+            repr=_matmul_ogs_repr, launch_metadata=matmul_launch_metadata)
 def _p_matmul_ogs(
              Y, Out, stride_y_k, stride_y_z, stride_y_m, stride_y_n,
              YExpectedScale, YActualScale, YChecksumScale,
diff --git a/python/triton_kernels/triton_kernels/proton_opts.py b/python/triton_kernels/triton_kernels/proton_opts.py
@@ -0,0 +1,17 @@
+# proton options
+
+import os
+
+_launch_metadata_allow_sync = None
+
+
+def launch_metadata_allow_sync():
+    global _launch_metadata_allow_sync
+    if _launch_metadata_allow_sync is None:
+        _launch_metadata_allow_sync = not (os.getenv("PROTON_LAUNCH_METADATA_NOSYNC") == "1")
+    return _launch_metadata_allow_sync
+
+
+def set_launch_metadata_allow_sync(allow_sync: bool):
+    global _launch_metadata_allow_sync
+    _launch_metadata_allow_sync = allow_sync