Add kernel-run-level tensor blob save controls

srivatsan-ramesh · meta-codesync[bot] · commit e84d1faff580 · 2026-03-16T13:54:30.000-07:00
Summary:
Add env vars TRITONPARSE_TENSOR_SAVE_SKIP_RUNS and TRITONPARSE_TENSOR_SAVE_MAX_RUNS
to skip/limit tensor blob saving at the kernel run granularity. A "kernel run" counts
all autotune benchmark launches + the winner launch for a single kernel invocation as
one run (autotune benchmarks are detected via stack frame inspection and excluded from
the run counter).

While disk writes are already deduped via content-addressed BLAKE2b hashing, the
serialization + hashing overhead still occurs per launch. These controls let users
skip that overhead for runs they don't need blobs from.

- `TRITONPARSE_TENSOR_SAVE_SKIP_RUNS=N`: skip blob saving for first N kernel runs
- `TRITONPARSE_TENSOR_SAVE_MAX_RUNS=N`: save blobs for at most N runs after skipping (0=unlimited)

Also exposes these as `tensor_save_skip_runs` / `tensor_save_max_runs` params on
`init()` and `TritonParseManager`.

Reviewed By: FindHao

Differential Revision: D96661726

fbshipit-source-id: c423799da37d21ead475740d3b33326444bd4fac
diff --git a/tests/gpu/test_tensor_blob.py b/tests/gpu/test_tensor_blob.py
@@ -265,6 +265,74 @@ def count_all_blobs(manager_dir_path):
             )
             print("✓ Storage correctly disabled when enable_tensor_blob_storage=False")
 
+        # === Test 5: Skip/Max Runs ===
+        print("\n=== Test 5: Skip/Max Runs ===")
+
+        # Test 5a: tensor_save_max_runs=1 — only first kernel run saves blobs
+        print("\n--- Test 5a: max_runs=1 ---")
+        temp_output_dir_5a = tempfile.mkdtemp()
+
+        with tritonparse.context_manager.TritonParseManager(
+            enable_trace_launch=True,
+            enable_tensor_blob_storage=True,
+            tensor_save_max_runs=1,
+            out=temp_output_dir_5a,
+        ) as manager:
+            # Run 3 kernels with different inputs so blobs are not deduped
+            for i in range(3):
+                x = torch.randn(
+                    (512,), device=self.cuda_device, dtype=torch.float32
+                ) * (i + 1)
+                y = run_kernel(x)
+                y.sum()
+            torch.cuda.synchronize()
+
+            blobs_5a = count_all_blobs(manager.dir_path)
+            print(f"  Blobs with max_runs=1 over 3 launches: {blobs_5a}")
+            # With max_runs=1, only the first kernel run saves blobs (input + output = 2)
+            # Runs 2 and 3 should not save any blobs
+            self.assertGreater(
+                blobs_5a, 0, "Should have at least 1 blob from first run"
+            )
+            self.assertLessEqual(
+                blobs_5a, 2, "Should have at most 2 blobs (input+output of first run)"
+            )
+            print(f"✓ max_runs=1: {blobs_5a} blob(s) saved (only first kernel run)")
+
+        # Test 5b: skip_runs=1, max_runs=2 — only second and third kernel runs save blobs
+        print("\n--- Test 5b: skip_runs=1, max_runs=2 ---")
+        temp_output_dir_5b = tempfile.mkdtemp()
+
+        with tritonparse.context_manager.TritonParseManager(
+            enable_trace_launch=True,
+            enable_tensor_blob_storage=True,
+            tensor_save_skip_runs=1,
+            tensor_save_max_runs=2,
+            out=temp_output_dir_5b,
+        ) as manager:
+            for i in range(4):
+                x = torch.randn(
+                    (512,), device=self.cuda_device, dtype=torch.float32
+                ) * (i + 1)
+                y = run_kernel(x)
+                y.sum()
+            torch.cuda.synchronize()
+
+            blobs_5b = count_all_blobs(manager.dir_path)
+            print(f"  Blobs with skip=1, max=2 over 4 launches: {blobs_5b}")
+            # skip=1 skips first run, max=2 saves runs 2 and 3, skips run 4
+            # Each saved run has input + output = 2 blobs, so expect 3-4 blobs
+            # (dedup may reduce if outputs happen to match)
+            self.assertGreater(
+                blobs_5b, 0, "Should have at least 1 blob from saved runs"
+            )
+            self.assertLessEqual(
+                blobs_5b,
+                4,
+                "Should have at most 4 blobs (input+output of runs 2 and 3)",
+            )
+            print(f"✓ skip=1, max=2: {blobs_5b} blob(s) saved (runs 2 and 3 only)")
+
         # Clean up all test outputs
         try:
             if TEST_KEEP_OUTPUT:
@@ -273,14 +341,18 @@ def count_all_blobs(manager_dir_path):
                     f"  Test 1: {temp_output_dir_1}\n"
                     f"  Test 2: {temp_output_dir_2}\n"
                     f"  Test 3: {temp_output_dir_3}\n"
-                    f"  Test 4: {temp_output_dir_4}"
+                    f"  Test 4: {temp_output_dir_4}\n"
+                    f"  Test 5a: {temp_output_dir_5a}\n"
+                    f"  Test 5b: {temp_output_dir_5b}"
                 )
             else:
                 for temp_dir in [
                     temp_output_dir_1,
                     temp_output_dir_2,
                     temp_output_dir_3,
                     temp_output_dir_4,
+                    temp_output_dir_5a,
+                    temp_output_dir_5b,
                 ]:
                     if os.path.exists(temp_dir):
                         shutil.rmtree(temp_dir)
diff --git a/tritonparse/context_manager.py b/tritonparse/context_manager.py
@@ -20,6 +20,8 @@ def __init__(
         split_inductor_compilations=True,
         enable_tensor_blob_storage=False,
         tensor_storage_quota=None,
+        tensor_save_skip_runs=None,
+        tensor_save_max_runs=None,
         log_dir=None,
         keep_logs=False,
         **parse_kwargs,
@@ -32,6 +34,8 @@ def __init__(
             split_inductor_compilations: Whether to split inductor compilations in the output
             enable_tensor_blob_storage: Whether to enable tensor blob storage
             tensor_storage_quota: Storage quota in bytes for tensor blobs (default: 100GB)
+            tensor_save_skip_runs: Skip tensor blob saving for the first N kernel runs
+            tensor_save_max_runs: Save tensor blobs for at most N kernel runs after skipping
             log_dir: Optional directory path to store raw trace logs. If not provided,
                 a temporary directory will be created and cleaned up after parsing.
                 If provided, the directory will be created if it doesn't exist and
@@ -45,6 +49,8 @@ def __init__(
         self.split_inductor_compilations = split_inductor_compilations
         self.enable_tensor_blob_storage = enable_tensor_blob_storage
         self.tensor_storage_quota = tensor_storage_quota
+        self.tensor_save_skip_runs = tensor_save_skip_runs
+        self.tensor_save_max_runs = tensor_save_max_runs
         self.user_log_dir = log_dir
         self.keep_logs = keep_logs
         self.parse_kwargs = parse_kwargs
@@ -69,6 +75,10 @@ def __enter__(self):
         }
         if self.tensor_storage_quota is not None:
             init_kwargs["tensor_storage_quota"] = self.tensor_storage_quota
+        if self.tensor_save_skip_runs is not None:
+            init_kwargs["tensor_save_skip_runs"] = self.tensor_save_skip_runs
+        if self.tensor_save_max_runs is not None:
+            init_kwargs["tensor_save_max_runs"] = self.tensor_save_max_runs
 
         init(self.dir_path, **init_kwargs)
         return self
diff --git a/tritonparse/structured_logging.py b/tritonparse/structured_logging.py
@@ -89,6 +89,9 @@
 
 # The flag to mark if launch is traced. It is used to avoid initilizing the launch hook twice.
 _trace_launch_enabled = False
+# Kernel run counter and per-launch blob save flag for skip/max runs gating
+_kernel_run_count = 0
+_save_blobs_for_current_launch = True
 # Enable tensor blob storage
 TRITONPARSE_SAVE_TENSOR_BLOBS = os.getenv("TRITONPARSE_SAVE_TENSOR_BLOBS", "0") in [
     "1",
@@ -109,6 +112,14 @@
 TRITONPARSE_COMPRESSION_LEVEL = 4
 # Log statistics every N saved blobs
 TRITONPARSE_STATS_LOG_FREQUENCY = 100
+# Skip tensor blob saving for the first N kernel runs (0 = no skip)
+TRITONPARSE_TENSOR_SAVE_SKIP_RUNS = int(
+    os.getenv("TRITONPARSE_TENSOR_SAVE_SKIP_RUNS", "0")
+)
+# Save tensor blobs for at most N kernel runs after skipping (0 = unlimited)
+TRITONPARSE_TENSOR_SAVE_MAX_RUNS = int(
+    os.getenv("TRITONPARSE_TENSOR_SAVE_MAX_RUNS", "0")
+)
 
 TRITON_TRACE_HANDLER = None
 # Global tensor blob manager instance
@@ -607,7 +618,11 @@ def _log_torch_tensor_info(tensor_value):
             arg_info["tensor_capture_error"] = str(e)
 
     # Add tensor blob storage if enabled
-    if TRITONPARSE_SAVE_TENSOR_BLOBS and TENSOR_BLOB_MANAGER is not None:
+    if (
+        TRITONPARSE_SAVE_TENSOR_BLOBS
+        and TENSOR_BLOB_MANAGER is not None
+        and _save_blobs_for_current_launch
+    ):
         blob_info = TENSOR_BLOB_MANAGER.save_tensor_blob(tensor_value)
         arg_info.update(blob_info)
     return arg_info
@@ -1428,6 +1443,8 @@ def extract_arg_info(arg_dict):
 
 
 def add_launch_metadata(grid, metadata, arg_dict, inductor_args=None):
+    global _kernel_run_count, _save_blobs_for_current_launch
+
     # Check if we're in CUDA graph capture mode - if so, skip detailed argument extraction
     # to avoid CUDA errors (cudaErrorStreamCaptureUnsupported)
     is_capturing = False
@@ -1448,6 +1465,25 @@ def add_launch_metadata(grid, metadata, arg_dict, inductor_args=None):
             )
         }
 
+    # Gate tensor blob saving based on skip/max runs
+    if TRITONPARSE_SAVE_TENSOR_BLOBS:
+        skip = TRITONPARSE_TENSOR_SAVE_SKIP_RUNS
+        max_runs = TRITONPARSE_TENSOR_SAVE_MAX_RUNS
+        if skip > 0 or max_runs > 0:
+            # Only capture the stack when we actually need kernel run counting
+            from .parse.sourcemap_utils import _is_autotune_benchmark_launch
+
+            if not _is_autotune_benchmark_launch(get_stack_trace()):
+                _kernel_run_count += 1
+            _save_blobs_for_current_launch = not (
+                _kernel_run_count <= skip
+                or (max_runs > 0 and _kernel_run_count > skip + max_runs)
+            )
+        else:
+            _save_blobs_for_current_launch = True
+    else:
+        _save_blobs_for_current_launch = False
+
     # Extract detailed argument information (only when NOT capturing)
     extracted_args = extract_arg_info(arg_dict)
     extracted_inductor_args = extract_arg_info(inductor_args) if inductor_args else {}
@@ -1691,6 +1727,8 @@ def init(
     enable_tensor_blob_storage: bool = False,
     tensor_storage_quota: Optional[int] = None,
     compression: Optional[str] = None,
+    tensor_save_skip_runs: Optional[int] = None,
+    tensor_save_max_runs: Optional[int] = None,
 ):
     """
     This function is a wrapper around init_basic() that also sets up the compilation listener. Its arguments have higher priority than the environment variables for same settings.
@@ -1712,12 +1750,15 @@ def init(
         tensor_storage_quota (Optional[int]): Storage quota in bytes for tensor blobs (default: 100GB).
         compression (Optional[str]): Compression format for trace files ("none", "gzip", or "clp").
             If not specified, respects TRITON_TRACE_COMPRESSION env var, or defaults to "none".
+        tensor_save_skip_runs (Optional[int]): Skip tensor blob saving for the first N kernel runs.
+        tensor_save_max_runs (Optional[int]): Save tensor blobs for at most N kernel runs after skipping.
     """
     global TRITON_TRACE_LAUNCH, TRITON_TRACE_LAUNCH_WITHIN_PROFILING
     global TRITONPARSE_MORE_TENSOR_INFORMATION
     global TORCHINDUCTOR_RUN_JIT_POST_COMPILE_HOOK, TRITONPARSE_DUMP_SASS
     global TRITONPARSE_SAVE_TENSOR_BLOBS, TRITONPARSE_TENSOR_STORAGE_QUOTA
     global TRITON_TRACE_COMPRESSION
+    global TRITONPARSE_TENSOR_SAVE_SKIP_RUNS, TRITONPARSE_TENSOR_SAVE_MAX_RUNS
 
     # Set global flags BEFORE calling init_basic, so init_logs() can see them
     # TRITON_TRACE_LAUNCH and TRITON_TRACE_LAUNCH_WITHIN_PROFILING are mutually exclusive.
@@ -1750,6 +1791,12 @@ def init(
         if os.getenv("TRITON_TRACE_COMPRESSION") is None:
             TRITON_TRACE_COMPRESSION = compression
 
+    # Set tensor save skip/max runs (Python API overrides env var)
+    if tensor_save_skip_runs is not None:
+        TRITONPARSE_TENSOR_SAVE_SKIP_RUNS = tensor_save_skip_runs
+    if tensor_save_max_runs is not None:
+        TRITONPARSE_TENSOR_SAVE_MAX_RUNS = tensor_save_max_runs
+
     init_basic(trace_folder)
     from triton import knobs
 
@@ -1779,6 +1826,7 @@ def clear_logging_config():
     global TRITON_TRACE_HANDLER, triton_trace_folder, _KERNEL_ALLOWLIST_PATTERNS
     global _trace_launch_enabled
     global TENSOR_BLOB_MANAGER
+    global _kernel_run_count, _save_blobs_for_current_launch
     # 1. Clean up the log handler
     if TRITON_TRACE_HANDLER is not None:
         if TRITON_TRACE_HANDLER in triton_trace_log.handlers:
@@ -1793,6 +1841,8 @@ def clear_logging_config():
 
     # 3. Reset tensor blob manager and related flags
     TENSOR_BLOB_MANAGER = None
+    _kernel_run_count = 0
+    _save_blobs_for_current_launch = True
 
     # 4. Reset Triton knobs
     # Check if triton was actually imported and used