Fix non-contiguous tensor recreation in TritonParse reproducer

wychi · meta-codesync[bot] · commit 12f1d1bd8a28 · 2025-10-07T21:35:15.000-07:00
Summary:
The TritonParse reproducer was generating contiguous tensors when recreating kernel arguments from JSON, but the original tensors had non-contiguous strides and storage offsets. This caused CUDA out-of-bounds memory access errors when kernels computed pointer offsets using the stride information.

For example, the K tensor had shape [995415, 4, 64] with stride [1024, 64, 1] and storage_offset 768, but the reproducer created a contiguous tensor with stride [256, 64, 1] and no offset. When the kernel computed addresses like `K + off_h * stride_kh + seq_start * stride_kn`, it would access invalid memory.

This change adds proper handling for non-contiguous tensors by:
1. Extracting stride and storage_offset from JSON metadata
2. Creating a properly sized storage buffer
3. Using as_strided() to create views with the correct memory layout

The logic is refactored into helper functions to maintain clean code structure:
- _apply_stride_and_offset(): Handles stride/offset application
- _create_base_tensor(): Creates the base contiguous tensor with data
- _create_tensor(): Orchestrates the full tensor creation pipeline

Reviewed By: FindHao

Differential Revision: D84100085

fbshipit-source-id: c9618cd797bfe65c3ebeeff03ec054be56103d2f
diff --git a/tritonparse/reproducer/templates/example.py b/tritonparse/reproducer/templates/example.py
@@ -142,6 +142,171 @@ def create_args_from_json(data):
     return grid, args_dict
 
 
+def _apply_stride_and_offset(tensor, shape, stride, storage_offset):
+    """
+    Apply custom stride and storage offset to a tensor if needed.
+
+    Args:
+        tensor: The base contiguous tensor
+        shape: The desired shape
+        stride: The desired stride (or None for contiguous)
+        storage_offset: The desired storage offset
+
+    Returns:
+        torch.Tensor: The strided tensor view or original tensor if contiguous
+    """
+    if stride is None:
+        return tensor
+
+    # Calculate expected contiguous stride
+    expected_contiguous_stride = []
+    s = 1
+    for dim_size in reversed(shape):
+        expected_contiguous_stride.insert(0, s)
+        s *= dim_size
+
+    # If stride matches contiguous stride and no storage offset, return as-is
+    if tuple(stride) == tuple(expected_contiguous_stride) and storage_offset == 0:
+        return tensor
+
+    # Calculate required storage size
+    if len(shape) > 0 and len(stride) > 0:
+        max_offset = storage_offset
+        for dim_stride, dim_size in zip(stride, shape):
+            if dim_size > 0:
+                max_offset += dim_stride * (dim_size - 1)
+        storage_size = max_offset + 1
+    else:
+        storage_size = storage_offset + 1
+
+    # Create larger storage tensor and create strided view
+    storage_tensor = torch.empty(storage_size, dtype=tensor.dtype, device=tensor.device)
+
+    # Create strided view
+    strided_view = storage_tensor.as_strided(
+        size=shape, stride=stride, storage_offset=storage_offset
+    )
+
+    # Copy data from the base tensor into the strided layout
+    strided_view.copy_(tensor.flatten()[: strided_view.numel()].view(shape))
+
+    return strided_view
+
+
+def _create_base_tensor(arg_info) -> torch.Tensor:
+    if arg_info.get("blob_path"):
+        return load_tensor(arg_info.get("blob_path"), arg_info.get("device"))
+
+    # Extract basic tensor properties
+    dtype_str = arg_info.get("dtype")
+    try:
+        torch_dtype = getattr(torch, dtype_str.split(".")[-1])
+    except AttributeError:
+        logging.error(f"Unsupported dtype: {dtype_str}. Defaulting to float32.")
+        torch_dtype = torch.float32
+
+    shape = arg_info.get("shape", [])
+    device = arg_info.get("device", "cpu")
+
+    # Extract statistical information if available
+    mean = arg_info.get("mean")
+    std = arg_info.get("std")
+    min_val = arg_info.get("min")
+    max_val = arg_info.get("max")
+    has_stats = (
+        mean is not None
+        and std is not None
+        and min_val is not None
+        and max_val is not None
+    )
+
+    if arg_info.get("tensor_capture_error", False):
+        logging.error(
+            f"Error: Tensor '{arg_info.get('name', '')}' had capture error. Generating random tensor instead."
+        )
+
+    # Use a dummy tensor to check properties of the dtype
+    tensor_props = torch.empty(0, dtype=torch_dtype)
+
+    # Case 1: Floating point types
+    if tensor_props.is_floating_point():
+        if has_stats:
+            # Generate tensor with statistical properties matching original data
+            if std == 0 or min_val == max_val:
+                # Constant tensor
+                return torch.full(shape, mean, dtype=torch_dtype, device=device)
+            # Generate normal distribution with mean and std, then clamp to [min, max]
+            tensor = torch.randn(shape, dtype=torch.float32, device=device) * std + mean
+            tensor = torch.clamp(tensor, min=min_val, max=max_val)
+            return tensor.to(torch_dtype)
+        else:
+            # Fallback to original random generation
+            if torch_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
+                tmp = torch.rand(shape, dtype=torch.float32, device=device)
+                return tmp.to(torch_dtype)
+            else:
+                return torch.empty(shape, dtype=torch_dtype, device=device).random_()
+
+    # Case 2: Integer types
+    elif torch_dtype in [
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.uint8,
+        torch.bool,
+    ]:
+        if has_stats and torch_dtype != torch.bool:
+            # Generate tensor with statistical properties, then round for integers
+            if std == 0 or min_val == max_val:
+                # Constant tensor
+                return torch.full(shape, int(mean), dtype=torch_dtype, device=device)
+            tensor = torch.randn(shape, dtype=torch.float32, device=device) * std + mean
+            tensor = torch.clamp(tensor, min=min_val, max=max_val)
+            return torch.round(tensor).to(torch_dtype)
+        else:
+            # Fallback to original random generation
+            return torch.empty(shape, dtype=torch_dtype, device=device).random_()
+
+    # Case 3: Complex numbers need special handling
+    elif tensor_props.is_complex():
+        # Complex types: fallback to original logic for now
+        # TODO: Could be improved to use statistical info if available
+        float_dtype = torch.float32 if torch_dtype == torch.complex64 else torch.float64
+        real_part = torch.rand(shape, dtype=float_dtype, device=device)
+        imag_part = torch.rand(shape, dtype=float_dtype, device=device)
+        return torch.complex(real_part, imag_part)
+
+    # Case 4: Handle other unsigned integers (like uint32) which fail with random_()
+    elif "uint" in str(torch_dtype):
+        if has_stats:
+            # Generate tensor with statistical properties for unsigned integers
+            if std == 0 or min_val == max_val:
+                return torch.full(shape, int(mean), dtype=torch_dtype, device=device)
+            tensor = torch.randn(shape, dtype=torch.float32, device=device) * std + mean
+            tensor = torch.clamp(tensor, min=min_val, max=max_val)
+            return torch.round(tensor).to(torch_dtype)
+        else:
+            # Fallback to original random generation
+            return torch.randint(0, 1000, shape, dtype=torch_dtype, device=device)
+
+    # Case 5: If we don't know how to handle the type, raise an error
+    else:
+        raise NotImplementedError(
+            f"Random data generation not implemented for dtype: {torch_dtype}"
+        )
+
+
+def _create_tensor(arg_info) -> torch.Tensor:
+    tensor = _create_base_tensor(arg_info)
+
+    # Apply stride and storage offset if needed
+    shape = arg_info.get("shape", [])
+    stride = arg_info.get("stride")
+    storage_offset = arg_info.get("storage_offset", 0)
+    return _apply_stride_and_offset(tensor, shape, stride, storage_offset)
+
+
 def _create_arg_from_info(arg_info):
     """
     Recursively construct a kernel argument from its JSON schema.
@@ -166,121 +331,7 @@ def _create_arg_from_info(arg_info):
         return arg_info.get("value")
 
     elif arg_type == "tensor":
-        if arg_info.get("blob_path"):
-            return load_tensor(arg_info.get("blob_path"), arg_info.get("device"))
-
-        # Extract basic tensor properties
-        dtype_str = arg_info.get("dtype")
-        try:
-            torch_dtype = getattr(torch, dtype_str.split(".")[-1])
-        except AttributeError:
-            logging.error(f"Unsupported dtype: {dtype_str}. Defaulting to float32.")
-            torch_dtype = torch.float32
-
-        shape = arg_info.get("shape", [])
-        device = arg_info.get("device", "cpu")
-
-        # Extract statistical information if available
-        mean = arg_info.get("mean")
-        std = arg_info.get("std")
-        min_val = arg_info.get("min")
-        max_val = arg_info.get("max")
-        has_stats = (
-            mean is not None
-            and std is not None
-            and min_val is not None
-            and max_val is not None
-        )
-
-        if arg_info.get("tensor_capture_error", False):
-            logging.error(
-                f"Error: Tensor '{arg_info.get('name', '')}' had capture error. Generating random tensor instead."
-            )
-
-        # Use a dummy tensor to check properties of the dtype
-        tensor_props = torch.empty(0, dtype=torch_dtype)
-
-        # Case 1: Floating point types
-        if tensor_props.is_floating_point():
-            if has_stats:
-                # Generate tensor with statistical properties matching original data
-                if std == 0 or min_val == max_val:
-                    # Constant tensor
-                    return torch.full(shape, mean, dtype=torch_dtype, device=device)
-                # Generate normal distribution with mean and std, then clamp to [min, max]
-                tensor = (
-                    torch.randn(shape, dtype=torch.float32, device=device) * std + mean
-                )
-                tensor = torch.clamp(tensor, min=min_val, max=max_val)
-                return tensor.to(torch_dtype)
-            else:
-                # Fallback to original random generation
-                if torch_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
-                    tmp = torch.rand(shape, dtype=torch.float32, device=device)
-                    return tmp.to(torch_dtype)
-                else:
-                    return torch.empty(
-                        shape, dtype=torch_dtype, device=device
-                    ).random_()
-
-        # Case 2: Integer types
-        elif torch_dtype in [
-            torch.int8,
-            torch.int16,
-            torch.int32,
-            torch.int64,
-            torch.uint8,
-            torch.bool,
-        ]:
-            if has_stats and torch_dtype != torch.bool:
-                # Generate tensor with statistical properties, then round for integers
-                if std == 0 or min_val == max_val:
-                    # Constant tensor
-                    return torch.full(
-                        shape, int(mean), dtype=torch_dtype, device=device
-                    )
-                tensor = (
-                    torch.randn(shape, dtype=torch.float32, device=device) * std + mean
-                )
-                tensor = torch.clamp(tensor, min=min_val, max=max_val)
-                return torch.round(tensor).to(torch_dtype)
-            else:
-                # Fallback to original random generation
-                return torch.empty(shape, dtype=torch_dtype, device=device).random_()
-
-        # Case 3: Complex numbers need special handling
-        elif tensor_props.is_complex():
-            # Complex types: fallback to original logic for now
-            # TODO: Could be improved to use statistical info if available
-            float_dtype = (
-                torch.float32 if torch_dtype == torch.complex64 else torch.float64
-            )
-            real_part = torch.rand(shape, dtype=float_dtype, device=device)
-            imag_part = torch.rand(shape, dtype=float_dtype, device=device)
-            return torch.complex(real_part, imag_part)
-
-        # Case 4: Handle other unsigned integers (like uint32) which fail with random_()
-        elif "uint" in str(torch_dtype):
-            if has_stats:
-                # Generate tensor with statistical properties for unsigned integers
-                if std == 0 or min_val == max_val:
-                    return torch.full(
-                        shape, int(mean), dtype=torch_dtype, device=device
-                    )
-                tensor = (
-                    torch.randn(shape, dtype=torch.float32, device=device) * std + mean
-                )
-                tensor = torch.clamp(tensor, min=min_val, max=max_val)
-                return torch.round(tensor).to(torch_dtype)
-            else:
-                # Fallback to original random generation
-                return torch.randint(0, 1000, shape, dtype=torch_dtype, device=device)
-
-        # Case 5: If we don't know how to handle the type, raise an error
-        else:
-            raise NotImplementedError(
-                f"Random data generation not implemented for dtype: {torch_dtype}"
-            )
+        return _create_tensor(arg_info)
 
     elif arg_type == "triton_kernels.tensor.Tensor":
         if not TRITON_KERNELS_CUSTOM_TYPES: