Fix strided slice support for static slices (e.g., buf[::2]) (#426)

yf225 · web-flow · commit 4718678118bf · 2025-08-05T22:02:20.000-07:00
diff --git a/helion/_compiler/indexing_strategy.py b/helion/_compiler/indexing_strategy.py
@@ -17,6 +17,7 @@
 from .device_function import DeviceFunction
 from .host_function import HostFunction
 from .tile_strategy import DeviceLoopState
+from .utils import compute_slice_size
 from .variable_origin import BlockSizeOrigin
 
 if TYPE_CHECKING:
@@ -227,7 +228,10 @@ def valid_block_size(
             if k is None:
                 continue
             size, stride = size_stride.popleft()
-            if str(k) == "slice(None, None, None)":
+            if isinstance(k, slice):
+                # Slices with steps are not supported in tensor descriptor mode
+                if k.step is not None and k.step != 1:
+                    return False
                 block_size = env.allocate_reduction_dimension(size).from_config(config)
                 if not valid_block_size(block_size, stride, i):
                     return False
@@ -476,10 +480,13 @@ def compute_shape(
                             output_size.append(k)
                         else:
                             output_size.append(1)
-            elif isinstance(k, slice) and str(k) == "slice(None, None, None)":
+            elif isinstance(k, slice):
                 size = input_size.popleft()
-                if size != 1:
-                    rdim = env.allocate_reduction_dimension(size)
+                # Handle slices with steps
+                slice_size = compute_slice_size(k, size)
+
+                if slice_size != 1:
+                    rdim = env.allocate_reduction_dimension(slice_size)
                     output_size.append(rdim.var)
                 else:
                     output_size.append(1)
@@ -531,18 +538,40 @@ def create(
                     # When the index is a scalar (no BlockSizeOrigin), the corresponding dim is eliminated.
                     val = state.device_function.literal_expr(k)
                     index_values.append(f"({val})")
-            elif isinstance(k, slice) and str(k) == "slice(None, None, None)":
+            elif isinstance(k, slice):
                 expand = tile_strategy.expand_str(output_size, output_idx)
                 size = fake_value.size(len(index_values))
-                if size != 1:
-                    rdim = env.allocate_reduction_dimension(size)
-                    block_idx = rdim.block_id
-                    index_var = state.codegen.index_var(block_idx)
-                    index_values.append(f"({index_var}){expand}")
-                    if mask := state.codegen.mask_var(block_idx):
-                        mask_values.setdefault(f"({mask}){expand}")
+
+                # Handle slices with steps
+                if k.step is not None and k.step != 1:
+                    # For strided slices, we need to generate: start + index * step
+                    start = k.start if k.start is not None else 0
+                    step = k.step
+                    slice_size = compute_slice_size(k, size)
+
+                    if slice_size != 1:
+                        rdim = env.allocate_reduction_dimension(slice_size)
+                        block_idx = rdim.block_id
+                        index_var = state.codegen.index_var(block_idx)
+                        # Generate strided index: start + index * step
+                        index_values.append(
+                            f"({start} + ({index_var}) * {step}){expand}"
+                        )
+                        if mask := state.codegen.mask_var(block_idx):
+                            mask_values.setdefault(f"({mask}){expand}")
+                    else:
+                        index_values.append(f"{start}{expand}")
                 else:
-                    index_values.append(f"tl.zeros([1], {dtype}){expand}")
+                    # Full slice or slice without step
+                    if size != 1:
+                        rdim = env.allocate_reduction_dimension(size)
+                        block_idx = rdim.block_id
+                        index_var = state.codegen.index_var(block_idx)
+                        index_values.append(f"({index_var}){expand}")
+                        if mask := state.codegen.mask_var(block_idx):
+                            mask_values.setdefault(f"({mask}){expand}")
+                    else:
+                        index_values.append(f"tl.zeros([1], {dtype}){expand}")
                 output_idx += 1
             elif isinstance(k, torch.Tensor) and k.ndim == 1:
                 expand = tile_strategy.expand_str(output_size, output_idx)
@@ -772,8 +801,15 @@ def create(
                 else:
                     res.offsets.append(state.device_function.literal_expr(k))
                     res.block_shape.append(1)
-            elif isinstance(k, slice) and str(k) == "slice(None, None, None)":
+            elif isinstance(k, slice):
                 size = fake_value.size(len(res.offsets))
+                # Handle slices with steps
+                if k.step is not None and k.step != 1:
+                    # Slices with steps are not supported in block_ptr mode
+                    raise exc.InvalidIndexingType(
+                        f"Strided slices not supported in block_ptr mode: {k}"
+                    )
+                # Full slice or slice without step
                 if size != 1:
                     env = CompileEnvironment.current()
                     rdim = env.allocate_reduction_dimension(size)
diff --git a/helion/_compiler/type_propagation.py b/helion/_compiler/type_propagation.py
@@ -42,6 +42,7 @@
 from .host_function import SymbolOrigin
 from .output_header import library_imports
 from .source_location import current_location
+from .utils import compute_slice_size
 from .variable_origin import ArgumentOrigin
 from .variable_origin import AttributeOrigin
 from .variable_origin import BuiltinOrigin
@@ -437,14 +438,19 @@ def _device_indexing_size(self, key: TypeInfo) -> list[int | torch.SymInt]:
             elif isinstance(k, SymIntType):
                 inputs_consumed += 1
             elif isinstance(k, SliceType):
-                assert str(k.proxy()) == "slice(None, None, None)"
+                # Handle slices - including those with steps
+                slice_obj = k.proxy()
                 size = self.fake_value.size(inputs_consumed)
                 inputs_consumed += 1
+
+                # For slices with steps, we need to calculate the output size differently
+                output_size = compute_slice_size(slice_obj, size)
+
                 if self.origin.is_device():
-                    output_sizes.append(size)
-                elif size != 1:
+                    output_sizes.append(output_size)
+                elif output_size != 1:
                     rdim = CompileEnvironment.current().allocate_reduction_dimension(
-                        size
+                        output_size
                     )
                     output_sizes.append(rdim.var)
                 else:
diff --git a/helion/_compiler/utils.py b/helion/_compiler/utils.py
@@ -0,0 +1,29 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import torch
+
+
+def compute_slice_size(
+    slice_obj: slice, original_size: int | torch.SymInt
+) -> int | torch.SymInt:
+    """
+    Compute the size of a slice operation.
+
+    Args:
+        slice_obj: The slice object with start, stop, and step attributes
+        original_size: The size of the dimension being sliced
+
+    Returns:
+        The size of the resulting sliced dimension
+    """
+    if slice_obj.step is not None and slice_obj.step != 1:
+        # Calculate size based on step
+        start = slice_obj.start if slice_obj.start is not None else 0
+        stop = slice_obj.stop if slice_obj.stop is not None else original_size
+        step = slice_obj.step
+        return (stop - start + step - 1) // step
+    # Full slice or slice without step
+    return original_size
diff --git a/test/test_indexing.py b/test/test_indexing.py
@@ -689,7 +689,6 @@ def kernel(
         torch.testing.assert_close(src_result, expected_src)
         torch.testing.assert_close(dst_result, expected_dst)
 
-    @skipIfNormalMode("InternalError: AssertionError")
     def test_strided_slice(self):
         """Test both setter from scalar and getter for strided slices [::2] and [1::3]"""