Support reshape with block_size expressions (#495)

yf225 · web-flow · commit db41224c4a1f · 2025-08-13T18:21:57.000-07:00
diff --git a/helion/_compiler/compile_environment.py b/helion/_compiler/compile_environment.py
@@ -74,12 +74,16 @@ def __init__(self, device: torch.device, settings: Settings) -> None:
         self._symint_cache: dict[object, torch.SymInt] = {}
 
     def add_kernel_tensor_size(self, sizes: Sequence[int | torch.SymInt]) -> None:
+        from .device_function import contains_only_block_size_symbols
+
         for size in sizes:
             if isinstance(size, torch.SymInt):
                 block_idx = self.get_block_id(size)
                 if block_idx is None:
                     value = self.shape_env.replace(size._sympy_())
-                    if value.free_symbols:
+                    if value.free_symbols and not contains_only_block_size_symbols(
+                        value
+                    ):
                         raise exc.ShapeSpecializingAllocation
         self.kernel_tensor_sizes[(*map(_to_sympy, sizes),)] += 1
 
diff --git a/helion/_compiler/device_function.py b/helion/_compiler/device_function.py
@@ -58,6 +58,40 @@ class VarInfo(NamedTuple):
     fx_node: torch.fx.Node
 
 
+def find_block_size_symbols(
+    expr: sympy.Expr,
+) -> tuple[dict[sympy.Symbol, int], set[sympy.Symbol]]:
+    """
+    Find block size symbols in a sympy expression.
+
+    Returns:
+        tuple of (block_size_mapping, non_block_size_symbols) where:
+        - block_size_mapping: dict mapping block size symbols to their block_id
+        - non_block_size_symbols: set of symbols that are NOT block sizes
+    """
+    if not isinstance(expr, sympy.Expr):
+        return {}, set()
+
+    hf = HostFunction.current()
+    block_sizes = {}
+    non_block_size_symbols = set()
+
+    for symbol in expr.free_symbols:
+        origin_info = hf.expr_to_origin.get(symbol)  # pyright: ignore[reportArgumentType]
+        if origin_info is None or not isinstance(origin_info.origin, BlockSizeOrigin):
+            non_block_size_symbols.add(symbol)
+        else:
+            block_sizes[symbol] = origin_info.origin.block_id
+
+    return block_sizes, non_block_size_symbols
+
+
+def contains_only_block_size_symbols(expr: sympy.Expr) -> bool:
+    """Check if expression contains only block size symbols (no other variables)."""
+    _, non_block = find_block_size_symbols(expr)
+    return len(non_block) == 0
+
+
 @dataclasses.dataclass
 class Argument:
     name: str  # in the device function
@@ -209,6 +243,35 @@ def __init__(self, name: str, config: Config, codegen: GenerateAST) -> None:
     def block_size_var(self, block_id: int) -> str | None:
         return self.block_size_var_cache.get((block_id,))
 
+    def try_map_block_symbols_to_vars(self, expr: sympy.Expr) -> sympy.Expr | None:
+        """Try to map all block size symbols in expression to their variable names.
+
+        Returns:
+            - The expression with symbols replaced if ALL symbols are block sizes and have variables
+            - None if the expression contains non-block symbols or unmapped block symbols
+        """
+        block_mapping, non_block_symbols = find_block_size_symbols(expr)
+
+        # Can't map if there are non-block symbols
+        if non_block_symbols:
+            return None
+
+        # No symbols to map - return as-is
+        if not block_mapping:
+            return expr
+
+        # Try to map all block symbols to their variables
+        var_map = {}
+        for symbol, block_id in block_mapping.items():
+            block_var = self.block_size_var(block_id)
+            if not block_var:
+                # Can't map this block symbol - fail
+                return None
+            var_map[symbol] = sympy.Symbol(block_var, integer=True)
+
+        # Successfully mapped all symbols
+        return expr.xreplace(var_map)
+
     def merge_variable_names(self, a: str, b: str) -> None:
         name_group = [
             *self._variable_renames.get(a, [a]),
diff --git a/helion/_compiler/tile_dispatch.py b/helion/_compiler/tile_dispatch.py
@@ -4,8 +4,12 @@
 import operator
 from typing import TYPE_CHECKING
 
+import sympy
+import torch
+
 from .compile_environment import CompileEnvironment
 from .device_function import DeviceFunction
+from .device_function import texpr
 from .device_ir import ForLoopGraphInfo
 from .device_ir import ReductionLoopGraphInfo
 from .host_function import HostFunction
@@ -21,9 +25,6 @@
 if TYPE_CHECKING:
     from collections.abc import Sequence
 
-    import sympy
-    import torch
-
     from .. import Config
     from .inductor_lowering import CodegenState
 
@@ -120,9 +121,9 @@ def _compact_shape(self, shapes: ShapeLike) -> list[CompactedShape]:
         for idx, shape in enumerate(shapes):
             block_idx = CompileEnvironment.current().get_block_id(shape)
             if block_idx is None:
-                compacted_shapes.append(
-                    CompactedShape(self.strategies[0].fn.literal_expr(shape), [idx], [])
-                )
+                # Check if this is a symbolic expression with block sizes
+                shape_str = self._get_shape_string(shape)
+                compacted_shapes.append(CompactedShape(shape_str, [idx], []))
             else:
                 block_size = DeviceFunction.current().block_size_var(block_idx)
                 if block_size is None:
@@ -132,6 +133,24 @@ def _compact_shape(self, shapes: ShapeLike) -> list[CompactedShape]:
             compacted_shapes = strategy.compact_shape(compacted_shapes)
         return compacted_shapes
 
+    def _get_shape_string(self, shape: SymIntLike) -> str:
+        """Get string representation of a shape"""
+        # Extract sympy expression
+        if isinstance(shape, torch.SymInt):
+            expr = shape._sympy_()
+        elif isinstance(shape, sympy.Expr):
+            expr = shape
+        else:
+            return self.strategies[0].fn.literal_expr(shape)
+
+        # Try to map block symbols to their variable names
+        mapped_expr = DeviceFunction.current().try_map_block_symbols_to_vars(expr)
+        if mapped_expr is not None:
+            return texpr(mapped_expr)
+
+        # Fallback: use literal expression if mapping failed
+        return self.strategies[0].fn.literal_expr(shape)
+
     def shape_str(self, shape: ShapeLike) -> str:
         compacted_shapes = self._compact_shape(shape)
         result = [s.size_str for s in compacted_shapes]
diff --git a/helion/_utils.py b/helion/_utils.py
@@ -30,7 +30,7 @@ def convert_size_arg(size: object) -> object:
     """Convert a size argument that may contain RefTile objects.
 
     Handles:
-    - Single RefTile -> int
+    - Single RefTile -> int (block_size)
     - List/tuple containing RefTiles -> list with converted sizes
     - Other values -> unchanged
     """
@@ -40,7 +40,7 @@ def convert_size_arg(size: object) -> object:
     if isinstance(size, (list, tuple)):
         return [convert_size_arg(item) for item in size]
     if isinstance(size, RefTile):
-        return size._slice.stop - size._slice.start
+        return size._block_size
     return size
 
 
diff --git a/helion/runtime/ref_mode.py b/helion/runtime/ref_mode.py
@@ -128,6 +128,9 @@ def __init__(self) -> None:
             torch.Tensor.view: lambda args, kwargs: self._handle_size_arg_method(
                 args, kwargs, "view"
             ),
+            torch.Tensor.reshape: lambda args, kwargs: self._handle_size_arg_method(
+                args, kwargs, "reshape"
+            ),
             torch.reshape: lambda args, kwargs: self._handle_size_arg_method(
                 args, kwargs, "reshape"
             ),
@@ -217,20 +220,20 @@ def _handle_size_arg_method(
         tensor = cast("torch.Tensor", args[0])
 
         if method_name == "reshape":
-            # torch.reshape expects shape as a single tuple/list argument
-            # It can be passed as torch.reshape(tensor, shape) or torch.reshape(tensor, shape=shape)
-            shape = args[1] if len(args) > 1 else kwargs.get("shape")
-            if shape is not None:
-                shape = convert_size_arg(shape)
-                if len(args) > 1:
-                    return torch.reshape(
-                        tensor,
-                        shape,  # type: ignore[arg-type]
-                        *args[2:],
-                        **kwargs,
-                    )
+            # reshape can take shape as multiple positional args or as a single tuple/list
+            # e.g., tensor.reshape(2, 3) or tensor.reshape((2, 3))
+            if "shape" in kwargs:
+                # Handle kwargs case: tensor.reshape(shape=(2, 3))
+                shape = convert_size_arg(kwargs["shape"])
+                kwargs = dict(kwargs)  # Make a copy to avoid modifying the original
                 kwargs["shape"] = shape
-            return torch.reshape(tensor, **kwargs)  # type: ignore[arg-type]
+                return torch.reshape(tensor, **kwargs)  # type: ignore[arg-type]
+            # Handle positional args case
+            sizes = args[1:]
+            new_sizes = convert_size_arg(sizes)
+            method = getattr(tensor, method_name)
+            assert isinstance(new_sizes, list)
+            return method(*new_sizes, **kwargs)
 
         # view/expand take sizes as positional args
         sizes = args[1:]
diff --git a/test/test_views.py b/test/test_views.py
@@ -156,6 +156,42 @@ def fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         _code, result = code_and_output(fn, args)
         torch.testing.assert_close(result, args[0] + args[1])
 
+    def test_reshape_input_types(self):
+        @helion.kernel(static_shapes=True)
+        def reshape_reduction_dim(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            m, k = x.size()
+            k2, n = y.size()
+            assert k == k2, f"size mismatch {k} != {k2}"
+
+            out = torch.zeros(
+                [m, n], dtype=torch.promote_types(x.dtype, y.dtype), device=x.device
+            )
+
+            for tile_m, tile_n in hl.tile([m, n]):
+                acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
+                for tile_k in hl.tile(k):
+                    acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
+
+                # Test different reshape input types
+                reshaped_acc = acc.reshape(-1, tile_m.block_size * tile_n.block_size)
+                reshaped_acc = reshaped_acc.reshape(
+                    tile_m.block_size, tile_n.block_size
+                )
+                reshaped_acc = reshaped_acc.flatten(0)
+                reshaped_acc = reshaped_acc.reshape(tile_m, tile_n)
+                reshaped_acc = reshaped_acc.reshape(
+                    tile_m.block_size * 2 // 2, tile_n.block_size + 1 - 1
+                )
+                out[tile_m, tile_n] = reshaped_acc
+
+            return out
+
+        x = torch.randn(8, 16, device=DEVICE)
+        y = torch.randn(16, 32, device=DEVICE)
+        _code, result = code_and_output(reshape_reduction_dim, (x, y))
+        expected = torch.matmul(x, y)
+        torch.testing.assert_close(result, expected, rtol=1e-2, atol=1e-2)
+
 
 if __name__ == "__main__":
     unittest.main()