pytorch
diff --git a/‎helion/_compiler/compile_environment.py
Lines changed: 6 additions & 1 deletion b/‎helion/_compiler/compile_environment.py
Lines changed: 6 additions & 1 deletion
diff --git a/‎helion/_compiler/device_function.py
Lines changed: 13 additions & 5 deletions b/‎helion/_compiler/device_function.py
Lines changed: 13 additions & 5 deletions
diff --git a/‎helion/_compiler/generate_ast.py
Lines changed: 3 additions & 3 deletions b/‎helion/_compiler/generate_ast.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎helion/_compiler/indexing_strategy.py
Lines changed: 6 additions & 5 deletions b/‎helion/_compiler/indexing_strategy.py
Lines changed: 6 additions & 5 deletions
diff --git a/‎helion/_compiler/inductor_lowering.py
Lines changed: 6 additions & 3 deletions b/‎helion/_compiler/inductor_lowering.py
Lines changed: 6 additions & 3 deletions
diff --git a/‎helion/_compiler/reduction_strategy.py
Lines changed: 14 additions & 5 deletions b/‎helion/_compiler/reduction_strategy.py
Lines changed: 14 additions & 5 deletions
@@ -256,7 +256,7 @@ def _to_fake_tensor(self, tensor: torch.Tensor, source: Source) -> torch.Tensor:
     def size_hint(self, n: int | torch.SymInt) -> int:
         if isinstance(n, torch.SymInt):
             expr = n._sympy_()
-            if any(s.name.startswith("u") for s in expr.free_symbols):
+            if _has_unbacked(expr):
                 # If the size is a symbolic expression with unbacked symbols, then the shape environment
                 # hint will be wrong since we assign a default value to unbacked symbols.  Return a default hint.
                 return 8192
@@ -489,3 +489,8 @@ def _to_sympy(x: int | torch.SymInt) -> sympy.Expr:
     if isinstance(x, torch.SymInt):
         return x._sympy_()
     return sympy.sympify(x)
+
+
+def _has_unbacked(expr: sympy.Expr) -> bool:
+    # pyre-ignore[16]
+    return any(n.name.startswith("u") for n in expr.free_symbols)
@@ -7,6 +7,7 @@
 import math
 import threading
 from typing import TYPE_CHECKING
+from typing import NamedTuple
 from typing import Protocol
 from typing import TypeVar
 from typing import cast
@@ -47,6 +48,13 @@ class _TLS(Protocol):
 tls: _TLS = cast("_TLS", threading.local())
 
 
+class VarInfo(NamedTuple):
+    """Information about a variable derived from a sympy expression."""
+
+    name: str
+    fx_node: torch.fx.Node
+
+
 @dataclasses.dataclass
 class Argument:
     name: str  # in the device function
@@ -152,7 +160,7 @@ def __init__(self, name: str, config: Config) -> None:
         self._variable_renames: dict[str, list[str]] = {}
         self.dce_vars: list[str] = []
         self.block_size_var_cache: dict[tuple[int, ...], str] = {}
-        self.expr_to_var_name: dict[sympy.Expr, str] = {}
+        self.expr_to_var_info: dict[sympy.Expr, VarInfo] = {}
 
         from .indexing_strategy import IndexingStrategy
         from .tile_dispatch import TileStrategyDispatch
@@ -179,17 +187,17 @@ def sympy_expr(self, expr: sympy.Expr) -> str:
         expr = CompileEnvironment.current().shape_env.simplify(expr)
         if not expr.free_symbols:
             return texpr(expr)
-        if expr in self.expr_to_var_name:
-            return self.expr_to_var_name[expr]
+        if expr in self.expr_to_var_info:
+            return self.expr_to_var_info[expr].name
         expr_to_origin = HostFunction.current().expr_to_origin
         if expr in expr_to_origin:
             return self._lift_sympy_arg(expr)
         replacements = {}
         for sym in sorted(expr.free_symbols, key=lambda x: x.name):
             assert isinstance(sym, sympy.Symbol)
-            if sym in self.expr_to_var_name:
+            if sym in self.expr_to_var_info:
                 replacements[sym] = sympy.Symbol(
-                    self.expr_to_var_name[sym], integer=True
+                    self.expr_to_var_info[sym].name, integer=True
                 )
             else:
                 assert sym in expr_to_origin, f"no origin found for {sym.name}"
 
@@ -80,15 +80,15 @@ def set_statements(self, new_statements: list[ast.AST] | None) -> Iterator[None]
         if new_statements is None:
             yield
         else:
-            expr_to_var_name = self.device_function.expr_to_var_name
+            expr_to_var_info = self.device_function.expr_to_var_info
             # We don't want to reuse vars assigned in a nested scope, so copy it
-            self.device_function.expr_to_var_name = expr_to_var_name.copy()
+            self.device_function.expr_to_var_info = expr_to_var_info.copy()
             self.statements_stack.append(new_statements)
             try:
                 yield
             finally:
                 self.statements_stack.pop()
-                self.device_function.expr_to_var_name = expr_to_var_name
+                self.device_function.expr_to_var_info = expr_to_var_info
 
     @contextlib.contextmanager
     def set_on_device(self) -> Iterator[None]:
 
@@ -436,7 +436,9 @@ def is_supported(
         if extra_mask is not None:
             # TODO(jansel): support block_ptr with extra_mask
             return False
+        input_sizes = collections.deque(fake_tensor.size())
         for k in index:
+            input_size = 1 if k is None else input_sizes.popleft()
             if isinstance(k, torch.SymInt):
                 symbol = k._sympy_()
                 origin = None
@@ -455,14 +457,13 @@ def is_supported(
                         In this case, the block masking will be incorrect.  So we check if the
                         masking is needed and bail if it is.
                         """
-                        end = loop_state.end_bounds[block_index]
-                        if (
-                            not CompileEnvironment.current()
-                            .block_sizes[block_index]
-                            .size_matches(end)
+                        if not loop_state.block_id_to_info[block_index].is_end_matching(
+                            input_size
                         ):
                             assert state.fx_node is not None
                             if "masked_value" in state.fx_node.meta:
+                                # TODO(jansel): in this case we should be able to lower to block_ptr+tl.where
+                                # see test/test_loops.py::TestLoops::test_data_dependent_bounds2
                                 return False
             if isinstance(k, torch.Tensor):
                 # indirect loads don't work with block_ptr
 
@@ -46,6 +46,7 @@
 from .ast_extension import expr_from_string
 from .ast_extension import statement_from_string
 from .compile_environment import CompileEnvironment
+from .device_function import VarInfo
 from .node_masking import apply_masking
 from .node_masking import cached_masked_value
 from .node_masking import getitem_masked_value
@@ -940,11 +941,13 @@ def run_node(self, n: Node) -> object:
                         # Keep track of what variable symints are stored in to support DeviceFunction.sympy_expr()
                         expr = CompileEnvironment.current().shape_env.simplify(expr)
                         if isinstance(result, ast.Name):
-                            self.cg.device_function.expr_to_var_name[expr] = result.id
+                            self.cg.device_function.expr_to_var_info[expr] = VarInfo(
+                                result.id, n
+                            )
                         else:
                             assert isinstance(result, ast.Constant)
-                            self.cg.device_function.expr_to_var_name[expr] = repr(
-                                result.value
+                            self.cg.device_function.expr_to_var_info[expr] = VarInfo(
+                                repr(result.value), n
                             )
                     return result
                 if not isinstance(result, (ast.Name, ast.Constant)):
 
@@ -177,9 +177,14 @@ def codegen_preamble(self, state: CodegenState) -> None:
                 f"{mask_var} = {index_var} < {self.fn.sympy_expr(numel)}"
             )
         # Extract end_var_name from the numel expression
-        end_var_name = {self.block_index: self.fn.sympy_expr(numel)}
+        from .tile_strategy import LoopDimInfo
+
+        end_var_name = self.fn.sympy_expr(numel)
+        block_id_to_info = {
+            self.block_index: LoopDimInfo(end_var_name=end_var_name, end_expr=numel)
+        }
         state.codegen.set_active_loops(
-            PersistentReductionState(self, end_var_name=end_var_name)
+            PersistentReductionState(self, block_id_to_info=block_id_to_info)
         )
 
     def codegen_reduction(
@@ -258,13 +263,17 @@ def codegen_device_loop(self, state: CodegenState) -> DeviceLoopState:
             type_comment=None,
         )
         # Extract end_var_name from the actual numel expression used in the range()
-        end_var_name = {block_index: state.sympy_expr(numel)}
+        from .tile_strategy import LoopDimInfo
+
+        end_var_name = state.sympy_expr(numel)
+        block_id_to_info = {
+            block_index: LoopDimInfo(end_var_name=end_var_name, end_expr=numel)
+        }
         return DeviceLoopState(
             self,
             for_node=for_node,
             inner_statements=body,
-            end_bounds={block_index: numel},
-            end_var_name=end_var_name,
+            block_id_to_info=block_id_to_info,
         )
 
     def codegen_reduction(