pytorch
diff --git a/‎README.md
Lines changed: 7 additions & 5 deletions b/‎README.md
Lines changed: 7 additions & 5 deletions
diff --git a/‎examples/attention.py
Lines changed: 1 addition & 1 deletion b/‎examples/attention.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/embedding.py
Lines changed: 1 addition & 1 deletion b/‎examples/embedding.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/jagged_dense_add.py
Lines changed: 1 addition & 1 deletion b/‎examples/jagged_dense_add.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/long_sum.py
Lines changed: 3 additions & 3 deletions b/‎examples/long_sum.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/template_via_closure.py
Lines changed: 1 addition & 1 deletion b/‎examples/template_via_closure.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎helion/_compiler/compile_environment.py
Lines changed: 28 additions & 45 deletions b/‎helion/_compiler/compile_environment.py
Lines changed: 28 additions & 45 deletions
diff --git a/‎helion/_compiler/tile_dispatch.py
Lines changed: 6 additions & 1 deletion b/‎helion/_compiler/tile_dispatch.py
Lines changed: 6 additions & 1 deletion
diff --git a/‎helion/_compiler/tile_strategy.py
Lines changed: 7 additions & 27 deletions b/‎helion/_compiler/tile_strategy.py
Lines changed: 7 additions & 27 deletions
diff --git a/‎helion/_compiler/type_propagation.py
Lines changed: 7 additions & 15 deletions b/‎helion/_compiler/type_propagation.py
Lines changed: 7 additions & 15 deletions
@@ -179,16 +179,18 @@ and configurations directly from your code.
 
 Helion configurations include the following options:
 
-* **block\_sizes** (`list[int | list[int]]`):
-Controls tile sizes corresponding to each `hl.tile` invocation in the
-kernel. For tiles with two or more dimensions, you can use either an
-integer to flatten the iteration space into a single dimension or a list
-of integers for multi-dimensional tiling.
+* **block\_sizes** (`list[int]`):
+Controls tile sizes corresponding to each dimension passed `hl.tile` or call
+to `hl.register_block_size` in the kernel.
 
 * **loop\_orders** (`list[list[int]]`):
 Contains one entry per `hl.tile` call with two or more dimensions,
 allowing you to permute the iteration order of the tiles.
 
+* **flatten_loops** (`list[bool]`):
+Contains one entry per `hl.tile` call with two or more dimensions,
+allowing you to flatten the iteration space into a single dimension.
+
 * **reduction\_loops** (`list[int | None]`):
 Contains one entry per reduction dimension (see
 `examples/softmax.py`). Using `None` triggers a persistent reduction,
 
@@ -12,7 +12,7 @@
 @helion.kernel(
     config=helion.Config(
         # This config was autotuned on a 3090, it won't be fast for other architectures
-        block_sizes=[[32], [16]],
+        block_sizes=[32, 16],
         num_warps=1,
         num_stages=2,
         indexing="block_ptr",
 
@@ -8,7 +8,7 @@
 
 @helion.kernel(
     config=helion.Config(
-        block_size=[512, 32], loop_order=[0, 1], num_warps=8, indexing="block_ptr"
+        block_sizes=[512, 32], loop_order=[0, 1], num_warps=8, indexing="block_ptr"
     )
 )
 def embedding(x: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
 
@@ -21,7 +21,7 @@
 
 @helion.kernel(
     config=helion.Config(
-        block_sizes=[[1], [512], [512]], num_warps=8, num_stages=4, indexing="block_ptr"
+        block_sizes=[1, 512, 512], num_warps=8, num_stages=4, indexing="block_ptr"
     )
 )
 def jagged_dense_add_2d(
 
@@ -13,7 +13,7 @@ def baseline_sum(x: torch.Tensor) -> torch.Tensor:
 # Naive Reduction: Load the entire reduction dim at once, and reduce in reg.
 @helion.kernel(
     config=helion.Config(
-        block_sizes=[[1]],
+        block_sizes=[1],
         reduction_loops=[None],
         num_warps=32,
         num_stages=4,
@@ -32,7 +32,7 @@ def longsum(x: torch.Tensor) -> torch.Tensor:
 # Looped reduction
 @helion.kernel(
     config=helion.Config(
-        block_sizes=[[1]],
+        block_sizes=[1],
         reduction_loops=[
             32768
         ],  # [None] for naive reduction, [tile_size] for looped reduction
@@ -53,7 +53,7 @@ def longsum_w_red_loop(x: torch.Tensor) -> torch.Tensor:
 # This generates the same code as above, but manually implements looped reduction.
 @helion.kernel(
     config=helion.Config(
-        block_sizes=[[32768], [1]], num_warps=16, num_stages=5, indexing="pointer"
+        block_sizes=[32768, 1], num_warps=16, num_stages=5, indexing="pointer"
     )
 )
 def longsum_manual(x: torch.Tensor) -> torch.Tensor:
 
@@ -15,7 +15,7 @@
 @helion.kernel(
     # This was tuned on a 3090 and likely isn't optimal for other GPUs
     config=helion.Config(
-        block_sizes=[[64, 64], [16]],
+        block_sizes=[64, 64, 16],
         loop_orders=[[0, 1]],
         num_warps=2,
         num_stages=3,
 
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import collections
+import contextlib
 import dataclasses
 import threading
 import types
@@ -86,9 +87,8 @@ def finalize_config_spec(self) -> None:
         from .tile_strategy import FlattenedTileStrategy
 
         for shape in self.kernel_tensor_sizes:
-            FlattenedTileStrategy.update_allow_flattened(
-                self.config_spec.block_size_specs, shape
-            )
+            FlattenedTileStrategy.update_allow_flattened(shape)
+        self.config_spec._remove_duplicates()
 
     def allocate_block_size(
         self,
@@ -343,68 +343,66 @@ def mark_alternate_size(self, size: torch.SymInt | int | None) -> None:
         if isinstance(self.size, AutoSize):
             # The block size was created by hl.register_block_size, and we didn't know the size yet.
             self.size = size
-            if isinstance(size, (int, torch.SymInt)) and isinstance(
-                source := self.block_size_source, LoopSpecBlockSizeSource
-            ):
-                # update the size hint now that we know the size
+            if size is not None:
                 env = CompileEnvironment.current()
-                env.config_spec.block_size_specs[source.loop_spec].update_hint(
-                    source.dim, env.size_hint(size)
-                )
+                with contextlib.suppress(KeyError):
+                    # update the size hint now that we know the size
+                    env.config_spec.block_sizes.block_id_lookup(
+                        self.block_size_idx
+                    ).update_hint(env.size_hint(size))
         elif size is None or self.size is None or self.size != size:
             self.size = None
 
     def symbol(self) -> sympy.Symbol:
         return self.var._sympy_()
 
     def from_config(self, config: Config) -> int | torch.SymInt | None:
-        return self.block_size_source.from_config(config)
+        return self.block_size_source.from_config(config, self.block_size_idx)
 
     def from_config_assert(self, config: Config) -> int | torch.SymInt:
         val = self.from_config(config)
         assert val is not None
         return val
 
     def is_flattened(self, config: Config) -> bool:
-        return self.block_size_source.is_flattened(config)
+        spec = CompileEnvironment.current().config_spec
+        return spec.flatten_loops.config_get(
+            config.flatten_loops, self.block_size_idx, False
+        )
 
     def is_grid(self) -> bool:
         return self.block_size_source.is_grid()
 
     def update_min_block(self, value: int, *, allow_flattened: bool = True) -> None:
-        return self.block_size_source.update_min_block(
-            value, allow_flattened=allow_flattened
-        )
+        spec = CompileEnvironment.current().config_spec
+        if not allow_flattened:
+            spec.flatten_loops.disable_block_id(self.block_size_idx)
+        with contextlib.suppress(KeyError):
+            spec.block_sizes.block_id_lookup(self.block_size_idx).update_min(value)
 
 
 class BlockSizeSource:
-    def from_config(self, config: Config) -> int | torch.SymInt | None:
+    def from_config(self, config: Config, block_id: int) -> int | torch.SymInt | None:
         raise NotImplementedError
 
-    def is_flattened(self, config: Config) -> bool:
-        return False
-
     def is_grid(self) -> bool:
         return False
 
     def l2_grouping(self, config: Config) -> int:
         return 1
 
-    def update_min_block(self, value: int, *, allow_flattened: bool = True) -> None:
-        return None
-
 
 @dataclasses.dataclass
 class FixedBlockSizeSource(BlockSizeSource):
     value: int | torch.SymInt
 
-    def from_config(self, config: Config) -> int | torch.SymInt:
+    def from_config(self, config: Config, block_id: int) -> int | torch.SymInt:
         return self.value
 
 
 @dataclasses.dataclass
 class GridBlockSizeSource(BlockSizeSource):
-    def from_config(self, config: Config) -> int:
+    def from_config(self, config: Config, block_id: int) -> int:
         raise NotImplementedError
 
     def is_grid(self) -> bool:
@@ -413,33 +411,18 @@ def is_grid(self) -> bool:
 
 @dataclasses.dataclass
 class LoopSpecBlockSizeSource(BlockSizeSource):
-    loop_spec: int
-    dim: int
-
-    def from_config(self, config: Config) -> int:
-        value = config.block_sizes[self.loop_spec]
-        if isinstance(value, int):
-            assert self.dim == 0
-            return value
-        return value[self.dim]
-
-    def is_flattened(self, config: Config) -> bool:
-        return isinstance(config.block_sizes[self.loop_spec], int)
-
-    def update_min_block(self, value: int, *, allow_flattened: bool = True) -> None:
-        """
-        Update the minimum block size for the given block index, only increases the minimum size.
-        """
-        spec = CompileEnvironment.current().config_spec.block_size_specs[self.loop_spec]
-        spec.update_min(self.dim, value)
-        spec.allow_flattened = spec.allow_flattened and allow_flattened
+    def from_config(self, config: Config, block_id: int) -> int:
+        index = CompileEnvironment.current().config_spec.block_sizes.block_id_to_index(
+            block_id
+        )
+        return config.block_sizes[index]
 
 
 @dataclasses.dataclass
 class ReductionLoopBlockSizeSource(BlockSizeSource):
     reduction_loop: int
 
-    def from_config(self, config: Config) -> int | None:
+    def from_config(self, config: Config, block_id: int) -> int | None:
         return config.reduction_loops[self.reduction_loop]
 
 
 
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
 import collections
+import functools
+import operator
 from typing import TYPE_CHECKING
 
 from helion._compiler.compile_environment import CompileEnvironment
@@ -74,10 +76,13 @@ def _add_loop_strategy(
                 loop_order=loop_order,
             )
         elif block_size_infos[0].is_flattened(config):
+            block_size = functools.reduce(
+                operator.mul, [bs.from_config_assert(config) for bs in block_size_infos]
+            )
             strategy: TileStrategy = FlattenedTileStrategy(
                 fn,
                 block_indices,
-                block_size=block_size_infos[0].from_config_assert(config),
+                block_size=block_size,
                 loop_order=loop_order,
             )
         else:
 
@@ -18,7 +18,6 @@
 from .ast_extension import expr_from_string
 from .ast_extension import statement_from_string
 from .compile_environment import CompileEnvironment
-from .compile_environment import LoopSpecBlockSizeSource
 from .compile_environment import _to_sympy
 from .host_function import HostFunction
 from .program_id import GridProgramIDs
@@ -31,7 +30,6 @@
 if TYPE_CHECKING:
     from collections.abc import Sequence
 
-    from ..autotuner.config_spec import BlockSizeSpec
     from .device_function import DeviceFunction
     from .inductor_lowering import CodegenState
 
@@ -302,43 +300,25 @@ def codegen_device_loop(self, state: CodegenState) -> DeviceLoopState:
         )
 
     @classmethod
-    def update_allow_flattened(
-        cls, specs: list[BlockSizeSpec], shape: Sequence[sympy.Expr]
-    ) -> None:
+    def update_allow_flattened(cls, shape: Sequence[sympy.Expr]) -> None:
         used_indices = {}
         for i, x in enumerate(shape):
             block_idx = cls.get_block_index(x)
             if block_idx is not None:
-                if block_idx in used_indices:
-                    # multiple usages of the same block size??? bail out
-                    for spec in specs:
-                        spec.allow_flattened = False
-                    return
                 used_indices[block_idx] = i
-        env = CompileEnvironment.current()
-        for spec_idx, group in itertools.groupby(
-            [
-                bs
-                for bs in env.block_sizes
-                if isinstance(bs.block_size_source, LoopSpecBlockSizeSource)
-            ],
-            key=lambda x: x.block_size_source.loop_spec,
-        ):
-            spec = specs[spec_idx]
-            if not spec.allow_flattened:
-                continue
-            block_indices = [bs.block_size_idx for bs in group]
-            if len(block_indices) == 1 or not (
+        flatten_loops = CompileEnvironment.current().config_spec.flatten_loops
+        for spec in [*flatten_loops]:
+            block_indices = spec.block_ids
+            if not (
                 all(x in used_indices for x in block_indices)
                 or all(x not in used_indices for x in block_indices)
             ):
-                # A shape must use all or none of the block indices in the group
-                spec.allow_flattened = False
+                flatten_loops.disable_block_id(block_indices[0])
                 continue
             for i, j in itertools.pairwise(block_indices):
                 if i in used_indices and used_indices[i] + 1 != used_indices[j]:
                     # The block indices must be contiguous
-                    spec.allow_flattened = False
+                    flatten_loops.disable_block_id(block_indices[0])
                     break
 
     def compact_shape(self, shapes: list[CompactedShape]) -> list[CompactedShape]:
 
@@ -964,25 +964,17 @@ def proxy(self) -> object:
 
     @staticmethod
     def allocate(
-        numels: list[int | torch.SymInt | AutoSize | None], origin: Origin
-    ) -> list[TileIndexType]:
+        numel: int | torch.SymInt | AutoSize | None, origin: Origin
+    ) -> TileIndexType:
         env = CompileEnvironment.current()
-        spec_id = len(env.config_spec.block_size_specs)
-        env.config_spec.block_size_specs.append(
+        block_id = env.allocate_block_size(numel, source=LoopSpecBlockSizeSource())
+        env.config_spec.block_sizes.append(
             BlockSizeSpec(
-                size_hints=[*map(_get_hint, numels)],
-                allow_flattened=len(numels) > 1,
+                block_id=block_id,
+                size_hint=_get_hint(numel),
             )
         )
-        return [
-            TileIndexType(
-                origin,
-                env.allocate_block_size(
-                    x, source=LoopSpecBlockSizeSource(spec_id, dim)
-                ),
-            )
-            for dim, x in enumerate(numels)
-        ]
+        return TileIndexType(origin, block_id)
 
     @staticmethod
     def allocate_fixed(
Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@`
`8`	`8`
`9`	`9`	`@helion.kernel(`
`10`	`10`	`config=helion.Config(`
`11`		`- block_size=[512, 32], loop_order=[0, 1], num_warps=8, indexing="block_ptr"`
	`11`	`+ block_sizes=[512, 32], loop_order=[0, 1], num_warps=8, indexing="block_ptr"`
`12`	`12`	`)`
`13`	`13`	`)`
`14`	`14`	`def embedding(x: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:`
Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@`
`21`	`21`
`22`	`22`	`@helion.kernel(`
`23`	`23`	`config=helion.Config(`
`24`		`- block_sizes=[[1], [512], [512]], num_warps=8, num_stages=4, indexing="block_ptr"`
	`24`	`+ block_sizes=[1, 512, 512], num_warps=8, num_stages=4, indexing="block_ptr"`
`25`	`25`	`)`
`26`	`26`	`)`
`27`	`27`	`def jagged_dense_add_2d(`