iree-org
diff --git a/‎iree/turbine/kernel/wave/analysis/index_sequence_analysis.py‎
Lines changed: 145 additions & 4 deletions b/‎iree/turbine/kernel/wave/analysis/index_sequence_analysis.py‎
Lines changed: 145 additions & 4 deletions
diff --git a/‎iree/turbine/kernel/wave/constraints.py‎
Lines changed: 28 additions & 14 deletions b/‎iree/turbine/kernel/wave/constraints.py‎
Lines changed: 28 additions & 14 deletions
diff --git a/‎iree/turbine/kernel/wave/utils/general_utils.py‎
Lines changed: 6 additions & 5 deletions b/‎iree/turbine/kernel/wave/utils/general_utils.py‎
Lines changed: 6 additions & 5 deletions
@@ -15,6 +15,7 @@
     Output,
     Placeholder,
     Read,
+    ReduceOp,
     Reduction,
     Write,
     get_custom,
@@ -33,6 +34,7 @@
 from ..utils.general_utils import (
     get_hardware_constraint,
     get_largest_index_and_size,
+    get_workgroup_constraints,
     partial,
 )
 from ..utils.mma_utils import (
@@ -145,12 +147,21 @@ def set_node_indices(
         print_trace(trace)
 
     graph_passes = []
-    if mma_mapping != {}:
+    if mma_mapping:
         graph_passes += [
             partial(
                 set_thread_dependent_index_from_mma, constraints, mma_mapping, trace
             )
         ]
+    elif reduce_mapping := get_reduce_mapping(trace, constraints):
+        graph_passes += [
+            partial(
+                set_thread_dependent_index_from_reduce,
+                constraints,
+                trace,
+                reduce_mapping,
+            )
+        ]
     else:
         graph_passes += [
             partial(set_thread_dependent_index_from_read_write, constraints, trace)
@@ -516,9 +527,7 @@ def set_thread_dependent_index_from_read_write(
     assert sources, "No read nodes found in the graph."
 
     visited = set()
-    workgroup_constraints = [
-        c for c in constraints if isinstance(c, WorkgroupConstraint)
-    ]
+    workgroup_constraints = get_workgroup_constraints(constraints)
     symbolic_constraints = [c for c in constraints if isinstance(c, SymbolicAlias)]
     for source in sources:
         visited = visited.union(set([x for x in sources]))
@@ -533,6 +542,138 @@ def set_thread_dependent_index_from_read_write(
         )
 
 
+def get_reduce_mapping(
+    trace: CapturedTrace, constraints: list[Constraint]
+) -> dict[ReduceOp, dict[IndexSymbol, IndexSequence]]:
+    """
+    Get the mapping of the reduce ops to the index sequence.
+
+    Resulting index will have reduction dim distributed across wg0 threads and
+    rest of the dims distributed similar to read/write nodes according to the
+    WorkgroupConstraints.
+
+    Example:
+    ```
+    constraints += [tkw.WorkgroupConstraint(M, BLOCK_M, 1)]
+    ...
+    @tkw.reduction(N, init_args=[init_max, init_sum])
+    def repeat(
+        partial_max: tkl.Register[M, tkl.f32],
+    ) -> tkl.Register[M, tkl.f32]:
+        res = tkw.read(a) # [M, N]
+        partial_max = tkw.max(res, partial_max, dim=N) # {N: 2*$T0 : 2 : 1, M: $T1 : 1 : 1}
+        ...
+    ```
+
+    """
+    sources = trace.walk(lambda node: isinstance(get_custom(node), ReduceOp))
+    hardware_constraint = get_hardware_constraint(constraints)
+    workgroup_constraints = get_workgroup_constraints(constraints)
+
+    reduce_mapping = {}
+    for source in sources:
+        custom = get_custom(source)
+        index = {}
+
+        dim = custom.dim
+
+        # Compute the index sequence for the reduction dimension based on the
+        # threads per wave and the vector size.
+        threads_per_wave = hardware_constraint.threads_per_wave
+        vector_size = hardware_constraint.vector_shapes[dim]
+        assert (
+            vector_size % threads_per_wave == 0
+        ), f"Vector size {dim}={vector_size} must be divisible by threads per wave {threads_per_wave}"
+        elements_per_thread = vector_size // threads_per_wave
+        stride = compute_stride(
+            custom.indexing_dims, hardware_constraint.vector_shapes, dim
+        )
+        index[dim] = hardware_constraint.apply_read_write_thread_mapping(
+            dim, 0, elements_per_thread, stride
+        )
+
+        for dim in custom.indexing_dims:
+            elements_per_thread = 1
+            stride = compute_stride(
+                custom.indexing_dims, hardware_constraint.vector_shapes, dim
+            )
+            wg_constraint = [x for x in workgroup_constraints if x.dim == dim]
+            assert (
+                len(wg_constraint) <= 1
+            ), f"Multiple workgroup constraints for dimension {dim}"
+            if wg_constraint:
+                workgroup_dim = wg_constraint[0].workgroup_dim
+            else:
+                continue
+
+            index[dim] = hardware_constraint.apply_read_write_thread_mapping(
+                dim, workgroup_dim, elements_per_thread, stride
+            )
+
+        reduce_mapping[custom] = index
+
+    return reduce_mapping
+
+
+def populate_reduce_source_indices(
+    node: ReduceOp,
+    hardware_constraint: HardwareConstraint,
+    workgroup_constraints: list[WorkgroupConstraint],
+    index: dict[IndexSymbol, IndexSequence],
+):
+    """
+    Populate the source indices for the reduce op.
+    """
+    vector_shapes = hardware_constraint.vector_shapes
+    ret = []
+    if isinstance(node.arg, Sequence):
+        ret += [(get_custom(a), index, vector_shapes) for a in node.arg]
+    else:
+        ret += [(get_custom(node.arg), index, vector_shapes)]
+
+    # Reduce args must contain index for the reduction dimension,
+    # but init and the reduction itself does not.
+    res_index = copy(index)
+    del res_index[node.dim]
+
+    if node.init:
+        ret += [(get_custom(node.init), res_index, vector_shapes)]
+
+    ret += [(node, res_index, vector_shapes)]
+
+    return ret
+
+
+def set_thread_dependent_index_from_reduce(
+    constraints: Sequence[Constraint],
+    trace: CapturedTrace,
+    reduce_mapping: dict[ReduceOp, dict[IndexSymbol, IndexSequence]],
+):
+    """
+    Set the thread dependent index, rooting on reduce ops.
+    """
+    hardware_constraint = get_hardware_constraint(constraints)
+    sources = trace.walk(lambda node: isinstance(get_custom(node), ReduceOp))
+    sources = [get_custom(x) for x in sources]
+    assert sources, "No reduce nodes found in the graph."
+
+    visited = set()
+    workgroup_constraints = get_workgroup_constraints(constraints)
+    symbolic_constraints = [c for c in constraints if isinstance(c, SymbolicAlias)]
+    for source in sources:
+        visited = visited.union(set([x for x in sources]))
+        visited.remove(source)
+        index = reduce_mapping[source]
+        new_sources = populate_reduce_source_indices(
+            source, hardware_constraint, workgroup_constraints, index
+        )
+        visited = propagate_indices(
+            new_sources,
+            visited,
+            symbolic_constraints,
+        )
+
+
 def set_post_expansion_indices(trace: CapturedTrace, constraints: list[Constraint]):
     """
     Add offsets to the indices based on the expanded dims.
 
@@ -82,6 +82,24 @@ def apply(self) -> IndexSequence:
         ...
 
 
+@dataclass
+class DistributionConstraint(Constraint):
+    """
+    Base class for constraints that distribute a dimension across a
+    workgroup or reduction loop.
+    """
+
+    @property
+    def work_bound(self) -> IndexExpr:
+        """
+        Returns the work bound for the constraint.
+
+        It may be different from the dimension of the tensor if the dimensions is not divisible
+        by the tile size.
+        """
+        raise NotImplementedError("Subclasses must implement this method")
+
+
 @dataclass
 class HardwareConstraint(Constraint):
     """
@@ -263,18 +281,6 @@ def subs_vector_shapes(self, index_map: dict[IndexSymbol, int]):
             if isinstance(vector_size, IndexExpr):
                 self.vector_shapes[vector_dim] = vector_size.subs(index_map)
 
-    def compute_access_pattern_using_vector_shapes(
-        self,
-        dim: IndexSymbol,
-        workgroup_dim: int,
-        elements_per_thread: int | IndexSymbol,
-        stride: int,
-    ) -> IndexSequence:
-        thread_id = self.get_thread_id_from_workgroup_dim(workgroup_dim)
-        return IndexSequence(
-            thread_id * elements_per_thread, elements_per_thread, stride
-        )
-
     def apply(self):
         assert False, "Call either apply_read_write_thread_mapping or apply_mma_mapping"
 
@@ -370,7 +376,7 @@ def apply_mma_mapping(
 
 
 @dataclass
-class WorkgroupConstraint(Constraint):
+class WorkgroupConstraint(DistributionConstraint):
     """
     A constraint of the form `tkw.WorkgroupConstraint(M, BLOCK_M, 0)`
     specifies that we want to distribute dimension M along workgroup dim 0
@@ -410,6 +416,10 @@ def apply(self) -> IndexSequence:
             return IndexSequence(self.apply_fn(self.wg_dim), 1)
         return IndexSequence(self.wg_dim * self.tile_size, 1)
 
+    @property
+    def work_bound(self) -> IndexExpr:
+        return self.count * self.tile_size
+
 
 def get_grid_shape(wg_constraints: list[WorkgroupConstraint]) -> list[IndexExpr]:
     sorted_constraints = sorted(
@@ -428,7 +438,7 @@ def get_grid_shape(wg_constraints: list[WorkgroupConstraint]) -> list[IndexExpr]
 
 
 @dataclass
-class TilingConstraint(Constraint):
+class TilingConstraint(DistributionConstraint):
     """
     A constraint of the form `tkw.TilingConstraint(K, BLOCK_K)` specifies
     that we want to tile the K dimension with a tile size of BLOCK_K. This
@@ -469,6 +479,10 @@ def apply(self) -> IndexSequence:
             )
         return IndexSequence(self.start + self.induction_var * self.tile_size, 1)
 
+    @property
+    def work_bound(self) -> IndexExpr:
+        return self.start + self.count * self.tile_size
+
 
 @dataclass
 class WaveConstraint(Constraint):
 
@@ -18,6 +18,7 @@
 from ..assumptions import Assumption
 from ..constraints import (
     Constraint,
+    DistributionConstraint,
     HardwareConstraint,
     TilingConstraint,
     WorkgroupConstraint,
@@ -144,10 +145,10 @@ def align_index_vars(
     need partial reads/writes.
     """
     key_subs = {
-        c.dim: (c.count * c.tile_size)
+        c.dim: (c.work_bound)
         for c in constraints
-        if isinstance(c, (TilingConstraint, WorkgroupConstraint))
-        and subs_idxc(c.dim) != subs_idxc(c.count * c.tile_size)
+        if isinstance(c, DistributionConstraint)
+        and subs_idxc(c.dim) != subs_idxc(c.work_bound)
     }
     return {safe_subs(key, key_subs): index[key] for key in index}
 
@@ -157,14 +158,14 @@ def find_index_bounds(
 ) -> Optional[list[IndexExpr]]:
     bounds = []
     for constraint in constraints:
-        if not isinstance(constraint, (WorkgroupConstraint, TilingConstraint)):
+        if not isinstance(constraint, DistributionConstraint):
             continue
 
         dim = constraint.dim
         if dim not in index:
             continue
 
-        work_size = constraint.count * constraint.tile_size
+        work_size = constraint.work_bound
         if subs_idxc(work_size) == subs_idxc(dim):
             continue