Lift Subtensor over Softmax

ricardoV94 · ricardoV94 · commit eb11f0fe8d9a · 2025-01-20T18:08:26.000+01:00
diff --git a/pytensor/tensor/rewriting/subtensor_lift.py b/pytensor/tensor/rewriting/subtensor_lift.py
@@ -1,7 +1,7 @@
-from collections.abc import Iterable
+from collections.abc import Iterable, Sequence
 
 import numpy as np
-from numpy.core.numeric import normalize_axis_tuple
+from numpy.core.numeric import normalize_axis_index, normalize_axis_tuple
 
 from pytensor import Variable
 from pytensor.graph import Constant, node_rewriter
@@ -29,6 +29,7 @@
 from pytensor.tensor.rewriting.elemwise import local_dimshuffle_lift
 from pytensor.tensor.rewriting.subtensor import is_full_slice, register_useless
 from pytensor.tensor.shape import Shape, SpecifyShape, Unbroadcast, unbroadcast
+from pytensor.tensor.special import Softmax, softmax
 from pytensor.tensor.subtensor import (
     AdvancedSubtensor1,
     Subtensor,
@@ -42,9 +43,23 @@
 
 
 def _dims_dropped_by_basic_index(idxs) -> tuple[int, ...]:
+    # Inputs can be slice or integer indexes
+    # Slices keep the dimensions, integers collapse them
     return tuple(i for i, idx in enumerate(idxs) if not isinstance(idx, slice))
 
 
+def _ndim_dropped_left_of_axis_by_basic_index(idxs, axis: int) -> int:
+    return len(_dims_dropped_by_basic_index(idxs[:axis]))
+
+
+def _axis_is_indexed_by_basic_index(
+    idxs: tuple[Variable], axis: int | Sequence[int]
+) -> bool:
+    if isinstance(axis, int):
+        axis = (axis,)
+    return any(ax < len(idxs) and not is_full_slice(idxs[ax]) for ax in axis)
+
+
 @register_canonicalize
 @register_stabilize
 @register_specialize
@@ -235,6 +250,84 @@ def local_subtensor_of_reduce(fgraph, node):
     return [out]
 
 
+@register_canonicalize
+@register_specialize
+@node_rewriter([Subtensor])
+def local_subtensor_of_softmax(fgraph, node):
+    """Lift a Subtensor through a Softmax.
+
+    softmax(x, axis=1)[0] -> softmax(x[0], axis=0)
+    softmax(x, axis=1)[:, :, 0] -> softmax(x[:, :, 0], axis=1)
+
+    If part of the indexing acts on the axis of reduction, we split it
+    softmax(x, axis=1)[:, 0, 1:] -> softmax(x[:, :, 1:], axis=1)[0]
+
+    """
+    sm, *idx = node.inputs
+
+    if not (sm.owner and isinstance(sm.owner.op, Softmax)):
+        return None
+
+    if len(fgraph.clients[sm]) > 1:
+        return None
+
+    [x] = sm.owner.inputs
+    axis = sm.owner.op.axis
+
+    if axis is None:
+        if x.type.ndim == 1:
+            axis = 0
+        else:
+            # All dimensions are mixed, we can't lift the subtensor
+            return None
+    else:
+        # Softmax currently only allows None or a single integer axis
+        # Unlike CAReduce it does not normalize negative indices
+        axis = normalize_axis_index(axis, sm.ndim)
+
+    [old_out] = node.outputs
+    idx_tuple = indices_from_subtensor(idx, node.op.idx_list)
+
+    if _axis_is_indexed_by_basic_index(idx_tuple, axis):
+        # If there are more dimensions being indexed, we can split them
+        # And lift the non-axis indexes while keeping the axis index
+        real_indices = [idx for idx in idx_tuple if not is_full_slice(idx)]
+        if len(real_indices) > 1 and sm.type.ndim > 1:
+            # Split the subtensor
+            idx_to_keep = idx_tuple[axis]
+            idxs_to_lift = (*idx_tuple[:axis], slice(None), *idx_tuple[axis + 1 :])
+
+            # Lift the non-axis indexes by calling the rewrite itself
+            opt_sm = sm[idxs_to_lift]
+            [opt_sm] = local_subtensor_of_softmax.transform(fgraph, opt_sm.owner)
+            copy_stack_trace([old_out, sm], opt_sm)
+
+            # Then reintroduce the axis index
+            ndim_reduced_left = _ndim_dropped_left_of_axis_by_basic_index(
+                idx_tuple, axis
+            )
+            new_axis = axis - ndim_reduced_left
+            idxs_to_keep = (*(slice(None),) * new_axis, idx_to_keep)
+            new_out = opt_sm[idxs_to_keep]
+            copy_stack_trace(old_out, new_out)
+            return [new_out]
+
+        else:
+            return None
+
+    # Index input to softmax
+    x_sub = x[idx_tuple]
+
+    # Adjust axis of reduction when indexing drops dimensions (integer indexing as apposed to slice indexing)
+    axis -= len(
+        [idx_item for idx_item in idx_tuple[:axis] if not isinstance(idx_item, slice)]
+    )
+
+    out = softmax(x_sub, axis=axis)
+    copy_stack_trace(old_out, out)
+    return [out]
+
+
 @register_canonicalize("shape_unsafe")
 @register_specialize("shape_unsafe")
 @node_rewriter([Subtensor])
diff --git a/tests/tensor/rewriting/test_subtensor_lift.py b/tests/tensor/rewriting/test_subtensor_lift.py
@@ -51,6 +51,7 @@
     local_subtensor_shape_constant,
 )
 from pytensor.tensor.shape import SpecifyShape, Unbroadcast, _shape
+from pytensor.tensor.special import softmax
 from pytensor.tensor.subtensor import Subtensor
 
 
@@ -213,6 +214,44 @@ def test_local_subtensor_of_reduce(original_fn, expected_fn):
     )
 
 
+@pytest.mark.parametrize(
+    "original_fn, expected_fn",
+    [
+        # Lift single index that does not ovelap with axis of softmax
+        (lambda x: softmax(x, axis=1)[0], lambda x: softmax(x[0], axis=0)),
+        (lambda x: softmax(x, axis=1)[1:], lambda x: softmax(x[1:], axis=1)),
+        (lambda x: softmax(x, axis=0)[:, 0], lambda x: softmax(x[:, 0], axis=0)),
+        (lambda x: softmax(x, axis=0)[:, 1:], lambda x: softmax(x[:, 1:], axis=0)),
+        # Do nothing to single index over axis of softmax
+        (lambda x: softmax(x, axis=0)[0], lambda x: softmax(x, axis=0)[0]),
+        (lambda x: softmax(x, axis=1)[:, 1:], lambda x: softmax(x, axis=1)[:, 1:]),
+        # Split indexing on axis of softmax
+        (lambda x: softmax(x, axis=0)[1:, 0], lambda x: softmax(x[:, 0], axis=0)[1:]),
+        (lambda x: softmax(x, axis=1)[1:, 0], lambda x: softmax(x[1:], axis=1)[:, 0]),
+        (
+            lambda x: softmax(x, axis=0)[0, :5:2],
+            lambda x: softmax(x[:, :5:2], axis=0)[0],
+        ),
+        (lambda x: softmax(x, axis=1)[0, :5:2], lambda x: softmax(x[0], axis=0)[:5:2]),
+    ],
+)
+def test_local_subtensor_of_softmax(original_fn, expected_fn):
+    rng = np.random.default_rng(230)
+    x = pt.matrix("x", shape=(5, 3))
+    x_test = rng.normal(size=x.type.shape)
+
+    out = original_fn(x)
+    expected_opt_out = expected_fn(x)
+    opt_out = rewrite_graph(out)
+    assert equal_computations([opt_out], [expected_opt_out]), debugprint(
+        [expected_opt_out, opt_out], print_type=True
+    )
+    np.testing.assert_allclose(
+        opt_out.eval({x: x_test}, mode=NO_OPTIMIZATION_MODE),
+        out.eval({x: x_test}, mode=NO_OPTIMIZATION_MODE),
+    )
+
+
 def test_local_subtensor_of_unbroadcast():
     # Test that Subtensor(Unbroadcast(x)) gets optimized into
     # Unbroadcast(Subtensor(x)).