Resolve conflicts with main

jessegrabowski · ricardoV94 · commit 228fd7e7e32a · 2024-08-30T11:44:46.000+02:00
Generalize Blockwise inplace logic

Introduce `make_inplace` helper function for destructive rewrites

Refactor cholesky destructive re-write to use `make_inplace` helper

Add destructive in-place rewrite for `pt.linalg.cholesky`
diff --git a/pytensor/graph/op.py b/pytensor/graph/op.py
@@ -583,6 +583,11 @@ def make_thunk(
         )
         return self.make_py_thunk(node, storage_map, compute_map, no_recycling)
 
+    def try_inplace_inputs(self, candidate_inputs: list[int]) -> "Op":
+        """Try to return a version of self that can inplace on candidate_inputs."""
+        # TODO: Document this in the Create your own op docs
+        raise NotImplementedError()
+
     def __str__(self):
         return getattr(type(self), "__name__", super().__str__())
 
diff --git a/pytensor/link/numba/dispatch/basic.py b/pytensor/link/numba/dispatch/basic.py
@@ -36,7 +36,7 @@
 from pytensor.tensor.blas import BatchedDot
 from pytensor.tensor.math import Dot
 from pytensor.tensor.shape import Reshape, Shape, Shape_i, SpecifyShape
-from pytensor.tensor.slinalg import Solve
+from pytensor.tensor.slinalg import Cholesky, Solve
 from pytensor.tensor.type import TensorType
 from pytensor.tensor.type_other import MakeSlice, NoneConst
 
@@ -646,6 +646,40 @@ def softplus(x):
     return softplus
 
 
+@numba_funcify.register(Cholesky)
+def numba_funcify_Cholesky(op, node, **kwargs):
+    lower = op.lower
+    out_dtype = node.outputs[0].type.numpy_dtype
+
+    if lower:
+        inputs_cast = int_to_float_fn(node.inputs, out_dtype)
+
+        @numba_njit
+        def cholesky(a):
+            return np.linalg.cholesky(inputs_cast(a)).astype(out_dtype)
+
+    else:
+        # TODO: Use SciPy's BLAS/LAPACK Cython wrappers.
+
+        warnings.warn(
+            (
+                "Numba will use object mode to allow the "
+                "`lower` argument to `scipy.linalg.cholesky`."
+            ),
+            UserWarning,
+        )
+
+        ret_sig = get_numba_type(node.outputs[0].type)
+
+        @numba_njit
+        def cholesky(a):
+            with numba.objmode(ret=ret_sig):
+                ret = scipy.linalg.cholesky(a, lower=lower).astype(out_dtype)
+            return ret
+
+    return cholesky
+
+
 @numba_funcify.register(Solve)
 def numba_funcify_Solve(op, node, **kwargs):
     assume_a = op.assume_a
diff --git a/pytensor/tensor/blockwise.py b/pytensor/tensor/blockwise.py
@@ -45,6 +45,7 @@ def __init__(
         signature: str | None = None,
         name: str | None = None,
         gufunc_spec: tuple[str, int, int] | None = None,
+        destroy_map=None,
         **kwargs,
     ):
         """
@@ -79,6 +80,16 @@ def __init__(
         self.inputs_sig, self.outputs_sig = _parse_gufunc_signature(signature)
         self.gufunc_spec = gufunc_spec
         self._gufunc = None
+        if destroy_map is not None:
+            # TODO: Check core_op destroy_map is compatible with Blockwise destroy_map
+            self.destroy_map = destroy_map
+        if self.destroy_map != core_op.destroy_map:
+            # Note: Should be fine for destroy_map of Blockwise to be more extensive than that of core_op
+            # But we are not using that anywhere yet, so this check is fine for now
+            raise ValueError(
+                "Blockwise destroy_map must be the same as that of the core_op"
+            )
+
         super().__init__(**kwargs)
 
     def __getstate__(self):
diff --git a/pytensor/tensor/rewriting/blockwise.py b/pytensor/tensor/rewriting/blockwise.py
@@ -1,7 +1,11 @@
+import itertools
+from typing import Optional
+
+from pytensor.compile import Supervisor
 from pytensor.compile.mode import optdb
 from pytensor.graph import Constant, node_rewriter
 from pytensor.graph.replace import vectorize_node
-from pytensor.graph.rewriting.basic import copy_stack_trace, out2in
+from pytensor.graph.rewriting.basic import copy_stack_trace, in2out, out2in
 from pytensor.tensor.basic import Alloc, ARange, alloc, shape_padleft
 from pytensor.tensor.blockwise import Blockwise
 from pytensor.tensor.math import Dot
@@ -56,7 +60,7 @@ def local_useless_unbatched_blockwise(fgraph, node):
     "fast_run",
     "fast_compile",
     "blockwise",
-    position=49,
+    position=99,  # TODO: Check if this makes sense
 )
 
 
@@ -225,3 +229,77 @@ def local_blockwise_reshape(fgraph, node):
         new_out = x.reshape([*tuple(batched_shape), *tuple(core_reshape)])
         copy_stack_trace(node.outputs[0], new_out)
         return [new_out]
+
+
+@node_rewriter([Blockwise], inplace=True)
+def node_blockwise_inplace(fgraph, node):
+    # Find inputs that are candidates for inplacing
+    blockwise_op = node.op
+
+    if blockwise_op.destroy_map:
+        # Op already has inplace
+        return False
+
+    core_op = blockwise_op.core_op
+    batch_ndim = blockwise_op.batch_ndim(node)
+    out_batch_bcast = node.outputs[0].type.broadcastable[:batch_ndim]
+
+    # TODO: Refactor this code, which is also present in Elemwise Inplacer
+    protected_inputs = [
+        f.protected for f in fgraph._features if isinstance(f, Supervisor)
+    ]
+    protected_inputs = list(itertools.chain.from_iterable(protected_inputs))
+    protected_inputs.extend(fgraph.outputs)
+
+    # TODO: Add test for the broadcastable logic (don't inplace inputs that are being broadcasted)
+    candidate_inputs = [
+        idx
+        for idx, inp in enumerate(node.inputs)
+        if (
+            not isinstance(inp, Constant)
+            and inp.type.broadcastable[:batch_ndim] == out_batch_bcast
+            and not fgraph.has_destroyers([inp])
+            and inp not in protected_inputs
+        )
+    ]
+
+    if not candidate_inputs:
+        return None
+
+    try:
+        inplace_core_op = core_op.try_inplace_inputs(candidate_inputs)
+    except NotImplementedError:
+        return False
+
+    core_destroy_map = inplace_core_op.destroy_map
+
+    if not core_destroy_map:
+        return False
+
+    # Check Op is not trying to inplace on non-candidate inputs
+    for destroyed_inputs in core_destroy_map.values():
+        for destroyed_input in destroyed_inputs:
+            if destroyed_input not in candidate_inputs:
+                raise ValueError("core_op did not respect candidate inputs")
+
+    # Recreate core_op with inplace
+    inplace_blockwise_op = Blockwise(
+        core_op=inplace_core_op,
+        signature=blockwise_op.signature,
+        name=blockwise_op.name,
+        gufunc_spec=blockwise_op.gufunc_spec,
+        destroy_map=core_destroy_map,
+    )
+
+    return inplace_blockwise_op.make_node(*node.inputs).outputs
+
+
+# After destroyhandler(49.5) but before we try to make elemwise things inplace (75)
+blockwise_inplace = in2out(node_blockwise_inplace, name="blockwise_inplace")
+optdb.register(
+    "blockwise_inplace",
+    blockwise_inplace,
+    "fast_run",
+    "inplace",
+    position=69.0,
+)
diff --git a/pytensor/tensor/rewriting/elemwise.py b/pytensor/tensor/rewriting/elemwise.py
@@ -186,9 +186,8 @@ def apply(self, fgraph):
                     for i in range(len(node.inputs))
                     if i not in baseline.values()
                     and not isinstance(node.inputs[i], Constant)
-                    and
                     # the next line should not be costly most of the time.
-                    not fgraph.has_destroyers([node.inputs[i]])
+                    and not fgraph.has_destroyers([node.inputs[i]])
                     and node.inputs[i] not in protected_inputs
                 ]
             else:
diff --git a/pytensor/tensor/rewriting/linalg.py b/pytensor/tensor/rewriting/linalg.py
@@ -4,9 +4,7 @@
 
 from pytensor import Variable
 from pytensor.graph import Apply, FunctionGraph
-from pytensor.graph.rewriting.basic import (
-    copy_stack_trace,
-    node_rewriter,
+from pytensor.graph.rewriting.basic import (copy_stack_trace, node_rewriter,
 )
 from pytensor.scalar.basic import Mul
 from pytensor.tensor.basic import (
@@ -611,3 +609,26 @@ def rewrite_inv_inv(fgraph, node):
     ):
         return None
     return [potential_inner_inv.inputs[0]]
+
+
+cholesky_no_inplace = Cholesky(overwrite_a=False)
+cholesky_inplace = Cholesky(overwrite_a=True)
+
+
+@node_rewriter([cholesky_no_inplace], inplace=True)
+@node_rewriter([Cholesky], inplace=True)
+def local_inplace_cholesky(fgraph, node):
+    return make_inplace(node, "overwrite_a")
+
+
+# After destroyhandler(49.5) but before we try to make elemwise things
+# inplace (75)
+linalg_opt_inplace = in2out(local_inplace_cholesky, name="linalg_opt_inplace")
+optdb.register(
+    "InplaceLinalgOpt",
+    linalg_opt_inplace,
+    "fast_run",
+    "inplace",
+    "linalg_opt_inplace",
+    position=69.0,
+)
diff --git a/pytensor/tensor/slinalg.py b/pytensor/tensor/slinalg.py
diff --git a/tests/tensor/rewriting/test_linalg.py b/tests/tensor/rewriting/test_linalg.py
diff --git a/tests/tensor/test_blockwise.py b/tests/tensor/test_blockwise.py