Fix Elemwise and Blockwise gradient for Ops with mixed discrete and continuous output types

ricardoV94 · jessegrabowski · commit e58618fdbc77 · 2025-04-18T20:49:26.000-05:00
diff --git a/pytensor/tensor/blockwise.py b/pytensor/tensor/blockwise.py
@@ -18,7 +18,7 @@
 from pytensor.scalar import ScalarType
 from pytensor.tensor import as_tensor_variable
 from pytensor.tensor.shape import shape_padleft
-from pytensor.tensor.type import TensorType, continuous_dtypes, discrete_dtypes, tensor
+from pytensor.tensor.type import TensorType, tensor
 from pytensor.tensor.utils import (
     _parse_gufunc_signature,
     broadcast_static_dim_lengths,
@@ -256,6 +256,10 @@ def as_core(t, core_t):
                 as_core(ograd, core_ograd)
                 for ograd, core_ograd in zip(ograds, core_node.outputs, strict=True)
             ]
+            # FIXME: These core_outputs do not depend on core_inputs, not pretty
+            # It's not neccessarily a problem because if they are referenced by the gradient,
+            # they get replaced later in vectorize. But if the Op was to make any decision
+            # by introspecting the dependencies of output on inputs it would fail badly!
             core_outputs = core_node.outputs
 
             core_igrads = self.core_op.L_op(core_inputs, core_outputs, core_ograds)
@@ -283,27 +287,6 @@ def L_op(self, inputs, outs, ograds):
         # Compute grad with respect to broadcasted input
         rval = self._bgrad(inputs, outs, ograds)
 
-        # TODO: (Borrowed from Elemwise) make sure that zeros are clearly identifiable
-        # to the gradient.grad method when the outputs have
-        # some integer and some floating point outputs
-        if any(out.type.dtype not in continuous_dtypes for out in outs):
-            # For integer output, return value may only be zero or undefined
-            # We don't bother with trying to check that the scalar ops
-            # correctly returned something that evaluates to 0, we just make
-            # the return value obviously zero so that gradient.grad can tell
-            # this op did the right thing.
-            new_rval = []
-            for elem, inp in zip(rval, inputs, strict=True):
-                if isinstance(elem.type, NullType | DisconnectedType):
-                    new_rval.append(elem)
-                else:
-                    elem = inp.zeros_like()
-                    if str(elem.type.dtype) not in continuous_dtypes:
-                        elem = elem.astype(config.floatX)
-                    assert str(elem.type.dtype) not in discrete_dtypes
-                    new_rval.append(elem)
-            return new_rval
-
         # Sum out the broadcasted dimensions
         batch_ndims = self.batch_ndim(outs[0].owner)
         batch_shape = outs[0].type.shape[:batch_ndims]
diff --git a/pytensor/tensor/elemwise.py b/pytensor/tensor/elemwise.py
@@ -515,27 +515,6 @@ def L_op(self, inputs, outs, ograds):
         # Compute grad with respect to broadcasted input
         rval = self._bgrad(inputs, outs, ograds)
 
-        # TODO: make sure that zeros are clearly identifiable
-        # to the gradient.grad method when the outputs have
-        # some integer and some floating point outputs
-        if any(out.type.dtype not in continuous_dtypes for out in outs):
-            # For integer output, return value may only be zero or undefined
-            # We don't bother with trying to check that the scalar ops
-            # correctly returned something that evaluates to 0, we just make
-            # the return value obviously zero so that gradient.grad can tell
-            # this op did the right thing.
-            new_rval = []
-            for elem, ipt in zip(rval, inputs, strict=True):
-                if isinstance(elem.type, NullType | DisconnectedType):
-                    new_rval.append(elem)
-                else:
-                    elem = ipt.zeros_like()
-                    if str(elem.type.dtype) not in continuous_dtypes:
-                        elem = elem.astype(config.floatX)
-                    assert str(elem.type.dtype) not in discrete_dtypes
-                    new_rval.append(elem)
-            return new_rval
-
         # sum out the broadcasted dimensions
         for i, ipt in enumerate(inputs):
             if isinstance(rval[i].type, NullType | DisconnectedType):
diff --git a/tests/tensor/test_blockwise.py b/tests/tensor/test_blockwise.py
@@ -12,7 +12,7 @@
 from pytensor.graph import Apply, Op
 from pytensor.graph.replace import vectorize_node
 from pytensor.raise_op import assert_op
-from pytensor.tensor import diagonal, log, tensor
+from pytensor.tensor import diagonal, log, ones_like, scalar, tensor, vector
 from pytensor.tensor.blockwise import Blockwise, vectorize_node_fallback
 from pytensor.tensor.nlinalg import MatrixInverse
 from pytensor.tensor.rewriting.blas import specialize_matmul_to_batched_dot
@@ -28,6 +28,9 @@
 from pytensor.tensor.utils import _parse_gufunc_signature
 
 
+config.floatX = "float32"
+
+
 def test_perform_method_per_node():
     """Confirm that Blockwise uses one perform method per node.
 
@@ -603,3 +606,26 @@ def core_scipy_fn(A, b):
         # Confirm input was destroyed
         assert (A_val == A_val_copy).all() == (op.destroy_map.get(0, None) != [0])
         assert (b_val == b_val_copy).all() == (op.destroy_map.get(0, None) != [1])
+
+
+def test_gradient_mixed_discrete_output_core_op():
+    class MixedDtypeCoreOp(Op):
+        gufunc_signature = "()->(),()"
+        itypes = [scalar().type]
+        otypes = [scalar().type, scalar(dtype=int).type]
+
+        def perform(self, node, inputs, outputs):
+            raise NotImplementedError()
+
+        def L_op(self, inputs, outputs, output_gradients):
+            return [ones_like(inputs[0]) * output_gradients[0]]
+
+    op = Blockwise(MixedDtypeCoreOp())
+    x = vector("x")
+    y, _ = op(x)
+
+    np.testing.assert_array_equal(
+        grad(y.sum(), x).eval({x: np.full(12, np.nan, dtype=config.floatX)}),
+        np.ones(12, dtype=config.floatX),
+        strict=True,
+    )