Fix NJT min / max backward() for non-ragged reductions (pytorch#144583)

jbschlosser · pytorchmergebot · commit a8ef423fed46 · 2025-01-17T20:57:11.000Z
Part of my BE project addressing NJT bugs surfaced via OpInfo tests. `value_selecting_reduction_backward()` is used in the backward for min / max, so this PR implements it for NJT. Notably, this isn't enough for reducing over the ragged dim, since that results in a dense tensor and thus NJT's torch_dispatch will not be called for this op. We need factory function support for nested ints to fix that case. Pull Request resolved: pytorch#144583 Approved by: https://github.com/soulitzer ghstack dependencies: pytorch#144582
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -3866,6 +3866,7 @@
   device_guard: False
   dispatch:
     CompositeImplicitAutograd: value_selecting_reduction_backward_symint
+    NestedTensorCPU, NestedTensorCUDA: value_selecting_reduction_backward_nested_symint
 
 - func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
   variants: function, method
diff --git a/aten/src/ATen/native/nested/NestedTensorUtils.cpp b/aten/src/ATen/native/nested/NestedTensorUtils.cpp
@@ -10,6 +10,7 @@
 #include <ATen/ops/_nested_tensor_strides_native.h>
 #include <ATen/ops/chunk_native.h>
 #include <ATen/ops/split_with_sizes_native.h>
+#include <ATen/ops/value_selecting_reduction_backward_native.h>
 #endif
 
 namespace at::native {
@@ -166,4 +167,15 @@ std::vector<Tensor> split_with_sizes_nested(
   return splits;
 }
 
+Tensor value_selecting_reduction_backward_nested_symint(
+    const Tensor& grad,
+    int64_t dim,
+    const Tensor& indices,
+    c10::SymIntArrayRef sizes,
+    bool keepdim) {
+  TORCH_INTERNAL_ASSERT(
+      false, "value_selecting_reduction_backward(): expected to be implemented in Python"
+  );
+}
+
 } // namespace at::native
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
@@ -8317,15 +8317,16 @@ def __torch_dispatch__(self, func, types, args=..., kwargs=None):
         op_match_fn=lambda device, op: (op.full_name == "narrow"),
         name="broken_narrow_backward",
     ),
-    # min / max: need to examine backwards formula for non-full reduction
+    # min / max: need factory function support for ragged dim reductions
+    # where the output is dense but sizes still contain a nested int
     XFailRule(
         error_type=RuntimeError,
         error_msg="SymIntArrayRef expected to contain only concrete integers",
         op_match_fn=lambda device, op: (
             op.full_name in {"max.reduction_with_dim", "min.reduction_with_dim"}
         ),
-        sample_match_fn=lambda device, sample: ("full reduction" not in sample.name),
-        name="broken_min_max_reduction_with_dim_backward",
+        sample_match_fn=lambda device, sample: ("ragged dim" in sample.name),
+        name="broken_min_max_reduction_with_dim_backward_on_ragged_dim",
     ),
     # matmul(): unimplemented backward
     XFailRule(
@@ -8496,7 +8497,7 @@ def __torch_dispatch__(self, func, types, args=..., kwargs=None):
         op_match_fn=lambda device, op: (
             op.full_name in {"max.reduction_with_dim", "min.reduction_with_dim"}
         ),
-        sample_match_fn=lambda device, sample: ("full reduction" not in sample.name),
+        sample_match_fn=lambda device, sample: ("ragged dim" in sample.name),
         name="broken_min_max_compile_backward",
     ),
     # to() fails with data-dependent guards OR Unknown layout in record_stream_any_impl;
diff --git a/torch/nested/_internal/ops.py b/torch/nested/_internal/ops.py
@@ -2014,6 +2014,42 @@ def argmax_default(func, *args, **kwargs):
     return _apply_reduction(func, "argmax", dtype_min, *args, **kwargs)
 
 
+@register_jagged_func(
+    torch.ops.aten.value_selecting_reduction_backward.default,
+    "grad: jt_all, dim: any, indices: jt_all, sizes: any, keepdim: any",
+)
+def value_selecting_reduction_backward_default(func, *args, **kwargs):
+    from torch.fx.experimental.symbolic_shapes import is_nested_int
+
+    _, new_kwargs = normalize_function(  # type: ignore[misc]
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    grad = new_kwargs.pop("grad")
+    new_kwargs["grad"] = grad._values
+    indices = new_kwargs.pop("indices")
+    new_kwargs["indices"] = indices._values
+    # should always succeed; sizes should contain a nested int
+    ragged_idx = next(i for i, s in enumerate(new_kwargs["sizes"]) if is_nested_int(s))
+    # convert dim -> values-space dim
+    new_kwargs["dim"] = _wrap_jagged_dim(
+        len(new_kwargs["sizes"]),
+        new_kwargs["dim"],
+        ragged_idx,
+        "value_selecting_reduction_backward",
+    )
+    # convert saved NJT sizes -> values-space sizes
+    sizes = new_kwargs.pop("sizes")
+    sizes[ragged_idx] = indices._values.size(indices._ragged_idx - 1)
+    sizes = sizes[1:]
+    new_kwargs["sizes"] = sizes
+
+    output_kwargs = extract_kwargs(indices)
+    output_kwargs["_ragged_idx"] = ragged_idx
+
+    return NestedTensor(func(**new_kwargs), **output_kwargs)
+
+
 @register_jagged_func(torch.ops.aten.stack.default, "tensors: any, dim: any")
 def stack_default(func, *args, **kwargs):
     _, new_kwargs = normalize_function(  # type: ignore[misc]