Updated nll loss decomposition rule with ignore_index (#218)

vfdev-5 · web-flow · commit 38d1161f9528 · 2021-10-28T11:51:49.000-04:00
* WIP on adding ignore index decomp rule

* Updated nll_loss decomposition rule to take into account ignore_index

* Recoded total_weight computation for has_ignore_index without using .item

* Added required decompositions

* Fixed nits
diff --git a/functorch/csrc/BatchRulesDecompositions.cpp b/functorch/csrc/BatchRulesDecompositions.cpp
@@ -121,6 +121,9 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   OP_DECOMPOSE(var_mean);
   OP_DECOMPOSE2(var_mean, dim);
   OP_DECOMPOSE2(where, self);
+  OP_DECOMPOSE(nll_loss_nd);
+  OP_DECOMPOSE(nll_loss);
+  OP_DECOMPOSE(nll_loss2d);
 }
 
 }}
diff --git a/functorch/csrc/BatchRulesLoss.cpp b/functorch/csrc/BatchRulesLoss.cpp
@@ -73,19 +73,6 @@ std::tuple<Tensor, Tensor> nll_loss_forward_decomposition(
     const c10::optional<Tensor> & weight,
     int64_t reduction, int64_t ignore_index) {
 
-  bool has_ignore_index = ignore_index >= 0;
-  if (has_ignore_index) {
-    // fallback
-    if (target.dim() > 1) {
-      static auto op = c10::Dispatcher::singleton()
-        .findSchemaOrThrow("aten::nll_loss_nd", "");
-      return slow_fallback<Tensor, Tensor>(op, {self, target, weight, reduction, ignore_index});
-    } else {
-      static auto op = c10::Dispatcher::singleton()
-        .findSchemaOrThrow("aten::nll_loss_forward", "");
-      return slow_fallback<Tensor, Tensor>(op, {self, target, weight, reduction, ignore_index});
-    }
-  }
   // self can be [N, C, ...] or [C]
   // target can be [N, ...] or []
 
@@ -117,17 +104,35 @@ std::tuple<Tensor, Tensor> nll_loss_forward_decomposition(
       {}, result.numel(), self_.scalar_type(),
       self_.layout(), self_.device(), nullopt);
 
+  bool has_ignore_index = ignore_index >= 0;
+  Tensor ignore_index_mask;
+  if (has_ignore_index) {
+    ignore_index_mask = target != ignore_index;
+    result = result * ignore_index_mask;
+    total_weight = ignore_index_mask.sum().to(self_);
+  }
+
   // Apply the reduction
   if (result.dim() > 0) {
     if (reduction == Reduction::Sum) {
       result = result.sum();
     } else if (reduction == Reduction::Mean) {
       if (!weight || !weight->defined()) {
-        result = result.mean();
+        if (has_ignore_index) {
+          TORCH_INTERNAL_ASSERT(ignore_index_mask.defined());
+          // total_weight is ignore_index_mask.sum()
+          result = result.sum() / total_weight;
+        } else {
+          result = result.mean();
+        }
       } else {
         TORCH_INTERNAL_ASSERT(weight_.defined());
         weight_ = weight_.expand(self_.sizes());
         auto wsum = at::gather(weight_, channel_dim, target_).squeeze(channel_dim);
+        if (has_ignore_index) {
+          TORCH_INTERNAL_ASSERT(ignore_index_mask.defined());
+          wsum = wsum * ignore_index_mask;
+        }
         wsum = wsum.sum();
         result = result.sum() / wsum;
         total_weight = wsum;
@@ -136,6 +141,10 @@ std::tuple<Tensor, Tensor> nll_loss_forward_decomposition(
   } else if (reduction == Reduction::Mean && weight && weight->defined()) {
     // here weight is [C] and target is [1]
     auto wsum = at::gather(*weight, channel_dim, target_).squeeze(channel_dim);
+    if (has_ignore_index) {
+      TORCH_INTERNAL_ASSERT(ignore_index_mask.defined());
+      wsum = wsum * ignore_index_mask;
+    }
     total_weight = wsum.sum();
   }
 
@@ -244,6 +253,7 @@ at::Tensor nll_loss_backward_plumbing(
 
 TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   m.impl("nll_loss_forward", nll_loss_forward_decomposition);
+  m.impl("nll_loss2d_forward", nll_loss_forward_decomposition);
   m.impl("nll_loss_backward", nll_loss_backward_plumbing);
   VMAP_SUPPORT("mse_loss", mse_loss_batch_rule);
   VMAP_SUPPORT("mse_loss_backward", mse_loss_backward_batch_rule);
diff --git a/test/test_vmap.py b/test/test_vmap.py
@@ -3128,7 +3128,6 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('linalg.multi_dot'),
         xfail('nanmean'),
         xfail('nn.functional.layer_norm'),
-        xfail('nn.functional.nll_loss'),
         xfail('vstack'),
         xfail('block_diag'),
         xfail('nn.functional.dropout'),

Original file line number	Diff line number	Diff line change
`@@ -121,6 +121,9 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {`
`121`	`121`	`OP_DECOMPOSE(var_mean);`
`122`	`122`	`OP_DECOMPOSE2(var_mean, dim);`
`123`	`123`	`OP_DECOMPOSE2(where, self);`
	`124`	`+ OP_DECOMPOSE(nll_loss_nd);`
	`125`	`+ OP_DECOMPOSE(nll_loss);`
	`126`	`+ OP_DECOMPOSE(nll_loss2d);`
`124`	`127`	`}`
`125`	`128`
`126`	`129`	`}}`