Generate 2^n tests, not 3^n tests for vmap (#937)

zou3519 · web-flow · commit 88a4619c7f69 · 2022-07-06T16:42:47.000-04:00
Previously, our vmap tests were generating 3^n tests per OpInfo sample.
For each tensor argument, we would generate all permutations of bdim =
(0, -1, None).

This is pretty redundant and also performance intensive. The original
purpose of this was to make sure functorch's batching rules work with
bdim other than 0 (it's really easy to forget that the bdim is not
always at the front of the tensor).

The new strategy is to generate all permutations of bdim = (-1, None)
and also include the case where all bdims are 0 as a sanity check.
This leads to 2^n tests.

On my machine test_vmap goes from 3m25s to 2m45s, which is promising.
However the biggest wins are going to be in test_ops where n can be as
high as 10.
diff --git a/test/common_utils.py b/test/common_utils.py
@@ -38,7 +38,90 @@ def loop(op, in_dims, out_dim, batch_size, *batched_args, **kwarg_values):
     return loop_out
 
 
-def get_exhaustive_batched_inputs(arg_values, kwarg_values, batch_size=3, bdims=(0, -1), for_batch_norm=False):
+# This is kind of dangerous, please think carefully before using it.
+# Known risks:
+# - the return better not be mutated so it's best to return immutable types
+# (e.g. prefer tuples to list)
+# - Don't hash tensors in a global context, that'll keep them around forever
+def memoize(fn):
+    memo = {}
+    def wrapped(*args):
+        if args not in memo:
+            memo[args] = fn(*args)
+        return memo[args]
+    return wrapped
+
+
+# NB: This is O(2 ** num_tensors).
+# num_tensors ranges from 1 to 10, with 2-4 being most common.
+# Try not to extravagate it if you're modifying it.
+@memoize
+def get_bdim_choices(num_tensors):
+    choices = []
+
+    # full of zeros
+    choices.append((0,) * num_tensors)
+
+    # All permutations of (-1, None)
+    options = (-1, None)
+    for choice in itertools.product(options, repeat=num_tensors):
+        choices.append(choice)
+
+    assert choices[-1] == (None,) * num_tensors
+    return tuple(choices[:-1])
+
+
+def add_batch_dim(arg, bdim, batch_size=3):
+    assert bdim == 0 or bdim == -1
+    assert isinstance(arg, torch.Tensor)
+    if bdim == 0:
+        shape = [1] * len(arg.shape)
+        shape.insert(bdim, batch_size)
+        return (arg.repeat(shape), bdim)
+    if bdim == -1:
+        arg = arg.unsqueeze(-1).expand(*arg.shape, batch_size).contiguous()
+        return (arg, bdim)
+
+
+def construct_in_dims(bdim_choice_for_tensors, is_tensors):
+    result = []
+    bdim = iter(bdim_choice_for_tensors)
+    for is_tensor in is_tensors:
+        if not is_tensor:
+            result.append(None)
+            continue
+        result.append(next(bdim))
+    return tuple(result)
+
+
+def get_exhaustive_batched_inputs(arg_values, kwarg_values, batch_size=2, *, for_batch_norm=False):
+    if for_batch_norm:
+        # TODO: delete this path
+        return get_exhaustive_batched_inputs_batch_norm(arg_values, kwarg_values, batch_size)
+
+    flat_args, arg_spec = pytree.tree_flatten(tuple(arg_values))
+    is_tensors = [isinstance(a, torch.Tensor) for a in flat_args]
+    bdim_choices = get_bdim_choices(sum(is_tensors))
+
+    @memoize
+    def get_batched_arg(arg, bdim):
+        assert isinstance(arg, torch.Tensor)
+        assert bdim is not None
+        result, _ = add_batch_dim(arg, bdim, batch_size)
+        return result
+
+    for bdim_choice in bdim_choices:
+        flat_in_dims = construct_in_dims(bdim_choice, is_tensors)
+
+        flat_batched_args = tuple(arg if in_dim is None else get_batched_arg(arg, in_dim)
+                                  for arg, in_dim in zip(flat_args, flat_in_dims))
+        batched_args = pytree.tree_unflatten(flat_batched_args, arg_spec)
+        in_dims = pytree.tree_unflatten(flat_in_dims, arg_spec)
+        yield batched_args, in_dims, kwarg_values
+
+
+def get_exhaustive_batched_inputs_batch_norm(arg_values, kwarg_values, batch_size=3, bdims=(0, -1)):
+    for_batch_norm = True
     assert bdims == (0,) or bdims == (0, -1)
 
     def add_batch_dim(arg, bdim, batch_size=3):
@@ -112,18 +195,12 @@ def add_batch_choices(a):
                 yield batched_args_tuple, in_dims_tuple, kwarg_values
 
 
-def get_exhaustive_batched_inputs_for_batch_norm(arg_values, kwarg_values, batch_size=3, bdims=(0, -1)):
-    return get_exhaustive_batched_inputs(arg_values, kwarg_values,
-                                         batch_size=batch_size, bdims=bdims, for_batch_norm=True)
-
-
-def get_fallback_and_vmap_exhaustive(op, arg_values, kwarg_values, opinfo=None, compute_loop_out=True, bdims=(0, -1)):
+def get_fallback_and_vmap_exhaustive(op, arg_values, kwarg_values, opinfo=None, compute_loop_out=True):
     out_dim = 0
     batch_size = 2
-    generator = get_exhaustive_batched_inputs(arg_values, kwarg_values, batch_size, bdims=bdims)
     batch_norm_fns = ("nn.functional.batch_norm", "nn.functional.instance_norm")  # instance norm calls batch norm
-    if opinfo is not None and opinfo.name in batch_norm_fns:
-        generator = get_exhaustive_batched_inputs_for_batch_norm(arg_values, kwarg_values, batch_size, bdims=bdims)
+    for_batch_norm = opinfo is not None and opinfo.name in batch_norm_fns
+    generator = get_exhaustive_batched_inputs(arg_values, kwarg_values, batch_size, for_batch_norm=for_batch_norm)
     for batched_args, in_dims, kwarg_values in generator:
         if compute_loop_out:
             loop_out = loop(op, in_dims, out_dim, batch_size, *batched_args, **kwarg_values)
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -653,14 +653,9 @@ def test_vmapvjp(self, device, dtype, op):
 
         # The following are bugs that we should fix
         skip('nn.functional.max_pool1d'),  # fails on cpu, runs on cuda
-        xfail('nn.functional.batch_norm', device_type='cuda'),
-        xfail('nn.functional.batch_norm', 'without_cudnn', device_type='cuda'),
         xfail('_masked.mean'),
         xfail('_masked.prod'),
 
-        # Causing issues with multiple cpu levels of forward mode AD
-        xfail('nn.functional.batch_norm', device_type='cpu'),
-
         # Not actually a problem: embedding with max_norm mutates the weight
         # and causes different runs to produce different results.
         # skip because this is flaky depending on what the max_norm is!