Added chunks arg to vmap (#774)

vfdev-5 · web-flow · commit 571083171e77 · 2022-06-27T10:10:35.000-04:00
* Added chunks arg to vmap

Description:
- Added chunks arg to vmap
- Added a test

* Create chunk_vmap into experimental

* COde formatting

* Updated tests
Refactored common code and fixed random state with randomness = same

* Updated docstring and split tests by randomness
diff --git a/functorch/_src/vmap.py b/functorch/_src/vmap.py
@@ -352,18 +352,139 @@ def vmap(
         vmap does not provide general autobatching or handle variable-length
         sequences out of the box.
     """
-    if randomness not in ['error', 'different', 'same']:
-        raise RuntimeError(f"Only allowed values for randomness are 'error', 'different', or 'same'. Got {randomness}")
+    _check_randomness_arg(randomness)
 
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
         _check_out_dims_is_int_or_int_pytree(out_dims, func)
         batch_size, flat_in_dims, flat_args, args_spec = _process_batched_inputs(in_dims, args, func)
-        vmap_level = _vmap_increment_nesting(batch_size, randomness)
-        try:
-            batched_inputs = _create_batched_inputs(flat_in_dims, flat_args, vmap_level, args_spec)
-            batched_outputs = func(*batched_inputs, **kwargs)
-            return _unwrap_batched(batched_outputs, out_dims, vmap_level, batch_size, func)
-        finally:
-            _vmap_decrement_nesting()
+        return _flat_vmap(
+            func, batch_size, flat_in_dims, flat_args, args_spec, out_dims, randomness, **kwargs
+        )
+
     return wrapped
+
+
+def chunk_vmap(
+        func: Callable,
+        in_dims: in_dims_t = 0,
+        out_dims: out_dims_t = 0,
+        randomness: str = 'error',
+        chunks=2) -> Callable:
+    """
+    chunk_vmap is the vectorizing map (vmap) using chunks of input data. It is a mix of vmap (which vectorizes
+    everything) and map (which executes things sequentially). ``chunk_vmap`` vectorizes the input with number of
+    chunks at a time. For more details about vectorizing map, see :func:`vmap`.
+
+    Args:
+        func (function): A Python function that takes one or more arguments.
+            Must return one or more Tensors.
+        in_dims (int or nested structure): Specifies which dimension of the
+            inputs should be mapped over. :attr:`in_dims` should have a
+            structure like the inputs. If the :attr:`in_dim` for a particular
+            input is None, then that indicates there is no map dimension.
+            Default: 0.
+        out_dims (int or Tuple[int]): Specifies where the mapped dimension
+            should appear in the outputs. If :attr:`out_dims` is a Tuple, then
+            it should have one element per output. Default: 0.
+        randomness (str): Specifies whether the randomness in this
+            vmap should be the same or different across batches. If 'different',
+            the randomness for each batch will be different. If 'same', the
+            randomness will be the same across batches. If 'error', any calls to
+            random functions will error. Default: 'error'. WARNING: this flag
+            only applies to random PyTorch operations and does not apply to
+            Python's random module or numpy randomness.
+        chunks (int): Number of chunks to use to split the input data. Default is 2.
+            If equals to 1 then :func:`vmap` is called.
+
+    Returns:
+        Returns a new "batched" function. It takes the same inputs as
+        :attr:`func`, except each input has an extra dimension at the index
+        specified by :attr:`in_dims`. It takes returns the same outputs as
+        :attr:`func`, except each output has an extra dimension at the index
+        specified by :attr:`out_dims`.
+    """
+    _check_randomness_arg(randomness)
+
+    if chunks == 1:
+        return vmap(func, in_dims=in_dims, out_dims=out_dims, randomness=randomness)
+
+    def _get_chunk_flat_args(flat_args_, flat_in_dims_, chunks_):
+        flat_args_chunks = tuple(
+            t.chunk(chunks_, dim=in_dim) if in_dim is not None else [t, ] * chunks_
+            for t, in_dim in zip(flat_args_, flat_in_dims_)
+        )
+        # transpose chunk dim and flatten structure
+        # chunks_flat_args is a list of flatten args
+        chunks_flat_args = zip(*flat_args_chunks)
+        return chunks_flat_args
+
+    def _flatten_chunks_output(chunks_output_):
+        # chunks_output is a list of chunked outputs
+        # flatten chunked outputs:
+        flat_chunks_output = []
+        arg_spec_list = []
+        for output in chunks_output_:
+            flat_output, arg_specs = tree_flatten(output)
+            flat_chunks_output.append(flat_output)
+            arg_spec_list.append(arg_specs)
+
+        arg_spec = arg_spec_list[0]  # all specs should be the same
+        # transpose chunk dim and flatten structure
+        # flat_output_chunks is flat list of chunks
+        flat_output_chunks = list(zip(*flat_chunks_output))
+        return flat_output_chunks, arg_spec
+
+    @functools.wraps(func)
+    def wrapped_with_chunks(*args, **kwargs):
+        _check_out_dims_is_int_or_int_pytree(out_dims, func)
+        _, flat_in_dims, flat_args, args_spec = _process_batched_inputs(in_dims, args, func)
+        # Chunk flat arguments
+        chunks_flat_args = _get_chunk_flat_args(flat_args, flat_in_dims, chunks)
+
+        # Apply vmap on chunks
+        chunks_output = []
+        rs = torch.get_rng_state() if randomness == "same" else None
+        for flat_args in chunks_flat_args:
+            batch_size = _validate_and_get_batch_size(flat_in_dims, flat_args)
+            if rs is not None:
+                torch.set_rng_state(rs)
+            chunks_output.append(
+                _flat_vmap(
+                    func, batch_size, flat_in_dims, flat_args, args_spec, out_dims, randomness, **kwargs
+                )
+            )
+        flat_output_chunks, arg_spec = _flatten_chunks_output(chunks_output)
+        # Removing temporary variables helps to reduce memory usage on device like CUDA
+        del chunks_output
+
+        # concat chunks on out_dim
+        flat_out_dims = _broadcast_to_and_flatten(out_dims, arg_spec)
+        assert len(flat_out_dims) == len(flat_output_chunks)
+        flat_output = []
+        for out_dim in flat_out_dims:
+            flat_output.append(torch.cat(flat_output_chunks[0], dim=out_dim))
+            # release source data
+            del flat_output_chunks[0]
+        del flat_output_chunks
+
+        # finally unflatten the output
+        return tree_unflatten(flat_output, arg_spec)
+
+    return wrapped_with_chunks
+
+
+# Vmap refactored helper funcions:
+def _check_randomness_arg(randomness):
+    if randomness not in ['error', 'different', 'same']:
+        raise RuntimeError(f"Only allowed values for randomness are 'error', 'different', or 'same'. Got {randomness}")
+
+
+def _flat_vmap(func, batch_size, flat_in_dims, flat_args, args_spec, out_dims, randomness, **kwargs):
+    vmap_level = _vmap_increment_nesting(batch_size, randomness)
+    try:
+        batched_inputs = _create_batched_inputs(flat_in_dims, flat_args, vmap_level, args_spec)
+        batched_outputs = func(*batched_inputs, **kwargs)
+        return _unwrap_batched(batched_outputs, out_dims, vmap_level, batch_size, func)
+    finally:
+        _vmap_decrement_nesting()
diff --git a/functorch/experimental/__init__.py b/functorch/experimental/__init__.py
@@ -1,3 +1,4 @@
 from .batch_norm_replacement import replace_all_batch_norm_modules_
 # PyTorch forward-mode is not mature yet
 from .._src.eager_transforms import jvp, jacfwd, hessian, functionalize
+from .._src.vmap import chunk_vmap
diff --git a/test/test_eager_transforms.py b/test/test_eager_transforms.py
@@ -2946,7 +2946,6 @@ def f(x: torch.Tensor) -> torch.Tensor:
             return x
         self._check_functionalize_correctness(f, torch.zeros(4, 2, device=device))
 
-
     def test_inplace_view(self, device):
 
         def f(x: torch.Tensor) -> torch.Tensor:
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -484,8 +484,8 @@ def _test(_op):
     @skipOps('TestOperators', 'test_vjpvjp', vjp_fail.union({
         skip('nn.functional.max_unpool1d'),  # Flaky
         skip('nn.functional.max_unpool2d'),  # Flaky
-        skip('nn.functional.fractional_max_pool2d'), # randomness
-        skip('nn.functional.fractional_max_pool3d'), # randomness
+        skip('nn.functional.fractional_max_pool2d'),  # randomness
+        skip('nn.functional.fractional_max_pool3d'),  # randomness
     }))
     @opsToleranceOverride('TestOperators', 'test_vjpvjp', (
         tol1('nn.functional.conv_transpose3d',
diff --git a/test/test_pythonkey.py b/test/test_pythonkey.py
@@ -113,6 +113,7 @@ def f(x):
     def test_make_fx_no_decompose(self, device):
         # FIXME
         return self.skipTest("error: maximum recursion reached")
+
         def f(x):
             return torch.tanh(x).sum()
 
diff --git a/test/test_vmap.py b/test/test_vmap.py
@@ -39,6 +39,7 @@
 
 import functorch
 from functorch import vmap, grad, grad_and_value, jvp, vjp
+from functorch.experimental import chunk_vmap
 from functorch._C import reshape_dim_into, reshape_dim_outof
 from functorch._src.make_functional import functional_init_with_buffers
 
@@ -879,7 +880,6 @@ def test_backward_unsupported_interaction(self):
         def backward_on_vmapped_tensor(x):
             x.sum().backward()
 
-
         # FIXME
         return self.skipTest("error: element 0 of tensors does not require grad and does not have a grad_fn")
         with self.assertRaisesRegex(RuntimeError, err_msg):
@@ -2719,14 +2719,28 @@ def naive_f(x, shape):
 
         self.assertTrue(torch.randn(()).dim() == 0)
 
-    @parametrize('op', [torch.cos, torch.sinh], name_fn=lambda f: f.__name__)
-    def test_foobar_parametrize(self, op):
-        pass
+    @parametrize('in_dim', [0, 1, 2])
+    @parametrize('out_dim', [0, 1, 2])
+    @parametrize('randomness', ['error', 'same'])
+    def test_chunk_vmap(self, in_dim, out_dim, randomness):
+
+        x = torch.randn(4, 5, 6)
+
+        def f(x):
+            y = x.sin()
+            if randomness != "error":
+                y = y + torch.rand_like(x)
+            return y
+
+        rs = torch.get_rng_state()
+        expected = vmap(f, in_dims=in_dim, out_dims=out_dim, randomness=randomness)(x)
 
-    @parametrize('op2', [torch.cos, torch.sinh], name_fn=lambda f: f.__name__)
-    @parametrize('op1', [torch.abs, torch.acos], name_fn=lambda f: f.__name__)
-    def test_parametrize_multiple(self, op1, op2):
-        pass
+        for chunks in [1, 2, 3, 4, 7, 10, 16]:
+            torch.set_rng_state(rs)
+            output = chunk_vmap(
+                f, in_dims=in_dim, out_dims=out_dim, randomness=randomness, chunks=chunks
+            )(x)
+            self.assertEqual(output, expected)
 
 
 instantiate_parametrized_tests(TestVmapOperators)
@@ -2906,10 +2920,6 @@ def test_log1p(self, device):
         self._batched_grad_test(torch.log1p, (x,))
         self._batched_grad_grad_test(torch.log1p, (x,))
 
-    @parametrize('param', ['foo', 'bar'])
-    def test_param_device(self, device, param):
-        pass
-
     @allowVmapFallbackUsage
     def test_max(self, device):
         x = torch.randn(2, 3, requires_grad=True, device=device)
@@ -4160,6 +4170,24 @@ def f(z):
                 return torch.rrelu(x)
             vmap(f, randomness='same')(z)
 
+    @parametrize('in_dim', [0, 1, 2])
+    @parametrize('out_dim', [0, 1, 2])
+    def test_chunk_vmap(self, in_dim, out_dim):
+
+        randomness = "different"
+
+        x = torch.randn(4, 5, 6)
+
+        def f(x):
+            y = x.sin() + torch.rand_like(x)
+            return y
+
+        for chunks in [1, 2, 3, 4, 7, 10, 16]:
+            output = chunk_vmap(
+                f, in_dims=in_dim, out_dims=out_dim, randomness=randomness, chunks=chunks
+            )(x)
+            self._assert_all_slices_unique(output)
+
 
 class TestTransformFailure(TestCase):
     @parametrize('transform', ['vmap', 'grad', 'grad_and_value', 'vjp', 'jvp', 'jacrev', 'jacfwd'])