[Cherry-pick]Add DistributedBatchSampler and Colerjitter (#25242)

LielinJiang · web-flow · commit 43facfd375b9 · 2020-07-01T21:09:28.000+08:00
* add DistributedSampler and ColorJitter, test=develop
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
@@ -96,6 +96,7 @@ if (WITH_TESTING)
   add_subdirectory(paddle/fluid/tests)
   add_subdirectory(paddle/fluid/contrib/tests)
   add_subdirectory(paddle/fluid/contrib/slim/tests)
+  add_subdirectory(paddle/incubate/hapi/tests)
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
     DESTINATION opt/paddle/share/wheels
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
@@ -35,3 +35,6 @@
 batch = batch.batch
 import paddle.sysconfig
 import paddle.complex
+
+from . import incubate
+from .incubate import hapi
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import hapi
+
+__all__ = []
+__all__ += hapi.__all__
diff --git a/python/paddle/incubate/hapi/__init__.py b/python/paddle/incubate/hapi/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import distributed
+from . import vision
+
+__all__ = ['distributed', 'vision']
diff --git a/python/paddle/incubate/hapi/distributed.py b/python/paddle/incubate/hapi/distributed.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import numpy as np
+
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.fluid.io import BatchSampler
+
+__all__ = ['DistributedBatchSampler']
+
+
+class DistributedBatchSampler(BatchSampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    In such case, each process can pass a DistributedBatchSampler instance 
+    as a DataLoader sampler, and load a subset of the original dataset that 
+    is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+        
+    Args:
+        dataset(paddle.io.Dataset): this could be a `paddle.io.Dataset` implement
+                     or other python object which implemented
+                     `__len__` for BatchSampler to get sample
+                     number of data source.
+        batch_size(int): sample indice number in a mini-batch indices.
+        shuffle(bool): whther to shuffle indices order before genrating
+            batch indices. Default False.
+        drop_last(bool): whether drop the last incomplete batch dataset size
+            is not divisible by the batch size. Default False
+    Examples:
+        .. code-block:: python
+            
+            from paddle.incubate.hapi.distributed import DistributedBatchSampler
+            class FakeDataset():
+                def __init__(self):
+                    pass
+
+                def __getitem__(self, idx):
+                    return idx,
+
+                def __len__(self):
+                    return 10
+
+            train_dataset = FakeDataset()
+            dist_train_dataloader = DistributedBatchSampler(train_dataset, batch_size=4)
+            for data in dist_train_dataloader:
+                # do something
+                break
+    """
+
+    def __init__(self, dataset, batch_size, shuffle=False, drop_last=False):
+        self.dataset = dataset
+
+        assert isinstance(batch_size, int) and batch_size > 0, \
+                "batch_size should be a positive integer"
+        self.batch_size = batch_size
+        assert isinstance(shuffle, bool), \
+                "shuffle should be a boolean value"
+        self.shuffle = shuffle
+        assert isinstance(drop_last, bool), \
+                "drop_last should be a boolean number"
+
+        self.drop_last = drop_last
+        self.nranks = ParallelEnv().nranks
+        self.local_rank = ParallelEnv().local_rank
+        self.epoch = 0
+        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.nranks))
+        self.total_size = self.num_samples * self.nranks
+
+    def __iter__(self):
+        num_samples = len(self.dataset)
+        indices = np.arange(num_samples).tolist()
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+        if self.shuffle:
+            np.random.RandomState(self.epoch).shuffle(indices)
+            self.epoch += 1
+
+        # subsample
+        def _get_indices_by_batch_size(indices):
+            subsampled_indices = []
+            last_batch_size = self.total_size % (self.batch_size * self.nranks)
+            assert last_batch_size % self.nranks == 0
+            last_local_batch_size = last_batch_size // self.nranks
+
+            for i in range(self.local_rank * self.batch_size,
+                           len(indices) - last_batch_size,
+                           self.batch_size * self.nranks):
+                subsampled_indices.extend(indices[i:i + self.batch_size])
+
+            indices = indices[len(indices) - last_batch_size:]
+            subsampled_indices.extend(indices[
+                self.local_rank * last_local_batch_size:(
+                    self.local_rank + 1) * last_local_batch_size])
+            return subsampled_indices
+
+        if self.nranks > 1:
+            indices = _get_indices_by_batch_size(indices)
+
+        assert len(indices) == self.num_samples
+        _sample_iter = iter(indices)
+
+        batch_indices = []
+        for idx in _sample_iter:
+            batch_indices.append(idx)
+            if len(batch_indices) == self.batch_size:
+                yield batch_indices
+                batch_indices = []
+        if not self.drop_last and len(batch_indices) > 0:
+            yield batch_indices
+
+    def __len__(self):
+        num_samples = self.num_samples
+        num_samples += int(not self.drop_last) * (self.batch_size - 1)
+        return num_samples // self.batch_size
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/python/paddle/incubate/hapi/tests/CMakeLists.txt b/python/paddle/incubate/hapi/tests/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/incubate/hapi/tests/test_distributed_sampler.py b/python/paddle/incubate/hapi/tests/test_distributed_sampler.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import unittest
+
+from paddle.incubate.hapi.distributed import DistributedBatchSampler
+
+
+class FakeDataset():
+    def __init__(self):
+        pass
+
+    def __getitem__(self, index):
+        return index
+
+    def __len__(self):
+        return 10
+
+
+class TestDistributedBatchSampler(unittest.TestCase):
+    def test_sampler(self):
+        dataset = FakeDataset()
+        sampler = DistributedBatchSampler(dataset, batch_size=1, shuffle=True)
+        for batch_idx in sampler:
+            batch_idx
+            pass
+
+    def test_multiple_gpus_sampler(self):
+        dataset = FakeDataset()
+        sampler1 = DistributedBatchSampler(
+            dataset, batch_size=4, shuffle=True, drop_last=True)
+        sampler2 = DistributedBatchSampler(
+            dataset, batch_size=4, shuffle=True, drop_last=True)
+
+        sampler1.nranks = 2
+        sampler1.local_rank = 0
+        sampler1.num_samples = int(
+            math.ceil(len(dataset) * 1.0 / sampler1.nranks))
+        sampler1.total_size = sampler1.num_samples * sampler1.nranks
+
+        sampler2.nranks = 2
+        sampler2.local_rank = 1
+        sampler2.num_samples = int(
+            math.ceil(len(dataset) * 1.0 / sampler2.nranks))
+        sampler2.total_size = sampler2.num_samples * sampler2.nranks
+
+        for batch_idx in sampler1:
+            batch_idx
+            pass
+
+        for batch_idx in sampler2:
+            batch_idx
+            pass
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_transforms.py b/python/paddle/incubate/hapi/tests/test_transforms.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+from paddle.incubate.hapi.vision.transforms import transforms
+
+
+class TestTransforms(unittest.TestCase):
+    def do_transform(self, trans):
+        fake_img = (np.random.random((400, 300, 3)) * 255).astype('uint8')
+        for t in trans:
+            fake_img = t(fake_img)
+
+    def test_color_jitter(self):
+        trans = [
+            transforms.BrightnessTransform(0.0), transforms.HueTransform(0.0),
+            transforms.SaturationTransform(0.0),
+            transforms.ContrastTransform(0.0),
+            transforms.ColorJitter(0.2, 0.2, 0.2, 0.2)
+        ]
+        self.do_transform(trans)
+
+    def test_exception(self):
+
+        with self.assertRaises(ValueError):
+            transforms.ContrastTransform(-1.0)
+
+        with self.assertRaises(ValueError):
+            transforms.SaturationTransform(-1.0),
+
+        with self.assertRaises(ValueError):
+            transforms.HueTransform(-1.0)
+
+        with self.assertRaises(ValueError):
+            transforms.BrightnessTransform(-1.0)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/incubate/hapi/vision/__init__.py b/python/paddle/incubate/hapi/vision/__init__.py
@@ -0,0 +1,18 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import transforms
+from .transforms import *
+
+__all__ = transforms.__all__
diff --git a/python/paddle/incubate/hapi/vision/transforms/__init__.py b/python/paddle/incubate/hapi/vision/transforms/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import transforms
+
+from .transforms import *
+
+__all__ = transforms.__all__
diff --git a/python/paddle/incubate/hapi/vision/transforms/transforms.py b/python/paddle/incubate/hapi/vision/transforms/transforms.py
diff --git a/python/setup.py.in b/python/setup.py.in