fix opt chunk offload (PaddlePaddle#76323)

Wennie396 · web-flow · commit 38161304fe0a · 2025-11-13T15:08:03.000+08:00
* fix opt chunk offload

* add test

* fix test

* fix test

* fix UT test
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
@@ -838,6 +838,9 @@ def _build_comm_buffers(
                     # here group_size is parameter size (GB)
                     # optimizer states(float32) size is 6 times as much as parameter(bfloat16) size
                     offload_buffer_size -= sum(opt_states_sizes)
+                else:
+                    for param in parameters:
+                        self._slice_params[param.name].is_offload_opt = False
 
                 self._comm_buffer_list.append(buffer)
 
diff --git a/test/collective/fleet/CMakeLists.txt b/test/collective/fleet/CMakeLists.txt
@@ -372,6 +372,21 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
   set_tests_properties(test_parallel_dygraph_sharding_parallel
                        PROPERTIES TIMEOUT "400")
 endif()
+if((WITH_GPU) AND LOCAL_ALL_PLAT)
+  bash_test_modules(
+    test_parallel_dygraph_sharding_parallel_chunkoffload
+    START_BASH
+    ../../legacy_test/dist_test.sh
+    TIMEOUT
+    "600"
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21218;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
+  set_tests_properties(test_parallel_dygraph_sharding_parallel_chunkoffload
+                       PROPERTIES TIMEOUT "600")
+endif()
 if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
   bash_test_modules(
     test_parallel_dygraph_tensor_parallel
diff --git a/test/collective/fleet/hybrid_parallel_sharding_model_chunkoffload.py b/test/collective/fleet/hybrid_parallel_sharding_model_chunkoffload.py
@@ -0,0 +1,251 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.dygraph_sharding_optimizer import (
+    DygraphShardingOptimizer,
+    DygraphShardingOptimizerV2,
+)
+from paddle.distributed.fleet.utils.mix_precision_utils import (
+    MixPrecisionLayer,
+    MixPrecisionOptimizer,
+)
+
+g_shard_split_param = int(os.environ.get("FLAGS_shard_split_param", 0))
+g_shard_param_with_color = int(
+    os.environ.get("FLAGS_shard_param_with_color", 0)
+)
+
+vocab_size = 20
+hidden_size = 10
+inner_size = 8
+output_size = 10
+seq_length = 2
+batch_size = 4
+STEPS = 10
+
+
+class SimpleDPNet(paddle.nn.Layer):
+    def __init__(
+        self, vocab_size, hidden_size, inner_size, output_size, np_fc1, np_fc2
+    ):
+        super().__init__()
+        self.linear1 = paddle.nn.Linear(
+            hidden_size,
+            inner_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(np_fc1)
+            ),
+            bias_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)
+            ),
+        )
+
+        self.linear2 = paddle.nn.Linear(
+            inner_size,
+            hidden_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(np_fc2)
+            ),
+            bias_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)
+            ),
+        )
+
+        self.linear3 = paddle.nn.Linear(
+            hidden_size,
+            output_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)
+            ),
+            bias_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)
+            ),
+        )
+
+        self.embedding = paddle.nn.Embedding(
+            vocab_size,
+            hidden_size,
+            weight_attr=paddle.nn.initializer.Constant(value=0.5),
+        )
+
+        if g_shard_param_with_color:
+            for p in self.linear1.parameters():
+                p.color = {'color': "linear1"}
+
+            for p in self.linear2.parameters():
+                p.color = {'color': "linear2"}
+
+            for p in self.linear3.parameters():
+                p.color = {'color': "linear3"}
+
+    def forward(self, x):
+        x = self.embedding(x)
+        x = self.linear1(x)
+        x = self.linear2(x)
+        x = self.linear3(x)
+        x = paddle.matmul(x, self.embedding.weight, transpose_y=True)
+        return x
+
+
+class TestShardingV2ChunkOffload(unittest.TestCase):
+    def setUp(self):
+        random.seed(2021)
+        np.random.seed(2021)
+        paddle.seed(2021)
+
+        self.strategy = fleet.DistributedStrategy()
+
+        self.strategy.hybrid_configs = {
+            "sharding_degree": 2,
+            "dp_degree": 1,
+            "mp_degree": 1,
+            "pp_degree": 1,
+        }
+        self.strategy.hybrid_configs["sharding_configs"].split_param = True
+        self.strategy.hybrid_configs[
+            "sharding_configs"
+        ].offload_opt_buffer_size = 0
+        fleet.init(is_collective=True, strategy=self.strategy)
+        self.data = [
+            np.random.randint(
+                0,
+                vocab_size,
+                (
+                    batch_size,
+                    seq_length,
+                ),
+            )
+            for _ in range(STEPS)
+        ]
+
+    def train_batch(self, batch, model, optimizer):
+        output = model(batch)
+        loss = output.mean()
+        loss.backward()  # do backward
+        optimizer.step()  # update parameters
+        optimizer.clear_grad()
+        return loss
+
+    def build_optimizer(self, model, strategy=None, Optimizer="adam"):
+        clip = paddle.nn.ClipGradByGlobalNorm(0.5)
+        if Optimizer == "adam":
+            optimizer = paddle.optimizer.AdamW(
+                parameters=model.parameters(),
+                learning_rate=0.001,
+                weight_decay=0.00001,
+                grad_clip=clip,
+            )
+        else:
+            optimizer = paddle.optimizer.Momentum(
+                learning_rate=0.001,
+                parameters=model.parameters(),
+                grad_clip=clip,
+            )
+        return optimizer
+
+    def build_model_optimizer(self, Optimizer="adam", amp_level=None):
+        np_fc1 = np.random.random_sample((hidden_size, inner_size))
+        np_fc2 = np.random.random_sample((inner_size, hidden_size))
+
+        model_a = SimpleDPNet(
+            vocab_size, hidden_size, inner_size, output_size, np_fc1, np_fc2
+        )
+        optimizer_a = self.build_optimizer(
+            model_a,
+            strategy=self.strategy,
+            Optimizer=Optimizer,
+        )
+
+        model_b = SimpleDPNet(
+            vocab_size, hidden_size, inner_size, output_size, np_fc1, np_fc2
+        )
+        optimizer_b = self.build_optimizer(
+            model_b,
+            strategy=self.strategy,
+            Optimizer=Optimizer,
+        )
+
+        if amp_level is not None and amp_level == "O2":
+            model_a = MixPrecisionLayer(model_a)
+            optimizer_a = MixPrecisionOptimizer(optimizer_a)
+            model_b = MixPrecisionLayer(model_b)
+            optimizer_b = MixPrecisionOptimizer(optimizer_b)
+
+        model_a = fleet.distributed_model(model_a)
+        optimizer_a = fleet.distributed_optimizer(optimizer_a)
+        model_b = fleet.distributed_model(model_b)
+        optimizer_b = fleet.distributed_optimizer(optimizer_b)
+
+        optimizer_a._set_all_gather_overlap_forward(True, model_a)
+        optimizer_b._set_all_gather_overlap_forward(False, model_b)
+        return model_a, optimizer_a, model_b, optimizer_b
+
+    def sharding_model(self, Optimizer, sharded_accumulators, amp_level=None):
+        model_a, optimizer_a, model_b, optimizer_b = self.build_model_optimizer(
+            Optimizer=Optimizer,
+            amp_level=amp_level,
+        )
+        opt_cls = (
+            DygraphShardingOptimizerV2 if True else DygraphShardingOptimizer
+        )
+        self.assertTrue(isinstance(optimizer_a._inner_opt, opt_cls))
+
+        for idx in range(STEPS):
+            if idx == 2 and paddle.distributed.get_rank() == 0 and not True:
+                self.assertTrue(
+                    set(optimizer_a._inner_opt._inner_opt.state_dict().keys())
+                    == sharded_accumulators
+                )
+
+            if paddle.distributed.get_rank() == 0:
+                batch_sharding = paddle.to_tensor(self.data[idx][:2])
+            else:
+                batch_sharding = paddle.to_tensor(self.data[idx][2:])
+
+            batch_single = paddle.to_tensor(self.data[idx])
+            loss_a = self.train_batch(batch_sharding, model_a, optimizer_a)
+            loss_b = self.train_batch(batch_single, model_b, optimizer_b)
+
+            for j in range(len(model_a.parameters())):
+                np.testing.assert_allclose(
+                    model_a.parameters()[j].numpy(),
+                    model_b.parameters()[j].numpy(),
+                    rtol=1e-6,
+                )
+
+    def test_all_gather_overlap_forward(self):
+        if True:
+            sharded_accumulators = {
+                'linear_12.b_0_velocity_0',
+                'linear_13.b_0_velocity_0',
+                'linear_14.b_0_velocity_0',
+                'embedding_4.w_0_velocity_0',
+            }
+            self.sharding_model(
+                Optimizer="Momentum",
+                sharded_accumulators=sharded_accumulators,
+                amp_level="O2",
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/collective/fleet/test_parallel_dygraph_sharding_parallel_chunkoffload.py b/test/collective/fleet/test_parallel_dygraph_sharding_parallel_chunkoffload.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from legacy_test.test_parallel_dygraph_dataparallel import (
+    TestMultipleAccelerators,
+)
+
+
+class TestHybridParallelShardingV2ChunkOffload(TestMultipleAccelerators):
+    # check sharding logic as well as the accuracy with single mode
+    def test_hybrid_parallel_sharding_v2_chunk_offload(self):
+        # test sharding v2 chunk offload
+        os.environ["FLAGS_shard_split_param"] = "1"
+        self.run_mnist_2accelerators(
+            'hybrid_parallel_sharding_model_chunkoffload.py'
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
@@ -519,6 +519,7 @@
     'test_new_group',
     'test_imperative_signal_handler',
     'test_parallel_dygraph_sharding_parallel',
+    'test_parallel_dygraph_sharding_parallel_chunkoffload',
     'test_dist_hapi_model',
     'test_dist_mnist_gradient_merge',
     'test_rnn_dp',