|
35 | 35 | from utils import get_timers, set_timers
|
36 | 36 | from types import MethodType
|
37 | 37 | from paddle import _C_ops
|
38 |
| -from paddle.fluid import core |
39 |
| -from paddle.fluid.dygraph import to_variable |
| 38 | +from paddle.framework import core |
40 | 39 | import paddle.distributed as dist
|
41 | 40 | from framework import assign_group_by_size, flatten_dense_tensors, obtain_storage, AdamW, group_sharded_parallel
|
42 | 41 | from paddle.incubate.distributed.models import moe
|
43 |
| -from paddle.fluid.framework import in_dygraph_mode |
44 | 42 | from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
|
45 | 43 | from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import GroupShardedScaler
|
46 | 44 |
|
@@ -179,8 +177,8 @@ def unscale_method(self, optimizer):
|
179 | 177 | if (param._grad_ivar() is not None) and (
|
180 | 178 | param._grad_ivar().dtype == core.VarDesc.VarType.FP32)
|
181 | 179 | ]
|
182 |
| - temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool)) |
183 |
| - temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool)) |
| 180 | + temp_found_inf_fp16 = paddle.to_tensor(np.array([0]).astype(np.bool)) |
| 181 | + temp_found_inf_fp32 = paddle.to_tensor(np.array([0]).astype(np.bool)) |
184 | 182 |
|
185 | 183 | if len(param_grads_fp16):
|
186 | 184 | _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
|
@@ -443,7 +441,7 @@ def do_train(args):
|
443 | 441 | scaler = fleet.distributed_scaler(scaler)
|
444 | 442 | scaler._unscale = MethodType(unscale_method, scaler)
|
445 | 443 | else:
|
446 |
| - wrap_scale_func = GroupShardedScaler if in_dygraph_mode( |
| 444 | + wrap_scale_func = GroupShardedScaler if paddle.in_dynamic_mode( |
447 | 445 | ) else ShardingScaler
|
448 | 446 | scaler = wrap_scale_func(scaler)
|
449 | 447 |
|
|
0 commit comments