chore: remove unrelated changes from distributed_data_parallel_config.py

daiyaanarfeen · daiyaanarfeen · commit e21635881d1c · 2026-02-23T10:05:54.000-08:00
Restore file to match upstream main exactly, removing:
- Unused typing imports (Dict, List, Tuple)
- fsdp_db_use_persist_buf_on_alloc_fail field
- fsdp_manual_registration field
- Docstring indentation changes

These were unrelated to NTP and came from other branches.
diff --git a/megatron/core/distributed/distributed_data_parallel_config.py b/megatron/core/distributed/distributed_data_parallel_config.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Optional
 
 
 @dataclass
@@ -122,16 +122,7 @@ class DistributedDataParallelConfig:
       This option will cause additional memory overhead, however, it is necessary for
       to register user buffer (nccl_ub=True) for the Megatron FSDP. 
       This option will be automatically set to True when nccl_ub=True.
-    """
-
-    fsdp_db_use_persist_buf_on_alloc_fail: bool = False
-    """Whether to fall back to persistent buffer when a bucket does not
-       fit FSDP double buffer size. If true, FSDP will use the persistently 
-       allocated buffer for the bucket that does not fit, it will enable NCCL 
-       user buffer with the cost of more memory usage. If false, FSDP will use
-       Dynamic memory allocator, NCCL user buffer won't not enabled, which 
-       usually leads to low performance. 
-    """
+   """
 
     outer_dp_sharding_strategy: str = 'no_shard'
     """
@@ -146,14 +137,6 @@ class DistributedDataParallelConfig:
       when nccl_ub is set.
     """
 
-    fsdp_manual_registration: bool = False
-    """If true, manually register the FSDP communication buffers to NCCL user buffer.
-      This option is only effective when use_megatron_fsdp and nccl_ub is set.
-      For symmetric registration with large models, the registration itself can take 
-      a significant amount of time. This option minimizes the number of registration calls
-      to minimize the registration time.
-    """
-
     delay_wgrad_compute: bool = False
     """Delay the weight gradient computation to improve batch-level communication overlapping"""