Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@
from typing import Dict, Tuple

# Third Party
from accelerate.logging import get_logger
from fms_acceleration import AccelerationPlugin
from peft import LoraConfig
from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
from transformers import TrainingArguments
import torch

Expand All @@ -28,6 +30,8 @@
prepare_scattermoe,
)

logger = get_logger(__name__)


# pylint: disable=too-many-instance-attributes
class ScatterMoEAccelerationPlugin(AccelerationPlugin):
Expand Down Expand Up @@ -124,6 +128,18 @@ def get_callbacks_and_ready_for_train(
if layer.__class__.__name__ in _layers
]

if (
accelerator.state.fsdp_plugin.state_dict_type
!= StateDictType.SHARDED_STATE_DICT
):
accelerator.state.fsdp_plugin.state_dict_type = (
StateDictType.SHARDED_STATE_DICT
)
logger.warning(
"Overriding FSDP plugin state_dict_type to"
f"{StateDictType.SHARDED_STATE_DICT},"
"since the plugin does not support {StateDictType.FULL_STATE_DICT}"
)
# call this to patch the HF save and load functions to be able
# to save DTensors propery
patch_huggingface_save_and_load_for_dtensors()
Expand Down