diff --git a/torchtitan/models/deepseek_v3/README.md b/torchtitan/models/deepseek_v3/README.md index 085403d47..92dd7bab7 100644 --- a/torchtitan/models/deepseek_v3/README.md +++ b/torchtitan/models/deepseek_v3/README.md @@ -61,6 +61,7 @@ python scripts/checkpoint_conversion/convert_from_hf.py dict[str, Any]: new_key = new_abstract_key.format(layer_num, expert_num) hf_state_dict[new_key] = split_values[expert_num].squeeze() + # Remove the GroupedExperts' weight from the state_dict to free memory + del value + elif "layers" in key: abstract_key = re.sub(r"(\d+)", "{}", key, count=1) layer_num = re.search(r"\d+", key).group(0)