From a2118b69b50e3f0838275a6b073f8f9ea9bb4559 Mon Sep 17 00:00:00 2001 From: Aaron Zheng Date: Sun, 2 Nov 2025 22:11:18 +0000 Subject: [PATCH 1/3] deepspeed --- .../scripts/convert_deepspeed_to_hf.py | 154 ++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 skyrl-train/scripts/convert_deepspeed_to_hf.py diff --git a/skyrl-train/scripts/convert_deepspeed_to_hf.py b/skyrl-train/scripts/convert_deepspeed_to_hf.py new file mode 100644 index 000000000..79b23bab6 --- /dev/null +++ b/skyrl-train/scripts/convert_deepspeed_to_hf.py @@ -0,0 +1,154 @@ +""" +Systematic converter: DeepSpeed ZeRO checkpoint → Hugging Face safetensors model. + +Assumptions: +- You have a structure like: + data.pt + trainer_state.pt + policy/ + ├── global_step_x/ + │ ├── zero_pp_rank_0_mp_rank_00_model_states.pt + │ └── zero_pp_rank_0_mp_rank_00_optim_states.pt + ├── huggingface/ + │ ├── config.json, tokenizer.json, etc. + └── zero_to_fp32.py + └── latest + + +Output: + policy/huggingface_converted/model.safetensors (+ copied config/tokenizer) + +For Deepspeed model shards, the output directory will be created with the following structure: +. +├── added_tokens.json +├── chat_template.jinja (optional: this file is for chat specific tasks) +├── config.json +├── generation_config.json (optional: default decoding parameters) +├── merges.txt +├── model.safetensors +├── special_tokens_map.json +├── tokenizer.json +├── tokenizer_config.json +└── vocab.json + +Example usage: +uv run --isolated --frozen --extra vllm scripts/convert_deepspeed_to_hf.py --ckpt-dir [local_checkpoint] --out-dir [output_directory] +""" + +import json +import shutil +import os +import subprocess +import argparse +import torch +from pathlib import Path +from safetensors.torch import save_model +from transformers import AutoModelForCausalLM, AutoConfig, AutoModelForSeq2SeqLM, AutoModel + +# === Directories === +def main(deepspeed_model_path: Path, out_dir:Path = None) -> Path: + ROOT = deepspeed_model_path + POLICY_DIR = ROOT / "policy" + HF_BASE = POLICY_DIR / "huggingface" + OUT_DIR = POLICY_DIR / "huggingface_converted" if not out_dir else out_dir + MERGED_FP32 = OUT_DIR / "merged_model" # directory that will store the ultimate pytorch weights. + + OUT_DIR.mkdir(exist_ok=True, parents=True) + + # === 1. Merge ZeRO shards into single FP32 checkpoint === + zero2fp32_script = POLICY_DIR / "zero_to_fp32.py" + + if not MERGED_FP32.exists(): + print(f"[1/5] Merging ZeRO shards from {POLICY_DIR} ...") + cmd = f"python {zero2fp32_script} {POLICY_DIR} {MERGED_FP32}" + result = subprocess.run(cmd) + if result.returncode != 0: + raise RuntimeError("zero_to_fp32.py merge failed.") + else: + print(f"[1/5] Merged model already exists → {MERGED_FP32}") + + # === 2. Load merged state dict === + print("[2/5] Loading merged model ...") + state = torch.load(MERGED_FP32 / "pytorch_model.bin", map_location="cpu") + + # Handle possible wrapper keys + if isinstance(state, dict): + for key in ["module", "model_state_dict", "state_dict"]: + if key in state: + state = state[key] + break + + merged_bin = MERGED_FP32 / "pytorch_model.bin" + hf_model_bin = HF_BASE / "pytorch_model.bin" + shutil.copy2(merged_bin, hf_model_bin) + print(f" Copied to: {hf_model_bin}") + + # === 3. Load HF config and initialize model === + print("[3/5] Initializing Hugging Face model ...") + model = AutoModelForCausalLM.from_pretrained(HF_BASE, torch_dtype=torch.float16) + missing, unexpected = model.load_state_dict(state, strict=False) + print(f" → Missing keys: {len(missing)}, Unexpected keys: {len(unexpected)}") + + # === 4. Save to safetensors === + print("[4/5] Saving model.safetensors ...") + save_model(model, str(OUT_DIR / "model.safetensors"), metadata={"format": "pt"}) + + # === 5. Copy tokenizer + config files === + print("[5/5] Copying tokenizer/config files ...") + for fname in os.listdir(HF_BASE): + if fname.endswith((".json", ".txt", ".jinja")): + shutil.copy(HF_BASE / fname, OUT_DIR / fname) + + # === Summary === + print("\n✅ Conversion complete!") + print(f"→ Hugging Face safetensors model located at: {OUT_DIR.resolve()}") + print(f"→ Load it via:\n\n" + f"from transformers import AutoModelForCausalLM, AutoTokenizer\n" + f"model = AutoModelForCausalLM.from_pretrained('{OUT_DIR}')\n" + f"tokenizer = AutoTokenizer.from_pretrained('{OUT_DIR}')\n") + return Path(OUT_DIR) + +def guess_hf_class(cfg: AutoConfig): + """ + Tries to find a reasonable HF class from config + Falls back to the AutoModel architecture if an LM head can't be detected + """ + if getattr(cfg, "is_encoder_decoder", False): + return AutoModelForSeq2SeqLM + archs = getattr(cfg, "architectures", []) or [] + if any(a.endswith("ForCausalLM") for a in archs): + return AutoModelForCausalLM + decoders = {"gpt2", "gpt_bigcode", "llama", "mistral", "qwen", "qwen2", "internlm", "mpt", "phi", "falcon"} + if getattr(cfg, "model_type", "") in decoders: + return AutoModelForCausalLM + return AutoModel + +def validate_load(out_dir: Path): + """ + Optional: sanity-load with HF to ensure the saved safetensors is consumable + Loads on the CPU to avoid device / dtype quirk (this may be a problem for loading on GPU which could cause data loading issues) + """ + try: + cfg = AutoConfig.from_pretrained(out_dir, local_files_only=True, trust_remote_code=True) + HFClass = guess_hf_class(cfg) + _ = HFClass.from_pretrained( + out_dir, local_files_only=True, device_map=None, dtype="auto", trust_remote_code=True + ) + print("[validate] HF Load OK") + except Exception as e: + print(f"[validate][error] HF Load failed: {e} ") + raise RuntimeError("HF Load failed") + +if __name__ == "__main__": + ap = argparse.ArgumentParser(description="Convert Deepspeed checkpoint shards to a HuggingFace safetensors model.") + ap.add_argument( + "--ckpt-dir", type=str, required=True, help="Path to the checkpoint directory, containing the trainer_state.pt file" + ) + ap.add_argument("--out-dir", type=str, default=None, help="Output for HF model folder") + ap.add_argument("--validate-load", action="store_true", help="Try loading with the Transformers Module after saving") + args = ap.parse_args() + ckpt_dir = Path(args.ckpt_dir).resolve() + output_dir = Path(args.out_dir).resolve() + out_path = main(ckpt_dir, output_dir) + if args.validate_load: + validate_load(out_path) \ No newline at end of file From d9c16edc39e47ff10cbbd3e95d8f45cb33453871 Mon Sep 17 00:00:00 2001 From: Aaron Zheng Date: Sun, 2 Nov 2025 22:12:18 +0000 Subject: [PATCH 2/3] formatted --- .../scripts/convert_deepspeed_to_hf.py | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/skyrl-train/scripts/convert_deepspeed_to_hf.py b/skyrl-train/scripts/convert_deepspeed_to_hf.py index 79b23bab6..7a2a4cdc0 100644 --- a/skyrl-train/scripts/convert_deepspeed_to_hf.py +++ b/skyrl-train/scripts/convert_deepspeed_to_hf.py @@ -35,7 +35,6 @@ uv run --isolated --frozen --extra vllm scripts/convert_deepspeed_to_hf.py --ckpt-dir [local_checkpoint] --out-dir [output_directory] """ -import json import shutil import os import subprocess @@ -45,13 +44,14 @@ from safetensors.torch import save_model from transformers import AutoModelForCausalLM, AutoConfig, AutoModelForSeq2SeqLM, AutoModel + # === Directories === -def main(deepspeed_model_path: Path, out_dir:Path = None) -> Path: +def main(deepspeed_model_path: Path, out_dir: Path = None) -> Path: ROOT = deepspeed_model_path POLICY_DIR = ROOT / "policy" HF_BASE = POLICY_DIR / "huggingface" OUT_DIR = POLICY_DIR / "huggingface_converted" if not out_dir else out_dir - MERGED_FP32 = OUT_DIR / "merged_model" # directory that will store the ultimate pytorch weights. + MERGED_FP32 = OUT_DIR / "merged_model" # directory that will store the ultimate pytorch weights. OUT_DIR.mkdir(exist_ok=True, parents=True) @@ -102,12 +102,15 @@ def main(deepspeed_model_path: Path, out_dir:Path = None) -> Path: # === Summary === print("\n✅ Conversion complete!") print(f"→ Hugging Face safetensors model located at: {OUT_DIR.resolve()}") - print(f"→ Load it via:\n\n" + print( + f"→ Load it via:\n\n" f"from transformers import AutoModelForCausalLM, AutoTokenizer\n" f"model = AutoModelForCausalLM.from_pretrained('{OUT_DIR}')\n" - f"tokenizer = AutoTokenizer.from_pretrained('{OUT_DIR}')\n") + f"tokenizer = AutoTokenizer.from_pretrained('{OUT_DIR}')\n" + ) return Path(OUT_DIR) + def guess_hf_class(cfg: AutoConfig): """ Tries to find a reasonable HF class from config @@ -123,6 +126,7 @@ def guess_hf_class(cfg: AutoConfig): return AutoModelForCausalLM return AutoModel + def validate_load(out_dir: Path): """ Optional: sanity-load with HF to ensure the saved safetensors is consumable @@ -138,17 +142,23 @@ def validate_load(out_dir: Path): except Exception as e: print(f"[validate][error] HF Load failed: {e} ") raise RuntimeError("HF Load failed") - + + if __name__ == "__main__": ap = argparse.ArgumentParser(description="Convert Deepspeed checkpoint shards to a HuggingFace safetensors model.") ap.add_argument( - "--ckpt-dir", type=str, required=True, help="Path to the checkpoint directory, containing the trainer_state.pt file" + "--ckpt-dir", + type=str, + required=True, + help="Path to the checkpoint directory, containing the trainer_state.pt file", ) ap.add_argument("--out-dir", type=str, default=None, help="Output for HF model folder") - ap.add_argument("--validate-load", action="store_true", help="Try loading with the Transformers Module after saving") + ap.add_argument( + "--validate-load", action="store_true", help="Try loading with the Transformers Module after saving" + ) args = ap.parse_args() ckpt_dir = Path(args.ckpt_dir).resolve() output_dir = Path(args.out_dir).resolve() out_path = main(ckpt_dir, output_dir) if args.validate_load: - validate_load(out_path) \ No newline at end of file + validate_load(out_path) From 7b0334d26e71ca196afb03379961cc7b6d43635a Mon Sep 17 00:00:00 2001 From: Sumanth R Hegde <39546518+SumanthRH@users.noreply.github.com> Date: Thu, 6 Nov 2025 22:04:52 -0800 Subject: [PATCH 3/3] Apply suggestions from code review Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- skyrl-train/scripts/convert_deepspeed_to_hf.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/skyrl-train/scripts/convert_deepspeed_to_hf.py b/skyrl-train/scripts/convert_deepspeed_to_hf.py index 7a2a4cdc0..2fedf4dfc 100644 --- a/skyrl-train/scripts/convert_deepspeed_to_hf.py +++ b/skyrl-train/scripts/convert_deepspeed_to_hf.py @@ -57,12 +57,15 @@ def main(deepspeed_model_path: Path, out_dir: Path = None) -> Path: # === 1. Merge ZeRO shards into single FP32 checkpoint === zero2fp32_script = POLICY_DIR / "zero_to_fp32.py" + if not zero2fp32_script.exists(): + raise FileNotFoundError(f"Conversion script not found at {zero2fp32_script}") if not MERGED_FP32.exists(): print(f"[1/5] Merging ZeRO shards from {POLICY_DIR} ...") - cmd = f"python {zero2fp32_script} {POLICY_DIR} {MERGED_FP32}" - result = subprocess.run(cmd) + cmd = ["python", str(zero2fp32_script), str(POLICY_DIR), str(MERGED_FP32)] + result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: + print(f"Error running zero_to_fp32.py:\n{result.stderr}") raise RuntimeError("zero_to_fp32.py merge failed.") else: print(f"[1/5] Merged model already exists → {MERGED_FP32}") @@ -85,7 +88,7 @@ def main(deepspeed_model_path: Path, out_dir: Path = None) -> Path: # === 3. Load HF config and initialize model === print("[3/5] Initializing Hugging Face model ...") - model = AutoModelForCausalLM.from_pretrained(HF_BASE, torch_dtype=torch.float16) + model = AutoModelForCausalLM.from_pretrained(HF_BASE, torch_dtype=torch.bfloat16) missing, unexpected = model.load_state_dict(state, strict=False) print(f" → Missing keys: {len(missing)}, Unexpected keys: {len(unexpected)}") @@ -158,7 +161,7 @@ def validate_load(out_dir: Path): ) args = ap.parse_args() ckpt_dir = Path(args.ckpt_dir).resolve() - output_dir = Path(args.out_dir).resolve() + output_dir = Path(args.out_dir).resolve() if args.out_dir is not None else None out_path = main(ckpt_dir, output_dir) if args.validate_load: validate_load(out_path)