THUDM
diff --git a/‎docker/patch/latest/megatron.patch‎
Lines changed: 14 additions & 314 deletions b/‎docker/patch/latest/megatron.patch‎
Lines changed: 14 additions & 314 deletions
diff --git a/‎docker/patch/latest/sglang.patch‎
Lines changed: 21 additions & 308 deletions b/‎docker/patch/latest/sglang.patch‎
Lines changed: 21 additions & 308 deletions
diff --git a/‎requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎slime/backends/fsdp_utils/actor.py‎
Lines changed: 15 additions & 14 deletions b/‎slime/backends/fsdp_utils/actor.py‎
Lines changed: 15 additions & 14 deletions
diff --git a/‎slime/backends/fsdp_utils/fsdp_cpu_adam_wrapper.py‎
Lines changed: 36 additions & 35 deletions b/‎slime/backends/fsdp_utils/fsdp_cpu_adam_wrapper.py‎
Lines changed: 36 additions & 35 deletions
diff --git a/‎slime/backends/fsdp_utils/update_weight_utils.py‎
Lines changed: 4 additions & 4 deletions b/‎slime/backends/fsdp_utils/update_weight_utils.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎slime/backends/megatron_utils/actor.py‎
Lines changed: 10 additions & 10 deletions b/‎slime/backends/megatron_utils/actor.py‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎slime/utils/data.py‎
Lines changed: 1 addition & 1 deletion b/‎slime/utils/data.py‎
Lines changed: 1 addition & 1 deletion
@@ -1,5 +1,6 @@
 accelerate
 datasets
+deepspeed
 httpx[http2]
 mcp[cli]
 pillow
@@ -10,4 +11,3 @@ tensorboard
 torch
 transformers
 wandb
-deepspeed
@@ -86,16 +86,16 @@ def init(self, args: Namespace, role: str, wandb_run_id: str, with_ref: bool = F
 
         if args.optimizer == "deepspeed_cpu_adam":
             optimizer_config = {
-                'lr': args.lr,
-                'betas': (args.adam_beta1, args.adam_beta2),
-                'eps': args.adam_eps,
-                'weight_decay': args.weight_decay,
-                'adamw_mode': True,  # Use AdamW mode (decoupled weight decay)
-                'fp32_optimizer_states': True,  # Keep optimizer states in FP32
+                "lr": args.lr,
+                "betas": (args.adam_beta1, args.adam_beta2),
+                "eps": args.adam_eps,
+                "weight_decay": args.weight_decay,
+                "adamw_mode": True,  # Use AdamW mode (decoupled weight decay)
+                "fp32_optimizer_states": True,  # Keep optimizer states in FP32
             }
-            
+
             self.optimizer = FSDPCPUAdamWrapper(optimizer_config, self.model)
-            
+
         elif args.optimizer == "adam":
             self.optimizer = torch.optim.AdamW(
                 self.model.parameters(),
@@ -104,9 +104,11 @@ def init(self, args: Namespace, role: str, wandb_run_id: str, with_ref: bool = F
                 eps=args.adam_eps,
                 weight_decay=args.weight_decay,
             )
-            
+
         else:
-            raise ValueError(f"Unsupported optimizer: {args.optimizer}. Supported options: 'adam', 'deepspeed_cpu_adam'")
+            raise ValueError(
+                f"Unsupported optimizer: {args.optimizer}. Supported options: 'adam', 'deepspeed_cpu_adam'"
+            )
 
         # TODO: load
 
@@ -149,7 +151,7 @@ def sleep(self, tags: str | Iterable[str] | None) -> None:
 
         if isinstance(tags, str):
             tags = (tags,)
-        
+
         if torch_memory_saver is not None:
             torch_memory_saver.pause()
 
@@ -164,10 +166,10 @@ def wake_up(self, tags: str | Iterable[str] | None) -> None:
         """
         if not getattr(self.args, "offload", False):
             return
-        
+
         if isinstance(tags, str):
             tags = (tags,)
-        
+
         if torch_memory_saver is not None:
             torch_memory_saver.resume()
 
@@ -555,7 +557,6 @@ def update_weights(self) -> None:  # type: ignore[override]
             self.weight_updater.connect_rollout_engines(rollout_engines, rollout_engine_lock)
             dist.barrier(group=get_gloo_group())
 
-
         with torch_memory_saver.disable() if self.args.offload and not torch.version.hip else nullcontext():
             self.weight_updater.update_weights()
 
 
@@ -1,64 +1,63 @@
-from typing import Dict, List, Any
+from typing import Any, Dict, List
 
 import torch
-import torch.distributed as dist
 import torch.nn as nn
-from torch.distributed.tensor import DTensor
 from deepspeed.ops.adam import DeepSpeedCPUAdam
+from torch.distributed.tensor import DTensor
 
 
 class FSDPCPUAdamWrapper:
     """
     Wrapper for DeepSpeedCPUAdam to work with FSDP models where parameters are on GPU.
-    
+
     DeepSpeedCPUAdam requires both parameters and gradients to be on CPU. This wrapper:
     1. Maintains CPU shadow copies of GPU parameters (contiguous, proper dtype)
     2. Copies gradients from GPU to CPU before optimizer step (contiguous)
     3. Runs optimizer update on CPU
     4. Copies updated parameters back to GPU
-    
+
     Following the parameter copy pattern from update_weight_utils.py
     """
-    
+
     def __init__(self, optimizer_config: Dict[str, Any], model: nn.Module) -> None:
         self.model: nn.Module = model
         self.gpu_params: List[nn.Parameter] = list(model.parameters())
         self.optimizer_config: Dict[str, Any] = optimizer_config
         self.cpu_params: List[torch.Tensor] = []
         self.cpu_optimizer: DeepSpeedCPUAdam
-        
+
         # Create CPU shadow copies of parameters using the pattern from update_weight_utils.py
         # Store only the LOCAL SHARD for each rank, not the full tensor
         for gpu_param in self.gpu_params:
             param_data = gpu_param.detach()
             if isinstance(param_data, DTensor):
                 param_data = param_data.to_local()
-            
-            cpu_param = param_data.contiguous().to(device='cpu', dtype=torch.float32, non_blocking=True)
+
+            cpu_param = param_data.contiguous().to(device="cpu", dtype=torch.float32, non_blocking=True)
             cpu_param.requires_grad_(True)
-            
+
             assert cpu_param.is_contiguous(), f"CPU param must be contiguous for AVX"
             assert cpu_param.dtype == torch.float32, f"CPU param must be FP32 for DeepSpeed"
-            
+
             self.cpu_params.append(cpu_param)
-        
+
         torch.cuda.synchronize()
 
         self.cpu_optimizer = DeepSpeedCPUAdam(
             self.cpu_params,
-            lr=self.optimizer_config['lr'],
-            betas=self.optimizer_config['betas'],
-            eps=self.optimizer_config['eps'],
-            weight_decay=self.optimizer_config['weight_decay'],
-            adamw_mode=self.optimizer_config['adamw_mode'],
-            fp32_optimizer_states=self.optimizer_config['fp32_optimizer_states'],
+            lr=self.optimizer_config["lr"],
+            betas=self.optimizer_config["betas"],
+            eps=self.optimizer_config["eps"],
+            weight_decay=self.optimizer_config["weight_decay"],
+            adamw_mode=self.optimizer_config["adamw_mode"],
+            fp32_optimizer_states=self.optimizer_config["fp32_optimizer_states"],
         )
-        
+
         self.param_groups = self.cpu_optimizer.param_groups
-    
+
     def zero_grad(self, set_to_none: bool = True) -> None:
         """Zero gradients on GPU parameters.
-        
+
         Args:
             set_to_none: If True, set gradients to None; otherwise zero them.
         """
@@ -67,15 +66,15 @@ def zero_grad(self, set_to_none: bool = True) -> None:
                 param.grad = None
             elif param.grad is not None:
                 param.grad.zero_()
-    
+
     def step(self) -> None:
         """Perform optimizer step.
-        
+
         Steps:
             1. Copy gradients from GPU to CPU (handling DTensor, ensuring contiguous FP32)
             2. Run optimizer update on CPU
             3. Copy updated parameters back to GPU
-        
+
         Uses the same .to() pattern as update_weight_utils.py for proper memory layout.
         """
         # Copy gradients from GPU to CPU - handle DTensor and ensure FP32 for DeepSpeed AVX
@@ -85,29 +84,31 @@ def step(self) -> None:
                 grad_data = gpu_param.grad.detach()
                 if isinstance(grad_data, DTensor):
                     grad_data = grad_data.to_local()
-                
+
                 # DeepSpeed's AVX operations expect FP32 gradients to match FP32 params
-                cpu_grad = grad_data.contiguous().to(device='cpu', dtype=torch.float32, non_blocking=True)
-                
+                cpu_grad = grad_data.contiguous().to(device="cpu", dtype=torch.float32, non_blocking=True)
+
                 # Verify gradient properties for DeepSpeed AVX
                 assert cpu_grad.is_contiguous(), "CPU gradient must be contiguous for AVX"
                 assert cpu_grad.dtype == torch.float32, "CPU gradient must be FP32 for DeepSpeed"
-                
+
                 cpu_param.grad = cpu_grad
             else:
                 cpu_param.grad = None
-        
+
         torch.cuda.synchronize()
-        
+
         # Run optimizer step on CPU
         self.cpu_optimizer.step()
-        
+
         for gpu_param, cpu_param in zip(self.gpu_params, self.cpu_params):
-            updated_param = cpu_param.data.to(device=torch.cuda.current_device(), dtype=gpu_param.dtype, non_blocking=True)
-            
+            updated_param = cpu_param.data.to(
+                device=torch.cuda.current_device(), dtype=gpu_param.dtype, non_blocking=True
+            )
+
             if isinstance(gpu_param.data, DTensor):
                 gpu_param.data.to_local().copy_(updated_param, non_blocking=True)
             else:
                 gpu_param.data.copy_(updated_param, non_blocking=True)
-        
-        torch.cuda.synchronize()
+
+        torch.cuda.synchronize()
@@ -152,14 +152,14 @@ def update_weights(self) -> None:
 
         if self.full_params:
             print("Using FULL_STATE_DICT path with loading from CPU storage")
-            
+
             # Load all parameters from CPU storage to GPU in one go
             # This is more memory intensive but faster than bucket-based approach
             named_tensors = []
             for name, cpu_param in self.weights["actor"].items():
                 gpu_param = cpu_param.to(device=torch.cuda.current_device(), non_blocking=True)
                 named_tensors.append((name, gpu_param))
-            
+
             torch.cuda.synchronize()
 
             if use_flattened_tensor_bucket:
@@ -359,11 +359,11 @@ def update_weights(self) -> None:
             cpu_param = self.weights["actor"][name]
             gpu_param = cpu_param.to(device=torch.cuda.current_device(), dtype=torch.bfloat16, non_blocking=True)
             torch.cuda.synchronize()
-            
+
             # Broadcast this single parameter
             single_param_dict = {name: gpu_param}
             self.request_update_params(single_param_dict)
-            
+
             del gpu_param
             clear_memory()
 
 
@@ -19,6 +19,8 @@
 from slime.utils.distributed_utils import get_gloo_group, init_process_group
 from slime.utils.memory_utils import clear_memory, print_memory
 from slime.utils.ray_utils import Box
+from slime.utils.reloadable_process_group import destroy_process_groups, monkey_patch_torch_dist, reload_process_groups
+from slime.utils.routing_replay import RoutingReplay
 from slime.utils.timer import Timer, timer
 from slime.utils.types import RolloutBatch
 from slime.utils.wandb_utils import init_wandb_secondary
@@ -40,6 +42,8 @@ def init(
         wandb_run_id: str,
         with_ref: bool = False,
     ) -> Optional[int]:
+        monkey_patch_torch_dist()
+
         super().init(args, role, wandb_run_id, with_ref)
 
         init(args)
@@ -158,8 +162,7 @@ def sleep(self, tags: Union[str, Tuple[str, ...]]) -> None:
 
         clear_memory()
         print_memory("before offload model")
-        if hasattr(mpu, "destroy_process_groups"):
-            mpu.destroy_process_groups()
+        destroy_process_groups()
 
         torch_memory_saver.pause()
 
@@ -184,8 +187,7 @@ def wake_up(self, tags: Union[str, Tuple[str, ...]]) -> None:
         torch_memory_saver.resume()
 
         clear_memory()
-        if hasattr(mpu, "reload_process_groups"):
-            mpu.reload_process_groups()
+        reload_process_groups()
         print_memory("after wake_up model")
 
     def _get_rollout_data(self, rollout_data_ref: Box) -> RolloutBatch:
@@ -375,8 +377,6 @@ def train_actor(self, rollout_id: int, rollout_data: RolloutBatch) -> None:
             )
 
         if self.args.use_routing_replay:
-            from megatron.core.transformer.moe.moe_utils import RoutingReplay
-
             RoutingReplay.clear_all()
 
         # update the cpu actor weight to the latest model
@@ -407,8 +407,8 @@ def update_weights(self) -> None:
         if self.args.debug_train_only or self.args.debug_rollout_only:
             return
 
-        if self.args.offload and hasattr(mpu, "reload_process_groups"):
-            mpu.reload_process_groups()
+        if self.args.offload:
+            reload_process_groups()
 
         rollout_engines, rollout_engine_lock, num_new_engines = ray.get(
             self.rollout_manager.get_rollout_engines_and_lock.remote()
@@ -434,8 +434,8 @@ def update_weights(self) -> None:
                 else:
                     self.update_cpu_params_dict(self.weights["old_actor"])
 
-        if self.args.offload and hasattr(mpu, "destroy_process_groups"):
-            mpu.destroy_process_groups()
+        if self.args.offload:
+            destroy_process_groups()
 
     def load_other_checkpoint(self, model_tag: str, path: str) -> None:
         old_args = self.args.load, self.args.no_load_optim, self.args.no_load_rng, self.args.finetune
 
@@ -15,7 +15,7 @@
 # TODO: don't read the whole file into memory.
 def read_file(path):
     if path.endswith(".jsonl"):
-        df = pd.read_json(path, lines=True, dtype={'label': str})
+        df = pd.read_json(path, lines=True, dtype={"label": str})
     elif path.endswith(".parquet"):
         df = pd.read_parquet(path, dtype_backend="pyarrow")
     else: