inclusionAI
diff --git a/‎areal/api/cli_args.py‎
Lines changed: 3 additions & 0 deletions b/‎areal/api/cli_args.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎areal/engine/base_hf_engine.py‎
Lines changed: 1 addition & 0 deletions b/‎areal/engine/base_hf_engine.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎areal/engine/sglang_remote.py‎
Lines changed: 3 additions & 6 deletions b/‎areal/engine/sglang_remote.py‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎areal/engine/vllm_remote.py‎
Lines changed: 3 additions & 1 deletion b/‎areal/engine/vllm_remote.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎areal/experimental/tests/test_megatron_engine.py‎
Lines changed: 2 additions & 2 deletions b/‎areal/experimental/tests/test_megatron_engine.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎areal/experimental/tests/test_openai.py‎
Lines changed: 2 additions & 2 deletions b/‎areal/experimental/tests/test_openai.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎areal/experimental/tests/test_sglang_local_engine.py‎
Lines changed: 2 additions & 2 deletions b/‎areal/experimental/tests/test_sglang_local_engine.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎areal/experimental/tests/torchrun/run_megatron_engine_distributed.py‎
Lines changed: 2 additions & 2 deletions b/‎areal/experimental/tests/torchrun/run_megatron_engine_distributed.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎areal/launcher/local.py‎
Lines changed: 23 additions & 15 deletions b/‎areal/launcher/local.py‎
Lines changed: 23 additions & 15 deletions
diff --git a/‎areal/launcher/ray.py‎
Lines changed: 9 additions & 2 deletions b/‎areal/launcher/ray.py‎
Lines changed: 9 additions & 2 deletions
@@ -10,6 +10,7 @@
 uvloop.install()
 from hydra import compose as hydra_compose
 from hydra import initialize as hydra_init
+from hydra.core.global_hydra import GlobalHydra
 from omegaconf import MISSING, DictConfig, OmegaConf
 
 from areal.platforms import current_platform
@@ -1148,6 +1149,8 @@ def parse_cli_args(argv: List[str]):
     assert config_file.exists(), f"Config file {config_file} does not exist."
     # hydra only recognize relative paths
     relpath = Path(os.path.relpath(str(config_file), Path(__file__).parent.absolute()))
+    if GlobalHydra.instance().is_initialized():
+        GlobalHydra.instance().clear()
     hydra_init(config_path=str(relpath.parent), job_name="app", version_base=None)
     cfg = hydra_compose(
         config_name=str(relpath.name).split(".yaml")[0],
 
@@ -316,6 +316,7 @@ def step_lr_scheduler(self):
 
     def prepare_mb_list(self, input_: Dict[str, Any]) -> MicroBatchList:
         assert "attention_mask" in input_ and "input_ids" in input_
+        input_ = input_.copy()
 
         if is_qwen2_vl_model(self.model_config.model_type):
             # Create the special t,h,w position IDs for qwen 2.5 VL
 
@@ -34,11 +34,6 @@
 class RemoteSGLangEngine(InferenceEngine):
 
     def __init__(self, config: InferenceEngineConfig):
-        if current_platform.communication_backend == "nccl":
-            # Required by NCCL weight update group.
-            os.environ["NCCL_CUMEM_ENABLE"] = "0"
-            os.environ["NCCL_NVLS_ENABLE"] = "0"
-
         self.config = config
 
         self.rid_to_address = {}
@@ -102,11 +97,13 @@ def initialize(
                         timeout=1,
                     )
                     self.logger.info(f"Get server addresses from name_resolve.")
-                except TimeoutError:
+                except (TimeoutError, RuntimeError):
+                    # RuntimeError happens when name_resolve is not properly configured.
                     pass
         if not self.addresses and os.getenv("AREAL_LLM_SERVER_ADDRS"):
             # When addr is not provided, fallback to reading addrs from env var
             self.addresses = os.environ["AREAL_LLM_SERVER_ADDRS"].split(",")
+            self.logger.info(f"Get server addresses from environment variable.")
         if not self.addresses:
             raise RuntimeError(
                 "No configured SGLang servers. Please pass in SGLang server addresses by arguments "
 
@@ -104,11 +104,13 @@ def initialize(
                         timeout=1,
                     )
                     self.logger.info(f"Get server addresses from name_resolve.")
-                except TimeoutError:
+                except (TimeoutError, RuntimeError):
+                    # RuntimeError happens when name_resolve is not properly configured.
                     pass
         if not self.addresses and os.getenv("AREAL_LLM_SERVER_ADDRS"):
             # When addr is not provided, fallback to reading addrs from env var
             self.addresses = os.environ["AREAL_LLM_SERVER_ADDRS"].split(",")
+            self.logger.info(f"Get server addresses from environment variable.")
         if not self.addresses:
             raise RuntimeError(
                 "No configured vLLM servers. Please pass in vLLM server addresses by arguments "
 
@@ -23,9 +23,9 @@
 logger = logging.getLogger("MegatronEngine Test")
 
 VOCAB_SIZE = 100
-MODEL_PATH = "/storage/testing/models/Qwen__Qwen3-1.7B/"
+MODEL_PATH = "/storage/openpsi/models/Qwen__Qwen3-0.6B/"
 if not os.path.exists(MODEL_PATH):
-    MODEL_PATH = "Qwen/Qwen3-1.7B"
+    MODEL_PATH = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
 
@@ -15,9 +15,9 @@
 
 EXPR_NAME = "test_openai"
 TRIAL_NAME = "trial_0"
-MODEL_PATH = "/storage/openpsi/models/Qwen__Qwen3-1.7B/"
+MODEL_PATH = "/storage/openpsi/models/Qwen__Qwen3-0.6B/"
 if not os.path.exists(MODEL_PATH):
-    MODEL_PATH = "Qwen/Qwen3-1.7B"
+    MODEL_PATH = "Qwen/Qwen3-0.6B"
 PORT, DIST_PORT = network.find_free_ports(2)
 HOST = network.gethostip()
 # set a large timeout since we may need to download the model from hub
 
@@ -29,9 +29,9 @@
 
 EXPR_NAME = "test_sglang_local_engine"
 TRIAL_NAME = "trial_0"
-MODEL_PATH = "/storage/testing/models/Qwen__Qwen3-1.7B/"
+MODEL_PATH = "/storage/openpsi/models/Qwen__Qwen3-0.6B/"
 if not os.path.exists(MODEL_PATH):
-    MODEL_PATH = "Qwen/Qwen2-0.5B"
+    MODEL_PATH = "Qwen/Qwen3-0.6B"
 
 
 def build_engine_config(**kwargs):
 
@@ -26,11 +26,11 @@
 from areal.utils.data import broadcast_tensor_container
 
 MODEL_PATHS = {
-    "qwen3": "/storage/openpsi/models/Qwen__Qwen3-1.7B/",
+    "qwen3": "/storage/openpsi/models/Qwen__Qwen3-0.6B/",
     "qwen3moe": "/storage/openpsi/models/Qwen__Qwen3-30B-A3B/",
 }
 HF_MODEL_PATHS = {
-    "qwen3": "Qwen/Qwen3-1.7B",
+    "qwen3": "Qwen/Qwen3-0.6B",
     # TODO: switch Qwen3MoE to smaller model initialized from scratch
     "qwen3moe": "Qwen/Qwen3-30B-A3B",
 }
 
@@ -283,6 +283,7 @@ def local_main(config, run_id: int = 0):
         f"run_id={run_id}, is_recover_run={is_recover_run}"
     )
 
+    server_addrs = []
     if alloc_mode.gen_backend in ("sglang", "vllm"):
         # Launcher should launch llm servers according to allocation mode.
         if alloc_mode.gen_backend == "sglang":
@@ -328,19 +329,19 @@ def local_main(config, run_id: int = 0):
             ),
         )
 
-    # Get llm server addresses by name resolve
-    try:
-        server_addrs = wait_llm_server_addrs(
-            config.experiment_name,
-            config.trial_name,
-            n_rollout_servers=alloc_mode.gen.dp_size,
-        )
-        logger.info(
-            f"LLM inference server launched at: AREAL_LLM_SERVER_ADDRS={','.join(server_addrs)}"
-        )
-    except (TimeoutError, KeyboardInterrupt) as e:
-        launcher.stop_all(signal="SIGINT")
-        raise e
+        # Get llm server addresses by name resolve
+        try:
+            server_addrs = wait_llm_server_addrs(
+                config.experiment_name,
+                config.trial_name,
+                n_rollout_servers=alloc_mode.gen.dp_size,
+            )
+            logger.info(
+                f"LLM inference server launched at: AREAL_LLM_SERVER_ADDRS={','.join(server_addrs)}"
+            )
+        except (TimeoutError, KeyboardInterrupt) as e:
+            launcher.stop_all(signal="SIGINT")
+            raise e
 
     # Launch trainer entrypoint
     if alloc_mode.type_ != AllocationType.LLM_SERVER_ONLY:
@@ -349,6 +350,14 @@ def local_main(config, run_id: int = 0):
             nprocs = 1
         else:
             gpu = nprocs = alloc_mode.train.world_size
+        _env_vars = dict(
+            AREAL_LLM_SERVER_ADDRS=",".join(server_addrs),
+            AREAL_RECOVER_RUN=str(int(is_recover_run)),
+        )
+        if alloc_mode.gen_backend == "sglang":
+            # Required by NCCL weight update group.
+            _env_vars["NCCL_CUMEM_ENABLE"] = "0"
+            _env_vars["NCCL_NVLS_ENABLE"] = "0"
         launcher.submit(
             job_name="trainer",
             cmd=f"torchrun --nnodes 1 --nproc-per-node {nprocs} --master-addr localhost --master-port {find_free_ports(1, (10000, 50000))[0]} {' '.join(sys.argv[1:])}",
@@ -358,8 +367,7 @@ def local_main(config, run_id: int = 0):
                     config.cluster.cluster_name,
                     config.launcher.trainer_env_vars,
                 ),
-                AREAL_LLM_SERVER_ADDRS=",".join(server_addrs),
-                AREAL_RECOVER_RUN=str(int(is_recover_run)),
+                **_env_vars,
             ),
         )
 
 
@@ -534,6 +534,14 @@ def torch_env_hook(n_tasks: int, placement_group: PlacementGroup) -> List[Dict]:
                 )
             return env_vars
 
+        _env_vars = dict(
+            AREAL_LLM_SERVER_ADDRS=",".join(llm_addrs),
+            AREAL_RECOVER_RUN=str(int(is_recover_run)),
+        )
+        if allocation_mode.gen_backend == "sglang":
+            # Required by NCCL weight update group.
+            _env_vars["NCCL_CUMEM_ENABLE"] = "0"
+            _env_vars["NCCL_NVLS_ENABLE"] = "0"
         launcher.submit_array(
             job_name="trainer",
             file_path=trainer_entry_point,
@@ -549,8 +557,7 @@ def torch_env_hook(n_tasks: int, placement_group: PlacementGroup) -> List[Dict]:
                     config.cluster.cluster_name,
                     config.launcher.trainer_env_vars,
                 ),
-                AREAL_LLM_SERVER_ADDRS=",".join(llm_addrs),
-                AREAL_RECOVER_RUN=str(int(is_recover_run)),
+                **_env_vars,
             ),
             env_hook=partial(torch_env_hook, trainer_n_nodes * n_gpus_per_node),
         )
Original file line number	Diff line number	Diff line change
`@@ -26,11 +26,11 @@`
`26`	`26`	`from areal.utils.data import broadcast_tensor_container`
`27`	`27`
`28`	`28`	`MODEL_PATHS = {`
`29`		`- "qwen3": "/storage/openpsi/models/Qwen__Qwen3-1.7B/",`
	`29`	`+ "qwen3": "/storage/openpsi/models/Qwen__Qwen3-0.6B/",`
`30`	`30`	`"qwen3moe": "/storage/openpsi/models/Qwen__Qwen3-30B-A3B/",`
`31`	`31`	`}`
`32`	`32`	`HF_MODEL_PATHS = {`
`33`		`- "qwen3": "Qwen/Qwen3-1.7B",`
	`33`	`+ "qwen3": "Qwen/Qwen3-0.6B",`
`34`	`34`	`# TODO: switch Qwen3MoE to smaller model initialized from scratch`
`35`	`35`	`"qwen3moe": "Qwen/Qwen3-30B-A3B",`
`36`	`36`	`}`