union_gen_batch_via_task_id is to be tested

binary-husky · binary-husky · commit 766000731695 · 2026-01-29T18:31:36.000+08:00
diff --git a/ajet/backbone/trainer_verl.py b/ajet/backbone/trainer_verl.py
@@ -99,16 +99,20 @@ def parse_reward_from_dataproto(data: DataProto, return_dict=False) -> dict | to
         return reward_tensor
 
 
-def union_gen_batch_via_task_id(tasks, batch: DataProto, gen_batch_output: DataProto):
+def union_gen_batch_via_task_id(tasks, batch: DataProto, gen_batch_output: DataProto, discard_original_batch=False):
     """
     Union the gen_batch_output with the batch based on task_id.
     """
-    map_task_id_to_index = {t.task_id: i for i, t in enumerate(tasks)}
-    gen_task_task_ids = gen_batch_output.non_tensor_batch["task_ids"]
-    indices = [map_task_id_to_index[tid] for tid in gen_task_task_ids]
-    batch_extend = batch.select_idxs(indices)
-    batch_final = batch_extend.union(gen_batch_output)
-    return batch_final
+    if not discard_original_batch:
+        map_task_id_to_index = {t.task_id: i for i, t in enumerate(tasks)}
+        gen_task_task_ids = gen_batch_output.non_tensor_batch["task_ids"]
+        indices = [map_task_id_to_index[tid] for tid in gen_task_task_ids]
+        batch_extend = batch.select_idxs(indices)
+        batch_final = batch_extend.union(gen_batch_output)
+        return batch_final
+    else:
+        gen_batch_output.non_tensor_batch['uid'] = gen_batch_output.non_tensor_batch["task_ids"]
+        return gen_batch_output
 
 
 def compute_advantage(
@@ -550,16 +554,17 @@ def fit(self):  # noqa: C901
                 # pass global_steps to trace
                 gen_batch.meta_info["global_steps"] = self.global_steps
                 is_last_step = self.global_steps >= self.total_training_steps
-
+                from ajet import bp
+                bp("BATCH")
                 with marked_timer("step", timing_raw):
                     # generate a batch
-                    logger.info("=== + rollout step begin ===")
+                    logger.info("rollout step begin")
                     with marked_timer("gen", timing_raw, color="red"):
                         assert self.async_rollout_mode
-                        logger.info("=== wake up begin ===")
+                        logger.info("wake up begin")
                         self.async_rollout_manager.wake_up()
                         self._update_interchange_server_status_flag("ENGINE.ROLLING")
-                        logger.info("=== wake up end ===")
+                        logger.info("wake up end")
                         tasks: List[Task] = [
                             dict_to_ajet_task(dict(
                                 task_id=gen_batch.non_tensor_batch["task_id"][i],
@@ -578,16 +583,14 @@ def fit(self):  # noqa: C901
                                 ]
                             )
                         )
-                        logger.info("=" * 10 + "start fit rollout" + "=" * 10)
+                        logger.info("start fit rollout")
                         self.parallel_env.current_global_steps = self.global_steps
                         context_tracker_arr: List[BaseContextTracker] = self.parallel_env.rollout(
                             tasks, mode="sample", epoch=f"train.{epoch}"
                         )
-                        logger.info("=" * 10 + "end fit rollout" + "=" * 10)
-                        self._update_interchange_server_status_flag("ENGINE.WEIGHT_SYNCING")
-                        logger.info("begin to convert context_tracker_arr to dataproto")
+                        logger.info("end fit rollout")
                         gen_batch_output = self.parallel_env.to_dataproto(context_tracker_arr)
-                        logger.info("end convertion")
+                        logger.info("end dataproto convertion")
 
                         success_rate = [
                             traj.reward_structure.success_rate for traj in context_tracker_arr
@@ -630,17 +633,17 @@ def fit(self):  # noqa: C901
                         logger.info(
                             f"gen_batch_output.info batch.keys={gen_batch_output.batch.keys()}"
                         )
+                        self._update_interchange_server_status_flag("ENGINE.WEIGHT_SYNCING")
                         self.async_rollout_manager.sleep()
-                    logger.info("=== - rollout step end ===")
+                    logger.info("rollout step end")
 
-                    if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
-                        raise NotImplementedError("REMAX is not supported in GRPO yet.")
 
                     batch.non_tensor_batch["uid"] = np.array(
                         [str(uuid.uuid4()) for _ in range(len(batch.batch))],
                         dtype=object,
                     )
-                    batch = union_gen_batch_via_task_id(tasks, batch, gen_batch_output)
+                    discard_original_batch = self.config.ajet.enable_tinkerscript_mode
+                    batch = union_gen_batch_via_task_id(tasks, batch, gen_batch_output, discard_original_batch)
                     batch.batch["response_mask"] = compute_response_mask(batch)
 
                     if "response_mask" not in batch.batch.keys():
@@ -674,7 +677,7 @@ def fit(self):  # noqa: C901
                             )
 
                     # recompute old_log_probs
-                    logger.info("=== + compute log_probs begin ===")
+                    logger.info("+ compute log_probs begin")
                     with marked_timer("old_log_prob", timing_raw, color="blue"):
                         old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
                         entropys = old_log_prob.batch["entropys"]
@@ -946,7 +949,8 @@ def _validate(self):
                 dtype=object,
             )
             tasks = tasks[: len(main_val_dataset)]
-            test_batch = union_gen_batch_via_task_id(tasks, test_batch, test_output_gen_batch)
+            discard_original_batch = self.config.ajet.enable_tinkerscript_mode
+            test_batch = union_gen_batch_via_task_id(tasks, test_batch, test_output_gen_batch, discard_original_batch)
             # test_batch = test_batch.union(test_output_gen_batch)
             test_batch.meta_info["validate"] = True
 
diff --git a/ajet/launcher.py b/ajet/launcher.py
@@ -176,9 +176,9 @@ def setup_environment_vars(args, exp_config, main_yaml_fp):
         env["RAY_record_task_actor_creation_sites"] = "true"
         # assert exp_config["ajet"]["rollout"]["max_env_worker"] <= 4, "parallel worker too many for debugging mode"  # type: ignore
         if exp_config["ajet"]["rollout"]["max_env_worker"] > 1:  # type: ignore
-            exp_config["ajet"]["rollout"]["max_env_worker"] = 1
+            # exp_config["ajet"]["rollout"]["max_env_worker"] = 1
             logger.warning(
-                "For debugging mode, max_env_worker is set to 1 to facilitate debugging."
+                "For debugging mode, please set max_env_worker to 1 to facilitate debugging."
             )
         logger.warning("Debug mode is ON")
     else:
@@ -206,7 +206,7 @@ def start_tinkerscript_server(env, config):
     assert config.ajet.enable_experimental_interchange_server, \
         "Please enable_experimental_interchange_server in config to start tinkerscript server."
     from ajet.tuner_lib.weight_tuner.experimental.as_oai_model_server import start_interchange_server
-    start_interchange_server(config, blocking=True)
+    start_interchange_server(config, blocking=True, env=env)
 
 
 def main():
diff --git a/ajet/schema/task.py b/ajet/schema/task.py
@@ -8,11 +8,11 @@
 
 
 class Task(BaseModel):
-    main_query: str = Field(default="")
-    init_messages: List[dict] = Field(default=[])
-    task_id: str = Field(default="")
-    env_type: str = Field(default="")
-    metadata: dict = Field(default_factory=dict)
+    main_query: str = Field(default="", description="main query or instruction for the task, maybe absent if the task has valid init_messages.")
+    init_messages: List[dict] = Field(default=[], description="initial messages for the task, maybe absent if the task has valid main_query.")
+    task_id: str = Field(default="", description="same task_id mean same task, and of course, same GRPO group.")
+    env_type: str = Field(default="", description="valid when the task need to interact with a gym env.")
+    metadata: dict = Field(default_factory=dict, description="additional metadata for the task, e.g., reference answer for eval tasks.")
 
 
 """
diff --git a/ajet/task_runner/tinkerscript_runner.py b/ajet/task_runner/tinkerscript_runner.py
@@ -40,7 +40,7 @@ def register_episode_and_wait_output(self, episode_uuid: str, openai_base_url: s
         zmq_socket = zmq.Context().socket(zmq.REP)
         zmq_socket.bind(zmq_listen_result_addr)
         speicial_messages = [
-            "RUNNER.RESET_CONTEXT_TRACKER"
+            "RUNNER.SPECIAL.RESET_CONTEXT_TRACKER"
         ]
         while True:
             # <wait for 1/2>:
@@ -103,6 +103,12 @@ def execute(self, workflow_task: WorkflowTask) -> BaseContextTracker:
             context_tracker=context_tracker,
         )
 
+        # the most important thing is to fix task_id to client task_id, set task_id to workflow_task and context_tracker task_id
+        assert "task_id" in workflow_output.metadata, "workflow_output.metadata must contain task_id"
+        task_id = workflow_output.metadata.get("task_id", "")
+        workflow_task.task_id = task_id
+        context_tracker.task_id = task_id
+
         if workflow_output.reward is not None:
             raw_reward, is_success = (
                 workflow_output.reward,
diff --git a/ajet/tuner_lib/weight_tuner/experimental/as_oai_model_server.py b/ajet/tuner_lib/weight_tuner/experimental/as_oai_model_server.py
@@ -271,7 +271,7 @@ async def serve_with_monitor(additional_coro):
 
 
 # Convenience function for quick server startup
-def start_interchange_server(config, blocking=False) -> int:
+def start_interchange_server(config, blocking=False, env={}) -> int:
     # Read config
     already_started = config.ajet.interchange_server.already_started
     experiment_dir = config.ajet.experiment_dir
@@ -293,6 +293,9 @@ def start_interchange_server(config, blocking=False) -> int:
 
     # init interchage server sub-process
     if not already_started:
+        # apply env vars
+        os.environ.update(env)
+        # start interchange server
         interchange_server = InterchangeServer(
             experiment_dir,
             port,
@@ -342,6 +345,14 @@ def start_interchange_server(config, blocking=False) -> int:
                        f"URL 1: {localhost_url}\n------\n"
                        f"URL 2: {host_url}\n------\n"
                        f"Press Ctrl+C to stop.")
-        if interchange_server:
-            interchange_server.join()
+        try:
+            if interchange_server:
+                interchange_server.join()
+        except KeyboardInterrupt:
+            logger.info("Shutting down interchange server...")
+            try: httpx.get(f"http://127.0.0.1:{port}/stop_engine", timeout=8).status_code
+            except Exception: pass
+
+            if interchange_server:
+                interchange_server.terminate()
         return -1
diff --git a/ajet/tuner_lib/weight_tuner/experimental/as_tinkerscript_client.py b/ajet/tuner_lib/weight_tuner/experimental/as_tinkerscript_client.py
@@ -4,7 +4,7 @@
 import yaml
 from typing import List, Tuple
 from loguru import logger
-from ajet.schema.task import WorkflowOutput
+from ajet.schema.task import WorkflowOutput, Task
 from ajet.copilot.job import AgentJetJob
 from ajet.tuner_lib.weight_tuner.as_oai_baseurl_apikey import OpenaiBaseUrlAndApiKey
 from ajet.tuner_lib.weight_tuner.experimental.interchange_utils import (
@@ -76,16 +76,19 @@ def begin_episode(self, allow_discard_timeout=60, episode_type="train") -> Tuple
                 logger.error(f"Error claiming episode: {e}. Retrying in 5s...")
                 time.sleep(5)
 
-    def end_episode(self, episode_uuid: str, workflow_output: WorkflowOutput):
+    def end_episode(self, task:Task, episode_uuid: str, workflow_output: WorkflowOutput):
         if not episode_uuid:
             logger.error("No episode to end.")
             return
 
         try:
+            task_id = task.task_id
+            workflow_output.metadata["task_id"] = task_id
             req_obj = EndEpisodeRequest(
                 client_uuid=self.client_uuid,
                 episode_uuid=episode_uuid,
-                workflow_output=workflow_output
+                workflow_output=workflow_output,
+                task_id=task_id
             )
 
             resp = httpx.post(
diff --git a/ajet/tuner_lib/weight_tuner/experimental/as_tinkerscript_server.py b/ajet/tuner_lib/weight_tuner/experimental/as_tinkerscript_server.py
@@ -148,10 +148,11 @@ def _register_final_episode_output(episode_uuid, workflow_output, shared_mem_dic
     # --------------------------------------------------------------------------------------
 
     async def register_episode_ready_listener():
-        while True:
-            read_all_episode_status()
-            await asyncio.sleep(10)  # check every 10 seconds
-            find_claimed_episodes_that_need_to_be_unclaimed()
+        pass
+        # while True:
+        #     read_all_episode_status()
+        #     await asyncio.sleep(10)  # check every 10 seconds
+        #     find_claimed_episodes_that_need_to_be_unclaimed()
 
     def read_all_episode_status() -> Optional[EpisodeStatus]:
         print_buffer = []
@@ -242,17 +243,26 @@ async def start_engine():
 
             # Create args namespace
             args = SimpleNamespace(
-                conf=main_yaml_fp, backbone=backbone, exp_dir=exp_dir_final, with_logview=False, debug=False,
+                conf=main_yaml_fp, backbone=backbone, exp_dir=exp_dir_final, with_logview=False,
+                debug=False,
             )
+            # get debug param
+            should_debug = os.environ.get("RAY_DEBUG_POST_MORTEM", "0") == "1"
+            debug_tags = os.environ.get("DEBUG_TAGS", "")
+            if should_debug:
+                args.debug = debug_tags
+
+            def override_param_callback(config):
+                config['ajet']['interchange_server']['already_started'] = True
+                config['ajet']['interchange_server']['interchange_server_port'] = int(os.getenv("AJET_DAT_INTERCHANGE_PORT"))   # type: ignore
+                return config
 
             # Finalize experiment config
             main_yaml_fp, exe_exp_base, exp_name, exp_config = prepare_experiment_config(
-                main_yaml_fp, exp_dir_final, backbone
+                main_yaml_fp, exp_dir_final, backbone, override_param_callback
             )
 
             # Setup environment variables
-            exp_config['ajet']['interchange_server']['already_started'] = True
-            exp_config['ajet']['interchange_server']['interchange_server_port'] = int(os.getenv("AJET_DAT_INTERCHANGE_PORT"))   # type: ignore
             env, exp_config = setup_environment_vars(args, exp_config, main_yaml_fp)
 
             # Start ray if not already started
@@ -421,6 +431,10 @@ async def end_episode(req: EndEpisodeRequest):
         client_uuid = req.client_uuid
         episode_uuid = req.episode_uuid
         workflow_output = req.workflow_output
+        task_id = req.task_id
+
+        assert "task_id" in workflow_output.metadata, "workflow_output.metadata must contain task_id"
+        assert workflow_output.metadata["task_id"] == task_id, "workflow_output.metadata.task_id must match req.task_id"
 
         if 'episodes' not in shared_mem_dict:
             logger.error(f"[server] No episodes registered yet.")
diff --git a/ajet/tuner_lib/weight_tuner/experimental/interchange_utils.py b/ajet/tuner_lib/weight_tuner/experimental/interchange_utils.py
@@ -35,6 +35,7 @@ class EndEpisodeRequest(BaseModel):
     client_uuid: str
     episode_uuid: str
     workflow_output: WorkflowOutput
+    task_id: str
 
 class EndEpisodeResponse(BaseModel):
     success: bool
diff --git a/ajet/utils/config_utils.py b/ajet/utils/config_utils.py
@@ -168,7 +168,7 @@ def config_safe_guard(config: dict, backbone: str) -> dict:
 
 
 def read_ajet_hierarchical_config(
-    yaml_fp, exp_name, backbone, write_to=None, exp_dir="saved_experiments"
+    yaml_fp, exp_name, backbone, write_to=None, exp_dir="saved_experiments", override_param_callback=None
 ):
     if yaml_fp is None:
         config = {
@@ -210,6 +210,9 @@ def read_ajet_hierarchical_config(
             config["defaults"].remove("trinity_default")
             config["hydra"]["searchpath"].remove("file://ajet/default_config/trinity")
 
+    if override_param_callback is not None:
+        config = override_param_callback(config)
+
     if write_to:
         with open(write_to, "w") as file:
             yaml.dump(config, file)
@@ -239,7 +242,7 @@ def expand_ajet_hierarchical_config(config, write_to=None):
     return config_final
 
 
-def prepare_experiment_config(yaml_path, exp_dir, backbone):
+def prepare_experiment_config(yaml_path, exp_dir, backbone, override_param_callback=None):
     """
     Prepare experiment configuration by reading YAML, setting up backup directories,
     and copying necessary files for the experiment.
@@ -317,7 +320,7 @@ def prepare_experiment_config(yaml_path, exp_dir, backbone):
 
     ## 4. edit new yaml
     config = read_ajet_hierarchical_config(
-        yaml_backup_dst, exp_name, backbone, write_to=yaml_backup_dst, exp_dir=exp_dir
+        yaml_backup_dst, exp_name, backbone, write_to=yaml_backup_dst, exp_dir=exp_dir, override_param_callback=override_param_callback
     )
     config_final = expand_ajet_hierarchical_config(config, write_to=yaml_backup_dst)
 
diff --git a/ajet_tinkerscript_threading.py b/ajet_tinkerscript_threading.py
@@ -10,6 +10,7 @@
 from ajet import WorkflowOutput
 from ajet.task_reader import RouterTaskReader
 from ajet.utils.retry import retry_with_backoff
+from ajet.schema.task import Task
 from concurrent.futures import ThreadPoolExecutor
 
 # --------- configurations that take effect locally -------------
@@ -44,6 +45,7 @@ def main():
 
     # Hand shake with remote tinkerscript server
     tinkerscript_remote = TinkerScriptClient(REMOTE_TINKERJET_URL)
+    tinkerscript_remote.stop_engine()
     tinkerscript_remote.auto_sync_train_config_and_start_engine(
         AgentJetJob(
             algorithm="grpo",
@@ -52,8 +54,6 @@ def main():
             grpo_n=LOCAL_GRPO_N,
         )
     )
-    # tinkerscript_remote.stop_engine()
-
     # tinkerscript_remote = connect_to_tinkerscript_server(sync_train_config=False, start_engine=False)
     submit_sem = threading.BoundedSemaphore(LOCAL_MAX_PARALLEL)
 
@@ -67,7 +67,7 @@ def rollout(task):
                 # execute agent
                 workflow_output = execute_agent(task, api_baseurl_key)
                 # report output back to tinkerscript remote
-                tinkerscript_remote.end_episode(episode_uuid, workflow_output)
+                tinkerscript_remote.end_episode(task, episode_uuid, workflow_output)
                 # collect reward
                 group_reward.append(workflow_output.reward)
             print(f"Group reward mean & std: {sum(group_reward)/len(group_reward)} +/- { (max(group_reward)-min(group_reward))/2 }")
@@ -94,7 +94,7 @@ def rollout(task):
 
 
 @retry_with_backoff(max_retry=2)
-def execute_agent(task, api_baseurl_key: OpenaiBaseUrlAndApiKey):
+def execute_agent(task: Task, api_baseurl_key: OpenaiBaseUrlAndApiKey):
     # Prepare base_url, api_key
     base_url, api_key = (api_baseurl_key.base_url, api_baseurl_key.api_key)
     # Read dataset item