feat: enhance TinkerScript functionality with improved engine status handling and episode management

binary-husky · binary-husky · commit 968c2cf6487c · 2026-01-29T01:04:36.000+08:00
diff --git a/ajet/backbone/trainer_verl.py b/ajet/backbone/trainer_verl.py
@@ -444,7 +444,6 @@ def init_workers(self):
         )
 
     def _update_interchange_server_status_flag(self, status: str):
-        # if interchange server is enabled, change engine status to ROLLING
         if self.config.ajet.enable_experimental_interchange_server:
             if self.config.ajet.enable_tinkerscript_mode:
                 from ajet.tuner_lib.weight_tuner.experimental.interchange_utils import http_change_engine_status
diff --git a/ajet/task_runner/tinkerscript_runner.py b/ajet/task_runner/tinkerscript_runner.py
@@ -16,9 +16,10 @@
 from loguru import logger
 from ajet import Workflow
 
+DEBUG = False
+
 context = zmq.Context()
 atexit.register(context.term)
-DEBUG = True
 
 class TinkerScriptRunner(BaseAgentRunner):
 
@@ -33,12 +34,18 @@ def register_episode_and_wait_output(self, episode_uuid: str, openai_base_url: s
             openai_api_key=openai_api_key,
             zmq_listen_result_addr=zmq_listen_result_addr,
         )
-        logger.info(f"zmq_listen_result_addr: {zmq_listen_result_addr}")
+        if DEBUG: logger.info(f"zmq_listen_result_addr: {zmq_listen_result_addr}")
 
         # begin wait for result
         zmq_socket = zmq.Context().socket(zmq.REP)
         zmq_socket.bind(zmq_listen_result_addr)
+
+        # <wait for>:
+        #   <from_sourcefile>: ajet/tuner_lib/weight_tuner/experimental/as_tinkerscript_server.py
+        #   <from_code>: socket.send_string(workflow_output.model_dump_json())
+        #   <expect>: workflow_output: WorkflowOutput
         message = zmq_socket.recv_string()
+
         logger.success(f"Received workflow output for episode {episode_uuid}")
         zmq_socket.send_string("ack")
         zmq_socket.close()
diff --git a/ajet/tuner_lib/weight_tuner/experimental/as_oai_model_server.py b/ajet/tuner_lib/weight_tuner/experimental/as_oai_model_server.py
@@ -157,9 +157,9 @@ async def chat_completions(request: Request, authorization: str = Header(None)):
         if enable_tinkerscript_mode:
             assert shared_mem_dict is not None
             assert shared_mem_dict_lock is not None
-            if shared_mem_dict['engine_status'] != "ROLLING":
-                logger.error(f"The server is not in ROLLING status (current status: [{shared_mem_dict['engine_status']}]), cannot accept new requests.")
-                raise HTTPException(status_code=503, detail="The server is not in ROLLING status, cannot accept new requests.")
+            if shared_mem_dict['engine_status'] != "ENGINE.ROLLING":
+                logger.error(f"The server is not in ENGINE.ROLLING status (current status: [{shared_mem_dict['engine_status']}]), cannot accept new requests.")
+                raise HTTPException(status_code=503, detail="The server is not in ENGINE.ROLLING status, cannot accept new requests.")
             if (f"episodes-{episode_uuid}") not in shared_mem_dict:
                 raise HTTPException(status_code=404, detail=f"Episode {episode_uuid} not found.")
             # update activate timestamp
diff --git a/ajet/tuner_lib/weight_tuner/experimental/as_tinkerscript_client.py b/ajet/tuner_lib/weight_tuner/experimental/as_tinkerscript_client.py
@@ -25,6 +25,7 @@ class TinkerScriptClient(object):
     def __init__(self, server_url: str):
         self.server_url = server_url
         self.client_uuid = str(uuid.uuid4())
+        self.previous_warning_time = 0
 
 
     def begin_episode(self, allow_discard_timeout=60) -> Tuple[str, OpenaiBaseUrlAndApiKey]:
@@ -59,8 +60,18 @@ def begin_episode(self, allow_discard_timeout=60) -> Tuple[str, OpenaiBaseUrlAnd
                         episode_uuid=episode_uuid
                     )
                 else:
-                    logger.info(f"Failed to claim episode: {data.fail_cause}. Retrying in 5s...")
-                    time.sleep(5)
+                    need_wait_scenarios =[
+                        "Engine is syncing weights",
+                        "No available episodes to claim.",
+                    ]
+                    if any(scenario in data.fail_cause for scenario in need_wait_scenarios):
+                        if time.time() - self.previous_warning_time > 60:
+                            logger.info(f"{data.fail_cause}. Retrying in 30s...")
+                            self.previous_warning_time = time.time()
+                        time.sleep(30)
+                    else:
+                        logger.warning(f"Failed to claim episode: {data.fail_cause}. Retrying in 5s...")
+                        time.sleep(5)
             except Exception as e:
                 logger.error(f"Error claiming episode: {e}. Retrying in 5s...")
                 time.sleep(5)
@@ -98,6 +109,11 @@ def sync_train_config(self, agent_jet_job: AgentJetJob):
         Sync training configuration to the TinkerScript server.
         This sends the AgentJetJob config as YAML to the remote server.
         """
+        # try get init status
+        current_status = self.get_engine_status()
+        if current_status != "ENGINE.OFFLINE":
+            raise RuntimeError(f"Cannot sync train config when engine is NOT ENGINE.OFFLINE. (current status: {current_status})")
+
         try:
             config_dict = agent_jet_job.config.to_dict()
             yaml_str = yaml.safe_dump(config_dict, sort_keys=False)
@@ -121,6 +137,12 @@ def start_engine(self):
         This triggers the server to begin the training process.
         Polls until engine status is "ENGINE.ROLLING".
         """
+        # try get init status
+        current_status = self.get_engine_status()
+        if current_status != "ENGINE.OFFLINE":
+            raise RuntimeError(f"Cannot start engine when engine is NOT ENGINE.OFFLINE. (current status: {current_status})")
+
+        # Send start engine request
         try:
             resp = httpx.post(
                 f"{self.server_url}/start_engine",
@@ -139,8 +161,17 @@ def start_engine(self):
             raise
 
         # Poll until engine status is "ENGINE.ROLLING"
+        self._wait_until_avail()
+        logger.success("Training engine is now ROLLING and ready.")
+
+    def _wait_until_avail(self):
+        """
+        Poll engine status until it reaches ENGINE.ROLLING state.
+        Reports status every 5 seconds while waiting.
+        """
         logger.info("Polling engine status until ENGINE.ROLLING...")
         last_report_time = time.time()
+        init_poll_time = last_report_time
 
         while True:
             try:
@@ -149,7 +180,7 @@ def start_engine(self):
 
                 # Report status every 5 seconds
                 if current_time - last_report_time >= 5:
-                    logger.info(f"Current engine status: {current_status}")
+                    logger.info(f"Current engine status (already waited {current_time - init_poll_time:.1f}s): {current_status}")
                     last_report_time = current_time
 
                 # Check if engine has reached the desired status
@@ -210,3 +241,22 @@ def get_episode_buffer(self) -> List[EpisodeStatus]:
         except Exception as e:
             logger.error(f"Error getting episode buffer: {e}")
             return []
+
+    def auto_sync_train_config_and_start_engine(self, agent_jet_job: AgentJetJob):
+        """
+        Automatically sync training configuration and start the engine if needed.
+        This checks the current engine status and performs actions accordingly.
+        """
+        current_status = self.get_engine_status()
+        if current_status == "ENGINE.OFFLINE":
+            logger.info("Engine is OFFLINE. Syncing train config and starting engine...")
+            self.sync_train_config(agent_jet_job)
+            self.start_engine()
+        elif current_status == "ENGINE.ROLLING":
+            logger.info("Engine is already ROLLING. No action needed.")
+        elif current_status == "ENGINE.BOOTING":
+            logger.info("Engine is BOOTING. Waiting until it becomes ROLLING...")
+            self._wait_until_avail()
+            logger.success("Training engine is now ROLLING and ready.")
+        else:
+            raise RuntimeError(f"Cannot sync train config or start engine when engine is in status: {current_status}")
diff --git a/ajet/tuner_lib/weight_tuner/experimental/as_tinkerscript_server.py b/ajet/tuner_lib/weight_tuner/experimental/as_tinkerscript_server.py
@@ -28,7 +28,7 @@
     UpdateEngineStatusRequest,
 )
 
-DEBUG = True
+DEBUG = False
 
 def register_enable_tinkerscript_mode_routes(
         app,
@@ -43,6 +43,84 @@ def register_enable_tinkerscript_mode_routes(
     if 'unclaimed_episodes' not in shared_mem_dict:
         shared_mem_dict['unclaimed_episodes'] = []
 
+    def find_claimed_episodes_that_need_to_be_unclaimed() -> List[str]:
+        result = []
+        current_time = time.time()
+
+        for k, v in shared_mem_dict.items():
+            if k.startswith("episodes-"):
+                es:EpisodeStatus = v
+                if es.episode_status == "claimed":
+                    if (current_time - es.latest_activity_timestamp) > es.allow_discard_timeout:
+                        result.append(es.episode_uuid)
+
+        for episode_uuid in result:
+            _revert_episode_to_unclaimed(episode_uuid)
+
+        return result
+
+    def _revert_episode_to_unclaimed(episode_uuid: str):
+        with shared_mem_dict_lock:
+            # check status again, because other thread may have changed it
+            if shared_mem_dict[f"episodes-{episode_uuid}"].episode_status != "claimed":
+                return
+
+            # revert
+            logger.warning(f"Reverting episode {episode_uuid} to unclaimed due to client timeout.")
+            if f"episodes-{episode_uuid}" in shared_mem_dict:
+                es:EpisodeStatus = shared_mem_dict[f"episodes-{episode_uuid}"]
+                es.episode_status = "registered"
+                es.client_uuid = ""
+                es.latest_activity_timestamp = time.time()
+                es.allow_discard_timeout = -1
+                shared_mem_dict[f"episodes-{episode_uuid}"] = es
+                shared_mem_dict['unclaimed_episodes'] += [episode_uuid]
+
+
+    async def register_episode_ready_listener():
+        while True:
+            read_all_episode_status()
+            await asyncio.sleep(10)  # check every 10 seconds
+            find_claimed_episodes_that_need_to_be_unclaimed()
+
+
+    def read_all_episode_status() -> Optional[EpisodeStatus]:
+        print_buffer = []
+        group_by_status = {}
+
+        for k, v in shared_mem_dict.items():
+            if k.startswith("episodes-"):
+                es:EpisodeStatus = v
+                if es.episode_status not in group_by_status:
+                    group_by_status[es.episode_status] = []
+                group_by_status[es.episode_status].append(es)
+
+        for status, es_list in group_by_status.items():
+            print_buffer.append(f"--- {status} (time since last activity) ---")
+            in_line_buffer = ""
+            for es in es_list:
+                time_since_last_activity = time.time() - es.latest_activity_timestamp
+                in_line_buffer += f"{es.episode_uuid[:6]}({time_since_last_activity:.1f}s)\t"
+            print_buffer.append(in_line_buffer)
+
+        print_buffer_str = "\n".join(print_buffer)
+        logger.info(f"Current engine status: [{shared_mem_dict['engine_status']}]")
+        if print_buffer:
+            logger.info(f"Current episode statuses:\n{print_buffer_str}")
+        else:
+            logger.info(f"Current episode statuses: [NA]")
+
+        return None
+
+
+    # hiefwu1(15.1s ago)	hiefwu2(20.3s ago)    hiefwu3(5.0s ago)
+
+
+
+    # --------------------------------------------------------------------
+    # -------------------------- fastapi routes --------------------------
+    # --------------------------------------------------------------------
+
     @app.post("/sync_train_config")
     async def sync_train_config(req: SyncTrainConfigRequest):
         """
@@ -120,7 +198,7 @@ async def start_engine():
 
             # Setup environment variables
             exp_config['ajet']['interchange_server']['already_started'] = True
-            exp_config['ajet']['interchange_server']['interchange_server_port'] = int(os.getenv("AJET_DAT_INTERCHANGE_PORT"))
+            exp_config['ajet']['interchange_server']['interchange_server_port'] = int(os.getenv("AJET_DAT_INTERCHANGE_PORT"))   # type: ignore
             env, exp_config = setup_environment_vars(args, exp_config, main_yaml_fp)
 
             # Start ray if not already started
@@ -163,11 +241,12 @@ async def start_engine():
 
 
     # --- engine status ---
-    shared_mem_dict['engine_status'] = "ENGINE.OFF"
+    shared_mem_dict['engine_status'] = "ENGINE.OFFLINE"
     @app.post("/update_engine_status", response_model=BoolResponse)
     async def update_engine_status(req: UpdateEngineStatusRequest):
+        """Update the current engine status."""
         if req.engine_status not in [
-            "ENGINE.OFF",
+            "ENGINE.OFFLINE",
             "ENGINE.BOOTING",
             "ENGINE.ROLLING",
             "ENGINE.WEIGHT_SYNCING",
@@ -180,14 +259,15 @@ async def update_engine_status(req: UpdateEngineStatusRequest):
 
     @app.get("/get_engine_status")
     async def get_engine_status():
+        """Get the current engine status."""
         status = shared_mem_dict['engine_status']
         return {"engine_status": status}
 
 
     # --- episode status ---
     @app.post("/register_episode", response_model=BoolResponse)
     async def register_episode(req: RegisterEpisodeRequest):
-
+        """(From task_runner) Register a new episode as ready to roll."""
         episode_uuid = req.episode_uuid
         es = EpisodeStatus(
             episode_uuid=req.episode_uuid,
@@ -210,8 +290,30 @@ async def register_episode(req: RegisterEpisodeRequest):
 
     @app.post("/claim_episode", response_model=ClaimEpisodeResponse)
     async def claim_episode(req: ClaimEpisodeRequest):
+        """(From client) Claim an available episode to rollout."""
         find_claimed_episodes_that_need_to_be_unclaimed()
 
+        engine_status = shared_mem_dict['engine_status']
+        if engine_status != "ENGINE.ROLLING":
+            fail_cause = f"Engine not ready. Current status: [{engine_status}]."
+            advise = ""
+            if engine_status == "ENGINE.OFFLINE":
+                advise = "Please start the engine first. Please use one of the client to run `client.sync_train_config() + client.start_engine()` to start the engine."
+            elif engine_status == "ENGINE.BOOTING":
+                advise = "Please wait until the engine is fully booted. Try again (maybe 1 minute) later."
+            elif engine_status == "ENGINE.WEIGHT_SYNCING":
+                advise = "Engine is syncing weights. Try again (maybe 1 minute) later."
+            elif engine_status == "ENGINE.WEIGHT_EXPORTING":
+                advise = "Engine is exporting weights (fsdp -> hf safetensor). Try again (maybe 1 minute) later."
+            return ClaimEpisodeResponse(
+                success=False,
+                client_uuid=req.client_uuid,
+                episode_uuid="",
+                openai_base_url="",
+                openai_api_key="",
+                fail_cause=fail_cause + " " + advise,
+            )
+
         with shared_mem_dict_lock:
             if len(shared_mem_dict['unclaimed_episodes']) <= 0:
                 return ClaimEpisodeResponse(
@@ -248,41 +350,6 @@ async def claim_episode(req: ClaimEpisodeRequest):
         )
 
 
-    def find_claimed_episodes_that_need_to_be_unclaimed() -> List[str]:
-        result = []
-        current_time = time.time()
-
-        for k, v in shared_mem_dict.items():
-            if k.startswith("episodes-"):
-                es:EpisodeStatus = v
-                if es.episode_status == "claimed":
-                    if (current_time - es.latest_activity_timestamp) > es.allow_discard_timeout:
-                        result.append(es.episode_uuid)
-
-        for episode_uuid in result:
-            _revert_episode_to_unclaimed(episode_uuid)
-
-        return result
-
-
-    def _revert_episode_to_unclaimed(episode_uuid: str):
-        with shared_mem_dict_lock:
-            # check status again, because other thread may have changed it
-            if shared_mem_dict[f"episodes-{episode_uuid}"].episode_status != "claimed":
-                return
-
-            # revert
-            logger.info(f"Reverting episode {episode_uuid} to unclaimed due to client timeout.")
-            if f"episodes-{episode_uuid}" in shared_mem_dict:
-                es:EpisodeStatus = shared_mem_dict[f"episodes-{episode_uuid}"]
-                es.episode_status = "registered"
-                es.client_uuid = ""
-                es.latest_activity_timestamp = time.time()
-                es.allow_discard_timeout = -1
-                shared_mem_dict[f"episodes-{episode_uuid}"] = es
-                shared_mem_dict['unclaimed_episodes'] += [episode_uuid]
-
-
     @app.post("/end_episode", response_model=EndEpisodeResponse)
     async def end_episode(req: EndEpisodeRequest):
         # receive workflow output data
@@ -312,6 +379,10 @@ async def end_episode(req: EndEpisodeRequest):
         for _ in range(5):  # max 5 minutes wait
             try:
                 if DEBUG: logger.info(f"[server] episode_uuid: {episode_uuid} | recv_string begin.")
+                # <wait for>:
+                #   <from_sourcefile>: ajet/task_runner/tinkerscript_runner.py
+                #   <from_code>: zmq_socket.send_string("ack")
+                #   <expect>: "ack"
                 result_str = socket.recv_string()
                 break
             except zmq.Again as e:
@@ -345,9 +416,4 @@ async def get_episode_buffer():
         return EpisodeBufferResponse(buffer=result)
 
 
-
-    async def register_episode_ready_listener():
-        pass
-
-
     return app, register_episode_ready_listener()
diff --git a/ajet/tuner_lib/weight_tuner/experimental/interchange_utils.py b/ajet/tuner_lib/weight_tuner/experimental/interchange_utils.py
diff --git a/ajet_tinkerscript_threading.py b/ajet_tinkerscript_threading.py

Original file line number	Diff line number	Diff line change
`@@ -444,7 +444,6 @@ def init_workers(self):`
`444`	`444`	`)`
`445`	`445`
`446`	`446`	`def _update_interchange_server_status_flag(self, status: str):`
`447`		`- # if interchange server is enabled, change engine status to ROLLING`
`448`	`447`	`if self.config.ajet.enable_experimental_interchange_server:`
`449`	`448`	`if self.config.ajet.enable_tinkerscript_mode:`
`450`	`449`	`from ajet.tuner_lib.weight_tuner.experimental.interchange_utils import http_change_engine_status`