[data] feat: TransferQueue - Support AgentLoop performance metrics & minor fix (verl-project#4289)

0oshowero0 · web-flow · commit a8a32903849c · 2025-11-26T18:45:25.000+08:00
### What does this PR do? 1. Support performance metrics statistics that requires tensor data 2. Add stand-alone config structure for TransferQueue 3. Modify TransferQueue initialization process to suit for multiple backends 4. Fix `create_transferqueue_client` usage 5. Unify some function names 6. Add TODO ### Checklist Before Starting - [x] Search for similar PRs. Paste at least one query link here: ... - [x] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) - `{modules}` include `fsdp`, `megatron`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data` - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]` - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test` - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. - Example: `[BREAKING][fsdp, megatron] feat: dynamic batching` ### Test > For changes that can not be tested by CI (e.g., algorithm implementation, new model support), validate by experiment(s) and show results like training curve plots, evaluation results, etc. ### API and Usage Example > Demonstrate how the API changes if any, and provide usage example(s) if possible. ```python # Add code snippet or script demonstrating how to use this ``` ### Design & Code Changes > Demonstrate the high-level design if this PR is complex, and list the specific changes. ### Checklist Before Submitting > [!IMPORTANT] > Please check all the following items before requesting a review, otherwise the reviewer might deprioritize this PR for review. - [x] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [x] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [x] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). - [x] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: ... - [ ] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). (If not accessible, please try [the Feishu group (飞书群)](https://applink.larkoffice.com/client/chat/chatter/add_by_link?link_token=772jd4f1-cd91-441e-a820-498c6614126a).) --------- Signed-off-by: 0oshowero0 <o0shower0o@outlook.com>
diff --git a/recipe/transfer_queue/agent_loop.py b/recipe/transfer_queue/agent_loop.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import asyncio
+
 import numpy as np
 import ray
 from transfer_queue import BatchMeta
@@ -65,19 +67,33 @@ def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: Batc
         timing["agent_loop/tool_calls/max"] = t_tool_calls.max()
         timing["agent_loop/tool_calls/mean"] = t_tool_calls.mean()
 
-        # TODO (TQ): pass tq info throughout AgentLoop so we can retrieve tensor for these metrics
+        # TODO (TQ): initialize tq during init when enable TQ switch is stable
+        tq_client = self._create_transferqueue_client()
         # batch sequence generation is bounded by the slowest sample
-        # slowest = np.argmax(t_generate_sequences + t_tool_calls)
-        # attention_mask = output.extra_info.pop("attention_mask_perf")[slowest]
-        # prompt_length = output.extra_info.pop("prompts_perf").shape[1]
-        # timing["agent_loop/slowest/generate_sequences"] = t_generate_sequences[slowest]
-        # timing["agent_loop/slowest/tool_calls"] = t_tool_calls[slowest]
-        # timing["agent_loop/slowest/prompt_length"] = attention_mask[:prompt_length].sum().item()
-        # timing["agent_loop/slowest/response_length"] = attention_mask[prompt_length:].sum().item()
+        slowest = np.argmax(t_generate_sequences + t_tool_calls)
+        attention_mask = asyncio.run(tq_client.async_get_data(output[slowest]))["attention_mask"]
+        prompt_length = output.samples[0].fields["prompts"].shape[0]
+        timing["agent_loop/slowest/generate_sequences"] = t_generate_sequences[slowest]
+        timing["agent_loop/slowest/tool_calls"] = t_tool_calls[slowest]
+        timing["agent_loop/slowest/prompt_length"] = attention_mask[:prompt_length].sum().item()
+        timing["agent_loop/slowest/response_length"] = attention_mask[prompt_length:].sum().item()
 
         return timing
 
-    def create_transferqueue_client(self, controller_info, config):
-        ray.get(
-            [worker.create_transferqueue_client.remote(controller_info, config) for worker in self.agent_loop_workers]
+    def create_transferqueue_client_for_workers(self):
+        # TODO (TQ): initialize tq during worker init when enable TQ switch is stable
+        ray.get([worker.create_transferqueue_client.remote() for worker in self.agent_loop_workers])
+
+    def _create_transferqueue_client(self):
+        """Create a client for data system (TransferQueue)."""
+        from verl.single_controller.ray.base import get_random_string
+        from verl.utils.transferqueue_utils import create_transferqueue_client
+
+        client_name = get_random_string(length=6)
+
+        tq_client = create_transferqueue_client(
+            client_id=f"AgentLoopManager_{client_name}",
+            config=self.config.transfer_queue,
         )
+
+        return tq_client
diff --git a/recipe/transfer_queue/config/transfer_queue_ppo_megatron_trainer.yaml b/recipe/transfer_queue/config/transfer_queue_ppo_megatron_trainer.yaml
@@ -9,3 +9,6 @@ defaults:
 # config for TransferQueue
 transfer_queue:
   enable: True
+  num_global_batch: 1
+  storage_backend: AsyncSimpleStorageManager
+  num_data_storage_units: 8
diff --git a/recipe/transfer_queue/config/transfer_queue_ppo_trainer.yaml b/recipe/transfer_queue/config/transfer_queue_ppo_trainer.yaml
@@ -9,3 +9,6 @@ defaults:
 # config for TransferQueue
 transfer_queue:
   enable: True
+  num_global_batch: 1
+  storage_backend: AsyncSimpleStorageManager
+  num_data_storage_units: 8
diff --git a/recipe/transfer_queue/ray_trainer.py b/recipe/transfer_queue/ray_trainer.py
diff --git a/recipe/transfer_queue/run_qwen3-8b_transferqueue.sh b/recipe/transfer_queue/run_qwen3-8b_transferqueue.sh
@@ -63,7 +63,5 @@ python3 -m recipe.transfer_queue.main_ppo \
     trainer.total_epochs=15 \
     trainer.total_training_steps=2 \
     trainer.val_before_train=False \
-    +trainer.num_global_batch=1 \
-    +trainer.num_data_storage_units=8 \
     2>&1 | tee "$log_file"
 echo "Finished, log is saved in: $log_file"
diff --git a/tests/special_e2e/run_transferqueue.sh b/tests/special_e2e/run_transferqueue.sh
@@ -122,8 +122,6 @@ common_params=(
     trainer.total_training_steps=2
     trainer.total_epochs=15
     trainer.val_before_train=True
-    +trainer.num_global_batch=1
-    +trainer.num_data_storage_units=8
 )
 
 if [ "${ACTOR_STRATEGY}" == "fsdp" ]; then
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
@@ -633,16 +633,18 @@ def _postprocess(self, inputs: list[_InternalAgentLoopOutput]) -> DataProto:
             meta_info={"metrics": metrics, "reward_extra_keys": reward_extra_keys},
         )
 
-    def create_transferqueue_client(self, controller_info, role):
-        """Create a client for data system(transfer queue)."""
+    def create_transferqueue_client(
+        self,
+    ):
+        """Create a client for data system (TransferQueue)."""
         from verl.single_controller.ray.base import get_random_string
         from verl.utils.transferqueue_utils import create_transferqueue_client
 
         client_name = get_random_string(length=6)
-        create_transferqueue_client(
-            client_id=f"{role}_worker_{client_name}",
-            controller_info=controller_info,
-            config=self.config,
+
+        self.tq_client = create_transferqueue_client(
+            client_id=f"AgentLoopWorker_{client_name}",
+            config=self.config.transfer_queue,
         )
 
 
diff --git a/verl/single_controller/base/worker.py b/verl/single_controller/base/worker.py
@@ -163,13 +163,12 @@ def set_dispatch_collect(self, mesh_name: str, dispatch_dp_rank: dict[str, int],
             self.__collect_dp_rank[mesh_name] = is_collect
 
     @register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=True)
-    def create_transferqueue_client(self, controller_info, config):
+    def create_transferqueue_client(self, config):
         from verl.utils.transferqueue_utils import create_transferqueue_client
 
         create_transferqueue_client(
             client_id=f"worker_{self.rank}",
-            controller_info=controller_info,
-            config=config,
+            config=config.transfer_queue,
         )
 
     @classmethod
diff --git a/verl/utils/transferqueue_utils.py b/verl/utils/transferqueue_utils.py
@@ -25,7 +25,6 @@
     from transfer_queue import (
         AsyncTransferQueueClient,
         BatchMeta,
-        ZMQServerInfo,
     )
 
 except ImportError:
@@ -44,18 +43,21 @@ class BatchMeta:
 
 def create_transferqueue_client(
     client_id: str,
-    controller_info: "ZMQServerInfo",
     config,
-) -> None:
+) -> "AsyncTransferQueueClient":
     global _TRANSFER_QUEUE_CLIENT
-    _TRANSFER_QUEUE_CLIENT = AsyncTransferQueueClient(client_id, controller_info)
-    _TRANSFER_QUEUE_CLIENT.initialize_storage_manager(manager_type="AsyncSimpleStorageManager", config=config)
+    if _TRANSFER_QUEUE_CLIENT is None:
+        _TRANSFER_QUEUE_CLIENT = AsyncTransferQueueClient(client_id, config.controller_info)
+        _TRANSFER_QUEUE_CLIENT.initialize_storage_manager(manager_type=config.storage_backend, config=config)
+
+    return _TRANSFER_QUEUE_CLIENT
 
 
 def get_transferqueue_client() -> "AsyncTransferQueueClient":
     return _TRANSFER_QUEUE_CLIENT
 
 
+# TODO (TQ): verl will make all actor async, so this can be cleanup later.
 def _run_async_in_temp_loop(async_func: Callable[..., Any], *args, **kwargs) -> Any:
     # Use a temporary event loop in a new thread because event
     # loop may already exist in server mode
@@ -127,7 +129,7 @@ def _update_batchmeta_with_output(output: DataProto, batchmeta: "BatchMeta") ->
 
 
 def tqbridge(put_data: bool = True):
-    """ "Creates a decorator for bridging BatchMeta and DataProto.
+    """Creates a decorator for bridging BatchMeta and DataProto.
 
     This decorator automatically handles conversions between `BatchMeta` and
     `DataProto` in function parameters, and decides whether to sync function

Original file line number	Diff line number	Diff line change
`@@ -122,8 +122,6 @@ common_params=(`
`122`	`122`	`trainer.total_training_steps=2`
`123`	`123`	`trainer.total_epochs=15`
`124`	`124`	`trainer.val_before_train=True`
`125`		`- +trainer.num_global_batch=1`
`126`		`- +trainer.num_data_storage_units=8`
`127`	`125`	`)`
`128`	`126`
`129`	`127`	`if [ "${ACTOR_STRATEGY}" == "fsdp" ]; then`