From 87569f2bab0824e8de6ea64a057fb76f4ddb4f0b Mon Sep 17 00:00:00 2001
From: wangqijian <221900059@smail.nju.edu.cn>
Date: Thu, 16 Jan 2025 12:08:31 +0000
Subject: [PATCH 01/18] vllm_test.py in ding/worker

---
 ding/worker/vllm_test_wqj.py | 222 +++++++++++++++++++++++++++++++++++
 1 file changed, 222 insertions(+)
 create mode 100644 ding/worker/vllm_test_wqj.py

diff --git a/ding/worker/vllm_test_wqj.py b/ding/worker/vllm_test_wqj.py
new file mode 100644
index 0000000000..de80dc7da1
--- /dev/null
+++ b/ding/worker/vllm_test_wqj.py
@@ -0,0 +1,222 @@
+from typing import List, Tuple
+import os
+import uuid
+from loguru import logger
+from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams, RequestOutput
+from vllm.assets.image import ImageAsset
+
+class VllmActor:
+    def __init__(self, model_path: str) -> None:
+        """
+        Overview:
+            Initialize the vLLM actor. For more details, please refer to https://docs.vllm.ai/en/stable.
+        Arguments:
+            - model_path (str): The path to the language model.
+        """
+        self.free_gpus = self.get_free_gpus()
+        self.num_gpus = len(self.free_gpus)
+        assert self.num_gpus > 0, "No GPUs found"
+        # Set CUDA_VISIBLE_DEVICES to use only free GPUs
+        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, self.free_gpus))
+        self.model_path = model_path
+        self._initialize()
+
+    def get_free_gpus(self) -> List[int]:
+        """
+        Overview:
+            Get IDs of GPUs with free memory.
+        Returns:
+            - List[int]: The IDs of the free GPUs.
+        """
+        try:
+            # Get GPU memory usage using nvidia-smi
+            gpu_stats = os.popen('nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader').readlines()
+            free_gpus = []
+
+            for gpu_id, stats in enumerate(gpu_stats):
+                mem_used, mem_total = map(int, stats.strip().split(','))
+                # Consider GPU as free if less than 5% memory is used
+                if mem_used / mem_total < 0.05:
+                    free_gpus.append(gpu_id)
+
+            return free_gpus if free_gpus else [0]  # Default to GPU 0 if no free GPUs found
+        except Exception:
+            logger.warning("Failed to get GPU stats, defaulting to GPU 0")
+            return [0]
+
+    def _initialize(self) -> None:
+        """
+        Overview:
+            Initialize the vLLM actor with a series of arguments.
+        """
+        logger.info("Initializing vLLM")
+        # TODO: Try other options in https://docs.vllm.ai/en/stable/models/engine_args.html#engine-args.
+        engine_args = AsyncEngineArgs(
+            model=self.model_path,
+            tensor_parallel_size=self.num_gpus,
+            max_num_batched_tokens=8192,
+            max_model_len=8192,
+            #max_model_len=4096,  #see if 8192 works
+            #max_num_batched_tokens=4096,
+            #max_num_batched_tokens=2048,
+            #max_model_len=2048,
+            # enable_chunked_prefill=True,
+            max_num_seqs=5,
+            # Note - mm_processor_kwargs can also be passed to generate/chat calls
+            mm_processor_kwargs={
+                "min_pixels": 28 * 28,
+                "max_pixels": 1280 * 28 * 28,
+            },
+        )
+        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
+
+    async def generate(self, prompt, num_samples: int, max_tokens: int, temperature: float = 0) -> RequestOutput:
+        """
+        Overview:
+            Generate tactics for the current state.
+        Arguments:
+            - prompt : The prompt to generate tactics.
+            - num_samples (int): The number of tactics to generate.
+            - max_tokens (int): The maximum number of tokens to generate.
+            - temperature (float): The temperature for the language model, default to 0.
+        Returns:
+            - RequestOutput: The generated tactics and their log-probabilities.
+        """
+        sampling_params = SamplingParams(
+            n=num_samples,
+            max_tokens=max_tokens,
+            temperature=temperature,
+        )
+        
+        # Using async iterator to handle vLLM's generation process
+        # 1. vLLM's generate method is asynchronous to prevent blocking while waiting for model outputs
+        # 2. async for allows streaming the generated outputs incrementally instead of waiting for all results
+        # 3. This approach is particularly suitable for LLM inference which can be time-consuming
+        # 4. The request_id ensures unique identification for each generation request
+        async for oup in self.engine.generate(
+            prompt, sampling_params, request_id=str(uuid.uuid4().hex)
+        ):
+            final_output = oup
+        return final_output
+
+
+class HuggingFaceModelGenerator:
+    """
+    Overview:
+        A LLM/VLM generator that uses Hugging Face models with vLLM as the backend.
+    """
+
+    def __init__(self, model_path: str, max_tokens: int = 1024, temperature: float = 0) -> None:
+        """
+        Overview:
+            Initialize the Hugging Face model generator.
+        Arguments:
+            - model_path (str): The path to the language model.
+            - max_tokens (int): The maximum number of tokens to generate, default to 1024.
+            - temperature (float): The temperature for the language model, default to 0.
+        """
+        self.vllm_actor = VllmActor(model_path)
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+
+    async def generate(
+        self,
+        prompt,
+        num_samples: int,
+    ) -> List[Tuple[str, float]]:
+        """
+        Overview:
+            Generate tactics for the current state.
+        Arguments:
+            - prompt : The prompt to generate tactics.
+            - num_samples (int): The number of tactics to generate.
+        Returns:
+            - List[Tuple[str, float]]: The generated tactics and their log-probabilities.
+
+        .. note::
+            This method is asynchronous and returns a coroutine.
+        """
+        response = await self.vllm_actor.generate(prompt, num_samples, self.max_tokens, self.temperature)
+        # Use raw logprobs as confidence scores
+        confidence_scores = [x.cumulative_logprob for x in response.outputs]
+        return [
+            (x.text.strip(), conf)
+            for x, conf in zip(response.outputs, confidence_scores)
+        ]
+        
+
+model=HuggingFaceModelGenerator('/mnt/afs/share/Qwen2-VL-7B',temperature=0.5) #设置一个temperature就好了,可以做到生成多个候选答案
+
+def get_prompts_qwen(questions: list,modality: str):
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
+
+    prompts = [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+              f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+              f"{question}<|im_end|>\n"
+              "<|im_start|>assistant\n") for question in questions]
+    stop_token_ids = None
+    return prompts,stop_token_ids
+
+def get_multi_modal_input(modality,filenames,questions):
+    """
+    return {
+        "data": image or video,
+        "question": question,
+    }
+    """
+    if modality == "image":
+        # Input image and question
+        ret={}
+        ret["data"]=[]
+        ret["question"]=[]
+        for filename,question in zip(filenames,questions):
+            image = ImageAsset(filename) \
+                .pil_image.convert("RGB")
+            #img_question = "What is the content of this image?"
+            img_question=question
+            ret["data"].append(image)
+            ret["question"].append(img_question)
+        return ret
+
+
+questions=["What is the content of this image?","Please describe the image.","How many people are there in the image? What are they doing?"]
+img_names=['/mnt/afs/niuyazhe/data/meme/data/Eimages/Eimages/Eimages/image_ (2)','/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(3127)','/mnt/afs/wangqijian/data/test/test']
+#questions=["What is the content of this image?"]
+#img_names=['/mnt/afs/niuyazhe/data/meme/data/Eimages/Eimages/Eimages/image_ (2)']
+num_prompts=len(questions)
+image_repeat_prob=None
+modality = 'image'
+
+mm_input = get_multi_modal_input(modality,img_names,questions)
+data = mm_input["data"]
+question = mm_input["question"]
+batch_inference_mine=True
+prompts, stop_token_ids = get_prompts_qwen(question,modality)
+
+
+import asyncio
+import nest_asyncio
+nest_asyncio.apply()
+async def main():
+    inputs = [
+        {
+            "prompt":prompt,
+            "multi_modal_data":{
+                modality:data
+            }
+        } for prompt,data in zip(prompts,data)
+    ]
+    # 调用 generate 方法
+    for in_data in inputs:
+        tactics = await model.generate(prompt=in_data, num_samples=3)
+        # 打印返回结果
+        for tactic, confidence in tactics:
+            print(f"Tactic: {tactic}")
+    
+
+# 运行主程序
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file

From ffd435b19b89f2d958cdd50b8195e8f3575a8410 Mon Sep 17 00:00:00 2001
From: wqj2004 <2285705435@qq.com>
Date: Tue, 21 Jan 2025 05:51:23 +0000
Subject: [PATCH 02/18] add vllm_collector and test_vllm

---
 .../collector/tests/test_vllm_collector.py    | 96 ++++++++++++++++++
 .../vllm_collector.py}                        | 97 ++-----------------
 2 files changed, 105 insertions(+), 88 deletions(-)
 create mode 100644 ding/worker/collector/tests/test_vllm_collector.py
 rename ding/worker/{vllm_test_wqj.py => collector/vllm_collector.py} (64%)

diff --git a/ding/worker/collector/tests/test_vllm_collector.py b/ding/worker/collector/tests/test_vllm_collector.py
new file mode 100644
index 0000000000..2b2795e00f
--- /dev/null
+++ b/ding/worker/collector/tests/test_vllm_collector.py
@@ -0,0 +1,96 @@
+from typing import List, Tuple
+import os
+import uuid
+from loguru import logger
+from ..vllm_collector import HuggingFaceModelGenerator
+from vllm.assets.image import ImageAsset
+        
+# set a temperature > 0 to get multiple responses
+# note that HuggingFaceModelGenerator has a parameter "mm_processor_kwargs" which is set to align with the settings of Qwen in default
+model=HuggingFaceModelGenerator('/mnt/afs/share/Qwen2-VL-7B',temperature=0.5) 
+
+def get_prompts_qwen(questions: list,modality: str):
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
+    else :
+        msg = f"Modality {modality} is not supported."
+        raise ValueError(msg)
+
+    prompts = [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+              f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+              f"{question}<|im_end|>\n"
+              "<|im_start|>assistant\n") for question in questions]
+    stop_token_ids = None
+    return prompts,stop_token_ids
+
+def get_multi_modal_input(modality:str ,filenames:list, questions:list):
+    """
+    return {
+        "data": image or video,
+        "question": question,
+    }
+    """
+    if modality == "image":
+        # Input image and question
+        ret={
+            'data':[],
+            'question':[]
+        }
+        for filename,question in zip(filenames,questions):
+            image = ImageAsset(filename) \
+                .pil_image.convert("RGB")
+            #img_question = "What is the content of this image?"
+            img_question=question
+            ret["data"].append(image)
+            ret["question"].append(img_question)
+    else:
+        msg = f"Modality {modality} is not supported."
+        raise ValueError(msg)
+    return ret
+
+
+questions=["What is the content of this image?","Please describe the image.","How many people are there in the image? What are they doing?"]
+img_names=['/mnt/afs/niuyazhe/data/meme/data/Eimages/Eimages/Eimages/image_ (2)','/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(3127)','/mnt/afs/wangqijian/data/test/test']
+
+num_prompts=len(questions)
+image_repeat_prob=None
+from enum import Enum
+
+class Modality(Enum):
+    IMAGE = 'image'
+    TEXT = 'text'
+    VIDEO = 'video'
+
+modality=Modality.IMAGE.value
+
+mm_input = get_multi_modal_input(modality,img_names,questions)
+data = mm_input["data"]
+question = mm_input["question"]
+prompts, stop_token_ids = get_prompts_qwen(question,modality)
+
+
+import asyncio
+import nest_asyncio
+nest_asyncio.apply()
+async def main():
+    inputs = [
+        {
+            "prompt":prompt,
+            "multi_modal_data":{
+                modality:data
+            }
+        } for prompt,data in zip(prompts,data)
+    ]
+    # generate responses
+    for in_data in inputs:
+        responses = await model.generate(prompt=in_data, num_samples=3)
+        # print response
+        for response, confidence in responses:
+            print(f"Response: {response}")
+    
+
+# run main
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/ding/worker/vllm_test_wqj.py b/ding/worker/collector/vllm_collector.py
similarity index 64%
rename from ding/worker/vllm_test_wqj.py
rename to ding/worker/collector/vllm_collector.py
index de80dc7da1..c5993c4bdf 100644
--- a/ding/worker/vllm_test_wqj.py
+++ b/ding/worker/collector/vllm_collector.py
@@ -3,10 +3,10 @@
 import uuid
 from loguru import logger
 from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams, RequestOutput
-from vllm.assets.image import ImageAsset
+
 
 class VllmActor:
-    def __init__(self, model_path: str) -> None:
+    def __init__(self, model_path: str,mm_processor_kwargs: dict) -> None:
         """
         Overview:
             Initialize the vLLM actor. For more details, please refer to https://docs.vllm.ai/en/stable.
@@ -19,6 +19,7 @@ def __init__(self, model_path: str) -> None:
         # Set CUDA_VISIBLE_DEVICES to use only free GPUs
         os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, self.free_gpus))
         self.model_path = model_path
+        self.mm_processor_kwargs=mm_processor_kwargs
         self._initialize()
 
     def get_free_gpus(self) -> List[int]:
@@ -56,17 +57,10 @@ def _initialize(self) -> None:
             tensor_parallel_size=self.num_gpus,
             max_num_batched_tokens=8192,
             max_model_len=8192,
-            #max_model_len=4096,  #see if 8192 works
-            #max_num_batched_tokens=4096,
-            #max_num_batched_tokens=2048,
-            #max_model_len=2048,
             # enable_chunked_prefill=True,
             max_num_seqs=5,
             # Note - mm_processor_kwargs can also be passed to generate/chat calls
-            mm_processor_kwargs={
-                "min_pixels": 28 * 28,
-                "max_pixels": 1280 * 28 * 28,
-            },
+            mm_processor_kwargs=self.mm_processor_kwargs,
         )
         self.engine = AsyncLLMEngine.from_engine_args(engine_args)
 
@@ -106,7 +100,10 @@ class HuggingFaceModelGenerator:
         A LLM/VLM generator that uses Hugging Face models with vLLM as the backend.
     """
 
-    def __init__(self, model_path: str, max_tokens: int = 1024, temperature: float = 0) -> None:
+    def __init__(self, model_path: str, max_tokens: int = 1024, temperature: float = 0, mm_processor_kwargs:dict = {
+                "min_pixels": 28 * 28,
+                "max_pixels": 1280 * 28 * 28,
+            }) -> None:
         """
         Overview:
             Initialize the Hugging Face model generator.
@@ -115,7 +112,7 @@ def __init__(self, model_path: str, max_tokens: int = 1024, temperature: float =
             - max_tokens (int): The maximum number of tokens to generate, default to 1024.
             - temperature (float): The temperature for the language model, default to 0.
         """
-        self.vllm_actor = VllmActor(model_path)
+        self.vllm_actor = VllmActor(model_path,mm_processor_kwargs)
         self.max_tokens = max_tokens
         self.temperature = temperature
 
@@ -144,79 +141,3 @@ async def generate(
             for x, conf in zip(response.outputs, confidence_scores)
         ]
         
-
-model=HuggingFaceModelGenerator('/mnt/afs/share/Qwen2-VL-7B',temperature=0.5) #设置一个temperature就好了,可以做到生成多个候选答案
-
-def get_prompts_qwen(questions: list,modality: str):
-    if modality == "image":
-        placeholder = "<|image_pad|>"
-    elif modality == "video":
-        placeholder = "<|video_pad|>"
-
-    prompts = [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
-              f"{question}<|im_end|>\n"
-              "<|im_start|>assistant\n") for question in questions]
-    stop_token_ids = None
-    return prompts,stop_token_ids
-
-def get_multi_modal_input(modality,filenames,questions):
-    """
-    return {
-        "data": image or video,
-        "question": question,
-    }
-    """
-    if modality == "image":
-        # Input image and question
-        ret={}
-        ret["data"]=[]
-        ret["question"]=[]
-        for filename,question in zip(filenames,questions):
-            image = ImageAsset(filename) \
-                .pil_image.convert("RGB")
-            #img_question = "What is the content of this image?"
-            img_question=question
-            ret["data"].append(image)
-            ret["question"].append(img_question)
-        return ret
-
-
-questions=["What is the content of this image?","Please describe the image.","How many people are there in the image? What are they doing?"]
-img_names=['/mnt/afs/niuyazhe/data/meme/data/Eimages/Eimages/Eimages/image_ (2)','/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(3127)','/mnt/afs/wangqijian/data/test/test']
-#questions=["What is the content of this image?"]
-#img_names=['/mnt/afs/niuyazhe/data/meme/data/Eimages/Eimages/Eimages/image_ (2)']
-num_prompts=len(questions)
-image_repeat_prob=None
-modality = 'image'
-
-mm_input = get_multi_modal_input(modality,img_names,questions)
-data = mm_input["data"]
-question = mm_input["question"]
-batch_inference_mine=True
-prompts, stop_token_ids = get_prompts_qwen(question,modality)
-
-
-import asyncio
-import nest_asyncio
-nest_asyncio.apply()
-async def main():
-    inputs = [
-        {
-            "prompt":prompt,
-            "multi_modal_data":{
-                modality:data
-            }
-        } for prompt,data in zip(prompts,data)
-    ]
-    # 调用 generate 方法
-    for in_data in inputs:
-        tactics = await model.generate(prompt=in_data, num_samples=3)
-        # 打印返回结果
-        for tactic, confidence in tactics:
-            print(f"Tactic: {tactic}")
-    
-
-# 运行主程序
-if __name__ == "__main__":
-    asyncio.run(main())
\ No newline at end of file

From 4d60f88406c65d123fd7d612c0309439f76d9db4 Mon Sep 17 00:00:00 2001
From: wqj2004 <2285705435@qq.com>
Date: Tue, 21 Jan 2025 13:04:18 +0000
Subject: [PATCH 03/18] formatted vllm & test_vllm

---
 .../collector/tests/test_vllm_collector.py    | 75 ++++++++++---------
 ding/worker/collector/vllm_collector.py       | 34 +++++----
 2 files changed, 59 insertions(+), 50 deletions(-)

diff --git a/ding/worker/collector/tests/test_vllm_collector.py b/ding/worker/collector/tests/test_vllm_collector.py
index 2b2795e00f..58a675532e 100644
--- a/ding/worker/collector/tests/test_vllm_collector.py
+++ b/ding/worker/collector/tests/test_vllm_collector.py
@@ -4,28 +4,34 @@
 from loguru import logger
 from ..vllm_collector import HuggingFaceModelGenerator
 from vllm.assets.image import ImageAsset
-        
+
 # set a temperature > 0 to get multiple responses
-# note that HuggingFaceModelGenerator has a parameter "mm_processor_kwargs" which is set to align with the settings of Qwen in default
-model=HuggingFaceModelGenerator('/mnt/afs/share/Qwen2-VL-7B',temperature=0.5) 
+# note that HFModelGenerator has a parameter "mm_processor_kwargs" set to align with the settings of Qwen in default
+model = HuggingFaceModelGenerator('/mnt/afs/share/Qwen2-VL-7B', temperature=0.5)
+
 
-def get_prompts_qwen(questions: list,modality: str):
+def get_prompts_qwen(questions: list, modality: str):
     if modality == "image":
         placeholder = "<|image_pad|>"
     elif modality == "video":
         placeholder = "<|video_pad|>"
-    else :
+    else:
         msg = f"Modality {modality} is not supported."
         raise ValueError(msg)
 
-    prompts = [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
-              f"{question}<|im_end|>\n"
-              "<|im_start|>assistant\n") for question in questions]
+    prompts = [
+        (
+            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"{question}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        ) for question in questions
+    ]
     stop_token_ids = None
-    return prompts,stop_token_ids
+    return prompts, stop_token_ids
+
 
-def get_multi_modal_input(modality:str ,filenames:list, questions:list):
+def get_multi_modal_input(modality: str, filenames: list, questions: list):
     """
     return {
         "data": image or video,
@@ -34,15 +40,12 @@ def get_multi_modal_input(modality:str ,filenames:list, questions:list):
     """
     if modality == "image":
         # Input image and question
-        ret={
-            'data':[],
-            'question':[]
-        }
-        for filename,question in zip(filenames,questions):
+        ret = {'data': [], 'question': []}
+        for filename, question in zip(filenames, questions):
             image = ImageAsset(filename) \
                 .pil_image.convert("RGB")
             #img_question = "What is the content of this image?"
-            img_question=question
+            img_question = question
             ret["data"].append(image)
             ret["question"].append(img_question)
     else:
@@ -51,46 +54,48 @@ def get_multi_modal_input(modality:str ,filenames:list, questions:list):
     return ret
 
 
-questions=["What is the content of this image?","Please describe the image.","How many people are there in the image? What are they doing?"]
-img_names=['/mnt/afs/niuyazhe/data/meme/data/Eimages/Eimages/Eimages/image_ (2)','/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(3127)','/mnt/afs/wangqijian/data/test/test']
+questions = [
+    "What is the content of this image?", "Please describe the image.",
+    "How many people are there in the image? What are they doing?"
+]
+img_names = [
+    '/mnt/afs/niuyazhe/data/meme/data/Eimages/Eimages/Eimages/image_ (2)',
+    '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(3127)', '/mnt/afs/wangqijian/data/test/test'
+]
 
-num_prompts=len(questions)
-image_repeat_prob=None
+num_prompts = len(questions)
+image_repeat_prob = None
 from enum import Enum
 
+
 class Modality(Enum):
     IMAGE = 'image'
     TEXT = 'text'
     VIDEO = 'video'
 
-modality=Modality.IMAGE.value
 
-mm_input = get_multi_modal_input(modality,img_names,questions)
+modality = Modality.IMAGE.value
+
+mm_input = get_multi_modal_input(modality, img_names, questions)
 data = mm_input["data"]
 question = mm_input["question"]
-prompts, stop_token_ids = get_prompts_qwen(question,modality)
-
+prompts, stop_token_ids = get_prompts_qwen(question, modality)
 
 import asyncio
 import nest_asyncio
 nest_asyncio.apply()
+
+
 async def main():
-    inputs = [
-        {
-            "prompt":prompt,
-            "multi_modal_data":{
-                modality:data
-            }
-        } for prompt,data in zip(prompts,data)
-    ]
+    inputs = [{"prompt": prompt, "multi_modal_data": {modality: data}} for prompt, data in zip(prompts, data)]
     # generate responses
     for in_data in inputs:
         responses = await model.generate(prompt=in_data, num_samples=3)
         # print response
         for response, confidence in responses:
             print(f"Response: {response}")
-    
+
 
 # run main
 if __name__ == "__main__":
-    asyncio.run(main())
\ No newline at end of file
+    asyncio.run(main())
diff --git a/ding/worker/collector/vllm_collector.py b/ding/worker/collector/vllm_collector.py
index c5993c4bdf..3bd138587f 100644
--- a/ding/worker/collector/vllm_collector.py
+++ b/ding/worker/collector/vllm_collector.py
@@ -6,7 +6,8 @@
 
 
 class VllmActor:
-    def __init__(self, model_path: str,mm_processor_kwargs: dict) -> None:
+
+    def __init__(self, model_path: str, mm_processor_kwargs: dict) -> None:
         """
         Overview:
             Initialize the vLLM actor. For more details, please refer to https://docs.vllm.ai/en/stable.
@@ -19,7 +20,7 @@ def __init__(self, model_path: str,mm_processor_kwargs: dict) -> None:
         # Set CUDA_VISIBLE_DEVICES to use only free GPUs
         os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, self.free_gpus))
         self.model_path = model_path
-        self.mm_processor_kwargs=mm_processor_kwargs
+        self.mm_processor_kwargs = mm_processor_kwargs
         self._initialize()
 
     def get_free_gpus(self) -> List[int]:
@@ -31,7 +32,8 @@ def get_free_gpus(self) -> List[int]:
         """
         try:
             # Get GPU memory usage using nvidia-smi
-            gpu_stats = os.popen('nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader').readlines()
+            gpu_stats = os.popen('nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader')\
+                .readlines()
             free_gpus = []
 
             for gpu_id, stats in enumerate(gpu_stats):
@@ -81,7 +83,7 @@ async def generate(self, prompt, num_samples: int, max_tokens: int, temperature:
             max_tokens=max_tokens,
             temperature=temperature,
         )
-        
+
         # Using async iterator to handle vLLM's generation process
         # 1. vLLM's generate method is asynchronous to prevent blocking while waiting for model outputs
         # 2. async for allows streaming the generated outputs incrementally instead of waiting for all results
@@ -100,10 +102,16 @@ class HuggingFaceModelGenerator:
         A LLM/VLM generator that uses Hugging Face models with vLLM as the backend.
     """
 
-    def __init__(self, model_path: str, max_tokens: int = 1024, temperature: float = 0, mm_processor_kwargs:dict = {
+    def __init__(
+            self,
+            model_path: str,
+            max_tokens: int = 1024,
+            temperature: float = 0,
+            mm_processor_kwargs: dict = {
                 "min_pixels": 28 * 28,
                 "max_pixels": 1280 * 28 * 28,
-            }) -> None:
+            }
+    ) -> None:
         """
         Overview:
             Initialize the Hugging Face model generator.
@@ -112,14 +120,14 @@ def __init__(self, model_path: str, max_tokens: int = 1024, temperature: float =
             - max_tokens (int): The maximum number of tokens to generate, default to 1024.
             - temperature (float): The temperature for the language model, default to 0.
         """
-        self.vllm_actor = VllmActor(model_path,mm_processor_kwargs)
+        self.vllm_actor = VllmActor(model_path, mm_processor_kwargs)
         self.max_tokens = max_tokens
         self.temperature = temperature
 
     async def generate(
-        self,
-        prompt,
-        num_samples: int,
+            self,
+            prompt,
+            num_samples: int,
     ) -> List[Tuple[str, float]]:
         """
         Overview:
@@ -136,8 +144,4 @@ async def generate(
         response = await self.vllm_actor.generate(prompt, num_samples, self.max_tokens, self.temperature)
         # Use raw logprobs as confidence scores
         confidence_scores = [x.cumulative_logprob for x in response.outputs]
-        return [
-            (x.text.strip(), conf)
-            for x, conf in zip(response.outputs, confidence_scores)
-        ]
-        
+        return [(x.text.strip(), conf) for x, conf in zip(response.outputs, confidence_scores)]

From 0d311ec76763f18cc9231e82dc134f87fc578808 Mon Sep 17 00:00:00 2001
From: wqj2004 <2285705435@qq.com>
Date: Wed, 22 Jan 2025 04:27:26 +0000
Subject: [PATCH 04/18] enum+typing lint

---
 .../collector/tests/test_vllm_collector.py    | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/ding/worker/collector/tests/test_vllm_collector.py b/ding/worker/collector/tests/test_vllm_collector.py
index 58a675532e..509c733ce1 100644
--- a/ding/worker/collector/tests/test_vllm_collector.py
+++ b/ding/worker/collector/tests/test_vllm_collector.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple
+from typing import List, Tuple, Optional
 import os
 import uuid
 from loguru import logger
@@ -9,11 +9,16 @@
 # note that HFModelGenerator has a parameter "mm_processor_kwargs" set to align with the settings of Qwen in default
 model = HuggingFaceModelGenerator('/mnt/afs/share/Qwen2-VL-7B', temperature=0.5)
 
+from enum import Enum
+class Modality(Enum):
+    IMAGE = "image"
+    TEXT = "text"
+    VIDEO = "video"
 
-def get_prompts_qwen(questions: list, modality: str):
-    if modality == "image":
+def get_prompts_qwen(questions: list, modality: Modality) -> Tuple[List[str],Optional[List[int]]]:    
+    if modality == Modality.IMAGE:
         placeholder = "<|image_pad|>"
-    elif modality == "video":
+    elif modality == Modality.VIDEO:
         placeholder = "<|video_pad|>"
     else:
         msg = f"Modality {modality} is not supported."
@@ -28,17 +33,17 @@ def get_prompts_qwen(questions: list, modality: str):
         ) for question in questions
     ]
     stop_token_ids = None
-    return prompts, stop_token_ids
+    return prompts,stop_token_ids
 
 
-def get_multi_modal_input(modality: str, filenames: list, questions: list):
+def get_multi_modal_input(modality: Modality, filenames: list, questions: list) -> dict:
     """
     return {
         "data": image or video,
         "question": question,
     }
     """
-    if modality == "image":
+    if modality == Modality.IMAGE:
         # Input image and question
         ret = {'data': [], 'question': []}
         for filename, question in zip(filenames, questions):
@@ -65,16 +70,11 @@ def get_multi_modal_input(modality: str, filenames: list, questions: list):
 
 num_prompts = len(questions)
 image_repeat_prob = None
-from enum import Enum
 
 
-class Modality(Enum):
-    IMAGE = 'image'
-    TEXT = 'text'
-    VIDEO = 'video'
 
 
-modality = Modality.IMAGE.value
+modality = Modality.IMAGE
 
 mm_input = get_multi_modal_input(modality, img_names, questions)
 data = mm_input["data"]
@@ -87,7 +87,7 @@ class Modality(Enum):
 
 
 async def main():
-    inputs = [{"prompt": prompt, "multi_modal_data": {modality: data}} for prompt, data in zip(prompts, data)]
+    inputs = [{"prompt": prompt, "multi_modal_data": {modality.value: data}} for prompt, data in zip(prompts, data)]
     # generate responses
     for in_data in inputs:
         responses = await model.generate(prompt=in_data, num_samples=3)

From e119c8592a766c1956c14d1eb1b22a48c8bd3848 Mon Sep 17 00:00:00 2001
From: wqj2004 <2285705435@qq.com>
Date: Wed, 22 Jan 2025 12:23:27 +0000
Subject: [PATCH 05/18] add test_vllm_collector_multigpu.py

---
 .../tests/test_vllm_collector__multigpu.py    | 277 ++++++++++++++++++
 1 file changed, 277 insertions(+)
 create mode 100644 ding/worker/collector/tests/test_vllm_collector__multigpu.py

diff --git a/ding/worker/collector/tests/test_vllm_collector__multigpu.py b/ding/worker/collector/tests/test_vllm_collector__multigpu.py
new file mode 100644
index 0000000000..fe0813a7a4
--- /dev/null
+++ b/ding/worker/collector/tests/test_vllm_collector__multigpu.py
@@ -0,0 +1,277 @@
+from typing import List, Tuple
+import os
+import uuid
+from loguru import logger
+from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams, RequestOutput
+
+
+class VllmActor:
+    def __init__(self, model_path: str,mm_processor_kwargs: dict,free_gpus:list) -> None:
+        """
+        Overview:
+            Initialize the vLLM actor. For more details, please refer to https://docs.vllm.ai/en/stable.
+        Arguments:
+            - model_path (str): The path to the language model.
+        """
+        self.free_gpus = free_gpus
+        self.num_gpus = len(self.free_gpus)
+        assert self.num_gpus > 0, "No GPUs found"
+        # Set CUDA_VISIBLE_DEVICES to use only free GPUs
+        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, self.free_gpus))
+        self.model_path = model_path
+        self.mm_processor_kwargs=mm_processor_kwargs
+        self._initialize()
+
+    def _initialize(self) -> None:
+        """
+        Overview:
+            Initialize the vLLM actor with a series of arguments.
+        """
+        logger.info("Initializing vLLM")
+        # TODO: Try other options in https://docs.vllm.ai/en/stable/models/engine_args.html#engine-args.
+        engine_args = AsyncEngineArgs(
+            model=self.model_path,
+            tensor_parallel_size=self.num_gpus,
+            max_num_batched_tokens=8192,
+            max_model_len=8192,
+            # enable_chunked_prefill=True,
+            max_num_seqs=5,
+            # Note - mm_processor_kwargs can also be passed to generate/chat calls
+            mm_processor_kwargs=self.mm_processor_kwargs,
+        )
+        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
+
+    async def generate(self, prompt, num_samples: int, max_tokens: int, temperature: float = 0) -> RequestOutput:
+        """
+        Overview:
+            Generate tactics for the current state.
+        Arguments:
+            - prompt : The prompt to generate tactics.
+            - num_samples (int): The number of tactics to generate.
+            - max_tokens (int): The maximum number of tokens to generate.
+            - temperature (float): The temperature for the language model, default to 0.
+        Returns:
+            - RequestOutput: The generated tactics and their log-probabilities.
+        """
+        sampling_params = SamplingParams(
+            n=num_samples,
+            max_tokens=max_tokens,
+            temperature=temperature,
+        )
+        
+        # Using async iterator to handle vLLM's generation process
+        # 1. vLLM's generate method is asynchronous to prevent blocking while waiting for model outputs
+        # 2. async for allows streaming the generated outputs incrementally instead of waiting for all results
+        # 3. This approach is particularly suitable for LLM inference which can be time-consuming
+        # 4. The request_id ensures unique identification for each generation request
+        async for oup in self.engine.generate(
+            prompt, sampling_params, request_id=str(uuid.uuid4().hex)
+        ):
+            final_output = oup
+        return final_output
+
+
+class HuggingFaceModelGenerator:
+    """
+    Overview:
+        A LLM/VLM generator that uses Hugging Face models with vLLM as the backend.
+    """
+
+    def __init__(self, model_path: str, free_gpus:list, max_tokens: int = 1024, temperature: float = 0, mm_processor_kwargs:dict = {
+                "min_pixels": 28 * 28,
+                "max_pixels": 1280 * 28 * 28,
+            }) -> None:
+        """
+        Overview:
+            Initialize the Hugging Face model generator.
+        Arguments:
+            - model_path (str): The path to the language model.
+            - max_tokens (int): The maximum number of tokens to generate, default to 1024.
+            - temperature (float): The temperature for the language model, default to 0.
+        """
+        self.vllm_actor = VllmActor(model_path,mm_processor_kwargs,free_gpus)
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+
+    async def generate(
+        self,
+        prompt,
+        num_samples: int,
+    ) -> List[Tuple[str, float]]:
+        """
+        Overview:
+            Generate tactics for the current state.
+        Arguments:
+            - prompt : The prompt to generate tactics.
+            - num_samples (int): The number of tactics to generate.
+        Returns:
+            - List[Tuple[str, float]]: The generated tactics and their log-probabilities.
+
+        .. note::
+            This method is asynchronous and returns a coroutine.
+        """
+        response = await self.vllm_actor.generate(prompt, num_samples, self.max_tokens, self.temperature)
+        # Use raw logprobs as confidence scores
+        confidence_scores = [x.cumulative_logprob for x in response.outputs]
+        return [
+            (x.text.strip(), conf)
+            for x, conf in zip(response.outputs, confidence_scores)
+        ]
+        
+
+def get_free_gpus() -> List[int]:
+        """
+        Overview:
+            Get IDs of GPUs with free memory.
+        Returns:
+            - List[int]: The IDs of the free GPUs.
+        """
+        try:
+            # Get GPU memory usage using nvidia-smi
+            gpu_stats = os.popen('nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader')\
+                .readlines()
+            free_gpus = []
+
+            for gpu_id, stats in enumerate(gpu_stats):
+                mem_used, mem_total = map(int, stats.strip().split(','))
+                # Consider GPU as free if less than 5% memory is used
+                if mem_used / mem_total < 0.05:
+                    free_gpus.append(gpu_id)
+
+            return free_gpus if free_gpus else [0]  # Default to GPU 0 if no free GPUs found
+        except Exception:
+            logger.warning("Failed to get GPU stats, defaulting to GPU 0")
+            return [0]
+
+def chunk_list(original_list, t):
+    # 使用列表推导式和切片
+    new_list = [original_list[i:i + t] for i in range(0, len(original_list), t)]
+    return new_list
+
+
+from typing import List, Tuple, Optional
+import os
+from loguru import logger
+from vllm.assets.image import ImageAsset
+from enum import Enum
+import concurrent.futures
+class Modality(Enum):
+    IMAGE = "image"
+    TEXT = "text"
+    VIDEO = "video"
+
+def get_prompts_qwen(questions: list, modality: Modality) -> Tuple[List[str],Optional[List[int]]]:    
+    if modality == Modality.IMAGE:
+        placeholder = "<|image_pad|>"
+    elif modality == Modality.VIDEO:
+        placeholder = "<|video_pad|>"
+    else:
+        msg = f"Modality {modality} is not supported."
+        raise ValueError(msg)
+
+    prompts = [
+        (
+            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"{question}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        ) for question in questions
+    ]
+    stop_token_ids = None
+    return prompts,stop_token_ids
+
+
+def get_multi_modal_input(modality: Modality, filenames: list, questions: list) -> dict:
+    """
+    return {
+        "data": image or video,
+        "question": question,
+    }
+    """
+    if modality == Modality.IMAGE:
+        # Input image and question
+        ret = {'data': [], 'question': []}
+        for filename, question in zip(filenames, questions):
+            image = ImageAsset(filename) \
+                .pil_image.convert("RGB")
+            #img_question = "What is the content of this image?"
+            img_question = question
+            ret["data"].append(image)
+            ret["question"].append(img_question)
+    else:
+        msg = f"Modality {modality} is not supported."
+        raise ValueError(msg)
+    return ret
+
+
+async def run_vllm_collector(gpu_id, prompts, model_path,temperature):
+    # 设置当前进程的可用GPU
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+    
+    model = HuggingFaceModelGenerator(model_path,free_gpus=[gpu_id],temperature=temperature)  # 实例化模型
+
+    responses_list = []
+    for prompt in prompts:
+        responses = await model.generate(prompt, num_samples=3)
+        for response in responses:
+            responses_list.append(response)
+            print(f"[GPU {gpu_id}] Response: {response}")
+
+    return responses_list
+
+import asyncio
+import nest_asyncio
+def start_collector(gpu_id, prompts, model_path,temperature):
+    # 在每个进程中运行事件循环
+    results = asyncio.run(run_vllm_collector(gpu_id, prompts, model_path,temperature))
+    return results
+
+def main(prompts, model_path,  free_gpus,temperature):
+    num_tot=len(prompts)
+    num_gpu=len(free_gpus)
+    num_per_gpu=num_tot//num_gpu
+    prompts_per_gpu=chunk_list(prompts,num_per_gpu)
+    with concurrent.futures.ProcessPoolExecutor(max_workers=len(free_gpus)) as executor:
+        futures = []
+        for gpu_id,prompts_gpu in zip(free_gpus,prompts_per_gpu):
+            futures.append(executor.submit(start_collector, gpu_id, prompts_gpu, model_path,temperature))
+
+        # 收集所有结果
+        all_results = []
+        for future in concurrent.futures.as_completed(futures):
+            all_results.extend(future.result())
+
+    # 保存结果的逻辑
+    with open("/mnt/afs/wangqijian/tests/vllm_multi_gpu.txt", "w") as f:
+        for response in all_results:
+            f.write(f"{response}\n")
+            
+
+
+
+if __name__ == "__main__":
+    questions=['Please describe the image.','Please describe the image.',
+               'What\'s the text in the image?','What\'s the text in the image?',
+             'What is in the image?','What is in the image?','How many people are in the image?','How many people are in the image?',
+             'What is the emotion of the main character of the image?','What is the emotion of the main character of the image?',
+             'How many animals are in the image?','How many animals are in the image?',
+             'What is the place of the image?','What is the place of the image?','What is the peroson doing?','What is the peroson doing?'
+             ]
+    img_names=['/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(2127)','/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(5394)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(1160)','/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(4956)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(2212)','/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(3387)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(4086)','/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(4384)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(5000)','/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(1237)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(766)','/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(6031)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(6)','/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(2284)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(4533)','/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(5495)'
+            ]
+    free_gpus=get_free_gpus()
+    modality = Modality.IMAGE
+    mm_input = get_multi_modal_input(modality, img_names, questions)
+    data = mm_input["data"]
+    question = mm_input["question"]
+    prompts, stop_token_ids = get_prompts_qwen(question, modality)
+    model_path='/mnt/afs/share/Qwen2-VL-7B'
+    temperature=0.5
+    main(prompts,model_path,free_gpus,temperature)
\ No newline at end of file

From bf3415575eeaab6e4df651bd4f64c65a87f3f0c6 Mon Sep 17 00:00:00 2001
From: wqj2004 <2285705435@qq.com>
Date: Wed, 22 Jan 2025 12:33:15 +0000
Subject: [PATCH 06/18] Add test_vllm_collector_multigpu

---
 .../tests/test_vllm_collector__multigpu.py    | 103 ++++++++++--------
 1 file changed, 58 insertions(+), 45 deletions(-)

diff --git a/ding/worker/collector/tests/test_vllm_collector__multigpu.py b/ding/worker/collector/tests/test_vllm_collector__multigpu.py
index fe0813a7a4..e23058a27a 100644
--- a/ding/worker/collector/tests/test_vllm_collector__multigpu.py
+++ b/ding/worker/collector/tests/test_vllm_collector__multigpu.py
@@ -77,7 +77,8 @@ class HuggingFaceModelGenerator:
         A LLM/VLM generator that uses Hugging Face models with vLLM as the backend.
     """
 
-    def __init__(self, model_path: str, free_gpus:list, max_tokens: int = 1024, temperature: float = 0, mm_processor_kwargs:dict = {
+    def __init__(self, model_path: str, free_gpus:list, 
+                 max_tokens: int = 1024, temperature: float = 0, mm_processor_kwargs:dict = {
                 "min_pixels": 28 * 28,
                 "max_pixels": 1280 * 28 * 28,
             }) -> None:
@@ -120,31 +121,31 @@ async def generate(
         
 
 def get_free_gpus() -> List[int]:
-        """
-        Overview:
-            Get IDs of GPUs with free memory.
-        Returns:
-            - List[int]: The IDs of the free GPUs.
-        """
-        try:
-            # Get GPU memory usage using nvidia-smi
-            gpu_stats = os.popen('nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader')\
-                .readlines()
-            free_gpus = []
+    """
+    Overview:
+        Get IDs of GPUs with free memory.
+    Returns:
+        - List[int]: The IDs of the free GPUs.
+    """
+    try:
+        # Get GPU memory usage using nvidia-smi
+        gpu_stats = os.popen('nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader')\
+            .readlines()
+        free_gpus = []
 
-            for gpu_id, stats in enumerate(gpu_stats):
-                mem_used, mem_total = map(int, stats.strip().split(','))
-                # Consider GPU as free if less than 5% memory is used
-                if mem_used / mem_total < 0.05:
-                    free_gpus.append(gpu_id)
+        for gpu_id, stats in enumerate(gpu_stats):
+            mem_used, mem_total = map(int, stats.strip().split(','))
+            # Consider GPU as free if less than 5% memory is used
+            if mem_used / mem_total < 0.05:
+                free_gpus.append(gpu_id)
 
-            return free_gpus if free_gpus else [0]  # Default to GPU 0 if no free GPUs found
-        except Exception:
-            logger.warning("Failed to get GPU stats, defaulting to GPU 0")
-            return [0]
+        return free_gpus if free_gpus else [0]  # Default to GPU 0 if no free GPUs found
+    except Exception:
+        logger.warning("Failed to get GPU stats, defaulting to GPU 0")
+        return [0]
 
-def chunk_list(original_list, t):
-    # 使用列表推导式和切片
+def chunk_list(original_list:list, t:int) -> List[list]:
+    # chunk the list into sub_lists
     new_list = [original_list[i:i + t] for i in range(0, len(original_list), t)]
     return new_list
 
@@ -204,11 +205,11 @@ def get_multi_modal_input(modality: Modality, filenames: list, questions: list)
     return ret
 
 
-async def run_vllm_collector(gpu_id, prompts, model_path,temperature):
-    # 设置当前进程的可用GPU
+async def run_vllm_collector(gpu_id:int, prompts:List, model_path:str,temperature:float) ->List[str]:
+    # set visible gpu
     os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
-    
-    model = HuggingFaceModelGenerator(model_path,free_gpus=[gpu_id],temperature=temperature)  # 实例化模型
+    # get a model on a single gpu
+    model = HuggingFaceModelGenerator(model_path,free_gpus=[gpu_id],temperature=temperature) 
 
     responses_list = []
     for prompt in prompts:
@@ -220,13 +221,12 @@ async def run_vllm_collector(gpu_id, prompts, model_path,temperature):
     return responses_list
 
 import asyncio
-import nest_asyncio
-def start_collector(gpu_id, prompts, model_path,temperature):
-    # 在每个进程中运行事件循环
+def start_collector(gpu_id:int, prompts:list, model_path:str,temperature:float) ->List[str]:
+    # event loop in a process 
     results = asyncio.run(run_vllm_collector(gpu_id, prompts, model_path,temperature))
     return results
 
-def main(prompts, model_path,  free_gpus,temperature):
+def main(prompts:list, model_path:str,  free_gpus:List[int],temperature:float) -> None:
     num_tot=len(prompts)
     num_gpu=len(free_gpus)
     num_per_gpu=num_tot//num_gpu
@@ -236,12 +236,12 @@ def main(prompts, model_path,  free_gpus,temperature):
         for gpu_id,prompts_gpu in zip(free_gpus,prompts_per_gpu):
             futures.append(executor.submit(start_collector, gpu_id, prompts_gpu, model_path,temperature))
 
-        # 收集所有结果
+        # get all results
         all_results = []
         for future in concurrent.futures.as_completed(futures):
             all_results.extend(future.result())
 
-    # 保存结果的逻辑
+    # save results
     with open("/mnt/afs/wangqijian/tests/vllm_multi_gpu.txt", "w") as f:
         for response in all_results:
             f.write(f"{response}\n")
@@ -252,19 +252,32 @@ def main(prompts, model_path,  free_gpus,temperature):
 if __name__ == "__main__":
     questions=['Please describe the image.','Please describe the image.',
                'What\'s the text in the image?','What\'s the text in the image?',
-             'What is in the image?','What is in the image?','How many people are in the image?','How many people are in the image?',
-             'What is the emotion of the main character of the image?','What is the emotion of the main character of the image?',
-             'How many animals are in the image?','How many animals are in the image?',
-             'What is the place of the image?','What is the place of the image?','What is the peroson doing?','What is the peroson doing?'
+             'What is in the image?','What is in the image?',
+             'How many people are in the image?','How many people are in the image?',
+             'What is the emotion of the main character of the image?',
+             'What is the emotion of the main character of the image?',
+             'How many animals are in the image?',
+             'How many animals are in the image?',
+             'What is the place of the image?','What is the place of the image?',
+             'What is the peroson doing?','What is the peroson doing?'
              ]
-    img_names=['/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(2127)','/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(5394)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(1160)','/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(4956)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(2212)','/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(3387)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(4086)','/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(4384)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(5000)','/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(1237)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(766)','/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(6031)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(6)','/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(2284)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(4533)','/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(5495)'
+    img_names=[
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(2127)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(5394)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(1160)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(4956)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(2212)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(3387)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(4086)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(4384)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(5000)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(1237)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(766)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(6031)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(6)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(2284)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(4533)',
+        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(5495)'
             ]
     free_gpus=get_free_gpus()
     modality = Modality.IMAGE

From 1794b6a6c90c64d622f6ff5632755d291440a293 Mon Sep 17 00:00:00 2001
From: wqj2004 <2285705435@qq.com>
Date: Wed, 22 Jan 2025 12:48:48 +0000
Subject: [PATCH 07/18] formatted test_vllm_collector_multigpu

---
 ...gpu.py => test_vllm_collector_multigpu.py} | 106 ++++++++++--------
 1 file changed, 57 insertions(+), 49 deletions(-)
 rename ding/worker/collector/tests/{test_vllm_collector__multigpu.py => test_vllm_collector_multigpu.py} (80%)

diff --git a/ding/worker/collector/tests/test_vllm_collector__multigpu.py b/ding/worker/collector/tests/test_vllm_collector_multigpu.py
similarity index 80%
rename from ding/worker/collector/tests/test_vllm_collector__multigpu.py
rename to ding/worker/collector/tests/test_vllm_collector_multigpu.py
index e23058a27a..bb6c977c6b 100644
--- a/ding/worker/collector/tests/test_vllm_collector__multigpu.py
+++ b/ding/worker/collector/tests/test_vllm_collector_multigpu.py
@@ -6,7 +6,8 @@
 
 
 class VllmActor:
-    def __init__(self, model_path: str,mm_processor_kwargs: dict,free_gpus:list) -> None:
+
+    def __init__(self, model_path: str, mm_processor_kwargs: dict, free_gpus: list) -> None:
         """
         Overview:
             Initialize the vLLM actor. For more details, please refer to https://docs.vllm.ai/en/stable.
@@ -19,7 +20,7 @@ def __init__(self, model_path: str,mm_processor_kwargs: dict,free_gpus:list) ->
         # Set CUDA_VISIBLE_DEVICES to use only free GPUs
         os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, self.free_gpus))
         self.model_path = model_path
-        self.mm_processor_kwargs=mm_processor_kwargs
+        self.mm_processor_kwargs = mm_processor_kwargs
         self._initialize()
 
     def _initialize(self) -> None:
@@ -58,7 +59,7 @@ async def generate(self, prompt, num_samples: int, max_tokens: int, temperature:
             max_tokens=max_tokens,
             temperature=temperature,
         )
-        
+
         # Using async iterator to handle vLLM's generation process
         # 1. vLLM's generate method is asynchronous to prevent blocking while waiting for model outputs
         # 2. async for allows streaming the generated outputs incrementally instead of waiting for all results
@@ -77,11 +78,17 @@ class HuggingFaceModelGenerator:
         A LLM/VLM generator that uses Hugging Face models with vLLM as the backend.
     """
 
-    def __init__(self, model_path: str, free_gpus:list, 
-                 max_tokens: int = 1024, temperature: float = 0, mm_processor_kwargs:dict = {
+    def __init__(
+            self,
+            model_path: str,
+            free_gpus: list,
+            max_tokens: int = 1024,
+            temperature: float = 0,
+            mm_processor_kwargs: dict = {
                 "min_pixels": 28 * 28,
                 "max_pixels": 1280 * 28 * 28,
-            }) -> None:
+            }
+    ) -> None:
         """
         Overview:
             Initialize the Hugging Face model generator.
@@ -90,14 +97,14 @@ def __init__(self, model_path: str, free_gpus:list,
             - max_tokens (int): The maximum number of tokens to generate, default to 1024.
             - temperature (float): The temperature for the language model, default to 0.
         """
-        self.vllm_actor = VllmActor(model_path,mm_processor_kwargs,free_gpus)
+        self.vllm_actor = VllmActor(model_path, mm_processor_kwargs, free_gpus)
         self.max_tokens = max_tokens
         self.temperature = temperature
 
     async def generate(
-        self,
-        prompt,
-        num_samples: int,
+            self,
+            prompt,
+            num_samples: int,
     ) -> List[Tuple[str, float]]:
         """
         Overview:
@@ -114,11 +121,8 @@ async def generate(
         response = await self.vllm_actor.generate(prompt, num_samples, self.max_tokens, self.temperature)
         # Use raw logprobs as confidence scores
         confidence_scores = [x.cumulative_logprob for x in response.outputs]
-        return [
-            (x.text.strip(), conf)
-            for x, conf in zip(response.outputs, confidence_scores)
-        ]
-        
+        return [(x.text.strip(), conf) for x, conf in zip(response.outputs, confidence_scores)]
+
 
 def get_free_gpus() -> List[int]:
     """
@@ -144,7 +148,8 @@ def get_free_gpus() -> List[int]:
         logger.warning("Failed to get GPU stats, defaulting to GPU 0")
         return [0]
 
-def chunk_list(original_list:list, t:int) -> List[list]:
+
+def chunk_list(original_list: list, t: int) -> List[list]:
     # chunk the list into sub_lists
     new_list = [original_list[i:i + t] for i in range(0, len(original_list), t)]
     return new_list
@@ -156,12 +161,15 @@ def chunk_list(original_list:list, t:int) -> List[list]:
 from vllm.assets.image import ImageAsset
 from enum import Enum
 import concurrent.futures
+
+
 class Modality(Enum):
     IMAGE = "image"
     TEXT = "text"
     VIDEO = "video"
 
-def get_prompts_qwen(questions: list, modality: Modality) -> Tuple[List[str],Optional[List[int]]]:    
+
+def get_prompts_qwen(questions: list, modality: Modality) -> Tuple[List[str], Optional[List[int]]]:
     if modality == Modality.IMAGE:
         placeholder = "<|image_pad|>"
     elif modality == Modality.VIDEO:
@@ -179,7 +187,7 @@ def get_prompts_qwen(questions: list, modality: Modality) -> Tuple[List[str],Opt
         ) for question in questions
     ]
     stop_token_ids = None
-    return prompts,stop_token_ids
+    return prompts, stop_token_ids
 
 
 def get_multi_modal_input(modality: Modality, filenames: list, questions: list) -> dict:
@@ -205,11 +213,11 @@ def get_multi_modal_input(modality: Modality, filenames: list, questions: list)
     return ret
 
 
-async def run_vllm_collector(gpu_id:int, prompts:List, model_path:str,temperature:float) ->List[str]:
+async def run_vllm_collector(gpu_id: int, prompts: List, model_path: str, temperature: float) -> List[str]:
     # set visible gpu
     os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
     # get a model on a single gpu
-    model = HuggingFaceModelGenerator(model_path,free_gpus=[gpu_id],temperature=temperature) 
+    model = HuggingFaceModelGenerator(model_path, free_gpus=[gpu_id], temperature=temperature)
 
     responses_list = []
     for prompt in prompts:
@@ -220,21 +228,25 @@ async def run_vllm_collector(gpu_id:int, prompts:List, model_path:str,temperatur
 
     return responses_list
 
+
 import asyncio
-def start_collector(gpu_id:int, prompts:list, model_path:str,temperature:float) ->List[str]:
-    # event loop in a process 
-    results = asyncio.run(run_vllm_collector(gpu_id, prompts, model_path,temperature))
+
+
+def start_collector(gpu_id: int, prompts: list, model_path: str, temperature: float) -> List[str]:
+    # event loop in a process
+    results = asyncio.run(run_vllm_collector(gpu_id, prompts, model_path, temperature))
     return results
 
-def main(prompts:list, model_path:str,  free_gpus:List[int],temperature:float) -> None:
-    num_tot=len(prompts)
-    num_gpu=len(free_gpus)
-    num_per_gpu=num_tot//num_gpu
-    prompts_per_gpu=chunk_list(prompts,num_per_gpu)
+
+def main(prompts: list, model_path: str, free_gpus: List[int], temperature: float) -> None:
+    num_tot = len(prompts)
+    num_gpu = len(free_gpus)
+    num_per_gpu = num_tot // num_gpu
+    prompts_per_gpu = chunk_list(prompts, num_per_gpu)
     with concurrent.futures.ProcessPoolExecutor(max_workers=len(free_gpus)) as executor:
         futures = []
-        for gpu_id,prompts_gpu in zip(free_gpus,prompts_per_gpu):
-            futures.append(executor.submit(start_collector, gpu_id, prompts_gpu, model_path,temperature))
+        for gpu_id, prompts_gpu in zip(free_gpus, prompts_per_gpu):
+            futures.append(executor.submit(start_collector, gpu_id, prompts_gpu, model_path, temperature))
 
         # get all results
         all_results = []
@@ -245,23 +257,19 @@ def main(prompts:list, model_path:str,  free_gpus:List[int],temperature:float) -
     with open("/mnt/afs/wangqijian/tests/vllm_multi_gpu.txt", "w") as f:
         for response in all_results:
             f.write(f"{response}\n")
-            
-
 
 
 if __name__ == "__main__":
-    questions=['Please describe the image.','Please describe the image.',
-               'What\'s the text in the image?','What\'s the text in the image?',
-             'What is in the image?','What is in the image?',
-             'How many people are in the image?','How many people are in the image?',
-             'What is the emotion of the main character of the image?',
-             'What is the emotion of the main character of the image?',
-             'How many animals are in the image?',
-             'How many animals are in the image?',
-             'What is the place of the image?','What is the place of the image?',
-             'What is the peroson doing?','What is the peroson doing?'
-             ]
-    img_names=[
+    questions = [
+        'Please describe the image.', 'Please describe the image.', 'What\'s the text in the image?',
+        'What\'s the text in the image?', 'What is in the image?', 'What is in the image?',
+        'How many people are in the image?', 'How many people are in the image?',
+        'What is the emotion of the main character of the image?',
+        'What is the emotion of the main character of the image?', 'How many animals are in the image?',
+        'How many animals are in the image?', 'What is the place of the image?', 'What is the place of the image?',
+        'What is the peroson doing?', 'What is the peroson doing?'
+    ]
+    img_names = [
         '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(2127)',
         '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(5394)',
         '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(1160)',
@@ -278,13 +286,13 @@ def main(prompts:list, model_path:str,  free_gpus:List[int],temperature:float) -
         '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(2284)',
         '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(4533)',
         '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(5495)'
-            ]
-    free_gpus=get_free_gpus()
+    ]
+    free_gpus = get_free_gpus()
     modality = Modality.IMAGE
     mm_input = get_multi_modal_input(modality, img_names, questions)
     data = mm_input["data"]
     question = mm_input["question"]
     prompts, stop_token_ids = get_prompts_qwen(question, modality)
-    model_path='/mnt/afs/share/Qwen2-VL-7B'
-    temperature=0.5
-    main(prompts,model_path,free_gpus,temperature)
\ No newline at end of file
+    model_path = '/mnt/afs/share/Qwen2-VL-7B'
+    temperature = 0.5
+    main(prompts, model_path, free_gpus, temperature)

From f70a942d37492d5c4e3cf65aae1fb22c1a4183b1 Mon Sep 17 00:00:00 2001
From: wqj2004 <2285705435@qq.com>
Date: Sun, 26 Jan 2025 07:03:14 +0000
Subject: [PATCH 08/18] formatted

---
 ding/worker/collector/tests/test_vllm_collector.py     | 10 +++++-----
 .../cartpole/config/cartpole_dqn_ddp_config.py         |  1 -
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/ding/worker/collector/tests/test_vllm_collector.py b/ding/worker/collector/tests/test_vllm_collector.py
index 509c733ce1..ef9252405b 100644
--- a/ding/worker/collector/tests/test_vllm_collector.py
+++ b/ding/worker/collector/tests/test_vllm_collector.py
@@ -10,12 +10,15 @@
 model = HuggingFaceModelGenerator('/mnt/afs/share/Qwen2-VL-7B', temperature=0.5)
 
 from enum import Enum
+
+
 class Modality(Enum):
     IMAGE = "image"
     TEXT = "text"
     VIDEO = "video"
 
-def get_prompts_qwen(questions: list, modality: Modality) -> Tuple[List[str],Optional[List[int]]]:    
+
+def get_prompts_qwen(questions: list, modality: Modality) -> Tuple[List[str], Optional[List[int]]]:
     if modality == Modality.IMAGE:
         placeholder = "<|image_pad|>"
     elif modality == Modality.VIDEO:
@@ -33,7 +36,7 @@ def get_prompts_qwen(questions: list, modality: Modality) -> Tuple[List[str],Opt
         ) for question in questions
     ]
     stop_token_ids = None
-    return prompts,stop_token_ids
+    return prompts, stop_token_ids
 
 
 def get_multi_modal_input(modality: Modality, filenames: list, questions: list) -> dict:
@@ -71,9 +74,6 @@ def get_multi_modal_input(modality: Modality, filenames: list, questions: list)
 num_prompts = len(questions)
 image_repeat_prob = None
 
-
-
-
 modality = Modality.IMAGE
 
 mm_input = get_multi_modal_input(modality, img_names, questions)
diff --git a/dizoo/classic_control/cartpole/config/cartpole_dqn_ddp_config.py b/dizoo/classic_control/cartpole/config/cartpole_dqn_ddp_config.py
index 82d6c673ec..a80662941a 100644
--- a/dizoo/classic_control/cartpole/config/cartpole_dqn_ddp_config.py
+++ b/dizoo/classic_control/cartpole/config/cartpole_dqn_ddp_config.py
@@ -63,4 +63,3 @@
     from ding.entry import serial_pipeline
     with DDPContext():
         serial_pipeline((main_config, create_config), seed=0)
-

From 7dae85e8fa0c0bc6db368bac52e2808e2fd747ea Mon Sep 17 00:00:00 2001
From: wqj2004 <2285705435@qq.com>
Date: Sun, 26 Jan 2025 07:16:30 +0000
Subject: [PATCH 09/18] style_fixed

---
 .../collector/tests/test_vllm_collector.py       |  8 +++-----
 .../tests/test_vllm_collector_multigpu.py        | 16 +++++-----------
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/ding/worker/collector/tests/test_vllm_collector.py b/ding/worker/collector/tests/test_vllm_collector.py
index ef9252405b..d8b7beaf93 100644
--- a/ding/worker/collector/tests/test_vllm_collector.py
+++ b/ding/worker/collector/tests/test_vllm_collector.py
@@ -4,13 +4,13 @@
 from loguru import logger
 from ..vllm_collector import HuggingFaceModelGenerator
 from vllm.assets.image import ImageAsset
-
+from enum import Enum
+import asyncio
+import nest_asyncio
 # set a temperature > 0 to get multiple responses
 # note that HFModelGenerator has a parameter "mm_processor_kwargs" set to align with the settings of Qwen in default
 model = HuggingFaceModelGenerator('/mnt/afs/share/Qwen2-VL-7B', temperature=0.5)
 
-from enum import Enum
-
 
 class Modality(Enum):
     IMAGE = "image"
@@ -81,8 +81,6 @@ def get_multi_modal_input(modality: Modality, filenames: list, questions: list)
 question = mm_input["question"]
 prompts, stop_token_ids = get_prompts_qwen(question, modality)
 
-import asyncio
-import nest_asyncio
 nest_asyncio.apply()
 
 
diff --git a/ding/worker/collector/tests/test_vllm_collector_multigpu.py b/ding/worker/collector/tests/test_vllm_collector_multigpu.py
index bb6c977c6b..fa8d7b0df6 100644
--- a/ding/worker/collector/tests/test_vllm_collector_multigpu.py
+++ b/ding/worker/collector/tests/test_vllm_collector_multigpu.py
@@ -3,6 +3,11 @@
 import uuid
 from loguru import logger
 from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams, RequestOutput
+from typing import List, Tuple, Optional
+from vllm.assets.image import ImageAsset
+from enum import Enum
+import concurrent.futures
+import asyncio
 
 
 class VllmActor:
@@ -155,14 +160,6 @@ def chunk_list(original_list: list, t: int) -> List[list]:
     return new_list
 
 
-from typing import List, Tuple, Optional
-import os
-from loguru import logger
-from vllm.assets.image import ImageAsset
-from enum import Enum
-import concurrent.futures
-
-
 class Modality(Enum):
     IMAGE = "image"
     TEXT = "text"
@@ -229,9 +226,6 @@ async def run_vllm_collector(gpu_id: int, prompts: List, model_path: str, temper
     return responses_list
 
 
-import asyncio
-
-
 def start_collector(gpu_id: int, prompts: list, model_path: str, temperature: float) -> List[str]:
     # event loop in a process
     results = asyncio.run(run_vllm_collector(gpu_id, prompts, model_path, temperature))

From c642e6e4750c8d23fc097d01d54b6487f1e31a6a Mon Sep 17 00:00:00 2001
From: wqj2004 <2285705435@qq.com>
Date: Sun, 26 Jan 2025 07:26:11 +0000
Subject: [PATCH 10/18] formatted

---
 ding/worker/collector/tests/test_vllm_collector_multigpu.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ding/worker/collector/tests/test_vllm_collector_multigpu.py b/ding/worker/collector/tests/test_vllm_collector_multigpu.py
index fa8d7b0df6..fa0ecbf2fc 100644
--- a/ding/worker/collector/tests/test_vllm_collector_multigpu.py
+++ b/ding/worker/collector/tests/test_vllm_collector_multigpu.py
@@ -1,9 +1,8 @@
-from typing import List, Tuple
+from typing import List, Tuple, Optional
 import os
 import uuid
 from loguru import logger
 from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams, RequestOutput
-from typing import List, Tuple, Optional
 from vllm.assets.image import ImageAsset
 from enum import Enum
 import concurrent.futures

From c45e4297306f6195884302fd6d9ac96b26a15c9c Mon Sep 17 00:00:00 2001
From: wqj2004 <2285705435@qq.com>
Date: Sun, 26 Jan 2025 08:12:37 +0000
Subject: [PATCH 11/18] formatted

---
 setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.py b/setup.py
index f3d60222f1..3bc8977b46 100644
--- a/setup.py
+++ b/setup.py
@@ -81,6 +81,8 @@
         'einops',
         'transformers',
         'datasets',
+        'loguru',
+        'vllm'
     ],
     extras_require={
         'test': [

From 3b7903a4de595ff39bf48210ebf76d93ba18a5e6 Mon Sep 17 00:00:00 2001
From: PaParaZz1 <niuyazhe314@outlook.com>
Date: Fri, 7 Feb 2025 13:10:20 +0800
Subject: [PATCH 12/18] feature(nyz): add vllm collector interface definition

---
 ding/worker/collector/vllm_collector.py | 163 +++++++++++++++++++++++-
 1 file changed, 162 insertions(+), 1 deletion(-)

diff --git a/ding/worker/collector/vllm_collector.py b/ding/worker/collector/vllm_collector.py
index 3bd138587f..a208a12f1f 100644
--- a/ding/worker/collector/vllm_collector.py
+++ b/ding/worker/collector/vllm_collector.py
@@ -1,8 +1,16 @@
-from typing import List, Tuple
+from typing import List, Tuple, Optional, Any
 import os
 import uuid
+import asyncio
+import numpy as np
 from loguru import logger
+from easydict import EasyDict
 from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams, RequestOutput
+from transformers import AutoTokenizer
+
+from ding.utils.data.rlhf_online_dataset import OnlineRLDataset
+from ding.utils import SERIAL_COLLECTOR_REGISTRY
+from .base_serial_collector import ISerialCollector
 
 
 class VllmActor:
@@ -145,3 +153,156 @@ async def generate(
         # Use raw logprobs as confidence scores
         confidence_scores = [x.cumulative_logprob for x in response.outputs]
         return [(x.text.strip(), conf) for x, conf in zip(response.outputs, confidence_scores)]
+
+
+@SERIAL_COLLECTOR_REGISTRY.register('vllm')
+class VllmCollector(ISerialCollector):
+    """
+    Overview:
+        Collector implementation for vLLM-based language models (LLM/VLM).
+        This collector manages the interaction with vLLM models for text generation tasks.
+    """
+    config = dict(
+        # (str) LLM/VLM model path
+        model_path='',
+        # (int) Maximum number of tokens to generate per request
+        max_tokens=1024,
+        # (float) Temperature for sampling, 0 means greedy decoding
+        temperature=0.0,
+        # (dict) Multimodal processor kwargs for vision-language models
+        mm_processor_kwargs={
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+        },
+        # Dataset related configs
+        # (str) Key to access the input data in the dataset
+        input_key='input',
+        # (bool) Whether to apply a chat template to the input
+        apply_chat_template=False,
+        # (str) Template for the input
+        input_template=None,
+        # (bool) Whether to shuffle the dataset
+        shuffle=True,
+    )
+
+    def __init__(self, cfg: EasyDict) -> None:
+        """
+        Overview:
+            Initialize the VllmCollector with configuration.
+        Arguments:
+            - cfg (:obj:`EasyDict`): Configuration for the collector including model path, generation parameters,
+              and dataset configuration
+        """
+        super().__init__()
+        self._cfg = cfg
+        self._envstep = 0
+
+        # Initialize the tokenizer and dataset
+        self._tokenizer = AutoTokenizer.from_pretrained(cfg.model_path)
+        self._dataset = OnlineRLDataset(
+            dataset=cfg.dataset,
+            tokenizer=self._tokenizer,
+            input_key=cfg.input_key,
+            apply_chat_template=cfg.apply_chat_template,
+            input_template=cfg.input_template,
+        )
+
+        self._model = VllmActor(model_path=cfg.model_path, mm_processor_kwargs=cfg.mm_processor_kwargs)
+        self.reset()
+
+    def reset(self) -> None:
+        """
+        Overview:
+            Reset the collector, including the dataset index.
+        """
+        self._index = np.arange(len(self._dataset))
+        if self._cfg.shuffle:
+            np.random.shuffle(self._index)
+
+    def reset_policy(self, _model: Optional[str] = None) -> None:
+        """
+        Overview:
+            Since LLM generation does not require a explicit policy and env, this function is empty.
+        """
+        pass
+
+    def reset_env(self, _env: Optional[Any] = None) -> None:
+        """
+        Overview:
+            Since LLM generation does not require a explicit policy and env, this function is empty.
+        """
+        pass
+
+    def collect(
+            self,
+            n_samples: int = 100,
+            num_samples_per_prompt: int = 1,
+            train_iter: int = 0,
+    ) -> List[Tuple[str, float]]:
+        """
+        Overview:
+            Collect generated responses from the vLLM model.
+        Arguments:
+            - n_samples (:obj:`int`): Number of prompts to generate.
+            - num_samples_per_prompt (:obj:`int`): Number of samples to generate per prompt.
+            - train_iter (:obj:`int`): Current training iteration, used for logging.
+        Returns:
+            - responses (:obj:`List[Tuple[str, float]]`): List of (generated_text, confidence_score) pairs
+        """
+        if self._model is None:
+            raise RuntimeError("Model not initialized. Call `reset` method first.")
+
+        prompt = self._dataset[self._index[:n_samples]]
+        # recusively update the index
+        self._index = self._index[n_samples:] + self._index[:n_samples]
+
+        self._envstep += n_samples
+
+        # Get the current event loop or create a new one
+        try:
+            loop = asyncio.get_event_loop()
+        except RuntimeError:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+
+        # Run the async generate method in the event loop
+        return loop.run_until_complete(
+            self._model.generate(
+                prompt=prompt,
+                num_samples=num_samples_per_prompt,
+                max_tokens=self._cfg.max_tokens,
+                temperature=self._cfg.temperature
+            )
+        )
+
+    @property
+    def envstep(self) -> int:
+        """
+        Overview:
+            Get the current environment step count.
+        Returns:
+            - count (:obj:`int`): Current environment step count
+        """
+        return self._envstep
+
+    @envstep.setter
+    def envstep(self, value: int) -> None:
+        """
+        Overview:
+            Set the current environment step count.
+        """
+        self._envstep = value
+
+    def close(self) -> None:
+        """
+        Overview:
+            Close the collector.
+        """
+        pass
+
+    def __del__(self) -> None:
+        """
+        Overview:
+            Destructor for the collector.
+        """
+        self.close()

From 6ca21344cd05d388387c5a82ab48214e7075fd27 Mon Sep 17 00:00:00 2001
From: wqj2004 <2285705435@qq.com>
Date: Mon, 10 Feb 2025 07:32:18 +0000
Subject: [PATCH 13/18] added test_vllm_collector_multi_new

---
 ding/utils/data/rlhf_online_dataset.py        |  40 +-
 .../data/tests/test_rlhf_online_dataset.py    |  31 +-
 .../tests/test_vllm_collector_multi_new.py    | 488 ++++++++++++++++++
 .../tests/test_vllm_collector_multigpu.py     |   7 +-
 ding/worker/collector/vllm_collector.py       | 131 ++++-
 5 files changed, 670 insertions(+), 27 deletions(-)
 create mode 100644 ding/worker/collector/tests/test_vllm_collector_multi_new.py

diff --git a/ding/utils/data/rlhf_online_dataset.py b/ding/utils/data/rlhf_online_dataset.py
index d307f09a32..00a81cba39 100644
--- a/ding/utils/data/rlhf_online_dataset.py
+++ b/ding/utils/data/rlhf_online_dataset.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Union, Callable, Iterable
+from typing import Any, Dict, Union, Callable, Iterable,List
 from tqdm import tqdm
 from torch.utils.data import Dataset
 from torch.distributed import get_rank
@@ -17,6 +17,7 @@ def __init__(
             dataset: Iterable[Dict],
             tokenizer: AutoTokenizer,
             input_key: str = "input",
+            extra_input_keys: List[str] = [],
             apply_chat_template: bool = False,
             input_template: str = None,
     ) -> None:
@@ -33,18 +34,27 @@ def __init__(
         super().__init__()
         self.tokenizer = tokenizer
         self.input_template = input_template
+        self.extra_input_keys = extra_input_keys
 
         if apply_chat_template:
             apply_chat_template = self.tokenizer.apply_chat_template
 
         self.prompts = []
+        for key in extra_input_keys:
+            setattr(self, key, [])
         try:
             rank = get_rank()
         except ValueError:  # not initialized yet, which is the case in unit test
             rank = 0
         for data in tqdm(dataset, desc="Preprocessing data", disable=not rank == 0):
-            prompt = self._preprocess_data(data, input_template, input_key, apply_chat_template)
-            self.prompts.append(prompt)
+            processed_data = self._preprocess_data(data, input_template, input_key,extra_input_keys, apply_chat_template)
+            self.prompts.append(processed_data['prompt'])
+            for key in extra_input_keys:
+                getattr(self, key).append(processed_data[key])  #maybe can be imporved later
+        # self.prompts=np.array(self.prompts)
+        # for key in extra_input_keys:
+        #     setattr(self, key, np.array(getattr(self,key)))
+        
 
     def __len__(self) -> int:
         """
@@ -55,7 +65,7 @@ def __len__(self) -> int:
         """
         return len(self.prompts)
 
-    def __getitem__(self, idx: int) -> str:
+    def __getitem__(self, idx: int) -> str: #can be improved later for list indexing instead of single indexing
         """
         Overview:
             Get the item at the given index.
@@ -64,13 +74,24 @@ def __getitem__(self, idx: int) -> str:
         Returns:
             - item (str): The item at the given index.
         """
-        return self.prompts[idx]
+        # extra inputs: usually image, video, audio, etc.
+        if self.extra_input_keys:
+            extra_inputs = {key: getattr(self, key)[idx] for key in self.extra_input_keys}
+        else:
+            extra_inputs = {}
+        return {
+            "prompt": self.prompts[idx],
+            "multi_modal_data":{
+                **extra_inputs
+            }
+        }
 
     def _preprocess_data(
             self,
             data: Dict[str, Any],
             input_template: str = None,
             input_key: str = "input",
+            extra_input_keys: List[str] = [],
             apply_chat_template: Union[bool, Callable] = False,
     ) -> str:
         """
@@ -86,6 +107,10 @@ def _preprocess_data(
         Returns:
             - prompt (str): The formatted prompt.
         """
+        if extra_input_keys:
+            extra_inputs = {key: data[key] for key in extra_input_keys}
+        else:
+            extra_inputs = {}
         if apply_chat_template:
             chat = data[input_key]
             if isinstance(chat, str):
@@ -96,4 +121,7 @@ def _preprocess_data(
             prompt = data[input_key]
             if input_template:
                 prompt = input_template.format(prompt)
-        return prompt
+        return {
+            "prompt": prompt,
+            **extra_inputs
+        }
diff --git a/ding/utils/data/tests/test_rlhf_online_dataset.py b/ding/utils/data/tests/test_rlhf_online_dataset.py
index cba9e7947c..88c2c70afe 100644
--- a/ding/utils/data/tests/test_rlhf_online_dataset.py
+++ b/ding/utils/data/tests/test_rlhf_online_dataset.py
@@ -1,27 +1,38 @@
 import pytest
 from datasets import load_dataset
-from transformers import AutoTokenizer
 from ding.utils.data import OnlineRLDataset
-
-
+from transformers import AutoTokenizer
+IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
+IMG_START_TOKEN = '<img>'
+IMG_END_TOKEN = '</img>'
+IMG_CONTEXT_NUM = 10  # user-defined number of image patches in the context
 @pytest.fixture
 def dataset():
     # Load the dataset
-    hf_dataset = load_dataset("cat-searcher/minif2f-lean4")['validation']
+    hf_dataset = load_dataset("MMInstruction/VL-RewardBench",split='test')
+    hf_dataset0 = hf_dataset.map(
+        lambda x: {
+            "query": f"{IMG_START_TOKEN}{IMG_CONTEXT_TOKEN * IMG_CONTEXT_NUM}{IMG_END_TOKEN}\n{x['query']}",
+            "image": x["image"],
+        }
+    )
+    # shuffle the dataset
+    hf_dataset = hf_dataset0.shuffle(seed=42)
     print(hf_dataset)
     return hf_dataset
 
 
 @pytest.fixture
 def tokenizer():
-    return AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-7B")
+    return AutoTokenizer.from_pretrained("OpenGVLab/InternVL2_5-4B")
 
 
 @pytest.mark.unittest
 def test_onlinerl_dataset_initialization(dataset, tokenizer):
     # Initialize OnlineRLDataset
     online_rl_dataset = OnlineRLDataset(
-        dataset=dataset, tokenizer=tokenizer, input_key="formal_statement", apply_chat_template=True
+        dataset=dataset, tokenizer=tokenizer, input_key="query",
+        extra_input_keys=["image"], apply_chat_template=True
     )
     # Check if the dataset is initialized correctly
     assert len(online_rl_dataset) == len(dataset)
@@ -31,9 +42,13 @@ def test_onlinerl_dataset_initialization(dataset, tokenizer):
 def test_onlinerl_dataset_getitem(dataset, tokenizer):
     # Initialize OnlineRLDataset
     online_rl_dataset = OnlineRLDataset(
-        dataset=dataset, tokenizer=tokenizer, input_key="formal_statement", apply_chat_template=True
+        dataset=dataset, tokenizer=tokenizer, input_key="query",
+        extra_input_keys=["image"], apply_chat_template=True
     )
     # Check if __getitem__ returns the expected formatted prompt
     item = online_rl_dataset[0]
     print(item)
-    assert isinstance(item, str)
+    assert "prompt" in item
+    assert "multi_modal_data" in item
+    assert "image" in item['multi_modal_data']
+    assert isinstance(item['prompt'],str)
diff --git a/ding/worker/collector/tests/test_vllm_collector_multi_new.py b/ding/worker/collector/tests/test_vllm_collector_multi_new.py
new file mode 100644
index 0000000000..114233f54e
--- /dev/null
+++ b/ding/worker/collector/tests/test_vllm_collector_multi_new.py
@@ -0,0 +1,488 @@
+from typing import Any, Dict, Union, Callable, Iterable,List
+from tqdm import tqdm
+from torch.utils.data import Dataset
+from torch.distributed import get_rank
+from transformers import AutoTokenizer
+from typing import List, Tuple, Optional, Any
+import os
+import uuid
+import asyncio
+import numpy as np
+from loguru import logger
+from easydict import EasyDict
+from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams, RequestOutput
+from ding.utils import SERIAL_COLLECTOR_REGISTRY
+from ding.worker.collector.base_serial_collector import ISerialCollector
+from datasets import load_dataset
+from ding.utils.data import OnlineRLDataset
+import copy
+import concurrent.futures
+
+
+class VllmActor:
+    def __init__(self, model_path: str,mm_processor_kwargs: dict,free_gpus:list) -> None:
+        """
+        Overview:
+            Initialize the vLLM actor. For more details, please refer to https://docs.vllm.ai/en/stable.
+        Arguments:
+            - model_path (str): The path to the language model.
+        """
+        self.free_gpus = free_gpus
+        self.num_gpus = len(self.free_gpus)
+        assert self.num_gpus > 0, "No GPUs found"
+        # Set CUDA_VISIBLE_DEVICES to use only free GPUs
+        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, self.free_gpus))
+        self.model_path = model_path
+        self.mm_processor_kwargs=mm_processor_kwargs
+        self._initialize()
+
+    def _initialize(self) -> None:
+        """
+        Overview:
+            Initialize the vLLM actor with a series of arguments.
+        """
+        logger.info("Initializing vLLM")
+        # TODO: Try other options in https://docs.vllm.ai/en/stable/models/engine_args.html#engine-args.
+        engine_args = AsyncEngineArgs(
+            model=self.model_path,
+            tensor_parallel_size=self.num_gpus,
+            max_num_batched_tokens=8192,
+            max_model_len=8192,
+            # enable_chunked_prefill=True,
+            max_num_seqs=5,
+            # Note - mm_processor_kwargs can also be passed to generate/chat calls
+            mm_processor_kwargs=self.mm_processor_kwargs,
+        )
+        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
+
+    async def generate(self, prompt, num_samples: int, max_tokens: int, temperature: float = 0) -> RequestOutput:
+        """
+        Overview:
+            Generate tactics for the current state.
+        Arguments:
+            - prompt : The prompt to generate tactics.
+            - num_samples (int): The number of tactics to generate.
+            - max_tokens (int): The maximum number of tokens to generate.
+            - temperature (float): The temperature for the language model, default to 0.
+        Returns:
+            - RequestOutput: The generated tactics and their log-probabilities.
+        """
+        sampling_params = SamplingParams(
+            n=num_samples,
+            max_tokens=max_tokens,
+            temperature=temperature,
+        )
+        
+        # Using async iterator to handle vLLM's generation process
+        # 1. vLLM's generate method is asynchronous to prevent blocking while waiting for model outputs
+        # 2. async for allows streaming the generated outputs incrementally instead of waiting for all results
+        # 3. This approach is particularly suitable for LLM inference which can be time-consuming
+        # 4. The request_id ensures unique identification for each generation request
+        async for oup in self.engine.generate(
+            prompt, sampling_params, request_id=str(uuid.uuid4().hex)
+        ):
+            final_output = oup
+        return final_output
+
+
+@SERIAL_COLLECTOR_REGISTRY.register('vllm')
+class VllmCollector(ISerialCollector):
+    """
+    Overview:
+        Collector implementation for vLLM-based language models (LLM/VLM).
+        This collector manages the interaction with vLLM models for text generation tasks.
+    """
+    config = dict(
+        # (str) LLM/VLM model path
+        model_path='',
+        # (int) Maximum number of tokens to generate per request
+        max_tokens=1024,
+        # (float) Temperature for sampling, 0 means greedy decoding
+        temperature=0.0,
+        # (dict) Multimodal processor kwargs for vision-language models
+        mm_processor_kwargs={
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+        },
+        # Dataset related configs
+        # (str) Key to access the input data in the dataset
+        input_key='input',
+        # (bool) Whether to apply a chat template to the input
+        apply_chat_template=False,
+        # (str) Template for the input
+        input_template=None,
+        # (bool) Whether to shuffle the dataset
+        shuffle=True,
+    )
+
+    def __init__(self, cfg: EasyDict) -> None:
+        """
+        Overview:
+            Initialize the VllmCollector with configuration.
+        Arguments:
+            - cfg (:obj:`EasyDict`): Configuration for the collector including model path, generation parameters,
+              and dataset configuration
+        """
+        super().__init__()
+        self._cfg = cfg
+        self._envstep = 0
+
+        # Initialize the tokenizer and dataset
+        self._tokenizer = AutoTokenizer.from_pretrained(cfg.model_path)
+        self._dataset = OnlineRLDataset(
+            dataset=cfg.dataset,
+            tokenizer=self._tokenizer,
+            input_key=cfg.input_key,
+            apply_chat_template=cfg.apply_chat_template,
+            input_template=cfg.input_template,
+            extra_input_keys=cfg.extra_input_keys
+        )
+
+        self._model = VllmActor(model_path=cfg.model_path, mm_processor_kwargs=cfg.mm_processor_kwargs,free_gpus=cfg.free_gpus)
+        self.reset()
+
+    def reset(self) -> None:
+        """
+        Overview:
+            Reset the collector, including the dataset index.
+        """
+        self._index = np.arange(len(self._dataset))
+        if self._cfg.shuffle:
+            np.random.shuffle(self._index)
+
+    def reset_policy(self, _model: Optional[str] = None) -> None:
+        """
+        Overview:
+            Since LLM generation does not require a explicit policy and env, this function is empty.
+        """
+        pass
+
+    def reset_env(self, _env: Optional[Any] = None) -> None:
+        """
+        Overview:
+            Since LLM generation does not require a explicit policy and env, this function is empty.
+        """
+        pass
+    async def _generate_for_prompt(self, prompt: str, num_samples_per_prompt: int) -> List[Tuple[str, float]]:
+        return await self._model.generate(
+            prompt=prompt,
+            num_samples=num_samples_per_prompt,
+            max_tokens=self._cfg.max_tokens,
+            temperature=self._cfg.temperature
+        )
+    def collect(
+            self,
+            n_samples: int = 100,
+            num_samples_per_prompt: int = 1,
+            train_iter: int = 0,
+    ) -> List[Tuple[str, float]]:
+        """
+        Overview:
+            Collect generated responses from the vLLM model.
+        Arguments:
+            - n_samples (:obj:`int`): Number of prompts to generate.
+            - num_samples_per_prompt (:obj:`int`): Number of samples to generate per prompt.
+            - train_iter (:obj:`int`): Current training iteration, used for logging.
+        Returns:
+            - responses (:obj:`List[Tuple[str, float]]`): List of (generated_text, confidence_score) pairs
+        """
+        if self._model is None:
+            raise RuntimeError("Model not initialized. Call `reset` method first.")
+
+        prompts=[]
+        for id in self._index[:n_samples]:
+            prompts.append(self._dataset[id])
+        # recusively update the index
+        self._index = np.concatenate((self._index[n_samples:],self._index[:n_samples]))
+
+        self._envstep += n_samples
+
+        # Get the current event loop or create a new one
+        try:
+            loop = asyncio.get_event_loop()
+        except RuntimeError:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+
+        # Run the async generate method in the event loop
+        # Create a list of tasks for each prompt
+        tasks = [self._generate_for_prompt(prompt, num_samples_per_prompt) for prompt in prompts]
+
+        # Run all tasks concurrently and collect results
+        results = loop.run_until_complete(asyncio.gather(*tasks))
+
+        # Map prompts to their corresponding results
+        responses = {prompt["prompt"]: result for prompt, result in zip(prompts, results)}
+
+        return responses
+    
+    def sync_collect(
+            self,
+            n_samples: int = 100,
+            num_samples_per_prompt: int = 1,
+            train_iter: int = 0,
+    ) -> List[Tuple[str, float]]:
+        """
+        Overview:
+            Collect generated responses from the vLLM model.
+        Arguments:
+            - n_samples (:obj:`int`): Number of prompts to generate.
+            - num_samples_per_prompt (:obj:`int`): Number of samples to generate per prompt.
+            - train_iter (:obj:`int`): Current training iteration, used for logging.
+        Returns:
+            - responses (:obj:`List[Tuple[str, float]]`): List of (generated_text, confidence_score) pairs
+        """
+        if self._model is None:
+            raise RuntimeError("Model not initialized. Call `reset` method first.")
+
+        prompts=[]
+        for id in self._index[:n_samples]:
+            prompts.append(self._dataset[id])
+        # recusively update the index
+        self._index = np.concatenate((self._index[n_samples:],self._index[:n_samples]))
+
+        self._envstep += n_samples
+
+        # Get the current event loop or create a new one
+        try:
+            loop = asyncio.get_event_loop()
+        except RuntimeError:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+
+        # Run the async generate method in the event loop
+        results = {}
+        for prompt in prompts:
+        # Run the async generate method in the event loop for each prompt
+            result = loop.run_until_complete(
+                self._model.generate(
+                    prompt=prompt,
+                    num_samples=num_samples_per_prompt,
+                    max_tokens=self._cfg.max_tokens,
+                    temperature=self._cfg.temperature
+                )
+            )
+            results[prompt['prompt']] = result
+
+        return results    
+    
+    def collect_prompts(
+            self,
+            n_samples: int = 100,
+            num_samples_per_prompt: int = 1,
+            train_iter: int = 0,
+    ) -> List[Tuple[str, float]]:
+        """
+        Overview:
+            Collect generated responses from the vLLM model.
+        Arguments:
+            - n_samples (:obj:`int`): Number of prompts to generate.
+            - num_samples_per_prompt (:obj:`int`): Number of samples to generate per prompt.
+            - train_iter (:obj:`int`): Current training iteration, used for logging.
+        Returns:
+            - responses (:obj:`List[Tuple[str, float]]`): List of (generated_text, confidence_score) pairs
+        """
+        if self._model is None:
+            raise RuntimeError("Model not initialized. Call `reset` method first.")
+
+        prompts=[]
+        for id in self._index[:n_samples]:
+            prompts.append(self._dataset[id])
+        # recusively update the index
+        self._index = np.concatenate((self._index[n_samples:],self._index[:n_samples]))
+
+        self._envstep += n_samples
+
+        # Get the current event loop or create a new one
+        try:
+            loop = asyncio.get_event_loop()
+        except RuntimeError:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+
+        # Run the async generate method in the event loop
+        results = {}
+        tasks=[]
+        for prompt in prompts:
+            for _ in range(num_samples_per_prompt): 
+            # Run the async generate method in the event loop for each prompt
+                tasks.append(self._generate_for_prompt(prompt, num_samples_per_prompt=1))
+        results_list = loop.run_until_complete(asyncio.gather(*tasks))
+        for i,prompt in enumerate(prompts):
+            results[prompt['prompt']]=[]
+            for result in results_list[i*num_samples_per_prompt:(i+1)*num_samples_per_prompt]:
+                results[prompt['prompt']].append(result.outputs[0].text)
+        return results    
+
+
+
+    @property
+    def envstep(self) -> int:
+        """
+        Overview:
+            Get the current environment step count.
+        Returns:
+            - count (:obj:`int`): Current environment step count
+        """
+        return self._envstep
+
+    @envstep.setter
+    def envstep(self, value: int) -> None:
+        """
+        Overview:
+            Set the current environment step count.
+        """
+        self._envstep = value
+
+    def close(self) -> None:
+        """
+        Overview:
+            Close the collector.
+        """
+        pass
+
+    def __del__(self) -> None:
+        """
+        Overview:
+            Destructor for the collector.
+        """
+        self.close()
+        
+        
+        
+        
+def get_free_gpus() -> List[int]:
+    """
+    Overview:
+        Get IDs of GPUs with free memory.
+    Returns:
+        - List[int]: The IDs of the free GPUs.
+    """
+    try:
+        # Get GPU memory usage using nvidia-smi
+        gpu_stats = os.popen('nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader')\
+            .readlines()
+        free_gpus = []
+
+        for gpu_id, stats in enumerate(gpu_stats):
+            mem_used, mem_total = map(int, stats.strip().split(','))
+            # Consider GPU as free if less than 5% memory is used
+            if mem_used / mem_total < 0.05:
+                free_gpus.append(gpu_id)
+
+        return free_gpus if free_gpus else [0]  # Default to GPU 0 if no free GPUs found
+    except Exception:
+        logger.warning("Failed to get GPU stats, defaulting to GPU 0")
+        return [0]
+    
+def chunk_list(original_list, t):
+    # chunk a list into sub_lists
+    new_list = [original_list[i:i + t] for i in range(0, len(original_list), t)]
+    return new_list
+
+
+# prepare dataset
+IMG_START_TOKEN = '<|vision_start|>'
+IMG_END_TOKEN = '<|vision_end|>'
+PLACE_HOLDER='<|image_pad|>'
+def dataset(num=None):
+    # Load the dataset
+    hf_dataset = load_dataset("/mnt/afs/wangqijian/data/rlhf_dataset_test/VL-RewardBench",split='test')
+    hf_dataset0 = hf_dataset.map(
+        lambda x: {
+            "query": f"{IMG_START_TOKEN}{PLACE_HOLDER}{IMG_END_TOKEN}{x['query']}",
+            "image": x["image"],
+        }
+    )
+    # shuffle the dataset
+    hf_dataset = hf_dataset0.shuffle(seed=42)
+    if num is None:
+        return hf_dataset
+    else:
+        ret_data=[]
+        for i in range(0,num):
+            ret_data.append(hf_dataset[i])
+        return ret_data
+
+
+def run_vllm_collector(config):
+    # set GPU for current process
+    gpu_ids = ",".join(map(str, config.free_gpus))
+    os.environ["CUDA_VISIBLE_DEVICES"] = gpu_ids
+    collector = VllmCollector(config)  # 实例化模型
+    #ret=collector.collect(n_samples=2,num_samples_per_prompt=4)
+    ret=collector.collect(n_samples=2,num_samples_per_prompt=4)
+    return ret
+
+
+def start_collector(config):
+    # collect within the process
+    # results:a dict, basic form:
+    #{"prompt_0":[ans_0,ans_1,...,ans_n],"prompt_1":[ans_0,ans_1,...,ans_n],...}
+    results = run_vllm_collector(config)
+    return results
+
+def main(tot_dataset, free_gpus,config):
+    num_tot=len(tot_dataset)
+    num_gpu=len(free_gpus)
+    num_per_gpu=num_tot//num_gpu
+    prompts_per_gpu=chunk_list(tot_dataset,num_per_gpu)
+    with concurrent.futures.ProcessPoolExecutor(max_workers=len(free_gpus)) as executor:
+        futures = []
+        for gpu_id,prompts_gpu in zip(free_gpus,prompts_per_gpu):
+            config_per_gpu=copy.deepcopy(config)
+            config_per_gpu.dataset=prompts_gpu
+            config_per_gpu.free_gpus=[gpu_id]
+            futures.append(executor.submit(start_collector, config_per_gpu))
+
+        # collect all results
+        all_results = []
+        for future in concurrent.futures.as_completed(futures):
+            all_results.append(future.result())
+
+    # save results
+    with open(config.save_path, "w") as f:
+        for response in all_results:
+            print(response)
+            for prompt in list(response.keys()):
+                f.write(f"{prompt}:\n")
+                for i,output in enumerate(response[prompt].outputs):
+                    f.write(f'output_{i}:\n')
+                    f.write(f"{output.text}\n")
+                    
+                    
+test_dataset=dataset(num=96)
+free_gpus=get_free_gpus()                   
+config = EasyDict(
+        # (str) LLM/VLM model path
+        model_path='/mnt/afs/share/Qwen2-VL-7B',
+        # (int) Maximum number of tokens to generate per request
+        max_tokens=4096,
+        # (float) Temperature for sampling, 0 means greedy decoding
+        temperature=1.0,
+        # (dict) Multimodal processor kwargs for vision-language models
+        mm_processor_kwargs={
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+        },# defaul set to align with Qwen2-VL-7B
+        # Dataset related configs
+        # dataset=test_dataset,
+        # dataset is defined for each gpu respectively
+        # (str) Key to access the input data in the dataset
+        input_key='query',
+        # (bool) Whether to apply a chat template to the input
+        apply_chat_template=True,
+        # (str) Template for the input
+        input_template=None,
+        # (bool) Whether to shuffle the dataset
+        shuffle=True,
+        extra_input_keys=['image'],
+        # free_gpus is defined for each gpu respectively
+        # save_path is the file to store the output
+        save_path="your_save_path"
+    )
+
+
+
+
+main(test_dataset,free_gpus,config)
\ No newline at end of file
diff --git a/ding/worker/collector/tests/test_vllm_collector_multigpu.py b/ding/worker/collector/tests/test_vllm_collector_multigpu.py
index fa0ecbf2fc..1c7d79e1bb 100644
--- a/ding/worker/collector/tests/test_vllm_collector_multigpu.py
+++ b/ding/worker/collector/tests/test_vllm_collector_multigpu.py
@@ -244,12 +244,13 @@ def main(prompts: list, model_path: str, free_gpus: List[int], temperature: floa
         # get all results
         all_results = []
         for future in concurrent.futures.as_completed(futures):
-            all_results.extend(future.result())
+            all_results.append(future.result())
 
     # save results
     with open("/mnt/afs/wangqijian/tests/vllm_multi_gpu.txt", "w") as f:
-        for response in all_results:
-            f.write(f"{response}\n")
+        for responses in all_results:
+            for response in responses:
+                f.write(f"{response}\n")
 
 
 if __name__ == "__main__":
diff --git a/ding/worker/collector/vllm_collector.py b/ding/worker/collector/vllm_collector.py
index a208a12f1f..d1c086410c 100644
--- a/ding/worker/collector/vllm_collector.py
+++ b/ding/worker/collector/vllm_collector.py
@@ -205,6 +205,7 @@ def __init__(self, cfg: EasyDict) -> None:
             input_key=cfg.input_key,
             apply_chat_template=cfg.apply_chat_template,
             input_template=cfg.input_template,
+            extra_input_keys=cfg.extra_input_keys
         )
 
         self._model = VllmActor(model_path=cfg.model_path, mm_processor_kwargs=cfg.mm_processor_kwargs)
@@ -232,7 +233,13 @@ def reset_env(self, _env: Optional[Any] = None) -> None:
             Since LLM generation does not require a explicit policy and env, this function is empty.
         """
         pass
-
+    async def _generate_for_prompt(self, prompt: str, num_samples_per_prompt: int) -> List[Tuple[str, float]]:
+        return await self._model.generate(
+            prompt=prompt,
+            num_samples=num_samples_per_prompt,
+            max_tokens=self._cfg.max_tokens,
+            temperature=self._cfg.temperature
+        )
     def collect(
             self,
             n_samples: int = 100,
@@ -252,9 +259,57 @@ def collect(
         if self._model is None:
             raise RuntimeError("Model not initialized. Call `reset` method first.")
 
-        prompt = self._dataset[self._index[:n_samples]]
+        prompts=[]
+        for id in self._index[:n_samples]:
+            prompts.append(self._dataset[id])
+        # recusively update the index
+        self._index = np.concatenate((self._index[n_samples:],self._index[:n_samples]))
+
+        self._envstep += n_samples
+
+        # Get the current event loop or create a new one
+        try:
+            loop = asyncio.get_event_loop()
+        except RuntimeError:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+
+        # Run the async generate method in the event loop
+        # Create a list of tasks for each prompt
+        tasks = [self._generate_for_prompt(prompt, num_samples_per_prompt) for prompt in prompts]
+
+        # Run all tasks concurrently and collect results
+        results = loop.run_until_complete(asyncio.gather(*tasks))
+
+        # Map prompts to their corresponding results
+        responses = {prompt["prompt"]: result for prompt, result in zip(prompts, results)}
+
+        return responses
+    
+    def sync_collect(
+            self,
+            n_samples: int = 100,
+            num_samples_per_prompt: int = 1,
+            train_iter: int = 0,
+    ) -> List[Tuple[str, float]]:
+        """
+        Overview:
+            Collect generated responses from the vLLM model.
+        Arguments:
+            - n_samples (:obj:`int`): Number of prompts to generate.
+            - num_samples_per_prompt (:obj:`int`): Number of samples to generate per prompt.
+            - train_iter (:obj:`int`): Current training iteration, used for logging.
+        Returns:
+            - responses (:obj:`List[Tuple[str, float]]`): List of (generated_text, confidence_score) pairs
+        """
+        if self._model is None:
+            raise RuntimeError("Model not initialized. Call `reset` method first.")
+
+        prompts=[]
+        for id in self._index[:n_samples]:
+            prompts.append(self._dataset[id])
         # recusively update the index
-        self._index = self._index[n_samples:] + self._index[:n_samples]
+        self._index = np.concatenate((self._index[n_samples:],self._index[:n_samples]))
 
         self._envstep += n_samples
 
@@ -266,14 +321,70 @@ def collect(
             asyncio.set_event_loop(loop)
 
         # Run the async generate method in the event loop
-        return loop.run_until_complete(
-            self._model.generate(
-                prompt=prompt,
-                num_samples=num_samples_per_prompt,
-                max_tokens=self._cfg.max_tokens,
-                temperature=self._cfg.temperature
+        results = {}
+        for prompt in prompts:
+        # Run the async generate method in the event loop for each prompt
+            result = loop.run_until_complete(
+                self._model.generate(
+                    prompt=prompt,
+                    num_samples=num_samples_per_prompt,
+                    max_tokens=self._cfg.max_tokens,
+                    temperature=self._cfg.temperature
+                )
             )
-        )
+            results[prompt['prompt']] = result
+
+        return results    
+    
+    def collect_prompts(
+            self,
+            n_samples: int = 100,
+            num_samples_per_prompt: int = 1,
+            train_iter: int = 0,
+    ) -> List[Tuple[str, float]]:
+        """
+        Overview:
+            Collect generated responses from the vLLM model.
+        Arguments:
+            - n_samples (:obj:`int`): Number of prompts to generate.
+            - num_samples_per_prompt (:obj:`int`): Number of samples to generate per prompt.
+            - train_iter (:obj:`int`): Current training iteration, used for logging.
+        Returns:
+            - responses (:obj:`List[Tuple[str, float]]`): List of (generated_text, confidence_score) pairs
+        """
+        if self._model is None:
+            raise RuntimeError("Model not initialized. Call `reset` method first.")
+
+        prompts=[]
+        for id in self._index[:n_samples]:
+            prompts.append(self._dataset[id])
+        # recusively update the index
+        self._index = np.concatenate((self._index[n_samples:],self._index[:n_samples]))
+
+        self._envstep += n_samples
+
+        # Get the current event loop or create a new one
+        try:
+            loop = asyncio.get_event_loop()
+        except RuntimeError:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+
+        # Run the async generate method in the event loop
+        results = {}
+        tasks=[]
+        for prompt in prompts:
+            for _ in range(num_samples_per_prompt): 
+            # Run the async generate method in the event loop for each prompt
+                tasks.append(self._generate_for_prompt(prompt, num_samples_per_prompt=1))
+        results_list = loop.run_until_complete(asyncio.gather(*tasks))
+        for i,prompt in enumerate(prompts):
+            results[prompt['prompt']]=[]
+            for result in results_list[i*4:(i+1)*4]:
+                results[prompt['prompt']].append(result.outputs[0].text)
+        return results    
+
+
 
     @property
     def envstep(self) -> int:

From 606fd55fd8e626907b606ec7b1c3a083968debce Mon Sep 17 00:00:00 2001
From: wqj2004 <2285705435@qq.com>
Date: Mon, 10 Feb 2025 07:34:55 +0000
Subject: [PATCH 14/18] added test_vllm_collector_multi_new

---
 .../worker/collector/tests/test_vllm_collector_multi_new.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/ding/worker/collector/tests/test_vllm_collector_multi_new.py b/ding/worker/collector/tests/test_vllm_collector_multi_new.py
index 114233f54e..e61ac8fe2e 100644
--- a/ding/worker/collector/tests/test_vllm_collector_multi_new.py
+++ b/ding/worker/collector/tests/test_vllm_collector_multi_new.py
@@ -1,7 +1,3 @@
-from typing import Any, Dict, Union, Callable, Iterable,List
-from tqdm import tqdm
-from torch.utils.data import Dataset
-from torch.distributed import get_rank
 from transformers import AutoTokenizer
 from typing import List, Tuple, Optional, Any
 import os
@@ -451,7 +447,7 @@ def main(tot_dataset, free_gpus,config):
                     f.write(f"{output.text}\n")
                     
                     
-test_dataset=dataset(num=96)
+test_dataset=dataset(num=16)
 free_gpus=get_free_gpus()                   
 config = EasyDict(
         # (str) LLM/VLM model path

From 45487495a3dda5f0be4646324917a24a6f2ee5e2 Mon Sep 17 00:00:00 2001
From: wqj2004 <2285705435@qq.com>
Date: Mon, 10 Feb 2025 08:05:57 +0000
Subject: [PATCH 15/18] formatted

---
 .flake8                                       |   2 +-
 ding/utils/data/rlhf_online_dataset.py        |  25 ++-
 .../data/tests/test_rlhf_online_dataset.py    |  12 +-
 .../tests/test_vllm_collector_multi_new.py    | 158 +++++++++---------
 ding/worker/collector/vllm_collector.py       |  38 ++---
 5 files changed, 116 insertions(+), 119 deletions(-)

diff --git a/.flake8 b/.flake8
index 9d86ca5e8c..8b176d3853 100644
--- a/.flake8
+++ b/.flake8
@@ -1,4 +1,4 @@
 [flake8]
 ignore=F401,F841,F403,E226,E126,W504,E265,E722,W503,W605,E741,E122,E731
 max-line-length=120
-statistics
+
diff --git a/ding/utils/data/rlhf_online_dataset.py b/ding/utils/data/rlhf_online_dataset.py
index 00a81cba39..08f6838d61 100644
--- a/ding/utils/data/rlhf_online_dataset.py
+++ b/ding/utils/data/rlhf_online_dataset.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Union, Callable, Iterable,List
+from typing import Any, Dict, Union, Callable, Iterable, List
 from tqdm import tqdm
 from torch.utils.data import Dataset
 from torch.distributed import get_rank
@@ -47,14 +47,16 @@ def __init__(
         except ValueError:  # not initialized yet, which is the case in unit test
             rank = 0
         for data in tqdm(dataset, desc="Preprocessing data", disable=not rank == 0):
-            processed_data = self._preprocess_data(data, input_template, input_key,extra_input_keys, apply_chat_template)
+            processed_data = self._preprocess_data(
+                data, input_template, input_key, extra_input_keys, apply_chat_template
+            )
             self.prompts.append(processed_data['prompt'])
+            #maybe can be imporved later
             for key in extra_input_keys:
-                getattr(self, key).append(processed_data[key])  #maybe can be imporved later
+                getattr(self, key).append(processed_data[key])  
         # self.prompts=np.array(self.prompts)
         # for key in extra_input_keys:
         #     setattr(self, key, np.array(getattr(self,key)))
-        
 
     def __len__(self) -> int:
         """
@@ -65,7 +67,8 @@ def __len__(self) -> int:
         """
         return len(self.prompts)
 
-    def __getitem__(self, idx: int) -> str: #can be improved later for list indexing instead of single indexing
+    def __getitem__(self, idx: int) -> str:
+        #can be improved later for list indexing instead of single indexing
         """
         Overview:
             Get the item at the given index.
@@ -79,12 +82,7 @@ def __getitem__(self, idx: int) -> str: #can be improved later for list indexing
             extra_inputs = {key: getattr(self, key)[idx] for key in self.extra_input_keys}
         else:
             extra_inputs = {}
-        return {
-            "prompt": self.prompts[idx],
-            "multi_modal_data":{
-                **extra_inputs
-            }
-        }
+        return {"prompt": self.prompts[idx], "multi_modal_data": {**extra_inputs}}
 
     def _preprocess_data(
             self,
@@ -121,7 +119,4 @@ def _preprocess_data(
             prompt = data[input_key]
             if input_template:
                 prompt = input_template.format(prompt)
-        return {
-            "prompt": prompt,
-            **extra_inputs
-        }
+        return {"prompt": prompt, **extra_inputs}
diff --git a/ding/utils/data/tests/test_rlhf_online_dataset.py b/ding/utils/data/tests/test_rlhf_online_dataset.py
index 88c2c70afe..1e12a777dd 100644
--- a/ding/utils/data/tests/test_rlhf_online_dataset.py
+++ b/ding/utils/data/tests/test_rlhf_online_dataset.py
@@ -6,10 +6,12 @@
 IMG_START_TOKEN = '<img>'
 IMG_END_TOKEN = '</img>'
 IMG_CONTEXT_NUM = 10  # user-defined number of image patches in the context
+
+
 @pytest.fixture
 def dataset():
     # Load the dataset
-    hf_dataset = load_dataset("MMInstruction/VL-RewardBench",split='test')
+    hf_dataset = load_dataset("MMInstruction/VL-RewardBench", split='test')
     hf_dataset0 = hf_dataset.map(
         lambda x: {
             "query": f"{IMG_START_TOKEN}{IMG_CONTEXT_TOKEN * IMG_CONTEXT_NUM}{IMG_END_TOKEN}\n{x['query']}",
@@ -31,8 +33,7 @@ def tokenizer():
 def test_onlinerl_dataset_initialization(dataset, tokenizer):
     # Initialize OnlineRLDataset
     online_rl_dataset = OnlineRLDataset(
-        dataset=dataset, tokenizer=tokenizer, input_key="query",
-        extra_input_keys=["image"], apply_chat_template=True
+        dataset=dataset, tokenizer=tokenizer, input_key="query", extra_input_keys=["image"], apply_chat_template=True
     )
     # Check if the dataset is initialized correctly
     assert len(online_rl_dataset) == len(dataset)
@@ -42,8 +43,7 @@ def test_onlinerl_dataset_initialization(dataset, tokenizer):
 def test_onlinerl_dataset_getitem(dataset, tokenizer):
     # Initialize OnlineRLDataset
     online_rl_dataset = OnlineRLDataset(
-        dataset=dataset, tokenizer=tokenizer, input_key="query",
-        extra_input_keys=["image"], apply_chat_template=True
+        dataset=dataset, tokenizer=tokenizer, input_key="query", extra_input_keys=["image"], apply_chat_template=True
     )
     # Check if __getitem__ returns the expected formatted prompt
     item = online_rl_dataset[0]
@@ -51,4 +51,4 @@ def test_onlinerl_dataset_getitem(dataset, tokenizer):
     assert "prompt" in item
     assert "multi_modal_data" in item
     assert "image" in item['multi_modal_data']
-    assert isinstance(item['prompt'],str)
+    assert isinstance(item['prompt'], str)
diff --git a/ding/worker/collector/tests/test_vllm_collector_multi_new.py b/ding/worker/collector/tests/test_vllm_collector_multi_new.py
index e61ac8fe2e..affb35a5e9 100644
--- a/ding/worker/collector/tests/test_vllm_collector_multi_new.py
+++ b/ding/worker/collector/tests/test_vllm_collector_multi_new.py
@@ -16,7 +16,8 @@
 
 
 class VllmActor:
-    def __init__(self, model_path: str,mm_processor_kwargs: dict,free_gpus:list) -> None:
+
+    def __init__(self, model_path: str, mm_processor_kwargs: dict, free_gpus: list) -> None:
         """
         Overview:
             Initialize the vLLM actor. For more details, please refer to https://docs.vllm.ai/en/stable.
@@ -29,7 +30,7 @@ def __init__(self, model_path: str,mm_processor_kwargs: dict,free_gpus:list) ->
         # Set CUDA_VISIBLE_DEVICES to use only free GPUs
         os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, self.free_gpus))
         self.model_path = model_path
-        self.mm_processor_kwargs=mm_processor_kwargs
+        self.mm_processor_kwargs = mm_processor_kwargs
         self._initialize()
 
     def _initialize(self) -> None:
@@ -68,7 +69,7 @@ async def generate(self, prompt, num_samples: int, max_tokens: int, temperature:
             max_tokens=max_tokens,
             temperature=temperature,
         )
-        
+
         # Using async iterator to handle vLLM's generation process
         # 1. vLLM's generate method is asynchronous to prevent blocking while waiting for model outputs
         # 2. async for allows streaming the generated outputs incrementally instead of waiting for all results
@@ -134,7 +135,9 @@ def __init__(self, cfg: EasyDict) -> None:
             extra_input_keys=cfg.extra_input_keys
         )
 
-        self._model = VllmActor(model_path=cfg.model_path, mm_processor_kwargs=cfg.mm_processor_kwargs,free_gpus=cfg.free_gpus)
+        self._model = VllmActor(
+            model_path=cfg.model_path, mm_processor_kwargs=cfg.mm_processor_kwargs, free_gpus=cfg.free_gpus
+        )
         self.reset()
 
     def reset(self) -> None:
@@ -159,6 +162,7 @@ def reset_env(self, _env: Optional[Any] = None) -> None:
             Since LLM generation does not require a explicit policy and env, this function is empty.
         """
         pass
+
     async def _generate_for_prompt(self, prompt: str, num_samples_per_prompt: int) -> List[Tuple[str, float]]:
         return await self._model.generate(
             prompt=prompt,
@@ -166,6 +170,7 @@ async def _generate_for_prompt(self, prompt: str, num_samples_per_prompt: int) -
             max_tokens=self._cfg.max_tokens,
             temperature=self._cfg.temperature
         )
+
     def collect(
             self,
             n_samples: int = 100,
@@ -185,11 +190,11 @@ def collect(
         if self._model is None:
             raise RuntimeError("Model not initialized. Call `reset` method first.")
 
-        prompts=[]
+        prompts = []
         for id in self._index[:n_samples]:
             prompts.append(self._dataset[id])
         # recusively update the index
-        self._index = np.concatenate((self._index[n_samples:],self._index[:n_samples]))
+        self._index = np.concatenate((self._index[n_samples:], self._index[:n_samples]))
 
         self._envstep += n_samples
 
@@ -211,7 +216,7 @@ def collect(
         responses = {prompt["prompt"]: result for prompt, result in zip(prompts, results)}
 
         return responses
-    
+
     def sync_collect(
             self,
             n_samples: int = 100,
@@ -231,11 +236,11 @@ def sync_collect(
         if self._model is None:
             raise RuntimeError("Model not initialized. Call `reset` method first.")
 
-        prompts=[]
+        prompts = []
         for id in self._index[:n_samples]:
             prompts.append(self._dataset[id])
         # recusively update the index
-        self._index = np.concatenate((self._index[n_samples:],self._index[:n_samples]))
+        self._index = np.concatenate((self._index[n_samples:], self._index[:n_samples]))
 
         self._envstep += n_samples
 
@@ -249,7 +254,7 @@ def sync_collect(
         # Run the async generate method in the event loop
         results = {}
         for prompt in prompts:
-        # Run the async generate method in the event loop for each prompt
+            # Run the async generate method in the event loop for each prompt
             result = loop.run_until_complete(
                 self._model.generate(
                     prompt=prompt,
@@ -260,8 +265,8 @@ def sync_collect(
             )
             results[prompt['prompt']] = result
 
-        return results    
-    
+        return results
+
     def collect_prompts(
             self,
             n_samples: int = 100,
@@ -281,11 +286,11 @@ def collect_prompts(
         if self._model is None:
             raise RuntimeError("Model not initialized. Call `reset` method first.")
 
-        prompts=[]
+        prompts = []
         for id in self._index[:n_samples]:
             prompts.append(self._dataset[id])
         # recusively update the index
-        self._index = np.concatenate((self._index[n_samples:],self._index[:n_samples]))
+        self._index = np.concatenate((self._index[n_samples:], self._index[:n_samples]))
 
         self._envstep += n_samples
 
@@ -298,19 +303,17 @@ def collect_prompts(
 
         # Run the async generate method in the event loop
         results = {}
-        tasks=[]
+        tasks = []
         for prompt in prompts:
-            for _ in range(num_samples_per_prompt): 
-            # Run the async generate method in the event loop for each prompt
+            for _ in range(num_samples_per_prompt):
+                # Run the async generate method in the event loop for each prompt
                 tasks.append(self._generate_for_prompt(prompt, num_samples_per_prompt=1))
         results_list = loop.run_until_complete(asyncio.gather(*tasks))
-        for i,prompt in enumerate(prompts):
-            results[prompt['prompt']]=[]
-            for result in results_list[i*num_samples_per_prompt:(i+1)*num_samples_per_prompt]:
+        for i, prompt in enumerate(prompts):
+            results[prompt['prompt']] = []
+            for result in results_list[i * num_samples_per_prompt:(i + 1) * num_samples_per_prompt]:
                 results[prompt['prompt']].append(result.outputs[0].text)
-        return results    
-
-
+        return results
 
     @property
     def envstep(self) -> int:
@@ -343,10 +346,8 @@ def __del__(self) -> None:
             Destructor for the collector.
         """
         self.close()
-        
-        
-        
-        
+
+
 def get_free_gpus() -> List[int]:
     """
     Overview:
@@ -370,7 +371,8 @@ def get_free_gpus() -> List[int]:
     except Exception:
         logger.warning("Failed to get GPU stats, defaulting to GPU 0")
         return [0]
-    
+
+
 def chunk_list(original_list, t):
     # chunk a list into sub_lists
     new_list = [original_list[i:i + t] for i in range(0, len(original_list), t)]
@@ -380,10 +382,12 @@ def chunk_list(original_list, t):
 # prepare dataset
 IMG_START_TOKEN = '<|vision_start|>'
 IMG_END_TOKEN = '<|vision_end|>'
-PLACE_HOLDER='<|image_pad|>'
+PLACE_HOLDER = '<|image_pad|>'
+
+
 def dataset(num=None):
     # Load the dataset
-    hf_dataset = load_dataset("/mnt/afs/wangqijian/data/rlhf_dataset_test/VL-RewardBench",split='test')
+    hf_dataset = load_dataset("/mnt/afs/wangqijian/data/rlhf_dataset_test/VL-RewardBench", split='test')
     hf_dataset0 = hf_dataset.map(
         lambda x: {
             "query": f"{IMG_START_TOKEN}{PLACE_HOLDER}{IMG_END_TOKEN}{x['query']}",
@@ -395,8 +399,8 @@ def dataset(num=None):
     if num is None:
         return hf_dataset
     else:
-        ret_data=[]
-        for i in range(0,num):
+        ret_data = []
+        for i in range(0, num):
             ret_data.append(hf_dataset[i])
         return ret_data
 
@@ -407,7 +411,7 @@ def run_vllm_collector(config):
     os.environ["CUDA_VISIBLE_DEVICES"] = gpu_ids
     collector = VllmCollector(config)  # 实例化模型
     #ret=collector.collect(n_samples=2,num_samples_per_prompt=4)
-    ret=collector.collect(n_samples=2,num_samples_per_prompt=4)
+    ret = collector.collect(n_samples=2, num_samples_per_prompt=4)
     return ret
 
 
@@ -418,17 +422,18 @@ def start_collector(config):
     results = run_vllm_collector(config)
     return results
 
-def main(tot_dataset, free_gpus,config):
-    num_tot=len(tot_dataset)
-    num_gpu=len(free_gpus)
-    num_per_gpu=num_tot//num_gpu
-    prompts_per_gpu=chunk_list(tot_dataset,num_per_gpu)
+
+def main(tot_dataset, free_gpus, config):
+    num_tot = len(tot_dataset)
+    num_gpu = len(free_gpus)
+    num_per_gpu = num_tot // num_gpu
+    prompts_per_gpu = chunk_list(tot_dataset, num_per_gpu)
     with concurrent.futures.ProcessPoolExecutor(max_workers=len(free_gpus)) as executor:
         futures = []
-        for gpu_id,prompts_gpu in zip(free_gpus,prompts_per_gpu):
-            config_per_gpu=copy.deepcopy(config)
-            config_per_gpu.dataset=prompts_gpu
-            config_per_gpu.free_gpus=[gpu_id]
+        for gpu_id, prompts_gpu in zip(free_gpus, prompts_per_gpu):
+            config_per_gpu = copy.deepcopy(config)
+            config_per_gpu.dataset = prompts_gpu
+            config_per_gpu.free_gpus = [gpu_id]
             futures.append(executor.submit(start_collector, config_per_gpu))
 
         # collect all results
@@ -442,43 +447,40 @@ def main(tot_dataset, free_gpus,config):
             print(response)
             for prompt in list(response.keys()):
                 f.write(f"{prompt}:\n")
-                for i,output in enumerate(response[prompt].outputs):
+                for i, output in enumerate(response[prompt].outputs):
                     f.write(f'output_{i}:\n')
                     f.write(f"{output.text}\n")
-                    
-                    
-test_dataset=dataset(num=16)
-free_gpus=get_free_gpus()                   
-config = EasyDict(
-        # (str) LLM/VLM model path
-        model_path='/mnt/afs/share/Qwen2-VL-7B',
-        # (int) Maximum number of tokens to generate per request
-        max_tokens=4096,
-        # (float) Temperature for sampling, 0 means greedy decoding
-        temperature=1.0,
-        # (dict) Multimodal processor kwargs for vision-language models
-        mm_processor_kwargs={
-            "min_pixels": 28 * 28,
-            "max_pixels": 1280 * 28 * 28,
-        },# defaul set to align with Qwen2-VL-7B
-        # Dataset related configs
-        # dataset=test_dataset,
-        # dataset is defined for each gpu respectively
-        # (str) Key to access the input data in the dataset
-        input_key='query',
-        # (bool) Whether to apply a chat template to the input
-        apply_chat_template=True,
-        # (str) Template for the input
-        input_template=None,
-        # (bool) Whether to shuffle the dataset
-        shuffle=True,
-        extra_input_keys=['image'],
-        # free_gpus is defined for each gpu respectively
-        # save_path is the file to store the output
-        save_path="your_save_path"
-    )
-
 
 
-
-main(test_dataset,free_gpus,config)
\ No newline at end of file
+test_dataset = dataset(num=16)
+free_gpus = get_free_gpus()
+config = EasyDict(
+    # (str) LLM/VLM model path
+    model_path='/mnt/afs/share/Qwen2-VL-7B',
+    # (int) Maximum number of tokens to generate per request
+    max_tokens=4096,
+    # (float) Temperature for sampling, 0 means greedy decoding
+    temperature=1.0,
+    # (dict) Multimodal processor kwargs for vision-language models
+    mm_processor_kwargs={
+        "min_pixels": 28 * 28,
+        "max_pixels": 1280 * 28 * 28,
+    },  # defaul set to align with Qwen2-VL-7B
+    # Dataset related configs
+    # dataset=test_dataset,
+    # dataset is defined for each gpu respectively
+    # (str) Key to access the input data in the dataset
+    input_key='query',
+    # (bool) Whether to apply a chat template to the input
+    apply_chat_template=True,
+    # (str) Template for the input
+    input_template=None,
+    # (bool) Whether to shuffle the dataset
+    shuffle=True,
+    extra_input_keys=['image'],
+    # free_gpus is defined for each gpu respectively
+    # save_path is the file to store the output
+    save_path="your_save_path"
+)
+
+main(test_dataset, free_gpus, config)
diff --git a/ding/worker/collector/vllm_collector.py b/ding/worker/collector/vllm_collector.py
index d1c086410c..ca37a74039 100644
--- a/ding/worker/collector/vllm_collector.py
+++ b/ding/worker/collector/vllm_collector.py
@@ -233,6 +233,7 @@ def reset_env(self, _env: Optional[Any] = None) -> None:
             Since LLM generation does not require a explicit policy and env, this function is empty.
         """
         pass
+
     async def _generate_for_prompt(self, prompt: str, num_samples_per_prompt: int) -> List[Tuple[str, float]]:
         return await self._model.generate(
             prompt=prompt,
@@ -240,6 +241,7 @@ async def _generate_for_prompt(self, prompt: str, num_samples_per_prompt: int) -
             max_tokens=self._cfg.max_tokens,
             temperature=self._cfg.temperature
         )
+
     def collect(
             self,
             n_samples: int = 100,
@@ -259,11 +261,11 @@ def collect(
         if self._model is None:
             raise RuntimeError("Model not initialized. Call `reset` method first.")
 
-        prompts=[]
+        prompts = []
         for id in self._index[:n_samples]:
             prompts.append(self._dataset[id])
         # recusively update the index
-        self._index = np.concatenate((self._index[n_samples:],self._index[:n_samples]))
+        self._index = np.concatenate((self._index[n_samples:], self._index[:n_samples]))
 
         self._envstep += n_samples
 
@@ -285,7 +287,7 @@ def collect(
         responses = {prompt["prompt"]: result for prompt, result in zip(prompts, results)}
 
         return responses
-    
+
     def sync_collect(
             self,
             n_samples: int = 100,
@@ -305,11 +307,11 @@ def sync_collect(
         if self._model is None:
             raise RuntimeError("Model not initialized. Call `reset` method first.")
 
-        prompts=[]
+        prompts = []
         for id in self._index[:n_samples]:
             prompts.append(self._dataset[id])
         # recusively update the index
-        self._index = np.concatenate((self._index[n_samples:],self._index[:n_samples]))
+        self._index = np.concatenate((self._index[n_samples:], self._index[:n_samples]))
 
         self._envstep += n_samples
 
@@ -323,7 +325,7 @@ def sync_collect(
         # Run the async generate method in the event loop
         results = {}
         for prompt in prompts:
-        # Run the async generate method in the event loop for each prompt
+            # Run the async generate method in the event loop for each prompt
             result = loop.run_until_complete(
                 self._model.generate(
                     prompt=prompt,
@@ -334,8 +336,8 @@ def sync_collect(
             )
             results[prompt['prompt']] = result
 
-        return results    
-    
+        return results
+
     def collect_prompts(
             self,
             n_samples: int = 100,
@@ -355,11 +357,11 @@ def collect_prompts(
         if self._model is None:
             raise RuntimeError("Model not initialized. Call `reset` method first.")
 
-        prompts=[]
+        prompts = []
         for id in self._index[:n_samples]:
             prompts.append(self._dataset[id])
         # recusively update the index
-        self._index = np.concatenate((self._index[n_samples:],self._index[:n_samples]))
+        self._index = np.concatenate((self._index[n_samples:], self._index[:n_samples]))
 
         self._envstep += n_samples
 
@@ -372,19 +374,17 @@ def collect_prompts(
 
         # Run the async generate method in the event loop
         results = {}
-        tasks=[]
+        tasks = []
         for prompt in prompts:
-            for _ in range(num_samples_per_prompt): 
-            # Run the async generate method in the event loop for each prompt
+            for _ in range(num_samples_per_prompt):
+                # Run the async generate method in the event loop for each prompt
                 tasks.append(self._generate_for_prompt(prompt, num_samples_per_prompt=1))
         results_list = loop.run_until_complete(asyncio.gather(*tasks))
-        for i,prompt in enumerate(prompts):
-            results[prompt['prompt']]=[]
-            for result in results_list[i*4:(i+1)*4]:
+        for i, prompt in enumerate(prompts):
+            results[prompt['prompt']] = []
+            for result in results_list[i * 4:(i + 1) * 4]:
                 results[prompt['prompt']].append(result.outputs[0].text)
-        return results    
-
-
+        return results
 
     @property
     def envstep(self) -> int:

From 092e5145627b32eb767c57a1efab7470f01e626d Mon Sep 17 00:00:00 2001
From: wqj2004 <2285705435@qq.com>
Date: Mon, 10 Feb 2025 08:07:00 +0000
Subject: [PATCH 16/18] formatted

---
 .flake8 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.flake8 b/.flake8
index 8b176d3853..9d86ca5e8c 100644
--- a/.flake8
+++ b/.flake8
@@ -1,4 +1,4 @@
 [flake8]
 ignore=F401,F841,F403,E226,E126,W504,E265,E722,W503,W605,E741,E122,E731
 max-line-length=120
-
+statistics

From eda18c5c88df39d4e4e2b0c5af87c73aa4488cda Mon Sep 17 00:00:00 2001
From: wqj2004 <2285705435@qq.com>
Date: Mon, 10 Feb 2025 08:16:14 +0000
Subject: [PATCH 17/18] formatted

---
 ding/utils/data/rlhf_online_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ding/utils/data/rlhf_online_dataset.py b/ding/utils/data/rlhf_online_dataset.py
index 08f6838d61..0386fca534 100644
--- a/ding/utils/data/rlhf_online_dataset.py
+++ b/ding/utils/data/rlhf_online_dataset.py
@@ -53,7 +53,7 @@ def __init__(
             self.prompts.append(processed_data['prompt'])
             #maybe can be imporved later
             for key in extra_input_keys:
-                getattr(self, key).append(processed_data[key])  
+                getattr(self, key).append(processed_data[key])
         # self.prompts=np.array(self.prompts)
         # for key in extra_input_keys:
         #     setattr(self, key, np.array(getattr(self,key)))

From 81a1016454cad1b3f74e32f86bab0a954ed772d9 Mon Sep 17 00:00:00 2001
From: wqj2004 <2285705435@qq.com>
Date: Tue, 18 Feb 2025 09:08:35 +0000
Subject: [PATCH 18/18] added pytest

---
 .../collector/tests/test_vllm_collector.py    | 187 +++++--
 .../tests/test_vllm_collector_multi_new.py    | 529 ++++--------------
 .../tests/test_vllm_collector_multigpu.py     | 276 +++------
 ding/worker/collector/vllm_collector.py       |  77 ++-
 .../halfcheetah_medium_expert_iql_config.py   |   1 -
 .../config/halfcheetah_medium_iql_config.py   |   1 -
 .../halfcheetah_medium_replay_iql_config.py   |   1 -
 .../config/hopper_medium_expert_iql_config.py |   1 -
 dizoo/d4rl/config/hopper_medium_iql_config.py |   1 -
 .../config/hopper_medium_replay_iql_config.py |   1 -
 10 files changed, 373 insertions(+), 702 deletions(-)

diff --git a/ding/worker/collector/tests/test_vllm_collector.py b/ding/worker/collector/tests/test_vllm_collector.py
index d8b7beaf93..ca210bdae7 100644
--- a/ding/worker/collector/tests/test_vllm_collector.py
+++ b/ding/worker/collector/tests/test_vllm_collector.py
@@ -1,15 +1,13 @@
 from typing import List, Tuple, Optional
-import os
-import uuid
-from loguru import logger
-from ..vllm_collector import HuggingFaceModelGenerator
+from ding.worker.collector.vllm_collector import HuggingFaceModelGenerator, get_free_gpus
 from vllm.assets.image import ImageAsset
 from enum import Enum
+from datasets import load_dataset
 import asyncio
-import nest_asyncio
-# set a temperature > 0 to get multiple responses
-# note that HFModelGenerator has a parameter "mm_processor_kwargs" set to align with the settings of Qwen in default
-model = HuggingFaceModelGenerator('/mnt/afs/share/Qwen2-VL-7B', temperature=0.5)
+from PIL import Image
+import os
+import concurrent.futures
+import pytest
 
 
 class Modality(Enum):
@@ -18,6 +16,24 @@ class Modality(Enum):
     VIDEO = "video"
 
 
+def chunk_list(original_list: List, t: int):
+    # chunk a list into sub_lists
+    # base length of sublists
+    base_length = len(original_list) // t
+    # remaind length of some sub_lists
+    remainder = len(original_list) % t
+    new_list = []
+    index = 0
+    for i in range(t):
+        if i < remainder:
+            sublist_length = base_length + 1
+        else:
+            sublist_length = base_length
+        new_list.append(original_list[index:index + sublist_length])
+        index += sublist_length
+    return new_list
+
+
 def get_prompts_qwen(questions: list, modality: Modality) -> Tuple[List[str], Optional[List[int]]]:
     if modality == Modality.IMAGE:
         placeholder = "<|image_pad|>"
@@ -50,9 +66,14 @@ def get_multi_modal_input(modality: Modality, filenames: list, questions: list)
         # Input image and question
         ret = {'data': [], 'question': []}
         for filename, question in zip(filenames, questions):
-            image = ImageAsset(filename) \
+            if isinstance(filename, str):
+                image = ImageAsset(filename) \
                 .pil_image.convert("RGB")
             #img_question = "What is the content of this image?"
+            elif isinstance(filename, Image.Image):
+                image = filename
+            else:
+                raise ValueError(f"Unsupported type in filenames: {type(filename)}")
             img_question = question
             ret["data"].append(image)
             ret["question"].append(img_question)
@@ -62,38 +83,132 @@ def get_multi_modal_input(modality: Modality, filenames: list, questions: list)
     return ret
 
 
-questions = [
-    "What is the content of this image?", "Please describe the image.",
-    "How many people are there in the image? What are they doing?"
-]
-img_names = [
-    '/mnt/afs/niuyazhe/data/meme/data/Eimages/Eimages/Eimages/image_ (2)',
-    '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(3127)', '/mnt/afs/wangqijian/data/test/test'
-]
-
-num_prompts = len(questions)
-image_repeat_prob = None
-
-modality = Modality.IMAGE
-
-mm_input = get_multi_modal_input(modality, img_names, questions)
-data = mm_input["data"]
-question = mm_input["question"]
-prompts, stop_token_ids = get_prompts_qwen(question, modality)
-
-nest_asyncio.apply()
-
-
-async def main():
-    inputs = [{"prompt": prompt, "multi_modal_data": {modality.value: data}} for prompt, data in zip(prompts, data)]
+# -----------------testing single gpu vllm_actor --------------------------------
+async def single_main(model_path: str, gpu: list, temperature: float, modality: str, prompts: list, data: list):
+    # note that HFModelGenerator has a parameter
+    # "mm_processor_kwargs" set to align with the settings of Qwen in default
+    model = HuggingFaceModelGenerator(model_path=model_path, free_gpus=gpu, temperature=temperature)
+    inputs = [{"prompt": prompt, "multi_modal_data": {modality: data}} for prompt, data in zip(prompts, data)]
     # generate responses
+    response_ret = []
     for in_data in inputs:
         responses = await model.generate(prompt=in_data, num_samples=3)
         # print response
+        response_per_prompt = []
         for response, confidence in responses:
-            print(f"Response: {response}")
+            response_per_prompt.append(response)
+        response_ret.append(response_per_prompt)
+    return response_ret
 
 
 # run main
-if __name__ == "__main__":
-    asyncio.run(main())
+@pytest.mark.unittest
+def test_single_main():
+    # set a temperature > 0 to get multiple responses
+    free_gpus = get_free_gpus()
+    model_path = 'Qwen/Qwen2-VL-7B'
+    temperature = 0.5
+    questions = []
+    img_names = []
+    sample_num = 4
+    hf_dataset = load_dataset("MMInstruction/VL-RewardBench", split='test')
+    for i in range(sample_num):
+        img_names.append(hf_dataset[i]["image"])
+        questions.append(hf_dataset[i]["query"])
+    assert len(img_names) == len(questions)
+    modality = Modality.IMAGE
+    mm_input = get_multi_modal_input(modality, img_names, questions)
+    data = mm_input["data"]
+    question = mm_input["question"]
+    prompts, stop_token_ids = get_prompts_qwen(question, modality)
+    responses = asyncio.run(
+        single_main(
+            model_path=model_path,
+            gpu=[free_gpus[0]],
+            temperature=temperature,
+            modality=modality.value,
+            prompts=prompts,
+            data=data
+        )
+    )
+    assert len(responses) == len(questions)
+
+
+# -----------------testing multi gpu vllm_actor --------------------------------
+async def run_vllm_collector(gpu_list: list, prompts: List, model_path: str, temperature: float) -> List[str]:
+    # set visible gpu
+    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_list))
+    # get a model on a single gpu
+    model = HuggingFaceModelGenerator(model_path, free_gpus=gpu_list, temperature=temperature)
+
+    # get response for each prompts (can be improved later using async generation)
+    responses_list = []
+    for prompt in prompts:
+        responses = await model.generate(prompt, num_samples=3)
+        for response in responses:
+            responses_list.append(response)
+            #print(f"[GPU {gpu_list}] Response: {response}")
+
+    return responses_list
+
+
+def start_collector(gpu_list: list, prompts: list, model_path: str, temperature: float) -> List[str]:
+    # event loop in a process
+    results = asyncio.run(run_vllm_collector(gpu_list, prompts, model_path, temperature))
+    return results
+
+
+def multi_main(
+        prompts: list, model_path: str, free_gpus: List[int], temperature: float, num_per_gpus_collector: int
+) -> None:
+    # solve how mant collectors to use
+    num_collector = len(free_gpus) // num_per_gpus_collector
+    # slove how many gpus a collector should use
+    gpus_per_collector = chunk_list(free_gpus, num_collector)
+    # split input_prompts to collectors equally
+    prompts_per_gpu = chunk_list(prompts, num_collector)
+    with concurrent.futures.ProcessPoolExecutor(max_workers=num_collector) as executor:
+        futures = []
+        for gpu_list, prompts_gpu in zip(gpus_per_collector, prompts_per_gpu):
+            futures.append(executor.submit(start_collector, gpu_list, prompts_gpu, model_path, temperature))
+
+        # get all results
+        all_results = []
+        for future in concurrent.futures.as_completed(futures):
+            all_results.append(future.result())
+
+    return all_results
+
+
+@pytest.mark.unittest
+def test_multi_main():
+    # get dataset
+    hf_dataset = load_dataset("MMInstruction/VL-RewardBench", split='test')
+    img_names = []
+    questions = []
+    num = 16
+    for i in range(num):
+        img_names.append(hf_dataset[i]["image"])
+        questions.append(hf_dataset[i]["query"])
+    assert len(img_names) == len(questions)
+    #get gpus
+    free_gpus = get_free_gpus()
+    # set modality
+    modality = Modality.IMAGE
+    # get input
+    mm_input = get_multi_modal_input(modality, img_names, questions)
+    data = mm_input["data"]
+    question = mm_input["question"]
+    # get prompts
+    prompts, stop_token_ids = get_prompts_qwen(question, modality)
+    # set necessary parameters
+    model_path = 'Qwen/Qwen2-VL-7B'
+    temperature = 0.5
+    num_gpus_per_collector = 1
+    assert len(free_gpus) >= num_gpus_per_collector
+    # set inputs
+    inputs = [{"prompt": prompt, "multi_modal_data": {modality.value: data}} for prompt, data in zip(prompts, data)]
+    # get results
+    result = multi_main(inputs, model_path, free_gpus, temperature, num_gpus_per_collector)
+    # default num_smaples is 3, can be modified in line 93
+    assert len(result) == len(questions)
diff --git a/ding/worker/collector/tests/test_vllm_collector_multi_new.py b/ding/worker/collector/tests/test_vllm_collector_multi_new.py
index affb35a5e9..0a255d0624 100644
--- a/ding/worker/collector/tests/test_vllm_collector_multi_new.py
+++ b/ding/worker/collector/tests/test_vllm_collector_multi_new.py
@@ -1,381 +1,29 @@
 from transformers import AutoTokenizer
 from typing import List, Tuple, Optional, Any
 import os
-import uuid
-import asyncio
-import numpy as np
-from loguru import logger
 from easydict import EasyDict
-from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams, RequestOutput
-from ding.utils import SERIAL_COLLECTOR_REGISTRY
-from ding.worker.collector.base_serial_collector import ISerialCollector
 from datasets import load_dataset
-from ding.utils.data import OnlineRLDataset
+from ding.worker.collector.vllm_collector import VllmCollector, get_free_gpus
 import copy
 import concurrent.futures
+import pytest
 
 
-class VllmActor:
-
-    def __init__(self, model_path: str, mm_processor_kwargs: dict, free_gpus: list) -> None:
-        """
-        Overview:
-            Initialize the vLLM actor. For more details, please refer to https://docs.vllm.ai/en/stable.
-        Arguments:
-            - model_path (str): The path to the language model.
-        """
-        self.free_gpus = free_gpus
-        self.num_gpus = len(self.free_gpus)
-        assert self.num_gpus > 0, "No GPUs found"
-        # Set CUDA_VISIBLE_DEVICES to use only free GPUs
-        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, self.free_gpus))
-        self.model_path = model_path
-        self.mm_processor_kwargs = mm_processor_kwargs
-        self._initialize()
-
-    def _initialize(self) -> None:
-        """
-        Overview:
-            Initialize the vLLM actor with a series of arguments.
-        """
-        logger.info("Initializing vLLM")
-        # TODO: Try other options in https://docs.vllm.ai/en/stable/models/engine_args.html#engine-args.
-        engine_args = AsyncEngineArgs(
-            model=self.model_path,
-            tensor_parallel_size=self.num_gpus,
-            max_num_batched_tokens=8192,
-            max_model_len=8192,
-            # enable_chunked_prefill=True,
-            max_num_seqs=5,
-            # Note - mm_processor_kwargs can also be passed to generate/chat calls
-            mm_processor_kwargs=self.mm_processor_kwargs,
-        )
-        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
-
-    async def generate(self, prompt, num_samples: int, max_tokens: int, temperature: float = 0) -> RequestOutput:
-        """
-        Overview:
-            Generate tactics for the current state.
-        Arguments:
-            - prompt : The prompt to generate tactics.
-            - num_samples (int): The number of tactics to generate.
-            - max_tokens (int): The maximum number of tokens to generate.
-            - temperature (float): The temperature for the language model, default to 0.
-        Returns:
-            - RequestOutput: The generated tactics and their log-probabilities.
-        """
-        sampling_params = SamplingParams(
-            n=num_samples,
-            max_tokens=max_tokens,
-            temperature=temperature,
-        )
-
-        # Using async iterator to handle vLLM's generation process
-        # 1. vLLM's generate method is asynchronous to prevent blocking while waiting for model outputs
-        # 2. async for allows streaming the generated outputs incrementally instead of waiting for all results
-        # 3. This approach is particularly suitable for LLM inference which can be time-consuming
-        # 4. The request_id ensures unique identification for each generation request
-        async for oup in self.engine.generate(
-            prompt, sampling_params, request_id=str(uuid.uuid4().hex)
-        ):
-            final_output = oup
-        return final_output
-
-
-@SERIAL_COLLECTOR_REGISTRY.register('vllm')
-class VllmCollector(ISerialCollector):
-    """
-    Overview:
-        Collector implementation for vLLM-based language models (LLM/VLM).
-        This collector manages the interaction with vLLM models for text generation tasks.
-    """
-    config = dict(
-        # (str) LLM/VLM model path
-        model_path='',
-        # (int) Maximum number of tokens to generate per request
-        max_tokens=1024,
-        # (float) Temperature for sampling, 0 means greedy decoding
-        temperature=0.0,
-        # (dict) Multimodal processor kwargs for vision-language models
-        mm_processor_kwargs={
-            "min_pixels": 28 * 28,
-            "max_pixels": 1280 * 28 * 28,
-        },
-        # Dataset related configs
-        # (str) Key to access the input data in the dataset
-        input_key='input',
-        # (bool) Whether to apply a chat template to the input
-        apply_chat_template=False,
-        # (str) Template for the input
-        input_template=None,
-        # (bool) Whether to shuffle the dataset
-        shuffle=True,
-    )
-
-    def __init__(self, cfg: EasyDict) -> None:
-        """
-        Overview:
-            Initialize the VllmCollector with configuration.
-        Arguments:
-            - cfg (:obj:`EasyDict`): Configuration for the collector including model path, generation parameters,
-              and dataset configuration
-        """
-        super().__init__()
-        self._cfg = cfg
-        self._envstep = 0
-
-        # Initialize the tokenizer and dataset
-        self._tokenizer = AutoTokenizer.from_pretrained(cfg.model_path)
-        self._dataset = OnlineRLDataset(
-            dataset=cfg.dataset,
-            tokenizer=self._tokenizer,
-            input_key=cfg.input_key,
-            apply_chat_template=cfg.apply_chat_template,
-            input_template=cfg.input_template,
-            extra_input_keys=cfg.extra_input_keys
-        )
-
-        self._model = VllmActor(
-            model_path=cfg.model_path, mm_processor_kwargs=cfg.mm_processor_kwargs, free_gpus=cfg.free_gpus
-        )
-        self.reset()
-
-    def reset(self) -> None:
-        """
-        Overview:
-            Reset the collector, including the dataset index.
-        """
-        self._index = np.arange(len(self._dataset))
-        if self._cfg.shuffle:
-            np.random.shuffle(self._index)
-
-    def reset_policy(self, _model: Optional[str] = None) -> None:
-        """
-        Overview:
-            Since LLM generation does not require a explicit policy and env, this function is empty.
-        """
-        pass
-
-    def reset_env(self, _env: Optional[Any] = None) -> None:
-        """
-        Overview:
-            Since LLM generation does not require a explicit policy and env, this function is empty.
-        """
-        pass
-
-    async def _generate_for_prompt(self, prompt: str, num_samples_per_prompt: int) -> List[Tuple[str, float]]:
-        return await self._model.generate(
-            prompt=prompt,
-            num_samples=num_samples_per_prompt,
-            max_tokens=self._cfg.max_tokens,
-            temperature=self._cfg.temperature
-        )
-
-    def collect(
-            self,
-            n_samples: int = 100,
-            num_samples_per_prompt: int = 1,
-            train_iter: int = 0,
-    ) -> List[Tuple[str, float]]:
-        """
-        Overview:
-            Collect generated responses from the vLLM model.
-        Arguments:
-            - n_samples (:obj:`int`): Number of prompts to generate.
-            - num_samples_per_prompt (:obj:`int`): Number of samples to generate per prompt.
-            - train_iter (:obj:`int`): Current training iteration, used for logging.
-        Returns:
-            - responses (:obj:`List[Tuple[str, float]]`): List of (generated_text, confidence_score) pairs
-        """
-        if self._model is None:
-            raise RuntimeError("Model not initialized. Call `reset` method first.")
-
-        prompts = []
-        for id in self._index[:n_samples]:
-            prompts.append(self._dataset[id])
-        # recusively update the index
-        self._index = np.concatenate((self._index[n_samples:], self._index[:n_samples]))
-
-        self._envstep += n_samples
-
-        # Get the current event loop or create a new one
-        try:
-            loop = asyncio.get_event_loop()
-        except RuntimeError:
-            loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(loop)
-
-        # Run the async generate method in the event loop
-        # Create a list of tasks for each prompt
-        tasks = [self._generate_for_prompt(prompt, num_samples_per_prompt) for prompt in prompts]
-
-        # Run all tasks concurrently and collect results
-        results = loop.run_until_complete(asyncio.gather(*tasks))
-
-        # Map prompts to their corresponding results
-        responses = {prompt["prompt"]: result for prompt, result in zip(prompts, results)}
-
-        return responses
-
-    def sync_collect(
-            self,
-            n_samples: int = 100,
-            num_samples_per_prompt: int = 1,
-            train_iter: int = 0,
-    ) -> List[Tuple[str, float]]:
-        """
-        Overview:
-            Collect generated responses from the vLLM model.
-        Arguments:
-            - n_samples (:obj:`int`): Number of prompts to generate.
-            - num_samples_per_prompt (:obj:`int`): Number of samples to generate per prompt.
-            - train_iter (:obj:`int`): Current training iteration, used for logging.
-        Returns:
-            - responses (:obj:`List[Tuple[str, float]]`): List of (generated_text, confidence_score) pairs
-        """
-        if self._model is None:
-            raise RuntimeError("Model not initialized. Call `reset` method first.")
-
-        prompts = []
-        for id in self._index[:n_samples]:
-            prompts.append(self._dataset[id])
-        # recusively update the index
-        self._index = np.concatenate((self._index[n_samples:], self._index[:n_samples]))
-
-        self._envstep += n_samples
-
-        # Get the current event loop or create a new one
-        try:
-            loop = asyncio.get_event_loop()
-        except RuntimeError:
-            loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(loop)
-
-        # Run the async generate method in the event loop
-        results = {}
-        for prompt in prompts:
-            # Run the async generate method in the event loop for each prompt
-            result = loop.run_until_complete(
-                self._model.generate(
-                    prompt=prompt,
-                    num_samples=num_samples_per_prompt,
-                    max_tokens=self._cfg.max_tokens,
-                    temperature=self._cfg.temperature
-                )
-            )
-            results[prompt['prompt']] = result
-
-        return results
-
-    def collect_prompts(
-            self,
-            n_samples: int = 100,
-            num_samples_per_prompt: int = 1,
-            train_iter: int = 0,
-    ) -> List[Tuple[str, float]]:
-        """
-        Overview:
-            Collect generated responses from the vLLM model.
-        Arguments:
-            - n_samples (:obj:`int`): Number of prompts to generate.
-            - num_samples_per_prompt (:obj:`int`): Number of samples to generate per prompt.
-            - train_iter (:obj:`int`): Current training iteration, used for logging.
-        Returns:
-            - responses (:obj:`List[Tuple[str, float]]`): List of (generated_text, confidence_score) pairs
-        """
-        if self._model is None:
-            raise RuntimeError("Model not initialized. Call `reset` method first.")
-
-        prompts = []
-        for id in self._index[:n_samples]:
-            prompts.append(self._dataset[id])
-        # recusively update the index
-        self._index = np.concatenate((self._index[n_samples:], self._index[:n_samples]))
-
-        self._envstep += n_samples
-
-        # Get the current event loop or create a new one
-        try:
-            loop = asyncio.get_event_loop()
-        except RuntimeError:
-            loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(loop)
-
-        # Run the async generate method in the event loop
-        results = {}
-        tasks = []
-        for prompt in prompts:
-            for _ in range(num_samples_per_prompt):
-                # Run the async generate method in the event loop for each prompt
-                tasks.append(self._generate_for_prompt(prompt, num_samples_per_prompt=1))
-        results_list = loop.run_until_complete(asyncio.gather(*tasks))
-        for i, prompt in enumerate(prompts):
-            results[prompt['prompt']] = []
-            for result in results_list[i * num_samples_per_prompt:(i + 1) * num_samples_per_prompt]:
-                results[prompt['prompt']].append(result.outputs[0].text)
-        return results
-
-    @property
-    def envstep(self) -> int:
-        """
-        Overview:
-            Get the current environment step count.
-        Returns:
-            - count (:obj:`int`): Current environment step count
-        """
-        return self._envstep
-
-    @envstep.setter
-    def envstep(self, value: int) -> None:
-        """
-        Overview:
-            Set the current environment step count.
-        """
-        self._envstep = value
-
-    def close(self) -> None:
-        """
-        Overview:
-            Close the collector.
-        """
-        pass
-
-    def __del__(self) -> None:
-        """
-        Overview:
-            Destructor for the collector.
-        """
-        self.close()
-
-
-def get_free_gpus() -> List[int]:
-    """
-    Overview:
-        Get IDs of GPUs with free memory.
-    Returns:
-        - List[int]: The IDs of the free GPUs.
-    """
-    try:
-        # Get GPU memory usage using nvidia-smi
-        gpu_stats = os.popen('nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader')\
-            .readlines()
-        free_gpus = []
-
-        for gpu_id, stats in enumerate(gpu_stats):
-            mem_used, mem_total = map(int, stats.strip().split(','))
-            # Consider GPU as free if less than 5% memory is used
-            if mem_used / mem_total < 0.05:
-                free_gpus.append(gpu_id)
-
-        return free_gpus if free_gpus else [0]  # Default to GPU 0 if no free GPUs found
-    except Exception:
-        logger.warning("Failed to get GPU stats, defaulting to GPU 0")
-        return [0]
-
-
-def chunk_list(original_list, t):
+def chunk_list(original_list: List, t: int) -> List[List]:
     # chunk a list into sub_lists
-    new_list = [original_list[i:i + t] for i in range(0, len(original_list), t)]
+    # base length of sublists
+    base_length = len(original_list) // t
+    # remaind length of some sub_lists
+    remainder = len(original_list) % t
+    new_list = []
+    index = 0
+    for i in range(t):
+        if i < remainder:
+            sublist_length = base_length + 1
+        else:
+            sublist_length = base_length
+        new_list.append(original_list[index:index + sublist_length])
+        index += sublist_length
     return new_list
 
 
@@ -385,9 +33,9 @@ def chunk_list(original_list, t):
 PLACE_HOLDER = '<|image_pad|>'
 
 
-def dataset(num=None):
+def dataset(num: int = None) -> List:
     # Load the dataset
-    hf_dataset = load_dataset("/mnt/afs/wangqijian/data/rlhf_dataset_test/VL-RewardBench", split='test')
+    hf_dataset = load_dataset("MMInstruction/VL-RewardBench", split='test')
     hf_dataset0 = hf_dataset.map(
         lambda x: {
             "query": f"{IMG_START_TOKEN}{PLACE_HOLDER}{IMG_END_TOKEN}{x['query']}",
@@ -405,17 +53,24 @@ def dataset(num=None):
         return ret_data
 
 
-def run_vllm_collector(config):
+def run_vllm_collector(config: EasyDict) -> List[dict]:
+    '''
+    ret:[
+        {
+        "prompt_i":output([output_text_0,output_text_1,...,])
+        }
+    ]
+    '''
     # set GPU for current process
     gpu_ids = ",".join(map(str, config.free_gpus))
     os.environ["CUDA_VISIBLE_DEVICES"] = gpu_ids
-    collector = VllmCollector(config)  # 实例化模型
+    collector = VllmCollector(config)
     #ret=collector.collect(n_samples=2,num_samples_per_prompt=4)
-    ret = collector.collect(n_samples=2, num_samples_per_prompt=4)
+    ret = collector.collect(n_samples=config.n_samples, num_samples_per_prompt=config.num_samples_per_prompt)
     return ret
 
 
-def start_collector(config):
+def start_collector(config: EasyDict):
     # collect within the process
     # results:a dict, basic form:
     #{"prompt_0":[ans_0,ans_1,...,ans_n],"prompt_1":[ans_0,ans_1,...,ans_n],...}
@@ -423,64 +78,84 @@ def start_collector(config):
     return results
 
 
-def main(tot_dataset, free_gpus, config):
-    num_tot = len(tot_dataset)
-    num_gpu = len(free_gpus)
-    num_per_gpu = num_tot // num_gpu
-    prompts_per_gpu = chunk_list(tot_dataset, num_per_gpu)
-    with concurrent.futures.ProcessPoolExecutor(max_workers=len(free_gpus)) as executor:
+def multi_vllm_main(tot_dataset, free_gpus: list, config: EasyDict):
+    '''
+    tot_dataset: the total dataset to process
+    free_gpus: list of total gpus available for the task
+    config: user defined config about how to do the task
+    '''
+    num_gpu_per_collector = config.num_gpus_per_collector
+    # how many collector to use
+    num_collector = len(free_gpus) // num_gpu_per_collector
+    # list of list, each list contains the gpus the collecor can use
+    gpu_per_collector = chunk_list(free_gpus, num_collector)
+    prompts_per_gpu = chunk_list(tot_dataset, num_collector)
+    with concurrent.futures.ProcessPoolExecutor(max_workers=num_collector) as executor:
         futures = []
-        for gpu_id, prompts_gpu in zip(free_gpus, prompts_per_gpu):
+        for gpu_list, prompts_per_collector in zip(gpu_per_collector, prompts_per_gpu):
             config_per_gpu = copy.deepcopy(config)
-            config_per_gpu.dataset = prompts_gpu
-            config_per_gpu.free_gpus = [gpu_id]
+            config_per_gpu.dataset = prompts_per_collector
+            config_per_gpu.free_gpus = gpu_list
+            #config_per_gpu.n_samples = len(prompts_per_collector)
+            config_per_gpu.n_samples = 2
             futures.append(executor.submit(start_collector, config_per_gpu))
 
         # collect all results
         all_results = []
         for future in concurrent.futures.as_completed(futures):
             all_results.append(future.result())
-
-    # save results
-    with open(config.save_path, "w") as f:
-        for response in all_results:
-            print(response)
-            for prompt in list(response.keys()):
-                f.write(f"{prompt}:\n")
-                for i, output in enumerate(response[prompt].outputs):
-                    f.write(f'output_{i}:\n')
-                    f.write(f"{output.text}\n")
-
-
-test_dataset = dataset(num=16)
-free_gpus = get_free_gpus()
-config = EasyDict(
-    # (str) LLM/VLM model path
-    model_path='/mnt/afs/share/Qwen2-VL-7B',
-    # (int) Maximum number of tokens to generate per request
-    max_tokens=4096,
-    # (float) Temperature for sampling, 0 means greedy decoding
-    temperature=1.0,
-    # (dict) Multimodal processor kwargs for vision-language models
-    mm_processor_kwargs={
-        "min_pixels": 28 * 28,
-        "max_pixels": 1280 * 28 * 28,
-    },  # defaul set to align with Qwen2-VL-7B
-    # Dataset related configs
-    # dataset=test_dataset,
-    # dataset is defined for each gpu respectively
-    # (str) Key to access the input data in the dataset
-    input_key='query',
-    # (bool) Whether to apply a chat template to the input
-    apply_chat_template=True,
-    # (str) Template for the input
-    input_template=None,
-    # (bool) Whether to shuffle the dataset
-    shuffle=True,
-    extra_input_keys=['image'],
-    # free_gpus is defined for each gpu respectively
-    # save_path is the file to store the output
-    save_path="your_save_path"
-)
-
-main(test_dataset, free_gpus, config)
+        return all_results
+
+    # # save results
+    # with open(config.save_path, "w") as f:
+    #     for response in all_results:
+    #         #print(response)
+    #         for prompt in list(response.keys()):
+    #             f.write(f"{prompt}:\n")
+    #             for i, output in enumerate(response[prompt].outputs):
+    #                 f.write(f'output_{i}:\n')
+    #                 f.write(f"{output.text}\n")
+
+
+@pytest.mark.unittest
+def test_multi_vllm():
+    test_dataset = dataset(num=16)
+    free_gpus = get_free_gpus()
+    config = EasyDict(
+        # (str) LLM/VLM model path
+        model_path='Qwen/Qwen2-VL-7B',
+        # (int) Maximum number of tokens to generate per request
+        max_tokens=4096,
+        # (float) Temperature for sampling, 0 means greedy decoding
+        temperature=1.0,
+        # (dict) Multimodal processor kwargs for vision-language models
+        mm_processor_kwargs={
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+        },  # defaul set to align with Qwen2-VL-7B
+        # Dataset related configs
+        # dataset=test_dataset,
+        # dataset is defined for each gpu respectively
+        # (str) Key to access the input data in the dataset
+        input_key='query',
+        # (bool) Whether to apply a chat template to the input
+        apply_chat_template=True,
+        # (str) Template for the input
+        input_template=None,
+        # (bool) Whether to shuffle the dataset
+        shuffle=True,
+        extra_input_keys=['image'],
+        # free_gpus is defined for each gpu respectively
+        # save_path is the file to store the output
+        save_path="your_path",
+        # how many gpus a collector can use
+        num_gpus_per_collector=1,
+        num_samples_per_prompt=4
+    )
+    result = multi_vllm_main(test_dataset, free_gpus, config)
+    collector_num = len(free_gpus) // config.num_gpus_per_collector
+    assert len(result) == collector_num
+    for response in result:
+        prompts = list(response.keys())
+        for prompt in prompts:
+            assert config.num_samples_per_prompt == len(response[prompt].outputs)
diff --git a/ding/worker/collector/tests/test_vllm_collector_multigpu.py b/ding/worker/collector/tests/test_vllm_collector_multigpu.py
index 1c7d79e1bb..966171f523 100644
--- a/ding/worker/collector/tests/test_vllm_collector_multigpu.py
+++ b/ding/worker/collector/tests/test_vllm_collector_multigpu.py
@@ -1,161 +1,30 @@
 from typing import List, Tuple, Optional
 import os
-import uuid
-from loguru import logger
-from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams, RequestOutput
 from vllm.assets.image import ImageAsset
 from enum import Enum
+from ding.worker.collector.vllm_collector import HuggingFaceModelGenerator, get_free_gpus
+from PIL import Image
+from datasets import load_dataset
 import concurrent.futures
 import asyncio
-
-
-class VllmActor:
-
-    def __init__(self, model_path: str, mm_processor_kwargs: dict, free_gpus: list) -> None:
-        """
-        Overview:
-            Initialize the vLLM actor. For more details, please refer to https://docs.vllm.ai/en/stable.
-        Arguments:
-            - model_path (str): The path to the language model.
-        """
-        self.free_gpus = free_gpus
-        self.num_gpus = len(self.free_gpus)
-        assert self.num_gpus > 0, "No GPUs found"
-        # Set CUDA_VISIBLE_DEVICES to use only free GPUs
-        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, self.free_gpus))
-        self.model_path = model_path
-        self.mm_processor_kwargs = mm_processor_kwargs
-        self._initialize()
-
-    def _initialize(self) -> None:
-        """
-        Overview:
-            Initialize the vLLM actor with a series of arguments.
-        """
-        logger.info("Initializing vLLM")
-        # TODO: Try other options in https://docs.vllm.ai/en/stable/models/engine_args.html#engine-args.
-        engine_args = AsyncEngineArgs(
-            model=self.model_path,
-            tensor_parallel_size=self.num_gpus,
-            max_num_batched_tokens=8192,
-            max_model_len=8192,
-            # enable_chunked_prefill=True,
-            max_num_seqs=5,
-            # Note - mm_processor_kwargs can also be passed to generate/chat calls
-            mm_processor_kwargs=self.mm_processor_kwargs,
-        )
-        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
-
-    async def generate(self, prompt, num_samples: int, max_tokens: int, temperature: float = 0) -> RequestOutput:
-        """
-        Overview:
-            Generate tactics for the current state.
-        Arguments:
-            - prompt : The prompt to generate tactics.
-            - num_samples (int): The number of tactics to generate.
-            - max_tokens (int): The maximum number of tokens to generate.
-            - temperature (float): The temperature for the language model, default to 0.
-        Returns:
-            - RequestOutput: The generated tactics and their log-probabilities.
-        """
-        sampling_params = SamplingParams(
-            n=num_samples,
-            max_tokens=max_tokens,
-            temperature=temperature,
-        )
-
-        # Using async iterator to handle vLLM's generation process
-        # 1. vLLM's generate method is asynchronous to prevent blocking while waiting for model outputs
-        # 2. async for allows streaming the generated outputs incrementally instead of waiting for all results
-        # 3. This approach is particularly suitable for LLM inference which can be time-consuming
-        # 4. The request_id ensures unique identification for each generation request
-        async for oup in self.engine.generate(
-            prompt, sampling_params, request_id=str(uuid.uuid4().hex)
-        ):
-            final_output = oup
-        return final_output
-
-
-class HuggingFaceModelGenerator:
-    """
-    Overview:
-        A LLM/VLM generator that uses Hugging Face models with vLLM as the backend.
-    """
-
-    def __init__(
-            self,
-            model_path: str,
-            free_gpus: list,
-            max_tokens: int = 1024,
-            temperature: float = 0,
-            mm_processor_kwargs: dict = {
-                "min_pixels": 28 * 28,
-                "max_pixels": 1280 * 28 * 28,
-            }
-    ) -> None:
-        """
-        Overview:
-            Initialize the Hugging Face model generator.
-        Arguments:
-            - model_path (str): The path to the language model.
-            - max_tokens (int): The maximum number of tokens to generate, default to 1024.
-            - temperature (float): The temperature for the language model, default to 0.
-        """
-        self.vllm_actor = VllmActor(model_path, mm_processor_kwargs, free_gpus)
-        self.max_tokens = max_tokens
-        self.temperature = temperature
-
-    async def generate(
-            self,
-            prompt,
-            num_samples: int,
-    ) -> List[Tuple[str, float]]:
-        """
-        Overview:
-            Generate tactics for the current state.
-        Arguments:
-            - prompt : The prompt to generate tactics.
-            - num_samples (int): The number of tactics to generate.
-        Returns:
-            - List[Tuple[str, float]]: The generated tactics and their log-probabilities.
-
-        .. note::
-            This method is asynchronous and returns a coroutine.
-        """
-        response = await self.vllm_actor.generate(prompt, num_samples, self.max_tokens, self.temperature)
-        # Use raw logprobs as confidence scores
-        confidence_scores = [x.cumulative_logprob for x in response.outputs]
-        return [(x.text.strip(), conf) for x, conf in zip(response.outputs, confidence_scores)]
-
-
-def get_free_gpus() -> List[int]:
-    """
-    Overview:
-        Get IDs of GPUs with free memory.
-    Returns:
-        - List[int]: The IDs of the free GPUs.
-    """
-    try:
-        # Get GPU memory usage using nvidia-smi
-        gpu_stats = os.popen('nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader')\
-            .readlines()
-        free_gpus = []
-
-        for gpu_id, stats in enumerate(gpu_stats):
-            mem_used, mem_total = map(int, stats.strip().split(','))
-            # Consider GPU as free if less than 5% memory is used
-            if mem_used / mem_total < 0.05:
-                free_gpus.append(gpu_id)
-
-        return free_gpus if free_gpus else [0]  # Default to GPU 0 if no free GPUs found
-    except Exception:
-        logger.warning("Failed to get GPU stats, defaulting to GPU 0")
-        return [0]
-
-
-def chunk_list(original_list: list, t: int) -> List[list]:
-    # chunk the list into sub_lists
-    new_list = [original_list[i:i + t] for i in range(0, len(original_list), t)]
+import pytest
+
+
+def chunk_list(original_list: List, t: int):
+    # chunk a list into sub_lists
+    # base length of sublists
+    base_length = len(original_list) // t
+    # remaind length of some sub_lists
+    remainder = len(original_list) % t
+    new_list = []
+    index = 0
+    for i in range(t):
+        if i < remainder:
+            sublist_length = base_length + 1
+        else:
+            sublist_length = base_length
+        new_list.append(original_list[index:index + sublist_length])
+        index += sublist_length
     return new_list
 
 
@@ -197,9 +66,14 @@ def get_multi_modal_input(modality: Modality, filenames: list, questions: list)
         # Input image and question
         ret = {'data': [], 'question': []}
         for filename, question in zip(filenames, questions):
-            image = ImageAsset(filename) \
+            if isinstance(filename, str):
+                image = ImageAsset(filename) \
                 .pil_image.convert("RGB")
             #img_question = "What is the content of this image?"
+            elif isinstance(filename, Image.Image):
+                image = filename
+            else:
+                raise ValueError(f"Unsupported type in filenames: {type(filename)}")
             img_question = question
             ret["data"].append(image)
             ret["question"].append(img_question)
@@ -209,84 +83,78 @@ def get_multi_modal_input(modality: Modality, filenames: list, questions: list)
     return ret
 
 
-async def run_vllm_collector(gpu_id: int, prompts: List, model_path: str, temperature: float) -> List[str]:
+async def run_vllm_collector(gpu_list: list, prompts: List, model_path: str, temperature: float) -> List[str]:
     # set visible gpu
-    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_list))
     # get a model on a single gpu
-    model = HuggingFaceModelGenerator(model_path, free_gpus=[gpu_id], temperature=temperature)
+    model = HuggingFaceModelGenerator(model_path, free_gpus=gpu_list, temperature=temperature)
 
+    # get response for each prompts (can be improved later using async generation)
     responses_list = []
     for prompt in prompts:
         responses = await model.generate(prompt, num_samples=3)
         for response in responses:
             responses_list.append(response)
-            print(f"[GPU {gpu_id}] Response: {response}")
+            #print(f"[GPU {gpu_list}] Response: {response}")
 
     return responses_list
 
 
-def start_collector(gpu_id: int, prompts: list, model_path: str, temperature: float) -> List[str]:
+def start_collector(gpu_list: list, prompts: list, model_path: str, temperature: float) -> List[str]:
     # event loop in a process
-    results = asyncio.run(run_vllm_collector(gpu_id, prompts, model_path, temperature))
+    results = asyncio.run(run_vllm_collector(gpu_list, prompts, model_path, temperature))
     return results
 
 
-def main(prompts: list, model_path: str, free_gpus: List[int], temperature: float) -> None:
-    num_tot = len(prompts)
-    num_gpu = len(free_gpus)
-    num_per_gpu = num_tot // num_gpu
-    prompts_per_gpu = chunk_list(prompts, num_per_gpu)
-    with concurrent.futures.ProcessPoolExecutor(max_workers=len(free_gpus)) as executor:
+def main(prompts: list, model_path: str, free_gpus: List[int], temperature: float, num_per_gpus_collector: int) -> None:
+    # solve how mant collectors to use
+    num_collector = len(free_gpus) // num_per_gpus_collector
+    # slove how many gpus a collector should use
+    gpus_per_collector = chunk_list(free_gpus, num_collector)
+    # split input_prompts to collectors equally
+    prompts_per_gpu = chunk_list(prompts, num_collector)
+    with concurrent.futures.ProcessPoolExecutor(max_workers=num_collector) as executor:
         futures = []
-        for gpu_id, prompts_gpu in zip(free_gpus, prompts_per_gpu):
-            futures.append(executor.submit(start_collector, gpu_id, prompts_gpu, model_path, temperature))
+        for gpu_list, prompts_gpu in zip(gpus_per_collector, prompts_per_gpu):
+            futures.append(executor.submit(start_collector, gpu_list, prompts_gpu, model_path, temperature))
 
         # get all results
         all_results = []
         for future in concurrent.futures.as_completed(futures):
             all_results.append(future.result())
 
-    # save results
-    with open("/mnt/afs/wangqijian/tests/vllm_multi_gpu.txt", "w") as f:
-        for responses in all_results:
-            for response in responses:
-                f.write(f"{response}\n")
-
-
-if __name__ == "__main__":
-    questions = [
-        'Please describe the image.', 'Please describe the image.', 'What\'s the text in the image?',
-        'What\'s the text in the image?', 'What is in the image?', 'What is in the image?',
-        'How many people are in the image?', 'How many people are in the image?',
-        'What is the emotion of the main character of the image?',
-        'What is the emotion of the main character of the image?', 'How many animals are in the image?',
-        'How many animals are in the image?', 'What is the place of the image?', 'What is the place of the image?',
-        'What is the peroson doing?', 'What is the peroson doing?'
-    ]
-    img_names = [
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(2127)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(5394)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(1160)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(4956)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(2212)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(3387)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(4086)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(4384)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(5000)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(1237)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(766)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(6031)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(6)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(2284)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(4533)',
-        '/mnt/afs/niuyazhe/data/meme/data/Cimages/Cimages/Cimages/Image_(5495)'
-    ]
+    return all_results
+
+
+@pytest.mark.unittest
+def test_main():
+    # get dataset
+    hf_dataset = load_dataset("MMInstruction/VL-RewardBench", split='test')
+    img_names = []
+    questions = []
+    num = 16
+    for i in range(num):
+        img_names.append(hf_dataset[i]["image"])
+        questions.append(hf_dataset[i]["query"])
+    assert len(img_names) == len(questions)
+    #get gpus
     free_gpus = get_free_gpus()
+    # set modality
     modality = Modality.IMAGE
+    # get input
     mm_input = get_multi_modal_input(modality, img_names, questions)
     data = mm_input["data"]
     question = mm_input["question"]
+    # get prompts
     prompts, stop_token_ids = get_prompts_qwen(question, modality)
-    model_path = '/mnt/afs/share/Qwen2-VL-7B'
+    # set necessary parameters
+    model_path = 'Qwen/Qwen2-VL-7B'
     temperature = 0.5
-    main(prompts, model_path, free_gpus, temperature)
+    num_gpus_per_collector = 1
+    assert len(free_gpus) >= num_gpus_per_collector
+    # set inputs
+    inputs = [{"prompt": prompt, "multi_modal_data": {modality.value: data}} for prompt, data in zip(prompts, data)]
+    # get results
+    result = main(inputs, model_path, free_gpus, temperature, num_gpus_per_collector)
+    # default num_smaples is 3, can be modified in line 93
+    assert len(result) == len(questions)
diff --git a/ding/worker/collector/vllm_collector.py b/ding/worker/collector/vllm_collector.py
index ca37a74039..eefe35c33b 100644
--- a/ding/worker/collector/vllm_collector.py
+++ b/ding/worker/collector/vllm_collector.py
@@ -13,16 +13,46 @@
 from .base_serial_collector import ISerialCollector
 
 
+def get_free_gpus() -> List[int]:
+    """
+    Overview:
+        Get IDs of GPUs with free memory.
+    Returns:
+        - List[int]: The IDs of the free GPUs.
+    """
+    try:
+        # Get GPU memory usage using nvidia-smi
+        gpu_stats = os.popen('nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader')\
+            .readlines()
+        free_gpus = []
+
+        for gpu_id, stats in enumerate(gpu_stats):
+            mem_used, mem_total = map(int, stats.strip().split(','))
+            # Consider GPU as free if less than 5% memory is used
+            if mem_used / mem_total < 0.05:
+                free_gpus.append(gpu_id)
+
+        return free_gpus if free_gpus else [0]  # Default to GPU 0 if no free GPUs found
+    except Exception:
+        logger.warning("Failed to get GPU stats, defaulting to GPU 0")
+        return [0]
+
+
 class VllmActor:
 
-    def __init__(self, model_path: str, mm_processor_kwargs: dict) -> None:
+    def __init__(self, model_path: str, mm_processor_kwargs: dict, free_gpus: list = None) -> None:
         """
         Overview:
             Initialize the vLLM actor. For more details, please refer to https://docs.vllm.ai/en/stable.
         Arguments:
             - model_path (str): The path to the language model.
+            - mm_processor_kwargs(dict): Multimodal processor kwargs for vision-language models
+            - free_gpus(list): gpus for the model
         """
-        self.free_gpus = self.get_free_gpus()
+        if free_gpus is None:
+            self.free_gpus = get_free_gpus()
+        else:
+            self.free_gpus = free_gpus
         self.num_gpus = len(self.free_gpus)
         assert self.num_gpus > 0, "No GPUs found"
         # Set CUDA_VISIBLE_DEVICES to use only free GPUs
@@ -31,30 +61,6 @@ def __init__(self, model_path: str, mm_processor_kwargs: dict) -> None:
         self.mm_processor_kwargs = mm_processor_kwargs
         self._initialize()
 
-    def get_free_gpus(self) -> List[int]:
-        """
-        Overview:
-            Get IDs of GPUs with free memory.
-        Returns:
-            - List[int]: The IDs of the free GPUs.
-        """
-        try:
-            # Get GPU memory usage using nvidia-smi
-            gpu_stats = os.popen('nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader')\
-                .readlines()
-            free_gpus = []
-
-            for gpu_id, stats in enumerate(gpu_stats):
-                mem_used, mem_total = map(int, stats.strip().split(','))
-                # Consider GPU as free if less than 5% memory is used
-                if mem_used / mem_total < 0.05:
-                    free_gpus.append(gpu_id)
-
-            return free_gpus if free_gpus else [0]  # Default to GPU 0 if no free GPUs found
-        except Exception:
-            logger.warning("Failed to get GPU stats, defaulting to GPU 0")
-            return [0]
-
     def _initialize(self) -> None:
         """
         Overview:
@@ -113,6 +119,7 @@ class HuggingFaceModelGenerator:
     def __init__(
             self,
             model_path: str,
+            free_gpus: list,
             max_tokens: int = 1024,
             temperature: float = 0,
             mm_processor_kwargs: dict = {
@@ -128,7 +135,7 @@ def __init__(
             - max_tokens (int): The maximum number of tokens to generate, default to 1024.
             - temperature (float): The temperature for the language model, default to 0.
         """
-        self.vllm_actor = VllmActor(model_path, mm_processor_kwargs)
+        self.vllm_actor = VllmActor(model_path, mm_processor_kwargs, free_gpus)
         self.max_tokens = max_tokens
         self.temperature = temperature
 
@@ -208,7 +215,9 @@ def __init__(self, cfg: EasyDict) -> None:
             extra_input_keys=cfg.extra_input_keys
         )
 
-        self._model = VllmActor(model_path=cfg.model_path, mm_processor_kwargs=cfg.mm_processor_kwargs)
+        self._model = VllmActor(
+            model_path=cfg.model_path, mm_processor_kwargs=cfg.mm_processor_kwargs, free_gpus=cfg.free_gpus
+        )
         self.reset()
 
     def reset(self) -> None:
@@ -235,6 +244,16 @@ def reset_env(self, _env: Optional[Any] = None) -> None:
         pass
 
     async def _generate_for_prompt(self, prompt: str, num_samples_per_prompt: int) -> List[Tuple[str, float]]:
+        """
+        Overview:
+            Generate response for the prompt.
+        Arguments:
+            - prompt(str) : The prompt to generate tactics.
+            - num_samples_per_prompt (int): The number of tactics to generate.
+        Returns:
+            - List[Tuple[str, float]]: The generated tactics and their log-probabilities.
+
+        """
         return await self._model.generate(
             prompt=prompt,
             num_samples=num_samples_per_prompt,
@@ -382,7 +401,7 @@ def collect_prompts(
         results_list = loop.run_until_complete(asyncio.gather(*tasks))
         for i, prompt in enumerate(prompts):
             results[prompt['prompt']] = []
-            for result in results_list[i * 4:(i + 1) * 4]:
+            for result in results_list[i * num_samples_per_prompt:(i + 1) * num_samples_per_prompt]:
                 results[prompt['prompt']].append(result.outputs[0].text)
         return results
 
diff --git a/dizoo/d4rl/config/halfcheetah_medium_expert_iql_config.py b/dizoo/d4rl/config/halfcheetah_medium_expert_iql_config.py
index 144feac1dd..e3aa855afe 100644
--- a/dizoo/d4rl/config/halfcheetah_medium_expert_iql_config.py
+++ b/dizoo/d4rl/config/halfcheetah_medium_expert_iql_config.py
@@ -18,7 +18,6 @@
         model=dict(
             obs_shape=17,
             action_shape=6,
-
         ),
         learn=dict(
             data_path=None,
diff --git a/dizoo/d4rl/config/halfcheetah_medium_iql_config.py b/dizoo/d4rl/config/halfcheetah_medium_iql_config.py
index 545ecf970b..440525a320 100644
--- a/dizoo/d4rl/config/halfcheetah_medium_iql_config.py
+++ b/dizoo/d4rl/config/halfcheetah_medium_iql_config.py
@@ -18,7 +18,6 @@
         model=dict(
             obs_shape=17,
             action_shape=6,
-
         ),
         learn=dict(
             data_path=None,
diff --git a/dizoo/d4rl/config/halfcheetah_medium_replay_iql_config.py b/dizoo/d4rl/config/halfcheetah_medium_replay_iql_config.py
index d48a1fb472..0974735b72 100644
--- a/dizoo/d4rl/config/halfcheetah_medium_replay_iql_config.py
+++ b/dizoo/d4rl/config/halfcheetah_medium_replay_iql_config.py
@@ -18,7 +18,6 @@
         model=dict(
             obs_shape=17,
             action_shape=6,
-
         ),
         learn=dict(
             data_path=None,
diff --git a/dizoo/d4rl/config/hopper_medium_expert_iql_config.py b/dizoo/d4rl/config/hopper_medium_expert_iql_config.py
index 6aef029c5e..2eebce2771 100644
--- a/dizoo/d4rl/config/hopper_medium_expert_iql_config.py
+++ b/dizoo/d4rl/config/hopper_medium_expert_iql_config.py
@@ -18,7 +18,6 @@
         model=dict(
             obs_shape=11,
             action_shape=3,
-
         ),
         learn=dict(
             data_path=None,
diff --git a/dizoo/d4rl/config/hopper_medium_iql_config.py b/dizoo/d4rl/config/hopper_medium_iql_config.py
index 8f429be268..61dbb5fac3 100644
--- a/dizoo/d4rl/config/hopper_medium_iql_config.py
+++ b/dizoo/d4rl/config/hopper_medium_iql_config.py
@@ -18,7 +18,6 @@
         model=dict(
             obs_shape=11,
             action_shape=3,
-
         ),
         learn=dict(
             data_path=None,
diff --git a/dizoo/d4rl/config/hopper_medium_replay_iql_config.py b/dizoo/d4rl/config/hopper_medium_replay_iql_config.py
index ad1b222843..df96a84aea 100644
--- a/dizoo/d4rl/config/hopper_medium_replay_iql_config.py
+++ b/dizoo/d4rl/config/hopper_medium_replay_iql_config.py
@@ -18,7 +18,6 @@
         model=dict(
             obs_shape=11,
             action_shape=3,
-
         ),
         learn=dict(
             data_path=None,