vllm-project
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 6 additions & 2 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎docs/design/io_processor_plugins.md‎
Lines changed: 78 additions & 0 deletions b/‎docs/design/io_processor_plugins.md‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎docs/design/plugin_system.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/design/plugin_system.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/offline_inference/prithvi_geospatial_mae_io_processor.py‎
Lines changed: 60 additions & 0 deletions b/‎examples/offline_inference/prithvi_geospatial_mae_io_processor.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎examples/online_serving/kv_events_subscriber.py‎
Lines changed: 2 additions & 0 deletions b/‎examples/online_serving/kv_events_subscriber.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/online_serving/prithvi_geospatial_mae.py‎
Lines changed: 54 additions & 0 deletions b/‎examples/online_serving/prithvi_geospatial_mae.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎tests/conftest.py‎
Lines changed: 3 additions & 0 deletions b/‎tests/conftest.py‎
Lines changed: 3 additions & 0 deletions
@@ -566,8 +566,7 @@ steps:
   - tests/models/multimodal
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
-    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+    - pytest -v -s models/multimodal/processing
 
 - label: Multi-Modal Models Test (Standard)
   mirror_hardwares: [amdexperimental]
@@ -770,6 +769,11 @@ steps:
   - pytest -v -s plugins_tests/test_platform_plugins.py
   - pip uninstall vllm_add_dummy_platform -y
   # end platform plugin tests
+  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+  - pip install -e ./plugins/prithvi_io_processor_plugin
+  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  - pip uninstall prithvi_io_processor_plugin -y 
+  # end io_processor plugins test
   # other tests continue here:
   - pytest -v -s plugins_tests/test_scheduler_plugins.py
   - pip install -e ./plugins/vllm_add_dummy_model
 
@@ -0,0 +1,78 @@
+# IO Processor Plugins
+
+IO Processor plugins are a feature that allows pre and post processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output.
+
+When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggerd via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint.
+
+## Writing an IO Processor Plugin
+
+IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
+
+```python
+IOProcessorInput = TypeVar('IOProcessorInput')
+IOProcessorOutput = TypeVar('IOProcessorOutput')
+
+class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
+
+    def __init__(self, vllm_config: VllmConfig):
+        self.vllm_config = vllm_config
+
+    @abstractmethod
+    def pre_process(
+        self,
+        prompt: IOProcessorInput,
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> Union[PromptType, Sequence[PromptType]]:
+        raise NotImplementedError
+
+    async def pre_process_async(
+        self,
+        prompt: IOProcessorInput,
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> Union[PromptType, Sequence[PromptType]]:
+        return self.pre_process(prompt, request_id, **kwargs)
+
+    @abstractmethod
+    def post_process(self,
+                     model_output: Sequence[PoolingRequestOutput],
+                     request_id: Optional[str] = None,
+                     **kwargs) -> IOProcessorOutput:
+        raise NotImplementedError
+
+    async def post_process_async(
+        self,
+        model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
+        request_id: Optional[str] = None,
+        **kwargs,
+    ) -> IOProcessorOutput:
+        collected_output = [item async for i, item in model_output]
+        return self.post_process(collected_output, request_id, **kwargs)
+
+    @abstractmethod
+    def parse_request(self, request: Any) -> IOProcessorInput:
+        raise NotImplementedError
+
+    @abstractmethod
+    def output_to_response(
+            self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
+        raise NotImplementedError
+```
+
+The `parse_request` method is used for validating the user prompt and converting it into the input expected by the `pre_process`/`pre_process_async` methods.
+The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
+The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
+
+The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is [here](../../vllm/entrypoints/openai/serving_pooling_with_io_plugin.py).
+
+An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our [online](../../examples/online_serving/prithvi_geospatial_mae.py) and [offline](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py) inference examples.
+
+## Using an IO Processor plugin
+
+IO Processor plugins are loaded at engine startup and there are two methods for specifying the name of the plugin to be loaded:
+
+1. Via vLLM's `EngineArgs`: setting the `io_processor_plugin` argument in the `EngineArgs` used to initialize the `AsyncLLM`. The same can be achieved by passing the `io_processor_plugin` argument to `LLM` in offline mode, or by passing the `--io-processor-plugin` argument in serving mode.
+2. Via the model HF configuration: adding an `io_processor_plugin` field to the model config (config.json).
+
+The order also determines method priority. i.e., setting the plugin name via `EngineArgs` will override any plugin name specified in the model HF config (config.json).
@@ -49,6 +49,8 @@ Every plugin has three parts:
 
 - **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.
 
+- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for poling models. The plugin function returns the IOProcessor's class fully qualified name.
+
 ## Guidelines for Writing Plugins
 
 - **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
 
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import base64
+import os
+
+import torch
+
+from vllm import LLM
+from vllm.pooling_params import PoolingParams
+
+# This example shows how to perform an offline inference that generates
+# multimodal data. In this specific case this example will take a geotiff
+# image as input, process it using the multimodal data processor, and
+# perform inference.
+# Reuirement - install plugin at:
+#   https://github.com/christian-pinto/prithvi_io_processor_plugin
+
+
+def main():
+    torch.set_default_dtype(torch.float16)
+    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif"  # noqa: E501
+
+    img_prompt = dict(
+        data=image_url,
+        data_format="url",
+        image_format="tiff",
+        out_data_format="b64_json",
+    )
+
+    llm = LLM(
+        model="christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
+        skip_tokenizer_init=True,
+        trust_remote_code=True,
+        enforce_eager=True,
+        # Limit the maximum number of parallel requests
+        # to avoid the model going OOM.
+        # The maximum number depends on the available GPU memory
+        max_num_seqs=32,
+        io_processor_plugin="prithvi_to_tiff_india",
+    )
+
+    pooling_params = PoolingParams(task="encode", softmax=False)
+    pooler_output = llm.encode(
+        img_prompt,
+        pooling_params=pooling_params,
+    )
+    output = pooler_output[0].outputs
+
+    print(output)
+    decoded_data = base64.b64decode(output.data)
+
+    file_path = os.path.join(os.getcwd(), "offline_prediction.tiff")
+    with open(file_path, "wb") as f:
+        f.write(decoded_data)
+
+    print(f"Output file path: {file_path}")
+
+
+if __name__ == "__main__":
+    main()
@@ -27,10 +27,12 @@ class BlockStored(KVCacheEvent):
     token_ids: list[int]
     block_size: int
     lora_id: Optional[int]
+    medium: Optional[str]
 
 
 class BlockRemoved(KVCacheEvent):
     block_hashes: list[int]
+    medium: Optional[str]
 
 
 class AllBlocksCleared(KVCacheEvent):
 
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+import os
+
+import requests
+
+# This example shows how to perform an online inference that generates
+# multimodal data. In this specific case this example will take a geotiff
+# image as input, process it using the multimodal data processor, and
+# perform inference.
+# Reuirements :
+# - install plugin at:
+#   https://github.com/christian-pinto/prithvi_io_processor_plugin
+# - start vllm in serving mode with the below args
+#   --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
+#   --task embed --trust-remote-code
+#   --skip-tokenizer-init --enforce-eager
+#   --io-processor-plugin prithvi_to_tiff_india
+
+
+def main():
+    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif"  # noqa: E501
+    server_endpoint = "http://localhost:8000/pooling"
+
+    request_payload_url = {
+        "data": {
+            "data": image_url,
+            "data_format": "url",
+            "image_format": "tiff",
+            "out_data_format": "b64_json",
+        },
+        "priority": 0,
+        "model": "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
+    }
+
+    ret = requests.post(server_endpoint, json=request_payload_url)
+
+    print(f"response.status_code: {ret.status_code}")
+    print(f"response.reason:{ret.reason}")
+
+    response = ret.json()
+
+    decoded_image = base64.b64decode(response["data"]["data"])
+
+    out_path = os.path.join(os.getcwd(), "online_prediction.tiff")
+
+    with open(out_path, "wb") as f:
+        f.write(decoded_image)
+
+
+if __name__ == "__main__":
+    main()
@@ -1120,6 +1120,9 @@ def _apply_model(self):
 
         return self.llm.llm_engine.collective_rpc(_apply_model)
 
+    def get_llm(self) -> LLM:
+        return self.llm
+
     def __enter__(self):
         return self