[python] Fix formatting (#2997)

xyang16 · web-flow · commit 99f6441be7dc · 2026-02-04T19:26:15.000-08:00
diff --git a/engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py b/engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py
@@ -162,10 +162,12 @@ async def initialize(self, properties: dict):
             self.session_manager: SessionManager = SessionManager(properties)
         self.initialized = True
 
-    def _get_custom_formatter(self, adapter_name: Optional[str] = None) -> bool:
+    def _get_custom_formatter(self,
+                              adapter_name: Optional[str] = None) -> bool:
         """Check if a custom output formatter exists for the adapter or base model."""
         if adapter_name:
-            adapter_formatter = self.get_adapter_formatter_handler(adapter_name)
+            adapter_formatter = self.get_adapter_formatter_handler(
+                adapter_name)
             if adapter_formatter and adapter_formatter.output_formatter:
                 return True
         return self.output_formatter is not None
@@ -263,7 +265,9 @@ async def check_health(self):
             logger.fatal("vLLM engine is dead, terminating process")
             kill_process_tree(os.getpid())
 
-    async def inference(self, inputs: Input) -> Union[Output, AsyncGenerator[Output, None]]:
+    async def inference(
+            self,
+            inputs: Input) -> Union[Output, AsyncGenerator[Output, None]]:
         await self.check_health()
         try:
             processed_request = self.preprocess_request(inputs)
@@ -281,10 +285,12 @@ async def inference(self, inputs: Input) -> Union[Output, AsyncGenerator[Output,
             processed_request.vllm_request)
 
         # Check if custom formatter exists (applies to both streaming and non-streaming)
-        custom_formatter = self._get_custom_formatter(processed_request.adapter_name)
+        custom_formatter = self._get_custom_formatter(
+            processed_request.adapter_name)
 
         if isinstance(response, types.AsyncGeneratorType):
-            return self._handle_streaming_response(response, processed_request, custom_formatter)
+            return self._handle_streaming_response(response, processed_request,
+                                                   custom_formatter)
 
         # Non-streaming response
         if custom_formatter:
@@ -296,32 +302,34 @@ async def inference(self, inputs: Input) -> Union[Output, AsyncGenerator[Output,
             elif hasattr(formatted_response, 'model_dump'):
                 formatted_response = formatted_response.model_dump()
             return create_non_stream_output(formatted_response)
-        
+
         # LMI formatter for non-streaming
         return processed_request.non_stream_output_formatter(
             response,
             request=processed_request.vllm_request,
             tokenizer=self.tokenizer,
         )
 
-    async def _handle_streaming_response(self, response, processed_request, custom_formatter):
+    async def _handle_streaming_response(self, response, processed_request,
+                                         custom_formatter):
         """Handle streaming responses as an async generator"""
         if custom_formatter:
             # Custom formatter: apply to each chunk and yield directly
             async for chunk in response:
                 formatted_chunk = self.apply_output_formatter(
                     chunk, adapter_name=processed_request.adapter_name)
-                yield create_stream_chunk_output(formatted_chunk, last_chunk=False)
+                yield create_stream_chunk_output(formatted_chunk,
+                                                 last_chunk=False)
             yield create_stream_chunk_output("", last_chunk=True)
         else:
             # LMI formatter for streaming
             async for output in handle_streaming_response(
-                response,
-                processed_request.stream_output_formatter,
-                request=processed_request.vllm_request,
-                accumulate_chunks=processed_request.accumulate_chunks,
-                include_prompt=processed_request.include_prompt,
-                tokenizer=self.tokenizer,
+                    response,
+                    processed_request.stream_output_formatter,
+                    request=processed_request.vllm_request,
+                    accumulate_chunks=processed_request.accumulate_chunks,
+                    include_prompt=processed_request.include_prompt,
+                    tokenizer=self.tokenizer,
             ):
                 yield output
 
diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py
@@ -639,7 +639,8 @@
         "s3://djl-llm/llama-3-8b-instruct-hf/",
         "option.tensor_parallel_degree":
         1,
-        "option.max_model_len": 8192,
+        "option.max_model_len":
+        8192,
         "option.lmcache_config_file":
         "lmcache_local_storage.yaml",
         "option.kv_transfer_config":
@@ -1407,7 +1408,8 @@ def build_stateful_model(model):
     'vllm_async': build_vllm_async_model,
     'vllm_async_custom_formatters': build_vllm_async_model_custom_formatters,
     'vllm_async_custom_handler': build_vllm_async_model_with_custom_handler,
-    'vllm_async_example_formatter': build_vllm_async_model_with_example_formatter
+    'vllm_async_example_formatter':
+    build_vllm_async_model_with_example_formatter
 }
 
 if __name__ == '__main__':