diff --git a/README.md b/README.md index a6e87f2..9c79bfe 100644 --- a/README.md +++ b/README.md @@ -139,6 +139,7 @@ Below is a summary of the available RunPod Worker images, categorized by image s | `LONG_LORA_SCALING_FACTORS` | None | `tuple` | Specify multiple scaling factors for LoRA adapters. | | `MAX_CPU_LORAS` | None | `int` | Maximum number of LoRAs to store in CPU memory. | | `FULLY_SHARDED_LORAS` | False | `bool` | Enable fully sharded LoRA layers. | +| `LORA_MODULES`| `[]`| `list[dict]`| Add lora adapters from Hugging Face `[{"name": "xx", "path": "xxx/xxxx", "base_model_name": "xxx/xxxx"}`| | `SCHEDULER_DELAY_FACTOR` | 0.0 | `float` | Apply a delay before scheduling next prompt. | | `ENABLE_CHUNKED_PREFILL` | False | `bool` | Enable chunked prefill requests. | | `SPECULATIVE_MODEL` | None | `str` | The name of the draft model to be used in speculative decoding. | diff --git a/src/engine.py b/src/engine.py index 88a0101..1f81c4c 100644 --- a/src/engine.py +++ b/src/engine.py @@ -122,31 +122,40 @@ def __init__(self, vllm_engine): super().__init__(vllm_engine) self.served_model_name = os.getenv("OPENAI_SERVED_MODEL_NAME_OVERRIDE") or self.engine_args.model self.response_role = os.getenv("OPENAI_RESPONSE_ROLE") or "assistant" + self.lora_adapters = self._load_lora_adapters() asyncio.run(self._initialize_engines()) self.raw_openai_output = bool(int(os.getenv("RAW_OPENAI_OUTPUT", 1))) - + + def _load_lora_adapters(self): + adapters = [] + try: + adapters = json.loads(os.getenv("LORA_MODULES", '[]')) + except Exception as e: + logging.info(f"---Initialized adapter json load error: {e}") + + for i, adapter in enumerate(adapters): + try: + adapters[i] = LoRAModulePath(**adapter) + logging.info(f"---Initialized adapter: {adapter}") + except Exception as e: + logging.info(f"---Initialized adapter not worked: {e}") + continue + return adapters + async def _initialize_engines(self): self.model_config = await self.llm.get_model_config() self.base_model_paths = [ BaseModelPath(name=self.engine_args.model, model_path=self.engine_args.model) ] - lora_modules = os.getenv('LORA_MODULES', None) - if lora_modules is not None: - try: - lora_modules = json.loads(lora_modules) - lora_modules = [LoRAModulePath(**lora_modules)] - except: - lora_modules = None - self.serving_models = OpenAIServingModels( engine_client=self.llm, model_config=self.model_config, base_model_paths=self.base_model_paths, - lora_modules=None, + lora_modules=self.lora_adapters, prompt_adapters=None, ) - + await self.serving_models.init_static_loras() self.chat_engine = OpenAIServingChat( engine_client=self.llm, model_config=self.model_config,