model loading is 10s faster

wsmlby · wsmlby · commit 0edde691aea1 · 2025-08-10T22:45:30.000-07:00
diff --git a/server/Dockerfile.cuda b/server/Dockerfile.cuda
@@ -17,6 +17,7 @@ RUN pip install -r requirements.txt
 # RUN pip install .
 ENV ACCELERATOR=CUDA
 # Copy our application source code
+COPY vllm_patched /usr/local/lib/python3.12/dist-packages/vllm
 COPY ./homl_server ./homl_server
 WORKDIR /app/homl_server
 # The base image exposes port 8000, so we don't need to do it again.
diff --git a/server/homl_server/main.py b/server/homl_server/main.py
@@ -36,11 +36,14 @@
 MODEL_LOAD_TIMEOUT = int(os.environ.get("HOML_MODEL_LOAD_TIMEOUT", 180))  # seconds
 # # This is the time after which a model will be unloaded if it is idle
 MODEL_UNLOAD_IDLE_TIME = int(os.environ.get("HOML_MODEL_UNLOAD_IDLE_TIME", 600))  # 10 minutes default
+module_info_cache = os.path.join(MODEL_HOME, "module_info_cache")
 
 os.makedirs(os.path.join(MODEL_HOME, "home"), exist_ok=True)
 os.makedirs(MODEL_LIB, exist_ok=True)
 os.makedirs(TORCH_CACHE, exist_ok=True)
+os.makedirs(module_info_cache, exist_ok=True)
 os.environ["TORCHINDUCTOR_CACHE_DIR"] = TORCH_CACHE
+os.environ["VLLM_LAZY_LOAD_MODULE_INFO_CACHE"] = module_info_cache
 # Ensure cache and lib directories exist
 
 
diff --git a/server/vllm_patched/model_executor/models/registry.py b/server/vllm_patched/model_executor/models/registry.py