Skip to content

Commit 0edde69

Browse files
committed
model loading is 10s faster
1 parent 46ff443 commit 0edde69

File tree

3 files changed

+715
-0
lines changed

3 files changed

+715
-0
lines changed

server/Dockerfile.cuda

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ RUN pip install -r requirements.txt
1717
# RUN pip install .
1818
ENV ACCELERATOR=CUDA
1919
# Copy our application source code
20+
COPY vllm_patched /usr/local/lib/python3.12/dist-packages/vllm
2021
COPY ./homl_server ./homl_server
2122
WORKDIR /app/homl_server
2223
# The base image exposes port 8000, so we don't need to do it again.

server/homl_server/main.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,14 @@
3636
MODEL_LOAD_TIMEOUT = int(os.environ.get("HOML_MODEL_LOAD_TIMEOUT", 180)) # seconds
3737
# # This is the time after which a model will be unloaded if it is idle
3838
MODEL_UNLOAD_IDLE_TIME = int(os.environ.get("HOML_MODEL_UNLOAD_IDLE_TIME", 600)) # 10 minutes default
39+
module_info_cache = os.path.join(MODEL_HOME, "module_info_cache")
3940

4041
os.makedirs(os.path.join(MODEL_HOME, "home"), exist_ok=True)
4142
os.makedirs(MODEL_LIB, exist_ok=True)
4243
os.makedirs(TORCH_CACHE, exist_ok=True)
44+
os.makedirs(module_info_cache, exist_ok=True)
4345
os.environ["TORCHINDUCTOR_CACHE_DIR"] = TORCH_CACHE
46+
os.environ["VLLM_LAZY_LOAD_MODULE_INFO_CACHE"] = module_info_cache
4447
# Ensure cache and lib directories exist
4548

4649

0 commit comments

Comments
 (0)