Skip to content

Commit cfa10e3

Browse files
joerundenjhill
authored andcommitted
Install AutoGPTQ from cache
This adds in the Dockerfile changes for both: - Compiling the auto_gptq wheel, which will be pushed in a cache image from the auto-gptq-cache branch - Installing auto_gptq from the cached wheel in the main builds Signed-off-by: Joe Runde <[email protected]>
1 parent 6e9ca9a commit cfa10e3

File tree

1 file changed

+18
-0
lines changed

1 file changed

+18
-0
lines changed

Dockerfile

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,15 @@ WORKDIR /usr/src
221221
COPY server/Makefile-flash-att Makefile
222222
RUN make build-flash-attention
223223

224+
## Install auto-gptq ###########################################################
225+
FROM python-builder as auto-gptq-installer
226+
ARG AUTO_GPTQ_REF=ccb6386ebfde63c17c45807d38779a93cd25846f
227+
228+
WORKDIR /usr/src/auto-gptq-wheel
229+
230+
# numpy is required to run auto-gptq's setup.py
231+
RUN pip install numpy
232+
RUN DISABLE_QIGEN=1 pip wheel git+https://github.com/AutoGPTQ/AutoGPTQ@${AUTO_GPTQ_REF} --no-cache-dir --no-deps --verbose
224233

225234
## Build libraries #############################################################
226235
FROM python-builder as build
@@ -257,6 +266,11 @@ COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build /usr/sr
257266
FROM base as flash-att-v2-cache
258267
COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2 /usr/src/flash-attention-v2
259268

269+
## Auto gptq cached build image
270+
FROM base as auto-gptq-cache
271+
272+
# Cache just the wheel we built for auto-gptq
273+
COPY --from=auto-gptq-installer /usr/src/auto-gptq-wheel /usr/src/auto-gptq-wheel
260274

261275
## Final Inference Server image ################################################
262276
FROM cuda-runtime as server-release
@@ -290,6 +304,10 @@ COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-* ${
290304
# Copy build artifacts from exllamav2 kernels builder
291305
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
292306

307+
# Copy over the auto-gptq wheel and install it
308+
RUN --mount=type=bind,from=auto-gptq-cache,src=/usr/src/auto-gptq-wheel,target=/usr/src/auto-gptq-wheel \
309+
pip install /usr/src/auto-gptq-wheel/*.whl --no-cache-dir
310+
293311
# Install server
294312
COPY proto proto
295313
COPY server server

0 commit comments

Comments
 (0)