Skip to content

Commit 98584d3

Browse files
committed
Update to python 3.11 and flash-attention 2.3.2
and ninja 1.11.1.1
1 parent 30c33bb commit 98584d3

File tree

8 files changed

+25
-83
lines changed

8 files changed

+25
-83
lines changed

Dockerfile

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ WORKDIR /app
1212
RUN dnf remove -y --disableplugin=subscription-manager \
1313
subscription-manager \
1414
# we install newer version of requests via pip
15-
python3-requests \
15+
python3.11-requests \
1616
&& dnf install -y make \
1717
# to help with debugging
1818
procps \
@@ -128,9 +128,10 @@ RUN cargo install --path .
128128
## Tests base ##################################################################
129129
FROM base as test-base
130130

131-
RUN dnf install -y make unzip python39 python3-pip gcc openssl-devel gcc-c++ && \
131+
RUN dnf install -y make unzip python3.11 python3.11-pip gcc openssl-devel gcc-c++ && \
132132
dnf clean all && \
133-
ln -s /usr/bin/python3 /usr/local/bin/python && ln -s /usr/bin/pip3 /usr/local/bin/pip
133+
ln -fs /usr/bin/python3.11 /usr/bin/python3 && \
134+
ln -s /usr/bin/python3.11 /usr/local/bin/python && ln -s /usr/bin/pip3.11 /usr/local/bin/pip
134135

135136
RUN pip install --upgrade pip && pip install pytest && pip install pytest-asyncio
136137

@@ -141,6 +142,7 @@ ENV CUDA_VISIBLE_DEVICES=""
141142
FROM test-base as cpu-tests
142143
ARG PYTORCH_INDEX
143144
ARG PYTORCH_VERSION
145+
ARG SITE_PACKAGES=/usr/local/lib/python3.11/site-packages
144146

145147
WORKDIR /usr/src
146148

@@ -157,8 +159,7 @@ RUN cd server && \
157159
pip install ".[accelerate]" --no-cache-dir
158160

159161
# Patch codegen model changes into transformers 4.34
160-
RUN cp server/transformers_patch/modeling_codegen.py \
161-
/usr/local/lib/python3.*/site-packages/transformers/models/codegen/modeling_codegen.py
162+
RUN cp server/transformers_patch/modeling_codegen.py ${SITE_PACKAGES}/transformers/models/codegen/modeling_codegen.py
162163

163164
# Install router
164165
COPY --from=router-builder /usr/local/cargo/bin/text-generation-router /usr/local/bin/text-generation-router
@@ -177,7 +178,7 @@ ARG PYTORCH_VERSION
177178
RUN dnf install -y unzip git ninja-build && dnf clean all
178179

179180
RUN cd ~ && \
180-
curl -L -O https://repo.anaconda.com/miniconda/Miniconda3-py39_23.5.2-0-Linux-x86_64.sh && \
181+
curl -L -O https://repo.anaconda.com/miniconda/Miniconda3-py311_23.5.2-0-Linux-x86_64.sh && \
181182
chmod +x Miniconda3-*-Linux-x86_64.sh && \
182183
bash ./Miniconda3-*-Linux-x86_64.sh -bf -p /opt/miniconda
183184

@@ -187,7 +188,7 @@ RUN rm -r /opt/miniconda/pkgs/conda-content-trust-*/info/test/tests
187188
ENV PATH=/opt/miniconda/bin:$PATH
188189

189190
# Install specific version of torch
190-
RUN pip install ninja==1.11.1 --no-cache-dir
191+
RUN pip install ninja==1.11.1.1 --no-cache-dir
191192
RUN pip install torch==$PYTORCH_VERSION+cu118 --index-url "${PYTORCH_INDEX}/cu118" --no-cache-dir
192193

193194

@@ -229,6 +230,7 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build /usr/src/flas
229230

230231
## Final Inference Server image ################################################
231232
FROM cuda-runtime as server-release
233+
ARG SITE_PACKAGES=/opt/miniconda/lib/python3.11/site-packages
232234

233235
# Install C++ compiler (required at runtime when PT2_COMPILE is enabled)
234236
RUN dnf install -y gcc-c++ && dnf clean all \
@@ -243,21 +245,20 @@ ENV PATH=/opt/miniconda/bin:$PATH
243245
# These could instead come from explicitly cached images
244246

245247
# Copy build artifacts from flash attention builder
246-
COPY --from=flash-att-cache /usr/src/flash-attention/build/lib.linux-x86_64-cpython-39 /opt/miniconda/lib/python3.9/site-packages
247-
COPY --from=flash-att-cache /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-39 /opt/miniconda/lib/python3.9/site-packages
248-
COPY --from=flash-att-cache /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-39 /opt/miniconda/lib/python3.9/site-packages
248+
COPY --from=flash-att-cache /usr/src/flash-attention/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
249+
COPY --from=flash-att-cache /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
250+
COPY --from=flash-att-cache /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
249251

250252
# Copy build artifacts from flash attention v2 builder
251-
COPY --from=flash-att-v2-cache /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-39 /opt/miniconda/lib/python3.9/site-packages
253+
COPY --from=flash-att-v2-cache /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
252254

253255
# Install server
254256
COPY proto proto
255257
COPY server server
256258
RUN cd server && make gen-server && pip install ".[accelerate, onnx-gpu]" --no-cache-dir
257259

258-
# Patch codegen model changes into transformers 4.34
259-
RUN cp server/transformers_patch/modeling_codegen.py \
260-
/opt/miniconda/lib/python3.*/site-packages/transformers/models/codegen/modeling_codegen.py
260+
# Patch codegen model changes into transformers 4.34.0
261+
RUN cp server/transformers_patch/modeling_codegen.py ${SITE_PACKAGES}/transformers/models/codegen/modeling_codegen.py
261262

262263
# Install router
263264
COPY --from=router-builder /usr/local/cargo/bin/text-generation-router /usr/local/bin/text-generation-router
@@ -272,7 +273,7 @@ ENV PORT=3000 \
272273
RUN chmod -R g+rwx ${HOME}
273274

274275
# Temporary for dev
275-
RUN chmod -R g+w /opt/miniconda/lib/python3.*/site-packages/text_generation_server /usr/src /usr/local/bin
276+
RUN chmod -R g+w ${SITE_PACKAGES}/text_generation_server /usr/src /usr/local/bin
276277

277278
# Run as non-root user by default
278279
USER tgis

integration_tests/poetry.lock

Lines changed: 2 additions & 31 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

integration_tests/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ description = "Integration Tests for FMaaS inference Server"
55
authors = ["Nick Hill"]
66

77
[tool.poetry.dependencies]
8-
python = "^3.9"
8+
python = "^3.11"
99

1010
[tool.poetry.group.dev.dependencies]
1111
protobuf = "^4.24.4"

server/Makefile-flash-att

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
#flash_att_commit := 6d48e14a6c2f551db96f0badc658a6279a929df3
21
flash_att_commit := v1.0.9
32

43
flash-attention:

server/Makefile-flash-att-v2

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
#flash_att_v2_commit := d30f2e1cd50185c98ed88c0684b4a603f15bee37
2-
flash_att_v2_commit := v2.0.4
1+
flash_att_v2_commit := v2.3.2
32

43
flash-attention-v2:
54
# Clone flash attention

server/poetry.lock

Lines changed: 2 additions & 32 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

server/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ authors = ["Olivier Dehaene <[email protected]>"]
88
text-generation-server = 'text_generation_server.cli:app'
99

1010
[tool.poetry.dependencies]
11-
python = ">=3.9.0,<3.13"
11+
python = ">=3.11.0,<3.13"
1212
protobuf = "^4.24.4"
1313
grpcio = "^1.59.0"
1414
grpcio-reflection = "^1.59.0"

server/text_generation_server/utils/flash_attn.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ def attention(
7070
softmax_scale,
7171
False,
7272
causal,
73+
-1,
74+
-1,
7375
False,
7476
None,
7577
)

0 commit comments

Comments
 (0)