Skip to content

Commit f22f686

Browse files
Merge pull request IBM#61 from IBM/main
[pull] main from IBM:main
2 parents 20fa70e + 545bbf6 commit f22f686

File tree

11 files changed

+472
-97
lines changed

11 files changed

+472
-97
lines changed

Dockerfile

Lines changed: 49 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,13 @@ ARG BASE_UBI_IMAGE_TAG=9.3-1552
33
ARG PROTOC_VERSION=25.2
44
ARG PYTORCH_INDEX="https://download.pytorch.org/whl"
55
# ARG PYTORCH_INDEX="https://download.pytorch.org/whl/nightly"
6+
ARG AUTO_GPTQ_VERSION=0.7.1
67

78
# match PyTorch version that was used to compile flash-attention v2 pre-built wheels
89
# e.g. flash-attn v2.5.2 => torch ['1.12.1', '1.13.1', '2.0.1', '2.1.2', '2.2.0', '2.3.0.dev20240126']
910
# https://github.com/Dao-AILab/flash-attention/blob/v2.5.2/.github/workflows/publish.yml#L47
1011
# use nightly build index for torch .dev pre-release versions
11-
ARG PYTORCH_VERSION=2.2.0
12+
ARG PYTORCH_VERSION=2.2.1
1213

1314
ARG PYTHON_VERSION=3.11
1415

@@ -35,18 +36,19 @@ ENV LANG=C.UTF-8 \
3536
## CUDA Base ###################################################################
3637
FROM base as cuda-base
3738

38-
ENV CUDA_VERSION=11.8.0 \
39-
NV_CUDA_LIB_VERSION=11.8.0-1 \
39+
# Ref: https://docs.nvidia.com/cuda/archive/12.1.0/cuda-toolkit-release-notes/
40+
ENV CUDA_VERSION=12.1.0 \
41+
NV_CUDA_LIB_VERSION=12.1.0-1 \
4042
NVIDIA_VISIBLE_DEVICES=all \
4143
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
42-
NV_CUDA_CUDART_VERSION=11.8.89-1 \
43-
NV_CUDA_COMPAT_VERSION=520.61.05-1
44+
NV_CUDA_CUDART_VERSION=12.1.55-1 \
45+
NV_CUDA_COMPAT_VERSION=530.30.02-1
4446

4547
RUN dnf config-manager \
4648
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
4749
&& dnf install -y \
48-
cuda-cudart-11-8-${NV_CUDA_CUDART_VERSION} \
49-
cuda-compat-11-8-${NV_CUDA_COMPAT_VERSION} \
50+
cuda-cudart-12-1-${NV_CUDA_CUDART_VERSION} \
51+
cuda-compat-12-1-${NV_CUDA_COMPAT_VERSION} \
5052
&& echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
5153
&& echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf \
5254
&& dnf clean all
@@ -59,22 +61,23 @@ ENV CUDA_HOME="/usr/local/cuda" \
5961
## CUDA Development ############################################################
6062
FROM cuda-base as cuda-devel
6163

62-
ENV NV_CUDA_CUDART_DEV_VERSION=11.8.89-1 \
63-
NV_NVML_DEV_VERSION=11.8.86-1 \
64-
NV_LIBCUBLAS_DEV_VERSION=11.11.3.6-1 \
65-
NV_LIBNPP_DEV_VERSION=11.8.0.86-1 \
66-
NV_LIBNCCL_DEV_PACKAGE_VERSION=2.15.5-1+cuda11.8
64+
# Ref: https://developer.nvidia.com/nccl/nccl-legacy-downloads
65+
ENV NV_CUDA_CUDART_DEV_VERSION=12.1.55-1 \
66+
NV_NVML_DEV_VERSION=12.1.55-1 \
67+
NV_LIBCUBLAS_DEV_VERSION=12.1.0.26-1 \
68+
NV_LIBNPP_DEV_VERSION=12.0.2.50-1 \
69+
NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.3-1+cuda12.1
6770

6871
RUN dnf config-manager \
6972
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
7073
&& dnf install -y \
71-
cuda-command-line-tools-11-8-${NV_CUDA_LIB_VERSION} \
72-
cuda-libraries-devel-11-8-${NV_CUDA_LIB_VERSION} \
73-
cuda-minimal-build-11-8-${NV_CUDA_LIB_VERSION} \
74-
cuda-cudart-devel-11-8-${NV_CUDA_CUDART_DEV_VERSION} \
75-
cuda-nvml-devel-11-8-${NV_NVML_DEV_VERSION} \
76-
libcublas-devel-11-8-${NV_LIBCUBLAS_DEV_VERSION} \
77-
libnpp-devel-11-8-${NV_LIBNPP_DEV_VERSION} \
74+
cuda-command-line-tools-12-1-${NV_CUDA_LIB_VERSION} \
75+
cuda-libraries-devel-12-1-${NV_CUDA_LIB_VERSION} \
76+
cuda-minimal-build-12-1-${NV_CUDA_LIB_VERSION} \
77+
cuda-cudart-devel-12-1-${NV_CUDA_CUDART_DEV_VERSION} \
78+
cuda-nvml-devel-12-1-${NV_NVML_DEV_VERSION} \
79+
libcublas-devel-12-1-${NV_LIBCUBLAS_DEV_VERSION} \
80+
libnpp-devel-12-1-${NV_LIBNPP_DEV_VERSION} \
7881
libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \
7982
&& dnf clean all
8083

@@ -199,12 +202,12 @@ ENV PATH=/opt/tgis/bin/:$PATH
199202
# Install specific version of torch
200203
RUN pip install ninja==1.11.1.1 --no-cache-dir
201204
RUN pip install packaging --no-cache-dir
202-
RUN pip install torch==$PYTORCH_VERSION+cu118 --index-url "${PYTORCH_INDEX}/cu118" --no-cache-dir
205+
RUN pip install torch==$PYTORCH_VERSION+cu121 --index-url "${PYTORCH_INDEX}/cu121" --no-cache-dir
203206

204207

205208
## Build flash attention v2 ####################################################
206209
FROM python-builder as flash-att-v2-builder
207-
ARG FLASH_ATT_VERSION=v2.5.2
210+
ARG FLASH_ATT_VERSION=v2.5.6
208211

209212
WORKDIR /usr/src/flash-attention-v2
210213

@@ -218,14 +221,15 @@ RUN MAX_JOBS=2 pip --verbose wheel --no-deps flash-attn==${FLASH_ATT_VERSION} \
218221

219222

220223
## Install auto-gptq ###########################################################
221-
FROM python-builder as auto-gptq-installer
222-
ARG AUTO_GPTQ_REF=ccb6386ebfde63c17c45807d38779a93cd25846f
223-
224-
WORKDIR /usr/src/auto-gptq-wheel
225-
226-
# numpy is required to run auto-gptq's setup.py
227-
RUN pip install numpy
228-
RUN DISABLE_QIGEN=1 pip wheel git+https://github.com/AutoGPTQ/AutoGPTQ@${AUTO_GPTQ_REF} --no-cache-dir --no-deps --verbose
224+
## Uncomment if a custom autogptq build is required
225+
#FROM python-builder as auto-gptq-installer
226+
#ARG AUTO_GPTQ_REF=896d8204bc89a7cfbda42bf3314e13cf4ce20b02
227+
#
228+
#WORKDIR /usr/src/auto-gptq-wheel
229+
#
230+
## numpy is required to run auto-gptq's setup.py
231+
#RUN pip install numpy
232+
#RUN DISABLE_QIGEN=1 pip wheel git+https://github.com/AutoGPTQ/AutoGPTQ@${AUTO_GPTQ_REF} --no-cache-dir --no-deps --verbose
229233

230234
## Build libraries #############################################################
231235
FROM python-builder as build
@@ -242,18 +246,20 @@ FROM base as flash-att-v2-cache
242246
COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2 /usr/src/flash-attention-v2
243247

244248

245-
## Auto gptq cached build image
246-
FROM base as auto-gptq-cache
247-
248-
# Copy just the wheel we built for auto-gptq
249-
COPY --from=auto-gptq-installer /usr/src/auto-gptq-wheel /usr/src/auto-gptq-wheel
249+
## Auto gptq cached build image ################################################
250+
## Uncomment if a custom autogptq build is required
251+
#FROM base as auto-gptq-cache
252+
#
253+
## Copy just the wheel we built for auto-gptq
254+
#COPY --from=auto-gptq-installer /usr/src/auto-gptq-wheel /usr/src/auto-gptq-wheel
250255

251256

252257
## Full set of python installations for server release #########################
253258

254259
FROM python-builder as python-installations
255260

256261
ARG PYTHON_VERSION
262+
ARG AUTO_GPTQ_VERSION
257263
ARG SITE_PACKAGES=/opt/tgis/lib/python${PYTHON_VERSION}/site-packages
258264

259265
COPY --from=build /opt/tgis /opt/tgis
@@ -266,15 +272,21 @@ RUN --mount=type=bind,from=flash-att-v2-cache,src=/usr/src/flash-attention-v2,ta
266272
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
267273

268274
# Copy over the auto-gptq wheel and install it
269-
RUN --mount=type=bind,from=auto-gptq-cache,src=/usr/src/auto-gptq-wheel,target=/usr/src/auto-gptq-wheel \
270-
pip install /usr/src/auto-gptq-wheel/*.whl --no-cache-dir
275+
#RUN --mount=type=bind,from=auto-gptq-cache,src=/usr/src/auto-gptq-wheel,target=/usr/src/auto-gptq-wheel \
276+
# pip install /usr/src/auto-gptq-wheel/*.whl --no-cache-dir
277+
278+
# We only need to install a custom-built auto-gptq version if we need a pre-release
279+
# or are using a PyTorch nightly version
280+
RUN pip install auto-gptq=="${AUTO_GPTQ_VERSION}" --no-cache-dir
271281

272282
# Install server
273283
# git is required to pull the fms-extras dependency
274284
RUN dnf install -y git && dnf clean all
275285
COPY proto proto
276286
COPY server server
277-
RUN cd server && make gen-server && pip install ".[accelerate, ibm-fms, onnx-gpu, quantize]" --no-cache-dir
287+
# Extra url is required to install cuda-12 version of onnxruntime-gpu
288+
# Ref: https://onnxruntime.ai/docs/install/#install-onnx-runtime-gpu-cuda-12x
289+
RUN cd server && make gen-server && pip install ".[accelerate, ibm-fms, onnx-gpu, quantize]" --no-cache-dir --extra-index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
278290

279291
# Patch codegen model changes into transformers 4.35
280292
RUN cp server/transformers_patch/modeling_codegen.py ${SITE_PACKAGES}/transformers/models/codegen/modeling_codegen.py

integration_tests/test_cases_bloom560m.yaml

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1254,7 +1254,7 @@
12541254

12551255

12561256
# Length penalty
1257-
- name: Length penalty
1257+
- name: Length penalty with repetition penalty
12581258
request:
12591259
params:
12601260
decoding:
@@ -1273,6 +1273,24 @@
12731273
stopReason: EOS_TOKEN
12741274
text: The first time I saw the movie, it was in
12751275

1276+
# Length penalty
1277+
- name: Length penalty
1278+
request:
1279+
params:
1280+
decoding:
1281+
length_penalty:
1282+
start_index: 8
1283+
decay_factor: 1.01
1284+
stopping:
1285+
maxNewTokens: 20
1286+
requests:
1287+
- {"text": "A very long story:\n"}
1288+
response:
1289+
responses:
1290+
- generatedTokenCount: 12
1291+
inputTokenCount: 6
1292+
stopReason: EOS_TOKEN
1293+
text: The first time I saw the movie, I was a
12761294

12771295
# Multiple inputs
12781296
- name: Multiple inputs

integration_tests/test_cases_mt0small.yaml

Lines changed: 113 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,100 @@
5050
- </s>
5151

5252

53+
# Tokenize with offsets
54+
- name: Tokenize with offsets
55+
request_type: tokenize
56+
request:
57+
return_offsets: true
58+
requests:
59+
- {"text": "The very long story is written"}
60+
response:
61+
responses:
62+
- offsets:
63+
- end: 3
64+
- end: 4
65+
start: 3
66+
- end: 8
67+
start: 4
68+
- end: 13
69+
start: 8
70+
- end: 19
71+
start: 13
72+
- end: 22
73+
start: 19
74+
- end: 23
75+
start: 22
76+
- end: 30
77+
start: 23
78+
- {}
79+
tokenCount: 9
80+
81+
82+
# Tokenize with tokens and offsets
83+
- name: Tokenize with tokens and offsets
84+
request_type: tokenize
85+
request:
86+
return_tokens: true
87+
return_offsets: true
88+
requests:
89+
- { "text": "The very long story is written" }
90+
response:
91+
responses:
92+
- offsets:
93+
- end: 3
94+
- end: 4
95+
start: 3
96+
- end: 8
97+
start: 4
98+
- end: 13
99+
start: 8
100+
- end: 19
101+
start: 13
102+
- end: 22
103+
start: 19
104+
- end: 23
105+
start: 22
106+
- end: 30
107+
start: 23
108+
- {}
109+
tokenCount: 9
110+
tokens:
111+
- "\u2581The"
112+
- "\u2581"
113+
- very
114+
- "\u2581long"
115+
- "\u2581story"
116+
- "\u2581is"
117+
- "\u2581"
118+
- written
119+
- </s>
120+
121+
122+
# Tokenize with truncate
123+
- name: Tokenize with tokens and truncation
124+
request_type: tokenize
125+
request:
126+
return_tokens: true
127+
truncate_input_tokens: 10
128+
requests:
129+
- {"text": "The very long story is written by a very long story"}
130+
response:
131+
responses:
132+
- tokenCount: 10
133+
# Truncation happens on the left
134+
tokens:
135+
- "\u2581"
136+
- written
137+
- "\u2581by"
138+
- "\u2581"
139+
- a
140+
- "\u2581"
141+
- very
142+
- "\u2581long"
143+
- "\u2581story"
144+
- </s>
145+
146+
53147
# Basic Greedy (implicit)
54148
- name: Basic Greedy, max new tokens (implicit)
55149
request:
@@ -1072,7 +1166,7 @@
10721166

10731167

10741168
# Length penalty
1075-
- name: Length penalty
1169+
- name: Length penalty with repetition penalty
10761170
request:
10771171
params:
10781172
decoding:
@@ -1092,6 +1186,24 @@
10921186
text: The very long story is
10931187

10941188

1189+
# Length penalty
1190+
- name: Length penalty
1191+
request:
1192+
params:
1193+
decoding:
1194+
length_penalty:
1195+
start_index: 3
1196+
decay_factor: 4.0
1197+
stopping:
1198+
maxNewTokens: 20
1199+
requests:
1200+
- {"text": "A very long story:\n"}
1201+
response:
1202+
responses:
1203+
- generatedTokenCount: 7
1204+
inputTokenCount: 8
1205+
stopReason: EOS_TOKEN
1206+
text: The very long story is
10951207

10961208
# Multiple inputs
10971209
- name: Multiple inputs

proto/generation.proto

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,11 @@ message TokenInfo {
197197
message BatchedTokenizeRequest {
198198
string model_id = 1;
199199
repeated TokenizeRequest requests = 2;
200-
bool return_tokens = 3; //TBD
200+
bool return_tokens = 3;
201+
bool return_offsets = 4;
202+
203+
// Zero means don't truncate.
204+
uint32 truncate_input_tokens = 5;
201205
}
202206

203207
message BatchedTokenizeResponse {
@@ -209,10 +213,17 @@ message TokenizeRequest {
209213
}
210214

211215
message TokenizeResponse {
216+
message Offset {
217+
uint32 start = 1;
218+
uint32 end = 2;
219+
}
220+
212221
uint32 token_count = 1;
213-
repeated string tokens = 2; // if include_tokens = true
214222

215-
// We'll possibly add more later
223+
// if return_tokens = true
224+
repeated string tokens = 2;
225+
// if return_tokens = true
226+
repeated Offset offsets = 3;
216227
}
217228

218229

0 commit comments

Comments
 (0)