Skip to content

Commit 5ac2dd1

Browse files
Merge branch 'main' into triton-kernel
Signed-off-by: chichun-charlie-liu <[email protected]>
2 parents 92a539e + 38d4b36 commit 5ac2dd1

22 files changed

+1082
-42
lines changed

.gitignore

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
# Distribution / packaging
2-
build/
32
dist/
43
*.egg-info/
54

5+
# Build output
6+
/build/lib/
7+
68
# Unit test / coverage reports
79
__pycache__
810
htmlcov/
@@ -12,6 +14,8 @@ htmlcov/
1214
durations/*
1315
coverage*.xml
1416
qcfg.json
17+
models
18+
configs
1519

1620
# IDEs
1721
.vscode/
@@ -33,6 +37,9 @@ venv/
3337
# Generated by spelling check
3438
dictionary.dic
3539

40+
# Generated error log
41+
error.log
42+
3643
# Files generated from running examples
3744
fms_mo.log
3845
data_train/

build/Dockerfile

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
# Copyright The FMS Model Optimizer Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
## Global Args #################################################################
17+
ARG BASE_UBI_IMAGE_TAG=latest
18+
ARG USER=optimizer
19+
ARG USER_UID=1000
20+
ARG PYTHON_VERSION=3.11
21+
ARG WHEEL_VERSION=""
22+
## Enable Aimstack if requested via ENABLE_AIM set to "true"
23+
ARG ENABLE_AIM=false
24+
25+
## Base Layer ##################################################################
26+
FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} AS base
27+
28+
ARG PYTHON_VERSION
29+
ARG USER
30+
ARG USER_UID
31+
32+
# Note this works for 3.9, 3.11, 3.12
33+
RUN dnf remove -y --disableplugin=subscription-manager \
34+
subscription-manager \
35+
&& dnf install -y python${PYTHON_VERSION} procps g++ python${PYTHON_VERSION}-devel \
36+
&& ln -s /usr/bin/python${PYTHON_VERSION} /bin/python \
37+
&& python -m ensurepip --upgrade \
38+
&& python -m pip install --upgrade pip \
39+
&& python -m pip install --upgrade setuptools \
40+
&& dnf update -y \
41+
&& dnf clean all
42+
43+
ENV LANG=C.UTF-8 \
44+
LC_ALL=C.UTF-8
45+
46+
RUN useradd -u $USER_UID ${USER} -m -g 0 --system && \
47+
chmod g+rx /home/${USER}
48+
49+
## Used as base of the Release stage to removed unrelated the packages and CVEs
50+
FROM base AS release-base
51+
52+
# Removes the python3.9 code to eliminate possible CVEs. Also removes dnf
53+
RUN rpm -e $(dnf repoquery python3-* -q --installed) dnf python3 yum crypto-policies-scripts
54+
55+
56+
## CUDA Base ###################################################################
57+
FROM base AS cuda-base
58+
59+
# Ref: https://docs.nvidia.com/cuda/archive/12.1.0/cuda-toolkit-release-notes/
60+
ENV CUDA_VERSION=12.1.0 \
61+
NV_CUDA_LIB_VERSION=12.1.0-1 \
62+
NVIDIA_VISIBLE_DEVICES=all \
63+
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
64+
NV_CUDA_CUDART_VERSION=12.1.55-1 \
65+
NV_CUDA_COMPAT_VERSION=530.30.02-1
66+
67+
RUN dnf config-manager \
68+
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
69+
&& dnf install -y \
70+
cuda-cudart-12-1-${NV_CUDA_CUDART_VERSION} \
71+
cuda-compat-12-1-${NV_CUDA_COMPAT_VERSION} \
72+
&& echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
73+
&& echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf \
74+
&& dnf clean all
75+
76+
ENV CUDA_HOME="/usr/local/cuda" \
77+
PATH="/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH}" \
78+
LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
79+
80+
## CUDA Development ############################################################
81+
FROM cuda-base AS cuda-devel
82+
83+
# Ref: https://developer.nvidia.com/nccl/nccl-legacy-downloads
84+
ENV NV_CUDA_CUDART_DEV_VERSION=12.1.55-1 \
85+
NV_NVML_DEV_VERSION=12.1.55-1 \
86+
NV_LIBCUBLAS_DEV_VERSION=12.1.0.26-1 \
87+
NV_LIBNPP_DEV_VERSION=12.0.2.50-1 \
88+
NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.3-1+cuda12.1
89+
90+
RUN dnf config-manager \
91+
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
92+
&& dnf install -y \
93+
cuda-command-line-tools-12-1-${NV_CUDA_LIB_VERSION} \
94+
cuda-libraries-devel-12-1-${NV_CUDA_LIB_VERSION} \
95+
cuda-minimal-build-12-1-${NV_CUDA_LIB_VERSION} \
96+
cuda-cudart-devel-12-1-${NV_CUDA_CUDART_DEV_VERSION} \
97+
cuda-nvml-devel-12-1-${NV_NVML_DEV_VERSION} \
98+
libcublas-devel-12-1-${NV_LIBCUBLAS_DEV_VERSION} \
99+
libnpp-devel-12-1-${NV_LIBNPP_DEV_VERSION} \
100+
libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \
101+
&& dnf clean all
102+
103+
ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"
104+
105+
## Full set of python installations for final image release #########################
106+
FROM cuda-devel AS python-installations
107+
108+
ARG WHEEL_VERSION
109+
ARG USER
110+
ARG USER_UID
111+
ARG ENABLE_AIM
112+
113+
# consistent arch support anywhere we compile CUDA code
114+
ENV TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX;8.9"
115+
116+
RUN dnf install -y git && \
117+
# perl-Net-SSLeay.x86_64 and server_key.pem are installed with git as dependencies
118+
# Twistlock detects it as H severity: Private keys stored in image
119+
rm -f /usr/share/doc/perl-Net-SSLeay/examples/server_key.pem && \
120+
dnf clean all
121+
USER ${USER}
122+
WORKDIR /tmp
123+
RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \
124+
python -m pip install --user build
125+
COPY --chown=${USER}:root fms_mo fms_mo
126+
COPY .git .git
127+
COPY pyproject.toml pyproject.toml
128+
129+
# Build a wheel if PyPi wheel_version is empty else download the wheel from PyPi
130+
RUN if [[ -z "${WHEEL_VERSION}" ]]; \
131+
then python -m build --wheel --outdir /tmp; \
132+
else pip download fms-model-optimizer==${WHEEL_VERSION} --dest /tmp --only-binary=:all: --no-deps; \
133+
fi && \
134+
ls /tmp/*.whl >/tmp/bdist_name
135+
136+
# Install from the wheel
137+
RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \
138+
python -m pip install --user wheel && \
139+
python -m pip install --user "$(head bdist_name)[opt]" && \
140+
python -m pip install --user "$(head bdist_name)" && \
141+
python -m pip install --user "$(head bdist_name)[opt]"
142+
143+
144+
RUN if [[ "${ENABLE_AIM}" == "true" ]]; then \
145+
python -m pip install --user "$(head bdist_name)[aim]"; \
146+
fi
147+
148+
# Clean up the wheel module. It's only needed by flash-attn install
149+
RUN python -m pip uninstall wheel build -y && \
150+
# Cleanup the bdist whl file
151+
rm $(head bdist_name) /tmp/bdist_name
152+
153+
## Final image ################################################
154+
FROM release-base AS release
155+
ARG USER
156+
ARG PYTHON_VERSION
157+
ARG ENABLE_AIM
158+
159+
RUN mkdir -p /licenses
160+
COPY LICENSE /licenses/
161+
162+
RUN mkdir /app && \
163+
chown -R $USER:0 /app /tmp && \
164+
chmod -R g+rwX /app /tmp
165+
166+
# Need a better way to address these hacks
167+
RUN if [[ "${ENABLE_AIM}" == "true" ]] ; then \
168+
touch /.aim_profile && \
169+
chmod -R 777 /.aim_profile; \
170+
fi
171+
RUN mkdir /.cache && \
172+
chmod -R 777 /.cache
173+
174+
# Copy scripts and default configs
175+
COPY build/accelerate_launch.py /app/
176+
COPY build/utils.py /app/build/
177+
RUN chmod +x /app/accelerate_launch.py
178+
179+
ENV SET_NUM_PROCESSES_TO_NUM_GPUS="True"
180+
181+
WORKDIR /app
182+
USER ${USER}
183+
COPY --from=python-installations /home/${USER}/.local /home/${USER}/.local
184+
ENV PYTHONPATH="/home/${USER}/.local/lib/python${PYTHON_VERSION}/site-packages"
185+
186+
CMD [ "python", "/app/accelerate_launch.py" ]

build/README.md

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
# Building fms-model-optimizer as an Image
2+
3+
The Dockerfile provides a way of running FMS Model Optimizer (FMS MO). It installs the dependencies needed and adds two additional scripts that helps to parse arguments to pass to FMS MO. The `accelerate_launch.py` script is run by default when running the image to trigger FMS MO for single or multi GPU by parsing arguments and running `accelerate launch fms_mo.run_quant.py`.
4+
5+
## Configuration
6+
7+
The scripts accept a JSON formatted config which are set by environment variables. `FMS_MO_CONFIG_JSON_PATH` can be set to the mounted path of the JSON config. Alternatively, `FMS_MO_CONFIG_JSON_ENV_VAR` can be set to the encoded JSON config using the below function:
8+
9+
```py
10+
import base64
11+
12+
def encode_json(my_json_string):
13+
base64_bytes = base64.b64encode(my_json_string.encode("ascii"))
14+
txt = base64_bytes.decode("ascii")
15+
return txt
16+
17+
with open("test_config.json") as f:
18+
contents = f.read()
19+
20+
encode_json(contents)
21+
```
22+
23+
The keys for the JSON config are all of the flags available to use with [FMS Model Optimizer](fms_mo/training_args.py).
24+
25+
For configuring `accelerate launch`, use key `accelerate_launch_args` and pass the set of flags accepted by [accelerate launch](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-launch). Since these flags are passed via the JSON config, the key matches the long formed flag name. For example, to enable flag `--quiet`, use JSON key `"quiet"`, using the short formed `"q"` will fail.
26+
27+
For example, the below config is used for creating a GPTQ checkpoint of LLAMA-3-8B with two GPUs:
28+
29+
```json
30+
{
31+
"accelerate_launch_args": {
32+
"main_process_port": 1234
33+
},
34+
"model_name_or_path": "meta-llama/Meta-Llama-3-8B",
35+
"training_data_path": "data_train",
36+
"quant_method": "gptq",
37+
"bits": 4,
38+
"group_size": 128,
39+
"output_dir": "/output/Meta-Llama-3-8B-GPTQ-MULTIGPU"
40+
}
41+
```
42+
43+
`num_processes` defaults to the amount of GPUs allocated for optimization, unless the user sets `SET_NUM_PROCESSES_TO_NUM_GPUS` to `False`. Note that `num_processes` which is the total number of processes to be launched in parallel, should match the number of GPUs to run on. The number of GPUs used can also be set by setting environment variable `CUDA_VISIBLE_DEVICES`. If ``num_processes=1`, the script will assume single-GPU.
44+
45+
46+
## Building the Image
47+
48+
With docker, build the image at the top level with:
49+
50+
```sh
51+
docker build . -t fms-model-optimizer:mytag -f build/Dockerfile
52+
```
53+
54+
## Running the Image
55+
56+
Run fms-model-optimizer-image with the JSON env var and mounts set up.
57+
58+
```sh
59+
docker run -v config.json:/app/config.json -v $MODEL_PATH:/models --env FMS_MO_CONFIG_JSON_PATH=/app/config.json fms-model-optimizer:mytag
60+
```
61+
62+
This will run `accelerate_launch.py` with the JSON config passed.
63+
64+
An example Kubernetes Pod for deploying fms-model-optimizer which requires creating PVCs with the model and input dataset and any mounts needed for the outputted quantized model:
65+
66+
```yaml
67+
apiVersion: v1
68+
kind: ConfigMap
69+
metadata:
70+
name: fms-model-optimizer-config
71+
data:
72+
config.json: |
73+
{
74+
"accelerate_launch_args": {
75+
"main_process_port": 1234
76+
},
77+
"model_name_or_path": "meta-llama/Meta-Llama-3-8B",
78+
"training_data_path": "data_train",
79+
"quant_method": "gptq",
80+
"bits": 4,
81+
"group_size": 128,
82+
"output_dir": "/output/Meta-Llama-3-8B-GPTQ-MULTIGPU"
83+
}
84+
---
85+
apiVersion: v1
86+
kind: Pod
87+
metadata:
88+
name: fms-model-optimizer-test
89+
spec:
90+
containers:
91+
env:
92+
- name: FMS_MO_CONFIG_JSON_PATH
93+
value: /config/config.json
94+
image: fms-model-optimizer:mytag
95+
imagePullPolicy: IfNotPresent
96+
name: fms-mo-test
97+
resources:
98+
limits:
99+
nvidia.com/gpu: "2"
100+
memory: 200Gi
101+
cpu: "10"
102+
ephemeral-storage: 2Ti
103+
requests:
104+
memory: 80Gi
105+
cpu: "5"
106+
ephemeral-storage: 1600Gi
107+
volumeMounts:
108+
- mountPath: /data/input
109+
name: input-data
110+
- mountPath: /data/output
111+
name: output-data
112+
- mountPath: /config
113+
name: fms-model-optimizer-config
114+
restartPolicy: Never
115+
terminationGracePeriodSeconds: 30
116+
volumes:
117+
- name: input-data
118+
persistentVolumeClaim:
119+
claimName: input-pvc
120+
- name: output-data
121+
persistentVolumeClaim:
122+
claimName: output-pvc
123+
- name: fms-model-optimizer-config
124+
configMap:
125+
name: fms-model-optimizer-config
126+
```
127+
128+
The above kube resource values are not hard-defined. However, they are useful when running some models (such as LLaMa-13b model). If ephemeral storage is not defined, you will likely hit into error `The node was low on resource: ephemeral-storage. Container was using 1498072868Ki, which exceeds its request of 0.` where the pod runs low on storage while tuning the model.
129+
130+
Note that additional accelerate launch arguments can be passed, however, FSDP defaults are set and no `accelerate_launch_args` need to be passed.
131+
132+
Another good example can be found [here](../examples/kfto-kueue-fms-model-optimizer.yaml) which launches a Kubernetes-native `PyTorchJob` using the [Kubeflow Training Operator](https://github.com/kubeflow/training-operator/) with [Kueue](https://github.com/kubernetes-sigs/kueue) for the queue management of optimization jobs. The KFTO example is running GPTQ of LLAMA-3-8B with two GPUs.

0 commit comments

Comments
 (0)