Skip to content

Commit 5bc12ca

Browse files
Initial commit for optimizer image
Signed-off-by: Thara Palanivel <[email protected]>
1 parent cea5bc7 commit 5bc12ca

File tree

4 files changed

+453
-2
lines changed

4 files changed

+453
-2
lines changed

build/Dockerfile

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
# Copyright The FMS Model Optimizer Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
## Global Args #################################################################
17+
ARG BASE_UBI_IMAGE_TAG=latest
18+
ARG USER=optimizer
19+
ARG USER_UID=1000
20+
ARG PYTHON_VERSION=3.11
21+
ARG WHEEL_VERSION=""
22+
## Enable Aimstack if requested via ENABLE_AIM set to "true"
23+
ARG ENABLE_AIM=false
24+
25+
## Base Layer ##################################################################
26+
FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} AS base
27+
28+
ARG PYTHON_VERSION
29+
ARG USER
30+
ARG USER_UID
31+
32+
# Note this works for 3.9, 3.11, 3.12
33+
RUN dnf remove -y --disableplugin=subscription-manager \
34+
subscription-manager \
35+
&& dnf install -y python${PYTHON_VERSION} procps g++ python${PYTHON_VERSION}-devel \
36+
&& ln -s /usr/bin/python${PYTHON_VERSION} /bin/python \
37+
&& python -m ensurepip --upgrade \
38+
&& python -m pip install --upgrade pip \
39+
&& python -m pip install --upgrade setuptools \
40+
&& dnf update -y \
41+
&& dnf clean all
42+
43+
ENV LANG=C.UTF-8 \
44+
LC_ALL=C.UTF-8
45+
46+
RUN useradd -u $USER_UID ${USER} -m -g 0 --system && \
47+
chmod g+rx /home/${USER}
48+
49+
## Used as base of the Release stage to removed unrelated the packages and CVEs
50+
FROM base AS release-base
51+
52+
# Removes the python3.9 code to eliminate possible CVEs. Also removes dnf
53+
RUN rpm -e $(dnf repoquery python3-* -q --installed) dnf python3 yum crypto-policies-scripts
54+
55+
56+
## CUDA Base ###################################################################
57+
FROM base AS cuda-base
58+
59+
# Ref: https://docs.nvidia.com/cuda/archive/12.1.0/cuda-toolkit-release-notes/
60+
ENV CUDA_VERSION=12.1.0 \
61+
NV_CUDA_LIB_VERSION=12.1.0-1 \
62+
NVIDIA_VISIBLE_DEVICES=all \
63+
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
64+
NV_CUDA_CUDART_VERSION=12.1.55-1 \
65+
NV_CUDA_COMPAT_VERSION=530.30.02-1
66+
67+
RUN dnf config-manager \
68+
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
69+
&& dnf install -y \
70+
cuda-cudart-12-1-${NV_CUDA_CUDART_VERSION} \
71+
cuda-compat-12-1-${NV_CUDA_COMPAT_VERSION} \
72+
&& echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
73+
&& echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf \
74+
&& dnf clean all
75+
76+
ENV CUDA_HOME="/usr/local/cuda" \
77+
PATH="/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH}" \
78+
LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
79+
80+
## CUDA Development ############################################################
81+
FROM cuda-base AS cuda-devel
82+
83+
# Ref: https://developer.nvidia.com/nccl/nccl-legacy-downloads
84+
ENV NV_CUDA_CUDART_DEV_VERSION=12.1.55-1 \
85+
NV_NVML_DEV_VERSION=12.1.55-1 \
86+
NV_LIBCUBLAS_DEV_VERSION=12.1.0.26-1 \
87+
NV_LIBNPP_DEV_VERSION=12.0.2.50-1 \
88+
NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.3-1+cuda12.1
89+
90+
RUN dnf config-manager \
91+
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
92+
&& dnf install -y \
93+
cuda-command-line-tools-12-1-${NV_CUDA_LIB_VERSION} \
94+
cuda-libraries-devel-12-1-${NV_CUDA_LIB_VERSION} \
95+
cuda-minimal-build-12-1-${NV_CUDA_LIB_VERSION} \
96+
cuda-cudart-devel-12-1-${NV_CUDA_CUDART_DEV_VERSION} \
97+
cuda-nvml-devel-12-1-${NV_NVML_DEV_VERSION} \
98+
libcublas-devel-12-1-${NV_LIBCUBLAS_DEV_VERSION} \
99+
libnpp-devel-12-1-${NV_LIBNPP_DEV_VERSION} \
100+
libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \
101+
&& dnf clean all
102+
103+
ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"
104+
105+
## Full set of python installations for final image release #########################
106+
FROM cuda-devel AS python-installations
107+
108+
ARG WHEEL_VERSION
109+
ARG USER
110+
ARG USER_UID
111+
ARG ENABLE_AIM
112+
113+
# consistent arch support anywhere we compile CUDA code
114+
ENV TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX;8.9"
115+
116+
RUN dnf install -y git && \
117+
# perl-Net-SSLeay.x86_64 and server_key.pem are installed with git as dependencies
118+
# Twistlock detects it as H severity: Private keys stored in image
119+
rm -f /usr/share/doc/perl-Net-SSLeay/examples/server_key.pem && \
120+
dnf clean all
121+
USER ${USER}
122+
WORKDIR /tmp
123+
RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \
124+
python -m pip install --user build
125+
COPY --chown=${USER}:root fms_mo fms_mo
126+
COPY .git .git
127+
COPY pyproject.toml pyproject.toml
128+
129+
# Build a wheel if PyPi wheel_version is empty else download the wheel from PyPi
130+
RUN if [[ -z "${WHEEL_VERSION}" ]]; \
131+
then python -m build --wheel --outdir /tmp; \
132+
else pip download fms-model-optimizer==${WHEEL_VERSION} --dest /tmp --only-binary=:all: --no-deps; \
133+
fi && \
134+
ls /tmp/*.whl >/tmp/bdist_name
135+
136+
# Install from the wheel
137+
RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \
138+
python -m pip install --user wheel && \
139+
python -m pip install --user "$(head bdist_name)[opt]" && \
140+
python -m pip install --user "$(head bdist_name)" && \
141+
python -m pip install --user "$(head bdist_name)[opt]"
142+
143+
144+
RUN if [[ "${ENABLE_AIM}" == "true" ]]; then \
145+
python -m pip install --user "$(head bdist_name)[aim]"; \
146+
fi
147+
148+
# Clean up the wheel module. It's only needed by flash-attn install
149+
RUN python -m pip uninstall wheel build -y && \
150+
# Cleanup the bdist whl file
151+
rm $(head bdist_name) /tmp/bdist_name
152+
153+
## Final image ################################################
154+
FROM release-base AS release
155+
ARG USER
156+
ARG PYTHON_VERSION
157+
ARG ENABLE_AIM
158+
159+
RUN mkdir -p /licenses
160+
COPY LICENSE /licenses/
161+
162+
RUN mkdir /app && \
163+
chown -R $USER:0 /app /tmp && \
164+
chmod -R g+rwX /app /tmp
165+
166+
# Need a better way to address these hacks
167+
RUN if [[ "${ENABLE_AIM}" == "true" ]] ; then \
168+
touch /.aim_profile && \
169+
chmod -R 777 /.aim_profile; \
170+
fi
171+
RUN mkdir /.cache && \
172+
chmod -R 777 /.cache
173+
174+
# Copy scripts and default configs
175+
COPY build/accelerate_launch.py /app/
176+
COPY build/utils.py /app/build/
177+
RUN chmod +x /app/accelerate_launch.py
178+
179+
ENV SET_NUM_PROCESSES_TO_NUM_GPUS="True"
180+
181+
WORKDIR /app
182+
USER ${USER}
183+
COPY --from=python-installations /home/${USER}/.local /home/${USER}/.local
184+
ENV PYTHONPATH="/home/${USER}/.local/lib/python${PYTHON_VERSION}/site-packages"
185+
186+
CMD [ "python", "/app/accelerate_launch.py" ]

build/accelerate_launch.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
# Copyright The FMS Model Optimizer Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""Script wraps fms_mo to run with accelerate for multi and single GPU cases.
15+
Read accelerate_launch_args configuration via environment variable `FMS_MO_CONFIG_JSON_PATH`
16+
for the path to the JSON config file with parameters or `FMS_MO_CONFIG_JSON_ENV_VAR`
17+
for the encoded config string to parse.
18+
"""
19+
20+
# Standard
21+
import os
22+
import logging
23+
import subprocess
24+
import sys
25+
import traceback
26+
from pathlib import Path
27+
28+
# Third Party
29+
from accelerate.commands.launch import launch_command
30+
31+
# Local
32+
from build.utils import (
33+
process_accelerate_launch_args,
34+
)
35+
36+
from fms_mo.utils.config_utils import get_json_config
37+
from fms_mo.utils.error_logging import (
38+
write_termination_log,
39+
USER_ERROR_EXIT_CODE,
40+
INTERNAL_ERROR_EXIT_CODE,
41+
)
42+
43+
ERROR_LOG = "/dev/termination-log"
44+
45+
46+
def main():
47+
if not os.getenv("TERMINATION_LOG_FILE"):
48+
os.environ["TERMINATION_LOG_FILE"] = ERROR_LOG
49+
50+
##########
51+
#
52+
# Parse arguments
53+
#
54+
##########
55+
try:
56+
job_config = get_json_config()
57+
if not job_config:
58+
raise ValueError(
59+
"Must set environment variable 'FMS_MO_CONFIG_JSON_PATH' \
60+
or 'FMS_MO_CONFIG_JSON_ENV_VAR'."
61+
)
62+
63+
# Configure log_level of python native logger.
64+
# CLI arg takes precedence over env var. And if neither is set, we use default "WARNING"
65+
log_level = job_config.get(
66+
"log_level"
67+
) # this will be set to either the value found or None
68+
if (
69+
not log_level
70+
): # if log level not set by job_config aka by JSON, set it via env var or set default
71+
log_level = os.environ.get("LOG_LEVEL", "WARNING") # TPP should be WARNING
72+
logging.basicConfig(level=log_level.upper())
73+
74+
logging.debug("About to parse args") #TPP add
75+
76+
args = process_accelerate_launch_args(job_config)
77+
logging.debug("accelerate launch parsed args: %s", args)
78+
except FileNotFoundError as e:
79+
logging.error(traceback.format_exc())
80+
write_termination_log("Unable to load file: {}".format(e))
81+
sys.exit(USER_ERROR_EXIT_CODE)
82+
except (TypeError, ValueError, EnvironmentError) as e:
83+
logging.error(traceback.format_exc())
84+
write_termination_log(
85+
f"Exception raised during optimization. This may be a problem with your input: {e}"
86+
)
87+
sys.exit(USER_ERROR_EXIT_CODE)
88+
except Exception as e: # pylint: disable=broad-except
89+
logging.error(traceback.format_exc())
90+
write_termination_log(f"Unhandled exception during optimization. {e}")
91+
sys.exit(INTERNAL_ERROR_EXIT_CODE)
92+
93+
##########
94+
#
95+
# Launch optimization
96+
#
97+
##########
98+
output_dir = job_config.get("output_dir")
99+
if not os.path.exists(output_dir):
100+
os.makedirs(output_dir)
101+
try:
102+
# checkpoints outputted to tempdir, only final checkpoint copied to output dir
103+
launch_command(args)
104+
except subprocess.CalledProcessError as e:
105+
# If the subprocess throws an exception, the base exception is hidden in the
106+
# subprocess call and is difficult to access at this level. However, that is not
107+
# an issue because run_quant.py would have already written the exception
108+
# message to termination log.
109+
logging.error(traceback.format_exc())
110+
# The exit code that run_quant.py threw is captured in e.returncode
111+
112+
return_code = e.returncode
113+
if return_code not in [INTERNAL_ERROR_EXIT_CODE, USER_ERROR_EXIT_CODE]:
114+
return_code = INTERNAL_ERROR_EXIT_CODE
115+
write_termination_log(f"Unhandled exception during optimization. {e}")
116+
sys.exit(return_code)
117+
except Exception as e: # pylint: disable=broad-except
118+
logging.error(traceback.format_exc())
119+
write_termination_log(f"Unhandled exception during optimization. {e}")
120+
sys.exit(INTERNAL_ERROR_EXIT_CODE)
121+
122+
# The .complete file will signal to users that we are finished copying
123+
# files over
124+
if os.path.exists(output_dir):
125+
Path(os.path.join(output_dir, ".complete")).touch()
126+
127+
return 0
128+
129+
130+
if __name__ == "__main__":
131+
main()

0 commit comments

Comments
 (0)