diff --git a/.github/scripts/runner_setup.sh b/.github/scripts/runner_setup.sh index bb1b7d0976b4..e6eb3aecd88e 100755 --- a/.github/scripts/runner_setup.sh +++ b/.github/scripts/runner_setup.sh @@ -1,6 +1,8 @@ #!/bin/bash set -e -curl -LsSf https://astral.sh/uv/install.sh | UV_INSTALL_DIR="/usr/local/bin" sh -uv self update +if ! command -v uv &> /dev/null; then + curl -LsSf https://astral.sh/uv/install.sh | UV_INSTALL_DIR="/usr/local/bin" sh + uv self update +fi docker --version diff --git a/.github/workflows/pr-example.yml b/.github/workflows/pr-example.yml index e69804295944..f9f7637197aa 100644 --- a/.github/workflows/pr-example.yml +++ b/.github/workflows/pr-example.yml @@ -41,7 +41,21 @@ jobs: - uses: actions/checkout@v5 - run: .github/scripts/runner_setup.sh - run: .github/scripts/buildkitd.sh - + - name: build vllm-rayserve-ec2 image + shell: bash + run: | + aws ecr get-login-password --region ${{ secrets.AWS_REGION }} | docker login --username AWS --password-stdin ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_REGION }}.amazonaws.com + IMAGE_TAG=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_REGION }}.amazonaws.com/ci:vllm-0.10.2-gpu-py312-cu128-ubuntu22.04-rayserve-ec2-pr-${{ github.event.pull_request.number }} + docker buildx build --progress plain \ + --build-arg CACHE_REFRESH="$(date +"%Y-%m-%d")" \ + --cache-to=type=inline \ + --cache-from=type=registry,ref="$IMAGE_TAG" \ + --tag "$IMAGE_TAG" \ + --target vllm-rayserve-ec2 \ + -f docker/vllm/Dockerfile.rayserve . + docker push "$IMAGE_TAG" + docker rmi "$IMAGE_TAG" + example-on-g6xl-runner-1: needs: [example-on-build-runner] runs-on: diff --git a/.gitignore b/.gitignore index 18b67f20119c..126c4416f381 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ __pycache__ .idea *.pyc .venv +.ruff_cache \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3ec28eba9d20..63f60f47f387 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,23 +15,14 @@ repos: # optional: add additional arguments here - --indent=2 - --write - stages: [manual] # run in CI - - repo: https://github.com/rhysd/actionlint - rev: v1.7.7 - hooks: - - id: actionlint - stages: [manual] # run in CI + stages: [pre-commit] - repo: https://github.com/scop/pre-commit-shfmt rev: v3.12.0-2 # Use the latest stable revision hooks: - id: shfmt # Optional: Add arguments to shfmt if needed, e.g., to enable "simplify" mode args: ["-s"] - - repo: https://github.com/crate-ci/typos - rev: v1.38.1 - hooks: - - id: typos - args: [--force-exclude] + stages: [pre-commit] - repo: https://github.com/hukkin/mdformat rev: 1.0.0 # Use the ref you want to point at hooks: @@ -40,17 +31,28 @@ repos: additional_dependencies: - mdformat-gfm - mdformat-black + stages: [pre-commit] - repo: https://github.com/igorshubovych/markdownlint-cli rev: v0.45.0 hooks: - id: markdownlint args: [--fix] + stages: [pre-commit] - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.14.3 hooks: - - id: ruff-check - args: [ --fix, --output-format=github ] - id: ruff-format + stages: [pre-commit] + - id: ruff-check + - repo: https://github.com/rhysd/actionlint + rev: v1.7.7 + hooks: + - id: actionlint + - repo: https://github.com/crate-ci/typos + rev: v1.38.1 + hooks: + - id: typos + args: [--force-exclude] - repo: local hooks: - id: signoff-commit diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 97ec98b254b1..55fefbd3a911 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -28,6 +28,13 @@ uv pip install pre-commit pre-commit install ``` +Install go using [homebrew](https://brew.sh/), below example assume on Mac. + +```bash +brew install go +go env -w GOPROXY=direct +``` + To manually run all linters: ```bash diff --git a/docker/vllm/Dockerfile.rayserve b/docker/vllm/Dockerfile.rayserve new file mode 100644 index 000000000000..3eeff6528797 --- /dev/null +++ b/docker/vllm/Dockerfile.rayserve @@ -0,0 +1,68 @@ +FROM docker.io/vllm/vllm-openai:v0.10.2 AS base +ARG PYTHON="python3" +LABEL maintainer="Amazon AI" +ARG EFA_VERSION="1.43.3" +LABEL dlc_major_version="1" +ENV DEBIAN_FRONTEND=noninteractive \ + LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 \ + DLC_CONTAINER_TYPE=base \ + # Python won’t try to write .pyc or .pyo files on the import of source modules + # Force stdin, stdout and stderr to be totally unbuffered. Good for logging + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONIOENCODING=UTF-8 \ + LD_LIBRARY_PATH="/usr/local/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" \ + PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}" + +WORKDIR / + +COPY ./scripts/telemetry/deep_learning_container.py /usr/local/bin/deep_learning_container.py +COPY ./scripts/telemetry/bash_telemetry.sh /usr/local/bin/bash_telemetry.sh +COPY ./scripts/setup_oss_compliance.sh setup_oss_compliance.sh + +RUN chmod +x /usr/local/bin/deep_learning_container.py \ + && chmod +x /usr/local/bin/bash_telemetry.sh \ + && echo 'source /usr/local/bin/bash_telemetry.sh' >>/etc/bash.bashrc \ + && bash setup_oss_compliance.sh ${PYTHON} && rm setup_oss_compliance.sh \ + # create symlink for python + && ln -s /usr/bin/python3 /usr/bin/python \ + # clean up + && rm -rf ${HOME_DIR}/oss_compliance* \ + && rm -rf /tmp/tmp* \ + && rm -rf /tmp/uv* \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /root/.cache | true + +COPY ./scripts/install_efa.sh install_efa.sh +RUN bash install_efa.sh ${EFA_VERSION} \ + && rm install_efa.sh \ + && mkdir -p /tmp/nvjpeg \ + && cd /tmp/nvjpeg \ + && wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \ + && tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \ + && rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \ + && rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \ + && cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/lib64/ \ + && cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/include/ \ + && rm -rf /tmp/nvjpeg \ + # remove cuobjdump and nvdisasm + && rm -rf /usr/local/cuda/bin/cuobjdump* \ + && rm -rf /usr/local/cuda/bin/nvdisasm* + +# ====================== ray serve ========================================= +FROM base AS vllm-rayserve-ec2 + +RUN uv pip install --system ray[serve]==2.49.0 \ + && uv cache clean + +ARG CACHE_REFRESH=0 +RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold \ + && apt-get update \ + && apt-get upgrade -y \ + && apt-get clean + +COPY ./scripts/dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh +RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh + +ENTRYPOINT ["/usr/local/bin/dockerd_entrypoint.sh"] \ No newline at end of file diff --git a/scripts/dockerd_entrypoint.sh b/scripts/dockerd_entrypoint.sh new file mode 100755 index 000000000000..c05dab13dfa1 --- /dev/null +++ b/scripts/dockerd_entrypoint.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +# Check if telemetry file exists before executing +# Execute telemetry script if it exists, suppress errors +bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true + +python3 -m vllm.entrypoints.openai.api_server "$@" \ No newline at end of file diff --git a/scripts/install_efa.sh b/scripts/install_efa.sh new file mode 100755 index 000000000000..75cbc6e93116 --- /dev/null +++ b/scripts/install_efa.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +set -ex + +ARCH=$(uname -m) +case $ARCH in + x86_64) + ARCH_DIR="x86_64-linux-gnu" + ;; + aarch64) + ARCH_DIR="aarch64-linux-gnu" + ;; + *) + echo "Unsupported architecture: $ARCH" + exit 1 + ;; +esac + +function check_libnccl_net_so { + OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib/${ARCH_DIR}" + NCCL_NET_SO="$OFI_LIB_DIR/libnccl-net.so" + + # Check if file exists + if [ ! -f "$NCCL_NET_SO" ]; then + echo "ERROR: $NCCL_NET_SO does not exist" + return 1 + fi +} + +function install_efa { + EFA_VERSION=$1 + OPEN_MPI_PATH="/opt/amazon/openmpi" + + # Install build time tools + apt-get update + apt-get install -y --allow-change-held-packages --no-install-recommends \ + curl \ + build-essential \ + cmake \ + git + + # Install EFA + mkdir /tmp/efa + cd /tmp/efa + curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz + tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz + cd aws-efa-installer + ./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify + rm -rf /tmp/efa + # Configure Open MPI and configure NCCL parameters + mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real + echo '#!/bin/bash' > ${OPEN_MPI_PATH}/bin/mpirun + echo "${OPEN_MPI_PATH}/bin/mpirun.real --allow-run-as-root \"\$@\"" >> ${OPEN_MPI_PATH}/bin/mpirun + chmod a+x ${OPEN_MPI_PATH}/bin/mpirun + echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf + echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf + echo NCCL_DEBUG=INFO >> /etc/nccl.conf + echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf + + # Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation + apt-get install -y --no-install-recommends \ + openssh-client \ + openssh-server + mkdir -p /var/run/sshd + cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + # Configure OpenSSH so that nodes can communicate with each other + mkdir -p /var/run/sshd + sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + rm -rf /root/.ssh/ + mkdir -p /root/.ssh/ + ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa + cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys + printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + + # Remove build time tools + # apt-get remove -y + # curl + # build-essential + # cmake + # git + + # Cleanup + apt-get clean + apt-get autoremove -y + rm -rf /var/lib/apt/lists/* + ldconfig + check_libnccl_net_so +} + +# idiomatic parameter and option handling in sh +while test $# -gt 0 +do + case "$1" in + [0-9].[0-9]*.[0-9]*) install_efa $1; + ;; + *) echo "bad argument $1"; exit 1 + ;; + esac + shift +done diff --git a/scripts/setup_oss_compliance.sh b/scripts/setup_oss_compliance.sh new file mode 100755 index 000000000000..426f8fb52f63 --- /dev/null +++ b/scripts/setup_oss_compliance.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +set -ex + +function install_oss_compliance { + HOME_DIR="/root" + PYTHON=$1 + + if [ -z "$PYTHON" ]; then + echo "Python version not specified. Using default Python." + PYTHON="python3" + fi + curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip + ${PYTHON} -c "import zipfile, os; zipfile.ZipFile('/root/oss_compliance.zip').extractall('/root/'); os.remove('/root/oss_compliance.zip')" + cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance + chmod +x /usr/local/bin/testOSSCompliance + chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh + ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} + rm -rf ${HOME_DIR}/oss_compliance* + rm -rf /tmp/tmp* + # Removing the cache as it is needed for security verification + rm -rf /root/.cache | true +} + +while test $# -gt 0 +do + case "$1" in + python*) install_oss_compliance $1; + ;; + *) echo "bad argument $1"; exit 1 + ;; + esac + shift +done \ No newline at end of file diff --git a/scripts/telemetry/bash_telemetry.sh b/scripts/telemetry/bash_telemetry.sh new file mode 100755 index 000000000000..390000bacfca --- /dev/null +++ b/scripts/telemetry/bash_telemetry.sh @@ -0,0 +1,11 @@ +# telemetry.sh +#!/bin/bash +if [ -f /usr/local/bin/deep_learning_container.py ] && [[ -z "${OPT_OUT_TRACKING}" || "${OPT_OUT_TRACKING,,}" != "true" ]]; then + ( + python /usr/local/bin/deep_learning_container.py \ + --framework "${FRAMEWORK}" \ + --framework-version "${FRAMEWORK_VERSION}" \ + --container-type "${CONTAINER_TYPE}" \ + &>/dev/null & + ) +fi diff --git a/scripts/telemetry/deep_learning_container.py b/scripts/telemetry/deep_learning_container.py new file mode 100755 index 000000000000..35e730d745d6 --- /dev/null +++ b/scripts/telemetry/deep_learning_container.py @@ -0,0 +1,395 @@ +# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +import argparse +import json +import logging +import multiprocessing +import os +import re +import signal +import sys + +import botocore.session +import requests + +TIMEOUT_SECS = 5 +REGION_MAPPING = { + "ap-northeast-1": "ddce303c", + "ap-northeast-2": "528c8d92", + "ap-southeast-1": "c35f9f00", + "ap-southeast-2": "d2add9c0", + "ap-south-1": "9deb4123", + "ca-central-1": "b95e2bf4", + "eu-central-1": "bfec3957", + "eu-north-1": "b453c092", + "eu-west-1": "d763c260", + "eu-west-2": "ea20d193", + "eu-west-3": "1894043c", + "sa-east-1": "030b4357", + "us-east-1": "487d6534", + "us-east-2": "72252b46", + "us-west-1": "d02c1125", + "us-west-2": "d8c0d063", + "af-south-1": "08ea8dc5", + "eu-south-1": "29566eac", + "me-south-1": "7ea07793", + "ap-southeast-7": "1699f14f", + "ap-southeast-3": "be0a3174", + "me-central-1": "6e06aaeb", + "ap-east-1": "5e1fbf92", + "ap-south-2": "50209442", + "ap-northeast-3": "fa298003", + "ap-southeast-5": "5852cd87", + "us-northeast-1": "bbf9e961", + "ap-southeast-4": "dc6f76ce", + "mx-central-1": "ed0da79c", + "il-central-1": "2fb2448e", + "ap-east-2": "8947749e", + "ca-west-1": "ea83ea06", + "eu-south-2": "df2c9d70", + "eu-central-2": "aa7aabcc", +} + + +def requests_helper(url, headers=None, timeout=0.1): + """ + Requests to get instance metadata using imdsv1 and imdsv2 + :param url: str, url to get the request + :param headers: str, headers needed to make a request + :param timeout: float, timeout value for a request + """ + response = None + try: + if headers: + response = requests.get(url, headers=headers, timeout=timeout) + else: + response = requests.get(url, timeout=timeout) + + except requests.exceptions.RequestException as e: + logging.error("Request exception: {}".format(e)) + + return response + + +def requests_helper_imds(url, token=None): + """ + Requests to get instance metadata using imdsv1 and imdsv2 + :param url: str, url to get the request + :param token: str, token is needed to use imdsv2 + """ + response_text = None + response = None + headers = None + if token: + headers = {"X-aws-ec2-metadata-token": token} + timeout = 1 + try: + while timeout <= 3: + if headers: + response = requests.get(url, headers=headers, timeout=timeout) + else: + response = requests.get(url, timeout=timeout) + if response: + break + timeout += 1 + + except requests.exceptions.RequestException as e: + logging.error("Request exception: {}".format(e)) + + if response is not None and not (400 <= response.status_code < 600): + response_text = response.text + + return response_text + + +def get_imdsv2_token(): + """ + Retrieve token using imdsv2 service + """ + response = None + token = None + headers = {"X-aws-ec2-metadata-token-ttl-seconds": "600"} + url = "http://169.254.169.254/latest/api/token" + timeout = 1 + + try: + while timeout <= 3: + response = requests.put(url, headers=headers, timeout=timeout) + if response: + break + timeout += 1 + except requests.exceptions.RequestException as e: + logging.error("Request exception: {}".format(e)) + + if response is not None and not (400 <= response.status_code < 600): + token = response.text + + return token + + +def _validate_instance_id(instance_id): + """ + Validate instance ID + """ + instance_id_regex = r"^(i-\S{17})" + compiled_regex = re.compile(instance_id_regex) + match = compiled_regex.match(instance_id) + + if not match: + return None + + return match.group(1) + + +def _retrieve_instance_id(token=None): + """ + Retrieve instance ID from instance metadata service + """ + instance_id = None + instance_url = "http://169.254.169.254/latest/meta-data/instance-id" + + if token: + instance_id = requests_helper_imds(instance_url, token) + else: + instance_id = requests_helper_imds(instance_url) + + if instance_id: + instance_id = _validate_instance_id(instance_id) + + return instance_id + + +def _retrieve_instance_region(token=None): + """ + Retrieve instance region from instance metadata service + """ + region = None + response_json = None + + region_url = "http://169.254.169.254/latest/dynamic/instance-identity/document" + + if token: + response_text = requests_helper_imds(region_url, token) + else: + response_text = requests_helper_imds(region_url) + + if response_text: + response_json = json.loads(response_text) + + if response_json["region"] in REGION_MAPPING: + region = response_json["region"] + + return region + + +def _retrieve_device(): + return ( + "gpu" + if os.path.isdir("/usr/local/cuda") + else ( + "eia" + if os.path.isdir("/opt/ei_tools") + else ( + "neuron" + if os.path.exists("/usr/local/bin/tensorflow_model_server_neuron") + else "cpu" + ) + ) + ) + + +def _retrieve_cuda(): + cuda_version = "" + try: + cuda_path = os.path.basename(os.readlink("/usr/local/cuda")) + cuda_version_search = re.search(r"\d+\.\d+", cuda_path) + cuda_version = "" if not cuda_version_search else cuda_version_search.group() + except Exception as e: + logging.error(f"Failed to get cuda path: {e}") + return cuda_version + + +def _retrieve_os(): + version = "" + name = "" + with open("/etc/os-release", "r") as f: + for line in f.readlines(): + if re.match(r"^ID=\w+$", line): + name = re.search(r"^ID=(\w+)$", line).group(1) + if re.match(r'^VERSION_ID="\d+\.\d+"$', line): + version = re.search(r'^VERSION_ID="(\d+\.\d+)"$', line).group(1) + return name + version + + +def parse_args(): + """ + Parsing function to parse input arguments. + Return: args, which containers parsed input arguments. + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--framework", + choices=["tensorflow", "mxnet", "pytorch", "base", "vllm"], + help="framework of container image.", + required=True, + ) + parser.add_argument( + "--framework-version", help="framework version of container image.", required=True + ) + parser.add_argument( + "--container-type", + choices=["training", "inference", "general"], + help="What kind of jobs you want to run on container. Either training or inference.", + required=True, + ) + + args, _unknown = parser.parse_known_args() + + fw_version_pattern = r"\d+(\.\d+){1,2}(-rc\d)?" + + # PT 1.10 and above has +cpu or +cu113 string, so handle accordingly + if args.framework == "pytorch": + pt_fw_version_pattern = r"(\d+(\.\d+){1,2}(-rc\d)?)((\+cpu)|(\+cu\d{3})|(a0\+git\w{7}))" + pt_fw_version_match = re.fullmatch(pt_fw_version_pattern, args.framework_version) + if pt_fw_version_match: + args.framework_version = pt_fw_version_match.group(1) + assert re.fullmatch(fw_version_pattern, args.framework_version), ( + f"args.framework_version = {args.framework_version} does not match {fw_version_pattern}\n" + f"Please specify framework version as X.Y.Z or X.Y." + ) + # TFS 2.12.1 still uses TF 2.12.0 and breaks the telemetry check as it is checking TF version + # instead of TFS version. WE are forcing the version we want. + if ( + args.framework == "tensorflow" + and args.container_type == "inference" + and args.framework_version == "2.12.0" + ): + args.framework_version = "2.12.1" + + return args + + +def query_bucket(instance_id, region): + """ + GET request on an empty object from an Amazon S3 bucket + """ + + response = None + args = parse_args() + framework, framework_version, container_type = ( + args.framework, + args.framework_version, + args.container_type, + ) + + py_version = sys.version.split(" ")[0] + + if instance_id is not None and region is not None: + url = ( + "https://aws-deep-learning-containers-{0}.s3.{1}.amazonaws.com" + "/dlc-containers-{2}.txt?x-instance-id={2}&x-framework={3}&x-framework_version={4}&x-py_version={5}&x-container_type={6}".format( + REGION_MAPPING[region], + region, + instance_id, + framework, + framework_version, + py_version, + container_type, + ) + ) + response = requests_helper(url, timeout=0.2) + if os.environ.get("TEST_MODE") == str(1): + with open(os.path.join(os.sep, "tmp", "test_request.txt"), "w+") as rf: + rf.write(url) + + logging.debug("Query bucket finished: {}".format(response)) + + return response + + +def tag_instance(instance_id, region): + """ + Apply instance tag on the instance that is running the container using botocore + """ + args = parse_args() + framework, framework_version, container_type = ( + args.framework, + args.framework_version, + args.container_type, + ) + py_version = sys.version.split(" ")[0] + device = _retrieve_device() + cuda_version = f"_cuda{_retrieve_cuda()}" if device == "gpu" else "" + os_version = _retrieve_os() + + tag = f"{framework}_{container_type}_{framework_version}_python{py_version}_{device}{cuda_version}_{os_version}" + tag_struct = {"Key": "aws-dlc-autogenerated-tag-do-not-delete", "Value": tag} + + request_status = None + if instance_id and region: + try: + session = botocore.session.get_session() + ec2_client = session.create_client("ec2", region_name=region) + response = ec2_client.create_tags(Resources=[instance_id], Tags=[tag_struct]) + request_status = response.get("ResponseMetadata").get("HTTPStatusCode") + if os.environ.get("TEST_MODE") == str(1): + with open(os.path.join(os.sep, "tmp", "test_tag_request.txt"), "w+") as rf: + rf.write(json.dumps(tag_struct, indent=4)) + except Exception as e: + logging.error(f"Error. {e}") + logging.debug("Instance tagged successfully: {}".format(request_status)) + else: + logging.error("Failed to retrieve instance_id or region") + + return request_status + + +def main(): + """ + Invoke bucket query + """ + # Logs are not necessary for normal run. Remove this line while debugging. + logging.getLogger().disabled = True + + logging.basicConfig(level=logging.ERROR) + + token = None + instance_id = None + region = None + token = get_imdsv2_token() + if token: + instance_id = _retrieve_instance_id(token) + region = _retrieve_instance_region(token) + else: + instance_id = _retrieve_instance_id() + region = _retrieve_instance_region() + + bucket_process = multiprocessing.Process(target=query_bucket, args=(instance_id, region)) + tag_process = multiprocessing.Process(target=tag_instance, args=(instance_id, region)) + + bucket_process.start() + tag_process.start() + + tag_process.join(TIMEOUT_SECS) + bucket_process.join(TIMEOUT_SECS) + + if tag_process.is_alive(): + os.kill(tag_process.pid, signal.SIGKILL) + tag_process.join() + if bucket_process.is_alive(): + os.kill(bucket_process.pid, signal.SIGKILL) + bucket_process.join() + + +if __name__ == "__main__": + main()