Skip to content

Commit bfa8b7b

Browse files
raoakarshaBordaakihironitta
authored
Create hpu-ci-runner Dockerfile (Lightning-AI#13239)
* Create hpu-ci-runner Dockerfile * Add ENTRYPOINT script 'start.sh' to hpu-ci-runner * rename dirs * ci * add docker * Fix build failure * Fix build failure * Fix title of nightly ci runner build * Fix comments * Fix comments Co-authored-by: Jirka <[email protected]> Co-authored-by: Akihiro Nitta <[email protected]>
1 parent 56cd883 commit bfa8b7b

File tree

6 files changed

+196
-5
lines changed

6 files changed

+196
-5
lines changed

.github/workflows/ci_dockers.yml

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ jobs:
130130
strategy:
131131
fail-fast: false
132132
matrix:
133-
# the config used in 'dockers/ipu-ci-runner/Dockerfile'
133+
# the config used in 'dockers/ci-runner-ipu/Dockerfile'
134134
python_version: ["3.9"] # latest
135135
pytorch_version: ["1.9"]
136136
steps:
@@ -154,6 +154,29 @@ jobs:
154154
build-args: |
155155
PYTHON_VERSION=${{ matrix.python_version }}
156156
PYTORCH_VERSION=${{ matrix.pytorch_version }}
157-
file: dockers/ipu-ci-runner/Dockerfile
157+
file: dockers/ci-runner-ipu/Dockerfile
158+
push: false
159+
timeout-minutes: 60
160+
161+
build-hpu:
162+
runs-on: ubuntu-20.04
163+
strategy:
164+
fail-fast: false
165+
matrix:
166+
# the config used in 'dockers/ci-runner-hpu/Dockerfile'
167+
gaudi_version: ["1.4.0"]
168+
pytorch_version: ["1.10.2"]
169+
steps:
170+
- name: Checkout
171+
uses: actions/checkout@v2
172+
173+
- name: Build HPU CI runner Docker
174+
uses: docker/build-push-action@v2
175+
with:
176+
build-args: |
177+
DIST=latest
178+
GAUDI_VERSION=${{ matrix.gaudi_version }}
179+
PYTORCH_VERSION=${{ matrix.pytorch_version }}
180+
file: dockers/ci-runner-hpu/Dockerfile
158181
push: false
159182
timeout-minutes: 60

.github/workflows/events-nightly.yml

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ jobs:
217217
strategy:
218218
fail-fast: false
219219
matrix:
220-
# the config used in 'dockers/ipu-ci-runner/Dockerfile'
220+
# the config used in 'dockers/ci-runner-ipu/Dockerfile'
221221
include:
222222
- {python_version: "3.9", pytorch_version: "1.9"}
223223

@@ -253,7 +253,7 @@ jobs:
253253
build-args: |
254254
PYTHON_VERSION=${{ matrix.python_version }}
255255
PYTORCH_VERSION=${{ matrix.pytorch_version }}
256-
file: dockers/ipu-ci-runner/Dockerfile
256+
file: dockers/ci-runner-ipu/Dockerfile
257257
push: ${{ env.PUSH_TO_HUB }}
258258
tags: pytorchlightning/pytorch_lightning:ipu-ci-runner-py${{ matrix.python_version }}
259259
timeout-minutes: 55
@@ -269,3 +269,51 @@ jobs:
269269
message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01BULUS2BG>' #SeanNaren
270270
env:
271271
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
272+
273+
docker-HPU:
274+
if: ${{ github.repository_owner == 'PyTorchLightning' }}
275+
runs-on: ubuntu-20.04
276+
strategy:
277+
fail-fast: false
278+
matrix:
279+
# the config used in 'dockers/ci-runner-hpu/Dockerfile'
280+
include:
281+
- {gaudi_version: "1.4.0", pytorch_version: "1.10.2"}
282+
283+
steps:
284+
- name: Checkout
285+
uses: actions/checkout@v2
286+
287+
# https://github.com/docker/setup-buildx-action
288+
# Set up Docker Buildx - to use cache-from and cache-to argument of buildx command
289+
- uses: docker/setup-buildx-action@v1
290+
- name: Login to DockerHub
291+
uses: docker/login-action@v1
292+
with:
293+
username: ${{ secrets.DOCKER_USERNAME }}
294+
password: ${{ secrets.DOCKER_PASSWORD }}
295+
296+
- name: Publish HPU CI runner to Docker Hub
297+
# publish master/release
298+
uses: docker/build-push-action@v2
299+
with:
300+
build-args: |
301+
DIST=latest
302+
GAUDI_VERSION=${{ matrix.gaudi_version }}
303+
PYTORCH_VERSION=${{ matrix.pytorch_version }}
304+
file: dockers/ci-runner-hpu/Dockerfile
305+
push: ${{ env.PUSH_TO_HUB }}
306+
tags: pytorchlightning/pytorch_lightning:hpu-ci-runner-gaudi${{ matrix.gaudi_version }}
307+
timeout-minutes: 55
308+
309+
# report failure to Slack
310+
- name: Slack notification
311+
if: failure() && github.event_name == 'schedule'
312+
uses: ravsamhq/notify-slack-action@v1
313+
with:
314+
status: ${{ job.status }}
315+
token: ${{ secrets.GITHUB_TOKEN }}
316+
notification_title: ${{ format('HPU; {0} py{1} for *{2}*', runner.os, matrix.gaudi_version, matrix.pytorch_version) }}
317+
message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U02PV6CL144> <@U0355SJN6HK>' #arao & Mythravarun N R
318+
env:
319+
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}

dockers/ci-runner-hpu/Dockerfile

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
ARG DIST="latest"
2+
ARG GAUDI_VERSION="1.4.0"
3+
ARG PYTORCH_VERSION="1.10.2"
4+
5+
FROM vault.habana.ai/gaudi-docker/${GAUDI_VERSION}/ubuntu20.04/habanalabs/pytorch-installer-${PYTORCH_VERSION}:${DIST}
6+
7+
LABEL maintainer="https://vault.habana.ai/"
8+
9+
RUN echo "ALL ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
10+
11+
WORKDIR /azp
12+
13+
COPY ./dockers/ci-runner-hpu/start.sh /usr/local/bin/
14+
15+
RUN chmod +x /usr/local/bin/start.sh
16+
17+
RUN curl -fsSL https://get.docker.com -o get-docker.sh && \
18+
sh get-docker.sh && \
19+
rm get-docker.sh
20+
21+
#RUN docker --help
22+
23+
ENTRYPOINT ["/usr/local/bin/start.sh"]
24+
CMD ["bash"]
File renamed without changes.

dockers/ipu-ci-runner/Dockerfile renamed to dockers/ci-runner-ipu/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ RUN echo "ALL ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
2323

2424
WORKDIR /azp
2525

26-
COPY ./dockers/ipu-ci-runner/start.sh /usr/local/bin/
26+
COPY ./dockers/ci-runner-ipu/start.sh /usr/local/bin/
2727

2828
RUN curl -o /usr/local/bin/installdependencies.sh \
2929
"https://raw.githubusercontent.com/microsoft/azure-pipelines-agent/d2acd5f77c6b3914cdb6ed0e5fbea672929c7da9/src/Misc/layoutbin/installdependencies.sh" && \

dockers/ci-runner-ipu/start.sh

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#!/bin/bash
2+
3+
# This is a slightly modified version of the script from
4+
# https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/docker
5+
6+
set -e
7+
8+
if [ -z "$AZP_URL" ]; then
9+
echo 1>&2 "error: missing AZP_URL environment variable"
10+
exit 1
11+
fi
12+
13+
if [ -z "$AZP_TOKEN_FILE" ]; then
14+
if [ -z "$AZP_TOKEN" ]; then
15+
echo 1>&2 "error: missing AZP_TOKEN environment variable"
16+
exit 1
17+
fi
18+
19+
AZP_TOKEN_FILE=/azp/.token
20+
echo -n $AZP_TOKEN > "$AZP_TOKEN_FILE"
21+
fi
22+
23+
unset AZP_TOKEN
24+
25+
if [ -n "$AZP_WORK" ]; then
26+
mkdir -p "$AZP_WORK"
27+
fi
28+
29+
rm -rf /azp/agent
30+
mkdir /azp/agent
31+
cd /azp/agent
32+
33+
export AGENT_ALLOW_RUNASROOT="1"
34+
35+
cleanup() {
36+
if [ -e config.sh ]; then
37+
print_header "Cleanup. Removing Azure Pipelines agent..."
38+
39+
./config.sh remove --unattended \
40+
--auth PAT \
41+
--token $(cat "$AZP_TOKEN_FILE")
42+
fi
43+
}
44+
45+
print_header() {
46+
lightcyan='\033[1;36m'
47+
nocolor='\033[0m'
48+
echo -e "${lightcyan}$1${nocolor}"
49+
}
50+
51+
# Let the agent ignore the token env variables
52+
export VSO_AGENT_IGNORE=AZP_TOKEN,AZP_TOKEN_FILE
53+
54+
print_header "1. Determining matching Azure Pipelines agent..."
55+
56+
AZP_AGENT_RESPONSE=$(curl -LsS \
57+
-u user:$(cat "$AZP_TOKEN_FILE") \
58+
-H 'Accept:application/json;api-version=3.0-preview' \
59+
"$AZP_URL/_apis/distributedtask/packages/agent?platform=linux-x64")
60+
61+
if echo "$AZP_AGENT_RESPONSE" | jq . >/dev/null 2>&1; then
62+
AZP_AGENTPACKAGE_URL=$(echo "$AZP_AGENT_RESPONSE" \
63+
| jq -r '.value | map([.version.major,.version.minor,.version.patch,.downloadUrl]) | sort | .[length-1] | .[3]')
64+
fi
65+
66+
if [ -z "$AZP_AGENTPACKAGE_URL" -o "$AZP_AGENTPACKAGE_URL" == "null" ]; then
67+
echo 1>&2 "error: could not determine a matching Azure Pipelines agent - check that account '$AZP_URL' is correct and the token is valid for that account"
68+
exit 1
69+
fi
70+
71+
print_header "2. Downloading and installing Azure Pipelines agent..."
72+
73+
curl -LsS $AZP_AGENTPACKAGE_URL | tar -xz & wait $!
74+
75+
source ./env.sh
76+
77+
print_header "3. Configuring Azure Pipelines agent..."
78+
79+
./config.sh --unattended \
80+
--agent "${AZP_AGENT_NAME:-$(hostname)}" \
81+
--url "$AZP_URL" \
82+
--auth PAT \
83+
--token $(cat "$AZP_TOKEN_FILE") \
84+
--pool "${AZP_POOL:-Default}" \
85+
--work "${AZP_WORK:-_work}" \
86+
--replace \
87+
--acceptTeeEula & wait $!
88+
89+
print_header "4. Running Azure Pipelines agent..."
90+
91+
trap 'cleanup; exit 130' INT
92+
trap 'cleanup; exit 143' TERM
93+
94+
# To be aware of TERM and INT signals call run.sh
95+
# Running it with the --once flag at the end will shut down the agent after the build is executed
96+
./run.sh --once & wait $!

0 commit comments

Comments
 (0)