Skip to content

Commit d6cea4b

Browse files
authored
Merge pull request #87 from coreweave/es/actions
feat!: Update to new runners, update PyTorch, add ARM builds, add Blackwell support, add `sglang` image
2 parents f575d1b + 68fbfd1 commit d6cea4b

File tree

17 files changed

+738
-248
lines changed

17 files changed

+738
-248
lines changed
Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
cuda: [ 12.6.1, 12.4.1, 12.2.2 ]
2-
os: [ ubuntu22.04, ubuntu20.04 ]
1+
cuda: [ 12.8.0, 12.6.3, 12.4.1 ]
2+
os: [ ubuntu22.04 ]
3+
abi: [ 1, 0 ]
34
include:
4-
- torch: 2.5.0
5-
vision: 0.20.0
6-
audio: 2.5.0
5+
- torch: 2.6.0
6+
vision: 0.21.0
7+
audio: 2.6.0
Lines changed: 8 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,9 @@
1-
image:
2-
# Ubuntu 22.04
3-
- cuda: 12.6.1
4-
cudnn: cudnn
5-
os: ubuntu22.04
6-
nccl: 2.23.4-1
7-
nccl-tests-hash: 2ff05b2
8-
- cuda: 12.4.1
9-
cudnn: cudnn
10-
os: ubuntu22.04
11-
nccl: 2.23.4-1
12-
nccl-tests-hash: 2ff05b2
13-
- cuda: 12.2.2
14-
cudnn: cudnn8
15-
os: ubuntu22.04
16-
nccl: 2.23.4-1
17-
nccl-tests-hash: 2ff05b2
18-
# Ubuntu 20.04
19-
- cuda: 12.6.1
20-
cudnn: cudnn
21-
os: ubuntu20.04
22-
nccl: 2.23.4-1
23-
nccl-tests-hash: 2ff05b2
24-
- cuda: 12.4.1
25-
cudnn: cudnn
26-
os: ubuntu20.04
27-
nccl: 2.23.4-1
28-
nccl-tests-hash: 2ff05b2
29-
- cuda: 12.2.2
30-
cudnn: cudnn8
31-
os: ubuntu20.04
32-
nccl: 2.21.5-1
33-
nccl-tests-hash: 2ff05b2
1+
cuda: [ 12.8.0, 12.6.3, 12.4.1 ]
2+
os: [ ubuntu22.04 ]
3+
abi: [ 1, 0 ]
344
include:
35-
- torch: 2.5.0
36-
vision: 0.20.0
37-
audio: 2.5.0
5+
- torch: 2.6.0
6+
vision: 0.21.0
7+
audio: 2.6.0
8+
nccl: 2.25.1-1
9+
nccl-tests-hash: 57fa979

.github/workflows/build.yml

Lines changed: 48 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,11 @@ on:
1919
required: false
2020
description: "Optional sub-key to append to the image name for build layer caching"
2121
type: string
22+
platforms:
23+
required: false
24+
description: "Platforms for which to build (default: linux/amd64,linux/arm64)"
25+
type: string
26+
default: linux/amd64,linux/arm64
2227
outputs:
2328
outcome:
2429
description: "The outcome of the build"
@@ -33,26 +38,42 @@ on:
3338
jobs:
3439
build:
3540
name: Build Images
36-
runs-on: [ self-hosted, Linux ]
41+
runs-on: [ cw ]
42+
container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.9.0'
43+
timeout-minutes: 960
44+
defaults:
45+
run:
46+
shell: bash
3747
outputs:
3848
outcome: ${{ steps.docker-build.outcome }}
3949
tags: ${{ steps.meta.outputs.tags }}
4050
version: ${{ steps.meta.outputs.version }}
4151
steps:
42-
- uses: actions/checkout@v3
43-
- name: Set up Docker Buildx
44-
uses: docker/setup-buildx-action@v2.2.1
45-
- name: Login to GitHub container registry
46-
uses: docker/login-action@v2.2.0
52+
- uses: actions/checkout@v4
53+
- name: Fetch BuildKit Client Certs
54+
uses: dopplerhq/secrets-fetch-action@v1.2.0
55+
id: client-certs
4756
with:
48-
registry: ghcr.io
49-
username: ${{ github.actor }}
50-
password: ${{ secrets.GITHUB_TOKEN }}
51-
- name: Login to DockerHub container registry
52-
uses: docker/login-action@v2.2.0
57+
doppler-token: ${{ secrets.ORG_BUILDKIT_CLIENT_TOKEN }}
58+
doppler-project: ${{ secrets.BUILDKIT_CONSUMER_DOPPLER_PROJECT }}
59+
doppler-config: prod
60+
inject-env-vars: false
61+
- name: Set up Docker Buildx
62+
uses: docker/setup-buildx-action@v3.7.1
5363
with:
54-
username: ${{ secrets.DOCKERHUB_USERNAME }}
55-
password: ${{ secrets.DOCKERHUB_PASSWORD }}
64+
driver: remote
65+
endpoint: ${{ secrets.BUILDKIT_CONSUMER_AMD64_ENDPOINT }}
66+
platforms: linux/amd64
67+
append: |
68+
- endpoint: ${{ secrets.BUILDKIT_CONSUMER_ARM64_ENDPOINT }}
69+
platforms: linux/arm64
70+
env:
71+
BUILDER_NODE_0_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }}
72+
BUILDER_NODE_0_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }}
73+
BUILDER_NODE_0_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }}
74+
BUILDER_NODE_1_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }}
75+
BUILDER_NODE_1_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }}
76+
BUILDER_NODE_1_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }}
5677
- name: Get base registry
5778
run: |
5879
echo "REGISTRY=ghcr.io/${GITHUB_REPOSITORY,,}" >> $GITHUB_ENV
@@ -70,14 +91,21 @@ jobs:
7091
echo "CACHE_KEY=${{ inputs.image-name }}-${{ inputs.cache-key }}" >> $GITHUB_ENV
7192
- name: Extract metadata (tags, labels) for Docker
7293
id: meta
73-
uses: docker/metadata-action@v4.1.1
94+
uses: docker/metadata-action@v5.5.1
7495
with:
7596
images: ${{ env.REGISTRY }}/${{ inputs.image-name }}
7697
tags: |
7798
type=sha,prefix=${{ env.TAG_PREFIX }},suffix=${{ env.TAG_SUFFIX }},format=short
99+
- name: Initialize registry credentials file
100+
env:
101+
USER: ${{ github.actor }}
102+
PASS: ${{ secrets.GITHUB_TOKEN }}
103+
run: |
104+
jq -n '.auths."ghcr.io" = { username: env.USER, password: env.PASS }' \
105+
| install -m400 /dev/stdin ~/.docker/config.json
78106
- name: Build and push Docker image
79107
id: docker-build
80-
uses: docker/build-push-action@v3.2.0
108+
uses: docker/build-push-action@v6.9.0
81109
with:
82110
context: ${{ inputs.folder }}
83111
build-args: |-
@@ -87,6 +115,11 @@ jobs:
87115
labels: ${{ steps.meta.outputs.labels }}
88116
cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }}
89117
cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max
118+
platforms: ${{ inputs.platforms }}
119+
- name: Clear registry credentials
120+
if: always()
121+
run: |
122+
rm -f ~/.docker/config.json && [ ! -e ~/.docker/config.json ]
90123
- uses: 8BitJonny/gh-get-current-pr@2.1.3
91124
id: PR
92125
with:

.github/workflows/read-configuration.yml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,16 @@ on:
1717
jobs:
1818
read-file:
1919
name: Read Configuration File
20-
runs-on: ["self-hosted", "Linux"]
20+
runs-on: [ cw ]
21+
container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0'
22+
defaults:
23+
run:
24+
shell: bash
2125
permissions: {}
2226
outputs:
2327
config: ${{ steps.read.outputs.contents }}
2428
steps:
25-
- uses: actions/checkout@v3
29+
- uses: actions/checkout@v4
2630
- name: Read configuration
2731
id: read
2832
env:

.github/workflows/sglang.yml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
on:
2+
workflow_dispatch:
3+
inputs:
4+
tag:
5+
description: 'Tag for the build'
6+
required: true
7+
base-image:
8+
description: 'Base image from which to build'
9+
required: true
10+
builder-image:
11+
description: 'Image to use to compile wheels, if different from the base image'
12+
required: false
13+
push:
14+
paths:
15+
- "sglang/**"
16+
- ".github/workflows/sglang.yml"
17+
- ".github/workflows/build.yml"
18+
19+
20+
jobs:
21+
build:
22+
uses: ./.github/workflows/build.yml
23+
secrets: inherit
24+
with:
25+
image-name: sglang
26+
folder: sglang
27+
tag-suffix: ${{ inputs.tag || '386fabe-nccl-cuda12.8.0-ubuntu22.04-nccl2.25.1-1-torch2.6.0-vision0.21.0-audio2.6.0-abi1' }}
28+
build-args: |
29+
BASE_IMAGE=${{ inputs.base-image || 'ghcr.io/coreweave/ml-containers/torch-extras:es-actions-386fabe-nccl-cuda12.8.0-ubuntu22.04-nccl2.25.1-1-torch2.6.0-vision0.21.0-audio2.6.0-abi1'}}
30+
${{ inputs.base-image && 'BASE_IMAGE=' }}${{ inputs.base-image}}

.github/workflows/torch-base.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,12 @@ jobs:
3535
secrets: inherit
3636
with:
3737
image-name: ${{ inputs.image-name }}
38-
tag: ${{ format('{0}-{1}', format('base-cuda{0}-{1}', matrix.cuda, matrix.os), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }}
38+
tag: ${{ format('{0}-{1}', format('base-cuda{0}-{1}', matrix.cuda, matrix.os), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}-abi{3}', matrix.torch, matrix.vision, matrix.audio, matrix.abi)) }}
3939
builder-base-image: nvidia/cuda:${{ matrix.cuda }}-devel-${{ matrix.os }}
4040
base-image: nvidia/cuda:${{ matrix.cuda }}-base-${{ matrix.os }}
4141
torch-version: ${{ matrix.torch }}
4242
torchvision-version: ${{ matrix.vision }}
4343
torchaudio-version: ${{ matrix.audio }}
44+
additional-build-args: BUILD_CXX11_ABI=${{ matrix.abi }}
4445
cache-key: base-cuda${{ matrix.cuda }}-${{ matrix.os }}
4546
build-extras: true

.github/workflows/torch-extras.yml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,17 @@ jobs:
5151
get-required-bases:
5252
name: Get Latest Required Base Images
5353
if: inputs.skip-bases-check != true
54-
runs-on: ["self-hosted", "Linux"]
54+
runs-on: [ cw ]
55+
container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0'
56+
defaults:
57+
run:
58+
shell: bash
5559
permissions:
5660
packages: read
5761
outputs:
5862
bases-list: ${{ steps.choose-bases.outputs.list }}
5963
steps:
60-
- uses: actions/checkout@v3
64+
- uses: actions/checkout@v4
6165
with:
6266
fetch-depth: 0
6367
- name: Check if torch-extras needs to be rebuilt from previous bases

.github/workflows/torch-nccl.yml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,12 @@ jobs:
4343
secrets: inherit
4444
with:
4545
image-name: ${{ inputs.image-name }}
46-
tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-{1}-nccl{2}', matrix.image.cuda, matrix.image.os, matrix.image.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }}
47-
builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
48-
base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
46+
tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-{1}-nccl{2}', matrix.cuda, matrix.os, matrix.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}-abi{3}', matrix.torch, matrix.vision, matrix.audio, matrix.abi)) }}
47+
builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
48+
base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
4949
torch-version: ${{ matrix.torch }}
5050
torchvision-version: ${{ matrix.vision }}
5151
torchaudio-version: ${{ matrix.audio }}
52-
cache-key: nccl-cuda${{ matrix.image.cuda }}-${{ matrix.image.os }}
52+
additional-build-args: BUILD_CXX11_ABI=${{ matrix.abi }}
53+
cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }}
5354
build-extras: true

.github/workflows/torch-nightly.yml

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,11 @@ jobs:
1919
get-nightly-info:
2020
name:
2121
Get Nightly Info
22-
runs-on: [ self-hosted, Linux ]
22+
runs-on: [ cw ]
23+
container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.9.0'
24+
defaults:
25+
run:
26+
shell: bash
2327
outputs:
2428
pytorch-commit: ${{ steps.get-hash.outputs.pytorch-commit }}
2529
triton-commit: ${{ steps.get-hash.outputs.triton-commit }}
@@ -89,13 +93,13 @@ jobs:
8993
uses: ./.github/workflows/read-configuration.yml
9094
with:
9195
path: ./.github/configurations/torch-base.yml
92-
filter: del(.include)
96+
filter: 'del(.include) | .exclude |= . + [{"abi": "0"}]'
9397
get-nccl-config:
9498
name: Get torch:nccl Config
9599
uses: ./.github/workflows/read-configuration.yml
96100
with:
97101
path: ./.github/configurations/torch-nccl.yml
98-
filter: del(.include)
102+
filter: 'del( .include[] | ( .torch, .vision, .audio ) ) | .exclude |= . + [{"abi": "0"}]'
99103

100104
build-base:
101105
name: Build Nightly torch:base
@@ -115,7 +119,7 @@ jobs:
115119
torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }}
116120
torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }}
117121
torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }}
118-
triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }}
122+
additional-build-args: BUILD_TRITON_VERSION=${{ needs.get-nightly-info.outputs.triton-commit }}
119123
cache-key: base-cuda${{ matrix.cuda }}-${{ matrix.os }}
120124
build-extras: true
121125
build-nccl:
@@ -130,12 +134,12 @@ jobs:
130134
secrets: inherit
131135
with:
132136
image-name: nightly-torch
133-
tag: ${{ format('nccl-{0}-cuda{1}-{2}-nccl{3}-{4}', needs.get-nightly-info.outputs.date, matrix.image.cuda, matrix.image.os, matrix.image.nccl, needs.get-nightly-info.outputs.version-string ) }}
134-
builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
135-
base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
137+
tag: ${{ format('nccl-{0}-cuda{1}-{2}-nccl{3}-{4}', needs.get-nightly-info.outputs.date, matrix.cuda, matrix.os, matrix.nccl, needs.get-nightly-info.outputs.version-string ) }}
138+
builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
139+
base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
136140
torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }}
137141
torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }}
138142
torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }}
139-
triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }}
140-
cache-key: nccl-cuda${{ matrix.image.cuda }}-${{ matrix.image.os }}
143+
additional-build-args: BUILD_TRITON_VERSION=${{ needs.get-nightly-info.outputs.triton-commit }}
144+
cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }}
141145
build-extras: true

.github/workflows/torch.yml

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,9 @@ on:
1919
torchaudio-version:
2020
required: true
2121
type: string
22-
triton-version:
22+
additional-build-args:
2323
required: false
2424
type: string
25-
cuda-arch-support:
26-
required: false
27-
type: string
28-
default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX"
2925
image-name:
3026
required: false
3127
type: string
@@ -63,15 +59,10 @@ on:
6359
required: true
6460
description: "Tagged version number from pytorch/audio to build"
6561
type: string
66-
triton-version:
67-
required: false
68-
description: "Tagged version number from openai/triton to build"
69-
type: string
70-
cuda-arch-support:
62+
additional-build-args:
7163
required: false
72-
description: "Space-separated list of CUDA architectures to support"
64+
description: "Further --build-arg parameters for the build"
7365
type: string
74-
default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX"
7566
image-name:
7667
required: false
7768
description: "Custom name under which to publish the resulting container"
@@ -99,8 +90,7 @@ jobs:
9990
BUILD_TORCH_VERSION=${{ inputs.torch-version }}
10091
BUILD_TORCH_VISION_VERSION=${{ inputs.torchvision-version }}
10192
BUILD_TORCH_AUDIO_VERSION=${{ inputs.torchaudio-version }}
102-
${{ inputs.cuda-arch-support && format('BUILD_TORCH_CUDA_ARCH_LIST={0}', inputs.cuda-arch-support) || '' }}
103-
${{ inputs.triton-version && format('BUILD_TRITON_VERSION={0}', inputs.triton-version) || '' }}
93+
${{ inputs.additional-build-args }}
10494
build-extras:
10595
name: Build torch-extras
10696
if: inputs.build-extras

0 commit comments

Comments
 (0)