Skip to content

Commit d95ae13

Browse files
authored
[Infra] - Always use x86 image for the Jenkins agent and few clean-ups (NVIDIA#5753)
Signed-off-by: Yanchao Lu <[email protected]>
1 parent 6bddaf6 commit d95ae13

File tree

4 files changed

+18
-45
lines changed

4 files changed

+18
-45
lines changed

jenkins/Build.groovy

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ AARCH64_TRIPLE = "aarch64-linux-gnu"
1616

1717
LLM_DOCKER_IMAGE = env.dockerImage
1818

19-
AGENT_IMAGE = env.dockerImage
19+
// Always use x86_64 image for agent
20+
AGENT_IMAGE = env.dockerImage.replace("aarch64", "x86_64")
2021

2122
POD_TIMEOUT_SECONDS = env.podTimeoutSeconds ? env.podTimeoutSeconds : "21600"
2223
POD_TIMEOUT_SECONDS_TMP = env.podTimeoutSeconds ? env.podTimeoutSeconds : "43200"

jenkins/L0_MergeRequest.groovy

Lines changed: 8 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,6 @@ def getContainerURIs()
4444
return uris
4545
}
4646

47-
// TODO: Move common variables to an unified location
48-
BUILD_CORES_REQUEST = "8"
49-
BUILD_CORES_LIMIT = "8"
50-
BUILD_MEMORY_REQUEST = "48Gi"
51-
BUILD_MEMORY_LIMIT = "48Gi"
52-
5347
// Stage choices
5448
STAGE_CHOICE_NORMAL = "normal"
5549
STAGE_CHOICE_SKIP = "skip"
@@ -214,37 +208,15 @@ def createKubernetesPodConfig(image, type, arch = "amd64")
214208
resources:
215209
requests:
216210
cpu: '2'
217-
memory: 10Gi
211+
memory: 5Gi
218212
ephemeral-storage: 25Gi
219213
limits:
220214
cpu: '2'
221-
memory: 10Gi
215+
memory: 5Gi
222216
ephemeral-storage: 25Gi
223217
imagePullPolicy: Always"""
224218
nodeLabelPrefix = "cpu"
225219
break
226-
case "build":
227-
containerConfig = """
228-
- name: trt-llm
229-
image: ${image}
230-
command: ['cat']
231-
volumeMounts:
232-
- name: sw-tensorrt-pvc
233-
mountPath: "/mnt/sw-tensorrt-pvc"
234-
readOnly: false
235-
tty: true
236-
resources:
237-
requests:
238-
cpu: ${BUILD_CORES_REQUEST}
239-
memory: ${BUILD_MEMORY_REQUEST}
240-
ephemeral-storage: 200Gi
241-
limits:
242-
cpu: ${BUILD_CORES_LIMIT}
243-
memory: ${BUILD_MEMORY_LIMIT}
244-
ephemeral-storage: 200Gi
245-
imagePullPolicy: Always"""
246-
nodeLabelPrefix = "cpu"
247-
break
248220
case "package":
249221
containerConfig = """
250222
- name: trt-llm
@@ -254,11 +226,11 @@ def createKubernetesPodConfig(image, type, arch = "amd64")
254226
resources:
255227
requests:
256228
cpu: '2'
257-
memory: 10Gi
229+
memory: 5Gi
258230
ephemeral-storage: 25Gi
259231
limits:
260232
cpu: '2'
261-
memory: 10Gi
233+
memory: 5Gi
262234
ephemeral-storage: 25Gi
263235
imagePullPolicy: Always"""
264236
nodeLabelPrefix = "cpu"
@@ -299,11 +271,11 @@ def createKubernetesPodConfig(image, type, arch = "amd64")
299271
resources:
300272
requests:
301273
cpu: '2'
302-
memory: 10Gi
274+
memory: 5Gi
303275
ephemeral-storage: 25Gi
304276
limits:
305277
cpu: '2'
306-
memory: 10Gi
278+
memory: 5Gi
307279
ephemeral-storage: 25Gi
308280
qosClass: Guaranteed
309281
volumes:
@@ -327,7 +299,7 @@ def echoNodeAndGpuInfo(pipeline, stageName)
327299
def setupPipelineEnvironment(pipeline, testFilter, globalVars)
328300
{
329301
image = "urm.nvidia.com/docker/golang:1.22"
330-
setupPipelineSpec = createKubernetesPodConfig(image, "build")
302+
setupPipelineSpec = createKubernetesPodConfig(image, "package")
331303
trtllm_utils.launchKubernetesPod(pipeline, setupPipelineSpec, "trt-llm", {
332304
sh "env | sort"
333305
updateGitlabCommitStatus name: "${BUILD_STATUS_NAME}", state: 'running'
@@ -413,7 +385,7 @@ def launchReleaseCheck(pipeline)
413385

414386
def image = "urm.nvidia.com/docker/golang:1.22"
415387
stageName = "Release Check"
416-
trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(image, "build"), "trt-llm", {
388+
trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(image, "package"), "trt-llm", {
417389
stage("[${stageName}] Run") {
418390
if (RELESE_CHECK_CHOICE == STAGE_CHOICE_SKIP) {
419391
echo "Release Check job is skipped due to Jenkins configuration"

jenkins/controlCCache.groovy

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,11 @@ def createKubernetesPodConfig(image, arch = "amd64")
3434
resources:
3535
requests:
3636
cpu: 2
37-
memory: 10Gi
37+
memory: 5Gi
3838
ephemeral-storage: 25Gi
3939
limits:
4040
cpu: 2
41-
memory: 10Gi
41+
memory: 5Gi
4242
ephemeral-storage: 25Gi
4343
imagePullPolicy: Always
4444
- name: jnlp
@@ -47,11 +47,11 @@ def createKubernetesPodConfig(image, arch = "amd64")
4747
resources:
4848
requests:
4949
cpu: '2'
50-
memory: 10Gi
50+
memory: 5Gi
5151
ephemeral-storage: 25Gi
5252
limits:
5353
cpu: '2'
54-
memory: 10Gi
54+
memory: 5Gi
5555
ephemeral-storage: 25Gi
5656
qosClass: Guaranteed
5757
volumes:
@@ -119,10 +119,10 @@ pipeline {
119119
case "Reset":
120120
sh "rm -rf ${CCACHE_DIR}"
121121
sh "mkdir -p ${CCACHE_DIR}"
122-
sh "printf \"max_size=300G\ntemporary_dir=/tmp/ccache\ncompression = true\n\" > ${CCACHE_DIR}/ccache.conf"
122+
sh "printf 'max_size=500G\ntemporary_dir=/tmp/ccache\ncompression=true\nbase_dir=/home/jenkins/agent/workspace/LLM\nsloppiness=file_macro,time_macros,pch_defines\n' > ${CCACHE_DIR}/ccache.conf"
123123
break
124124
case "Config":
125-
sh "printf \"max_size=300G\ntemporary_dir=/tmp/ccache\ncompression = true\n\" > ${CCACHE_DIR}/ccache.conf"
125+
sh "printf 'max_size=500G\ntemporary_dir=/tmp/ccache\ncompression=true\nbase_dir=/home/jenkins/agent/workspace/LLM\nsloppiness=file_macro,time_macros,pch_defines\n' > ${CCACHE_DIR}/ccache.conf"
126126
break
127127
case "Stats":
128128
sh "ccache -sv"

requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ onnx_graphsurgeon>=0.5.2
1313
openai
1414
polygraphy
1515
psutil
16-
nvidia-ml-py>=12
16+
nvidia-ml-py>=12,<13
1717
# Just a wrapper since nvidia-modelopt requires pynvml
18-
pynvml>=12.0.0
18+
pynvml==12.0.0
1919
pulp
2020
pandas
2121
h5py==3.12.1

0 commit comments

Comments
 (0)