Skip to content

Commit b3011c6

Browse files
authored
CI: Workaround for docker pull (#2100)
Workaround
1 parent fa1136f commit b3011c6

File tree

1 file changed

+112
-14
lines changed

1 file changed

+112
-14
lines changed

mlir/utils/jenkins/Jenkinsfile

Lines changed: 112 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -701,6 +701,98 @@ def withHealthyNode(String baseLabel, Closure<?> healthChecks, Closure<?> body,
701701
}
702702
}
703703

704+
// We should remove this func once the problem with docker pull on gfx950/mfma is fixed
705+
// Handles Docker image retrieval with a stash-based workaround for gfx950 and mfma
706+
def getDockerImage(String imageName, String matrixKey) {
707+
// Pre-calculate the sanitized image name to avoid GString parsing issues
708+
def sanitizedImageName = imageName.replace('/', '-').replace(':', '-')
709+
710+
// Create a unique stash name per build and image type
711+
// 'CIMIGRAPHX-' prefix added for the migraphx-ci image stash
712+
def stashNamePrefix = (imageName == dockerImageCIMIGraphX()) ? 'CIMIGRAPHX-' : ''
713+
def stashName = "${env.BUILD_ID}-${stashNamePrefix}img-${sanitizedImageName}"
714+
def img = docker.image(imageName)
715+
boolean isPrimaryPuller = (params.canXdlops && (matrixKey == 'navi3x' || matrixKey == 'gfx908')) ||
716+
(!params.canXdlops && (matrixKey == 'vanilla'))
717+
718+
// The failing branches (gfx950, mfma) are "Waiters"
719+
if (matrixKey == 'gfx950' || matrixKey == 'mfma') {
720+
stage("Wait/Load Image (${imageName}) from Stash") {
721+
echo "Branch '${matrixKey}' is WAITER. First, attempting normal docker pull..."
722+
723+
try {
724+
echo "Attempting to pull ${imageName}..."
725+
img.pull()
726+
echo "Docker pull succeeded on ${matrixKey}"
727+
} catch (Exception pullError) {
728+
echo "Docker pull failed on ${matrixKey}: ${pullError.message}"
729+
echo "Falling back to loading from stash '${stashName}'..."
730+
731+
try {
732+
timeout(time: 20, unit: 'MINUTES') {
733+
retry(120) { // Poll every 10 seconds for 20 minutes
734+
try {
735+
echo "Attempting to unstash ${stashName}..."
736+
unstash stashName
737+
echo "Unstash successful."
738+
} catch (e) {
739+
echo "Stash not ready. Waiting 10 seconds..."
740+
sleep(10)
741+
throw new Exception("Stash not found, retrying.")
742+
}
743+
}
744+
}
745+
} catch (Exception unstashError) {
746+
// Timeout or all retries failed
747+
echo "Failed to unstash image after 20 minutes."
748+
error "Failed to pull OR unstash image '${imageName}' from stash '${stashName}'. Aborting '${matrixKey}' leg."
749+
}
750+
751+
echo "Loading ${imageName} from tarball..."
752+
sh "docker load -i rocm-mlir-image.tar"
753+
sh "rm -f rocm-mlir-image.tar"
754+
echo "Image load complete"
755+
}
756+
}
757+
}
758+
// The "Puller" branch pulls AND stashes
759+
else if (isPrimaryPuller) {
760+
stage("Pull and Stash Image (${imageName})") {
761+
echo "Branch '${matrixKey}' is PRIMARY PULLER."
762+
retry(3) {
763+
try {
764+
echo "Attempting to pull ${imageName}..."
765+
img.pull()
766+
echo "Pull successful."
767+
} catch (Exception e) {
768+
echo "Docker pull failed. Retrying... Error: ${e.message}"
769+
sleep(time: 30, unit: 'SECONDS')
770+
throw e
771+
}
772+
}
773+
774+
echo "Saving ${imageName} to tarball..."
775+
sh "docker save -o rocm-mlir-image.tar ${imageName}"
776+
echo "Stashing tarball as ${stashName}..."
777+
// Only waiters should unstash it
778+
stash name: stashName, includes: 'rocm-mlir-image.tar'
779+
sh "rm -f rocm-mlir-image.tar" // Clean up workspace
780+
echo "Stash complete."
781+
}
782+
}
783+
// Other branches should do a normal pull
784+
else {
785+
stage("Pull Image (${imageName})") {
786+
echo "Branch '${matrixKey}' pulling image normally"
787+
echo "Attempting to pull ${imageName}..."
788+
img.pull()
789+
echo "Pull successful"
790+
}
791+
}
792+
793+
return img
794+
}
795+
704796
pipeline {
705797
agent none
706798
options { parallelsAlwaysFailFast() }
@@ -822,8 +914,9 @@ pipeline {
822914
args = DOCKER_ARGS_BY_NODE[env.NODE_NAME]
823915
// Check these args
824916
echo "Running ${CODEPATH} on ${env.NODE_NAME} with: ${args}"
825-
img = docker.image(dockerImage())
826-
img?.pull()
917+
918+
// Workaround for gfx950 issue with docker pull
919+
img = getDockerImage(dockerImage(), CODEPATH)
827920
}
828921
// Spin up ONE container and stay in it for all substages
829922
img.inside(args) {
@@ -965,8 +1058,9 @@ pipeline {
9651058
args = DOCKER_ARGS_BY_NODE[env.NODE_NAME]
9661059
// Check these args
9671060
echo "Running ${CODEPATH} on ${env.NODE_NAME} with: ${args}"
968-
img = docker.image(dockerImage())
969-
img?.pull()
1061+
1062+
// Workaround for gfx950 issue with docker pull
1063+
img = getDockerImage(dockerImage(), CODEPATH)
9701064
}
9711065
// Spin up ONE container and stay in it for all substages
9721066
img.inside(args) {
@@ -1052,9 +1146,10 @@ pipeline {
10521146

10531147
args = DOCKER_ARGS_BY_NODE[env.NODE_NAME]
10541148
// Check these args
1055-
echo "Running ${CODEPATH} on ${env.NODE_NAME} with: ${args}"
1056-
img = docker.image(dockerImage())
1057-
img?.pull()
1149+
echo "Running ${CHIP} on ${env.NODE_NAME} with: ${args}"
1150+
1151+
// Workaround for gfx950 issue with docker pull
1152+
img = getDockerImage(dockerImage(), CHIP)
10581153
}
10591154
// Spin up ONE container and stay in it for all substages
10601155
img.inside(args) {
@@ -1227,9 +1322,10 @@ pipeline {
12271322

12281323
args = DOCKER_ARGS_BY_NODE[env.NODE_NAME]
12291324
// Check these args
1230-
echo "Running ${CODEPATH} on ${env.NODE_NAME} with: ${args}"
1231-
img = docker.image(dockerImage())
1232-
img?.pull()
1325+
echo "Running ${CHIP} on ${env.NODE_NAME} with: ${args}"
1326+
1327+
// Workaround for gfx950 issue with docker pull
1328+
img = getDockerImage(dockerImage(), CHIP)
12331329
}
12341330
// Spin up ONE container and stay in it for all substages
12351331
img.inside(args) {
@@ -1438,8 +1534,9 @@ pipeline {
14381534
args = DOCKER_ARGS_BY_NODE[env.NODE_NAME]
14391535
// Check these args
14401536
echo "Running ${CODEPATH} on ${env.NODE_NAME} with: ${args}"
1441-
img = docker.image(dockerImageCIMIGraphX())
1442-
img?.pull()
1537+
1538+
// Workaround for gfx950 issue with docker pull
1539+
img = getDockerImage(dockerImageCIMIGraphX(), CODEPATH)
14431540
}
14441541
// Spin up ONE container and stay in it for all substages
14451542
img.inside(args) {
@@ -1556,8 +1653,9 @@ pipeline {
15561653
args = DOCKER_ARGS_BY_NODE[env.NODE_NAME]
15571654
// Check these args
15581655
echo "Running ${CODEPATH} on ${env.NODE_NAME} with: ${args}"
1559-
img = docker.image(dockerImage())
1560-
img?.pull()
1656+
1657+
// Workaround for gfx950 issue with docker pull
1658+
img = getDockerImage(dockerImage(), CODEPATH)
15611659
}
15621660
// Spin up ONE container and stay in it for all substages
15631661
img.inside(args) {

0 commit comments

Comments
 (0)