@@ -701,6 +701,98 @@ def withHealthyNode(String baseLabel, Closure<?> healthChecks, Closure<?> body,
701701 }
702702}
703703
704+ // We should remove this func once the problem with docker pull on gfx950/mfma is fixed
705+ // Handles Docker image retrieval with a stash-based workaround for gfx950 and mfma
706+ def getDockerImage (String imageName , String matrixKey ) {
707+ // Pre-calculate the sanitized image name to avoid GString parsing issues
708+ def sanitizedImageName = imageName. replace(' /' , ' -' ). replace(' :' , ' -' )
709+
710+ // Create a unique stash name per build and image type
711+ // 'CIMIGRAPHX-' prefix added for the migraphx-ci image stash
712+ def stashNamePrefix = (imageName == dockerImageCIMIGraphX()) ? ' CIMIGRAPHX-' : ' '
713+ def stashName = " ${ env.BUILD_ID} -${ stashNamePrefix} img-${ sanitizedImageName} "
714+ def img = docker. image(imageName)
715+ boolean isPrimaryPuller = (params. canXdlops && (matrixKey == ' navi3x' || matrixKey == ' gfx908' )) ||
716+ (! params. canXdlops && (matrixKey == ' vanilla' ))
717+
718+ // The failing branches (gfx950, mfma) are "Waiters"
719+ if (matrixKey == ' gfx950' || matrixKey == ' mfma' ) {
720+ stage(" Wait/Load Image (${ imageName} ) from Stash" ) {
721+ echo " Branch '${ matrixKey} ' is WAITER. First, attempting normal docker pull..."
722+
723+ try {
724+ echo " Attempting to pull ${ imageName} ..."
725+ img. pull()
726+ echo " Docker pull succeeded on ${ matrixKey} "
727+ } catch (Exception pullError) {
728+ echo " Docker pull failed on ${ matrixKey} : ${ pullError.message} "
729+ echo " Falling back to loading from stash '${ stashName} '..."
730+
731+ try {
732+ timeout(time : 20 , unit : ' MINUTES' ) {
733+ retry(120 ) { // Poll every 10 seconds for 20 minutes
734+ try {
735+ echo " Attempting to unstash ${ stashName} ..."
736+ unstash stashName
737+ echo " Unstash successful."
738+ } catch (e) {
739+ echo " Stash not ready. Waiting 10 seconds..."
740+ sleep(10 )
741+ throw new Exception (" Stash not found, retrying." )
742+ }
743+ }
744+ }
745+ } catch (Exception unstashError) {
746+ // Timeout or all retries failed
747+ echo " Failed to unstash image after 20 minutes."
748+ error " Failed to pull OR unstash image '${ imageName} ' from stash '${ stashName} '. Aborting '${ matrixKey} ' leg."
749+ }
750+
751+ echo " Loading ${ imageName} from tarball..."
752+ sh " docker load -i rocm-mlir-image.tar"
753+ sh " rm -f rocm-mlir-image.tar"
754+ echo " Image load complete"
755+ }
756+ }
757+ }
758+ // The "Puller" branch pulls AND stashes
759+ else if (isPrimaryPuller) {
760+ stage(" Pull and Stash Image (${ imageName} )" ) {
761+ echo " Branch '${ matrixKey} ' is PRIMARY PULLER."
762+ retry(3 ) {
763+ try {
764+ echo " Attempting to pull ${ imageName} ..."
765+ img. pull()
766+ echo " Pull successful."
767+ } catch (Exception e) {
768+ echo " Docker pull failed. Retrying... Error: ${ e.message} "
769+ sleep(time : 30 , unit : ' SECONDS' )
770+ throw e
771+ }
772+ }
773+
774+ echo " Saving ${ imageName} to tarball..."
775+ sh " docker save -o rocm-mlir-image.tar ${ imageName} "
776+ echo " Stashing tarball as ${ stashName} ..."
777+ // Only waiters should unstash it
778+ stash name : stashName, includes : ' rocm-mlir-image.tar'
779+ sh " rm -f rocm-mlir-image.tar" // Clean up workspace
780+ echo " Stash complete."
781+ }
782+ }
783+ // Other branches should do a normal pull
784+ else {
785+ stage(" Pull Image (${ imageName} )" ) {
786+ echo " Branch '${ matrixKey} ' pulling image normally"
787+ echo " Attempting to pull ${ imageName} ..."
788+ img. pull()
789+ echo " Pull successful"
790+ }
791+ }
792+
793+ return img
794+ }
795+
704796pipeline {
705797 agent none
706798 options { parallelsAlwaysFailFast() }
@@ -822,8 +914,9 @@ pipeline {
822914 args = DOCKER_ARGS_BY_NODE [env. NODE_NAME ]
823915 // Check these args
824916 echo " Running ${ CODEPATH} on ${ env.NODE_NAME} with: ${ args} "
825- img = docker. image(dockerImage())
826- img?. pull()
917+
918+ // Workaround for gfx950 issue with docker pull
919+ img = getDockerImage(dockerImage(), CODEPATH )
827920 }
828921 // Spin up ONE container and stay in it for all substages
829922 img. inside(args) {
@@ -965,8 +1058,9 @@ pipeline {
9651058 args = DOCKER_ARGS_BY_NODE [env. NODE_NAME ]
9661059 // Check these args
9671060 echo " Running ${ CODEPATH} on ${ env.NODE_NAME} with: ${ args} "
968- img = docker. image(dockerImage())
969- img?. pull()
1061+
1062+ // Workaround for gfx950 issue with docker pull
1063+ img = getDockerImage(dockerImage(), CODEPATH )
9701064 }
9711065 // Spin up ONE container and stay in it for all substages
9721066 img. inside(args) {
@@ -1052,9 +1146,10 @@ pipeline {
10521146
10531147 args = DOCKER_ARGS_BY_NODE [env. NODE_NAME ]
10541148 // Check these args
1055- echo " Running ${ CODEPATH} on ${ env.NODE_NAME} with: ${ args} "
1056- img = docker. image(dockerImage())
1057- img?. pull()
1149+ echo " Running ${ CHIP} on ${ env.NODE_NAME} with: ${ args} "
1150+
1151+ // Workaround for gfx950 issue with docker pull
1152+ img = getDockerImage(dockerImage(), CHIP )
10581153 }
10591154 // Spin up ONE container and stay in it for all substages
10601155 img. inside(args) {
@@ -1227,9 +1322,10 @@ pipeline {
12271322
12281323 args = DOCKER_ARGS_BY_NODE [env. NODE_NAME ]
12291324 // Check these args
1230- echo " Running ${ CODEPATH} on ${ env.NODE_NAME} with: ${ args} "
1231- img = docker. image(dockerImage())
1232- img?. pull()
1325+ echo " Running ${ CHIP} on ${ env.NODE_NAME} with: ${ args} "
1326+
1327+ // Workaround for gfx950 issue with docker pull
1328+ img = getDockerImage(dockerImage(), CHIP )
12331329 }
12341330 // Spin up ONE container and stay in it for all substages
12351331 img. inside(args) {
@@ -1438,8 +1534,9 @@ pipeline {
14381534 args = DOCKER_ARGS_BY_NODE [env. NODE_NAME ]
14391535 // Check these args
14401536 echo " Running ${ CODEPATH} on ${ env.NODE_NAME} with: ${ args} "
1441- img = docker. image(dockerImageCIMIGraphX())
1442- img?. pull()
1537+
1538+ // Workaround for gfx950 issue with docker pull
1539+ img = getDockerImage(dockerImageCIMIGraphX(), CODEPATH )
14431540 }
14441541 // Spin up ONE container and stay in it for all substages
14451542 img. inside(args) {
@@ -1556,8 +1653,9 @@ pipeline {
15561653 args = DOCKER_ARGS_BY_NODE [env. NODE_NAME ]
15571654 // Check these args
15581655 echo " Running ${ CODEPATH} on ${ env.NODE_NAME} with: ${ args} "
1559- img = docker. image(dockerImage())
1560- img?. pull()
1656+
1657+ // Workaround for gfx950 issue with docker pull
1658+ img = getDockerImage(dockerImage(), CODEPATH )
15611659 }
15621660 // Spin up ONE container and stay in it for all substages
15631661 img. inside(args) {
0 commit comments