From ed688a2145651a30425faa59e11bfaaf1b735810 Mon Sep 17 00:00:00 2001 From: Justin Pierce Date: Tue, 19 Aug 2025 10:45:14 -0400 Subject: [PATCH 1/2] Add retries for external dependencies --- 01_install_requirements.sh | 25 ++++++++++++++++++++++++- ocp_install_env.sh | 24 +++++++++++++++++++++--- utils.sh | 19 +++++++++++++++++++ 3 files changed, 64 insertions(+), 4 deletions(-) diff --git a/01_install_requirements.sh b/01_install_requirements.sh index 8fe354c05..3f5eaad5e 100755 --- a/01_install_requirements.sh +++ b/01_install_requirements.sh @@ -40,7 +40,30 @@ sudo dnf -y clean all old_version=$(sudo dnf info NetworkManager | grep Version | cut -d ':' -f 2) # Update to latest packages first -sudo dnf -y upgrade --nobest +# Number of attempts +MAX_RETRIES=5 +# Delay between attempts (in seconds) +_YUM_RETRY_BACKOFF=15 + +attempt=1 +while (( attempt <= MAX_RETRIES )); do + if sudo dnf -y upgrade --nobest; then + echo "System upgraded successfully." + break + else + echo "Upgrade failed (attempt $attempt). Cleaning cache and retrying..." + sudo dnf clean all + sudo rm -rf /var/cache/dnf/* + sleep $(( _YUM_RETRY_BACKOFF * attempt )) + fi + + (( attempt++ )) +done + +if (( attempt > MAX_RETRIES )); then + echo "ERROR: Failed to upgrade system after $MAX_RETRIES attempts." + exit 1 +fi new_version=$(sudo dnf info NetworkManager | grep Version | cut -d ':' -f 2) # If NetworkManager was upgraded it needs to be restarted diff --git a/ocp_install_env.sh b/ocp_install_env.sh index 241c62054..521ca000d 100644 --- a/ocp_install_env.sh +++ b/ocp_install_env.sh @@ -19,15 +19,33 @@ function extract_command() { local cmd local outdir local extract_dir + local MAX_RETRIES=5 + local SLEEP_BETWEEN=10 cmd="$1" release_image="$2" outdir="$3" - extract_dir=$(mktemp --tmpdir -d "installer--XXXXXXXXXX") - _tmpfiles="$_tmpfiles $extract_dir" + # Retry loop for oc adm release extract to handle quay.io blips + for attempt in $(seq 1 $MAX_RETRIES); do + extract_dir=$(mktemp --tmpdir -d "installer--XXXXXXXXXX") - oc adm release extract --registry-config "${PULL_SECRET_FILE}" --command=$cmd --to "${extract_dir}" ${release_image} + if oc adm release extract --registry-config "${PULL_SECRET_FILE}" --command="$cmd" --to "${extract_dir}" "${release_image}"; then + echo "Successfully extracted $cmd" + break + fi + + if [[ $attempt -lt $MAX_RETRIES ]]; then + echo "Extraction failed, retrying in ${SLEEP_BETWEEN}s..." + rm -rf "${extract_dir}" + sleep "${SLEEP_BETWEEN}" + else + echo "Failed to extract $cmd from ${release_image} after $MAX_RETRIES attempts" + return 1 + fi + done + + _tmpfiles="$_tmpfiles $extract_dir" if [[ $cmd == "oc.rhel8" ]]; then cmd="oc" diff --git a/utils.sh b/utils.sh index 454c28cf2..73382956b 100755 --- a/utils.sh +++ b/utils.sh @@ -617,6 +617,25 @@ EOF if [[ "$reg_state" != "running" || $restart_registry -eq 1 ]]; then sudo podman rm registry -f || true + MAX_RETRIES=5 + _PULL_RETRY_DELAY=10 + + # Try pulling the image first to tolerate quay.io errors like 504s. + for attempt in $(seq 1 $MAX_RETRIES); do + if sudo podman pull "${DOCKER_REGISTRY_IMAGE}"; then + echo "Successfully pulled ${DOCKER_REGISTRY_IMAGE}" + break + fi + + if [[ $attempt -lt $MAX_RETRIES ]]; then + echo "Pull failed, retrying in ${_PULL_RETRY_DELAY}s..." + sleep "${_PULL_RETRY_DELAY}" + else + echo "Failed to pull ${DOCKER_REGISTRY_IMAGE} after $MAX_RETRIES attempts" + exit 1 + fi + done + sudo podman run -d --name registry --net=host --privileged \ -v ${REGISTRY_DIR}/data:/var/lib/registry:z \ -v ${REGISTRY_DIR}/auth:/auth:z \ From fe17b3654c99e9bd24139ec2db6064842bb7085e Mon Sep 17 00:00:00 2001 From: Justin Pierce Date: Tue, 19 Aug 2025 16:07:11 -0400 Subject: [PATCH 2/2] Retry DNF operations --- 01_install_requirements.sh | 51 ++++++++++------------------------ agent/01_agent_requirements.sh | 4 +-- utils.sh | 29 +++++++++++++++++++ 3 files changed, 45 insertions(+), 39 deletions(-) diff --git a/01_install_requirements.sh b/01_install_requirements.sh index 3f5eaad5e..714f6051e 100755 --- a/01_install_requirements.sh +++ b/01_install_requirements.sh @@ -39,31 +39,8 @@ sudo dnf -y clean all old_version=$(sudo dnf info NetworkManager | grep Version | cut -d ':' -f 2) -# Update to latest packages first -# Number of attempts -MAX_RETRIES=5 -# Delay between attempts (in seconds) -_YUM_RETRY_BACKOFF=15 - -attempt=1 -while (( attempt <= MAX_RETRIES )); do - if sudo dnf -y upgrade --nobest; then - echo "System upgraded successfully." - break - else - echo "Upgrade failed (attempt $attempt). Cleaning cache and retrying..." - sudo dnf clean all - sudo rm -rf /var/cache/dnf/* - sleep $(( _YUM_RETRY_BACKOFF * attempt )) - fi - - (( attempt++ )) -done - -if (( attempt > MAX_RETRIES )); then - echo "ERROR: Failed to upgrade system after $MAX_RETRIES attempts." - exit 1 -fi +dnf_with_retries -y upgrade --nobest +echo "System upgraded successfully." new_version=$(sudo dnf info NetworkManager | grep Version | cut -d ':' -f 2) # If NetworkManager was upgraded it needs to be restarted @@ -80,7 +57,7 @@ source /etc/os-release # All of those are needed because we're still behind for OS support. # passlib needs to be installed as system dependency if [[ -x "/usr/libexec/platform-python" ]]; then - sudo /usr/libexec/platform-python -m pip install passlib || sudo dnf -y install python3-pip && sudo /usr/libexec/platform-python -m pip install passlib + sudo /usr/libexec/platform-python -m pip install passlib || sudo dnf_with_retries -y install --nobest python3-pip && sudo /usr/libexec/platform-python -m pip install passlib fi # Install ansible, other packages are installed via @@ -88,16 +65,16 @@ fi case $DISTRO in "centos8"|"rhel8"|"almalinux8"|"rocky8") # install network-scripts package to be able to use legacy network commands - sudo dnf install -y network-scripts + dnf_with_retries install -y --nobest network-scripts if [[ $DISTRO == "centos8" ]] && [[ "$NAME" != *"Stream"* ]]; then echo "CentOS is not supported, please switch to CentOS Stream / RHEL / Rocky / Alma" exit 1 fi if [[ $DISTRO == "centos8" || $DISTRO == "almalinux8" || $DISTRO == "rocky8" ]]; then - sudo dnf -y install epel-release dnf --enablerepo=extras + dnf_with_retries -y install --nobest epel-release dnf --enablerepo=extras elif [[ $DISTRO == "rhel8" ]]; then # Enable EPEL for python3-passlib and python3-bcrypt required by metal3-dev-env - sudo dnf -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm + dnf_with_retries dnf -y install --nobest https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm if sudo subscription-manager repos --list-enabled 2>&1 | grep "ansible-2-for-rhel-8-$(uname -m)-rpms"; then # The packaged 2.x ansible is too old for compatibility with metal3-dev-env sudo dnf erase -y ansible @@ -105,17 +82,17 @@ case $DISTRO in fi fi # Note recent ansible needs python >= 3.8 so we install 3.9 here - sudo dnf -y install python39 + dnf_with_retries -y install --nobest python39 sudo alternatives --set python /usr/bin/python3.9 sudo alternatives --set python3 /usr/bin/python3.9 sudo update-alternatives --install /usr/bin/pip3 pip3 /usr/bin/pip3.9 1 PYTHON_DEVEL="python39-devel" ;; "centos9"|"rhel9"|"almalinux9"|"rocky9") - sudo dnf -y install python3-pip + dnf_with_retries -y install --nobest python3-pip if [[ $DISTRO == "centos9" || $DISTRO == "almalinux9" || $DISTRO == "rocky9" ]] ; then sudo dnf config-manager --set-enabled crb - sudo dnf -y install epel-release + dnf_with_retries -y install --nobest epel-release elif [[ $DISTRO == "rhel9" ]]; then # NOTE(raukadah): If a system is subscribed to RHEL subscription then # sudo subscription-manager identity will return exit 0 else 1. @@ -124,7 +101,7 @@ case $DISTRO in # enable the CRB repository sudo subscription-manager repos --enable codeready-builder-for-rhel-9-$(arch)-rpms fi - sudo dnf -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm + dnf_with_retries -y install --nobest https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm fi sudo ln -s /usr/bin/python3 /usr/bin/python || true PYTHON_DEVEL="python3-devel" @@ -150,7 +127,7 @@ GO_VERSION=${GO_VERSION:-1.22.3} GOARCH=$(uname -m) if [[ $GOARCH == "aarch64" ]]; then GOARCH="arm64" - sudo dnf -y install $PYTHON_DEVEL libxml2-devel libxslt-devel + dnf_with_retries -y install --nobest $PYTHON_DEVEL libxml2-devel libxslt-devel elif [[ $GOARCH == "x86_64" ]]; then GOARCH="amd64" fi @@ -178,16 +155,16 @@ popd if [ -n "${KNI_INSTALL_FROM_GIT}" ]; then # zip is required for building the installer from source - sudo dnf -y install zip + dnf_with_retries -y --nobest install zip fi # Install nfs for persistent volumes if [ "${PERSISTENT_IMAGEREG}" == true ] ; then - sudo dnf -y install nfs-utils + dnf_with_retries -y --nobest install nfs-utils fi if [[ "${NODES_PLATFORM}" == "baremetal" ]] ; then - sudo dnf -y install ipmitool + dnf_with_retries -y --nobest install ipmitool fi # needed if we are using locally built images diff --git a/agent/01_agent_requirements.sh b/agent/01_agent_requirements.sh index 1206e8d43..a2db79149 100755 --- a/agent/01_agent_requirements.sh +++ b/agent/01_agent_requirements.sh @@ -51,9 +51,9 @@ fi if [[ "${AGENT_E2E_TEST_BOOT_MODE}" == "ISCSI" ]]; then # Install shell to administer local storage - sudo dnf -y install targetcli + dnf_with_retries -y --nobest install targetcli fi if [[ "${AGENT_E2E_TEST_BOOT_MODE}" == "ISO_NO_REGISTRY" ]]; then - sudo dnf -y install xorriso coreos-installer syslinux skopeo + dnf_with_retries -y --nobest install xorriso coreos-installer syslinux skopeo fi diff --git a/utils.sh b/utils.sh index 73382956b..2c5cd6e4f 100755 --- a/utils.sh +++ b/utils.sh @@ -28,6 +28,35 @@ function retry_with_timeout() { return $(( exit_code )) } +# Run a dnf command with retries and cache cleaning +dnf_with_retries() { + local max_retries=5 + local delay=15 + local attempt=1 + + while (( attempt <= max_retries )); do + echo "Attempt $attempt of $max_retries: sudo dnf $*" + + if sudo dnf "$@"; then + echo "sudo dnf $* succeeded." + return 0 + fi + + echo "sudo dnf $* failed on attempt $attempt." + if (( attempt < max_retries )); then + echo "Cleaning DNF cache and retrying after $delay seconds..." + sudo dnf clean all || true + sudo rm -rf /var/cache/dnf/* || true + sleep "$delay" + fi + + (( attempt++ )) + done + + echo "ERROR: sudo dnf $* failed after $max_retries attempts." + return 1 +} + function generate_assets() { rm -rf assets/generated && mkdir assets/generated for file in $(find assets/templates/ -iname '*.yaml' -type f -printf "%P\n"); do