Skip to content

Commit 16bcf51

Browse files
committed
Add retry mechanism for registry push commands
There are 2 types of commands that are handled by this commit. 1. Bash commands executed in the Tekton task 2. Commands executed in the Python script Both of these uses re-tries mechanism now. The bash commands use a generic script that controls the execution and retries if needed. This commit "fixes" the tasks and commands that failed the most often and improves stability. JIRA: ISV-6114 Signed-off-by: Ales Raszka <[email protected]>
1 parent 089934b commit 16bcf51

File tree

12 files changed

+132
-66
lines changed

12 files changed

+132
-66
lines changed

ansible/roles/index_signature_verification/files/tasks/verify-index-signatures.yml

Lines changed: 8 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,9 @@ spec:
1010
params:
1111
- name: organization
1212
description: Index organization
13-
- name: jq_image
14-
description: Operator pipeline default image
13+
- name: pipeline_image
14+
description: Pipeline image
1515
default: "quay.io/redhat-isv/operator-pipelines-images:released"
16-
- name: podman_image
17-
description: Podman image
18-
default: "registry.redhat.io/ubi9/podman:9.5"
19-
- name: skopeo_image
20-
description: Skopeo image
21-
default: "registry.redhat.io/ubi9/skopeo:9.5"
2216
- name: indices_api
2317
description: Pyxis API endpoint for Operator index images
2418
default: https://catalog.redhat.com/api/containers/v1/operators/indices
@@ -50,7 +44,7 @@ spec:
5044

5145
steps:
5246
- name: get-index-images
53-
image: "$(params.jq_image)"
47+
image: "$(params.pipeline_image)"
5448
script: |
5549
#!/usr/bin/env bash
5650
set +x -e -o pipefail
@@ -65,19 +59,19 @@ spec:
6559
| tee image-tags.txt
6660
6761
- name: inspect-images
68-
image: "$(params.skopeo_image)"
62+
image: "$(params.pipeline_image)"
6963
script: |
7064
#!/usr/bin/env bash
7165
set +x -e -o pipefail
7266
7367
for image in $(cat image-tags.txt); do
7468
dest=$(echo -n $image | base64)
7569
echo "Inspecting $image"
76-
skopeo inspect --retry-times 5 --raw docker://$image > $dest.json
70+
retry 5 skopeo inspect --raw docker://$image > $dest.json
7771
done
7872
7973
- name: get-digest-pull-specs
80-
image: "$(params.jq_image)"
74+
image: "$(params.pipeline_image)"
8175
script: |
8276
#!/usr/bin/env bash
8377
set +x -e -o pipefail
@@ -93,7 +87,7 @@ spec:
9387
- name: pull-images-by-digest
9488
securityContext:
9589
privileged: true
96-
image: "$(params.podman_image)"
90+
image: "$(params.pipeline_image)"
9791
env:
9892
- name: STORAGE_DRIVER
9993
value: vfs
@@ -104,22 +98,6 @@ spec:
10498
cp /etc/containers/policy.json /tmp/
10599
podman image trust set --policypath=/tmp/policy.json -f /mnt/keys/pub.gpg registry.redhat.io
106100
107-
max_retries=5
108-
109101
for pull_spec in $(cat image-digests.txt); do
110-
wait_time=1
111-
for ((i=1; i<=max_retries; i++)); do
112-
podman pull --retry 0 --signature-policy=/tmp/policy.json $pull_spec
113-
if [ $? -eq 0 ]; then
114-
break
115-
fi
116-
117-
sleep $wait_time
118-
wait_time=$((wait_time * 2))
119-
120-
if [ $i -eq $max_retries ]; then
121-
echo "ERROR: Podman pull failed after $max_retries attempts."
122-
exit 1
123-
fi
124-
done
102+
retry 5 podman pull --retry 0 --signature-policy=/tmp/policy.json $pull_spec
125103
done

ansible/roles/operator-pipeline/templates/openshift/tasks/buildah.yml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,23 @@ spec:
110110
EXTRA_ARGS+=" --authfile $(workspaces.dockerconfig.path)/.dockerconfigjson"
111111
fi
112112
113+
retry() {
114+
local max_retries=$1
115+
shift
116+
117+
local attempt=1
118+
119+
until "$@"; do
120+
if [ $attempt -ge $max_retries ]; then
121+
echo "Command failed after $max_retries attempts" >&2
122+
return 1
123+
fi
124+
attempt=$((attempt + 1))
125+
echo "Attempt $attempt failed. Retrying..." >&2
126+
sleep 1
127+
done
128+
}
129+
113130
echo "Building $(params.IMAGE)"
114131
BUILD_ARGS=()
115132
for buildarg in "$@"
@@ -125,7 +142,7 @@ spec:
125142
[ "${PARAM_SKIP_PUSH}" = "true" ] && echo "Push skipped" && exit 0
126143
# push the image (CERT_DIR_FLAG should be omitted if empty and PUSH_EXTRA_ARGS can contain multiple args)
127144
# shellcheck disable=SC2046,SC2086
128-
buildah ${CERT_DIR_FLAG} "--storage-driver=${PARAM_STORAGE_DRIVER}" push \
145+
retry 5 buildah ${CERT_DIR_FLAG} "--storage-driver=${PARAM_STORAGE_DRIVER}" push \
129146
"--tls-verify=${PARAM_TLSVERIFY}" --digestfile /tmp/image-digest ${PARAM_PUSH_EXTRA_ARGS} ${EXTRA_ARGS} \
130147
"${PARAM_IMAGE}" "docker://${PARAM_IMAGE}"
131148
tee "$(results.IMAGE_DIGEST.path)" < /tmp/image-digest

ansible/roles/operator-pipeline/templates/openshift/tasks/copy-image.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,22 +58,22 @@ spec:
5858
DEST_AUTHFILE=$(workspaces.dest-registry-credentials.path)/.dockerconfigjson
5959
CONNECT_REPO_PATH="$(params.connect_registry)/$(params.vendor_label)/$(params.repo_name)"
6060
61-
skopeo copy \
61+
retry 5 skopeo copy \
6262
--retry-times 5 \
6363
--src-authfile $SRC_AUTHFILE \
6464
--dest-authfile $DEST_AUTHFILE \
6565
docker://$(params.src_image) \
6666
docker://"$(params.dest_image_registry_namespace_certproject):$(params.dest_image_tag)"
6767
6868
69-
skopeo copy \
69+
retry 5 skopeo copy \
7070
--retry-times 5 \
7171
--src-authfile $SRC_AUTHFILE \
7272
--dest-authfile $DEST_AUTHFILE \
7373
docker://$(params.src_image) \
7474
docker://"$(params.dest_image_registry_namespace_certproject):latest"
7575
76-
DIGEST=$(skopeo inspect --retry-times 5 --authfile $DEST_AUTHFILE docker://$(params.dest_image_registry_namespace_certproject):$(params.dest_image_tag) | jq -r .Digest)
76+
DIGEST=$(retry 5 skopeo inspect --retry-times 5 --authfile $DEST_AUTHFILE docker://$(params.dest_image_registry_namespace_certproject):$(params.dest_image_tag) | jq -r .Digest)
7777
echo -n $DIGEST | tee $(results.container_digest.path)
7878
echo "- $CONNECT_REPO_PATH:$(params.dest_image_tag)" | tee "$RELEASE_INFO_DIR_PATH/released_bundle.txt"
7979
echo -n "$CONNECT_REPO_PATH@${DIGEST}" > $(results.image_pullspec.path)

ansible/roles/operator-pipeline/templates/openshift/tasks/publish-to-index.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ spec:
116116
echo " - $DEST_IMAGE_PERMANENT_TAG"
117117
118118
# Add version tag to an index
119-
skopeo \
119+
retry 5 skopeo \
120120
copy \
121121
--retry-times 5 \
122122
--format v2s2 --all \
@@ -126,7 +126,7 @@ spec:
126126
docker://$DEST_IMAGE_VERSION_TAG
127127
128128
# Add permanent tag to an index
129-
skopeo \
129+
retry 5 skopeo \
130130
copy \
131131
--retry-times 5 \
132132
--format v2s2 --all \

operator-pipeline-images/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ RUN dnf update -y && \
4444
dnf clean all
4545

4646
COPY operator-pipeline-images/config/krb5.conf /etc/krb5.conf
47+
COPY operator-pipeline-images/hacks/retry-command.sh /usr/local/bin/retry
4748

4849
# Install oc, opm and operator-sdk CLI
4950
RUN curl -LO https://github.com/operator-framework/operator-registry/releases/download/v1.46.0/linux-${ARCH}-opm && \
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/sh
2+
3+
max_retries=$1
4+
shift
5+
6+
if [ -z "$1" ]; then
7+
echo "Usage: retry <max_retries> <command> [args...]" >&2
8+
exit 1
9+
fi
10+
11+
attempt=1
12+
13+
until "$@"; do
14+
if [ "$attempt" -ge "$max_retries" ]; then
15+
echo "Command failed after $max_retries attempts" >&2
16+
exit 1
17+
fi
18+
attempt=$((attempt + 1))
19+
echo "Attempt $attempt failed. Retrying..." >&2
20+
21+
done

operator-pipeline-images/operatorcert/buildah.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def build_image(dockerfile_path: str, context: str, output_image: str) -> Any:
3333
context,
3434
]
3535
LOGGER.info("Building image: %s", output_image)
36-
return run_command(cmd)
36+
return run_command(cmd, retries=2)
3737

3838

3939
def push_image(image: str, authfile: str) -> Any:
@@ -58,4 +58,4 @@ def push_image(image: str, authfile: str) -> Any:
5858
]
5959

6060
LOGGER.info("Pushing image: %s", image)
61-
return run_command(cmd)
61+
return run_command(cmd, retries=5)

operator-pipeline-images/operatorcert/bundle.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def _copy_image(self) -> None:
116116
if self.auth_file_path:
117117
command.extend(["--authfile", self.auth_file_path])
118118

119-
utils.run_command(command)
119+
utils.run_command(command, retries=5)
120120

121121
def _extract_content(self) -> None:
122122
"""

operator-pipeline-images/operatorcert/utils.py

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -134,33 +134,43 @@ def __call__(
134134

135135

136136
def run_command(
137-
cmd: List[str], check: bool = True, cwd: Optional[str] = None
137+
cmd: List[str], check: bool = True, cwd: Optional[str] = None, retries: int = 1
138138
) -> subprocess.CompletedProcess[bytes]:
139139
"""
140140
Run a shell command and return its output.
141141
142142
Args:
143143
cmd (List[str]): Command to run
144+
check (bool): Whether to check the command exit code
145+
cwd (Optional[str]): Current working directory
146+
retries (int): Number of retries before failing
144147
145148
Returns:
146149
CompletedProcess: Command output
147150
"""
148151
LOGGER.debug("Running command: %s", cmd)
149-
try:
150-
output = subprocess.run(
151-
cmd,
152-
stdout=subprocess.PIPE,
153-
stderr=subprocess.PIPE,
154-
check=check,
155-
cwd=cwd,
156-
)
157-
except subprocess.CalledProcessError as e:
158-
LOGGER.error(
159-
"Error running command: \nstdout: %s\nstderr: %s",
160-
e.stdout,
161-
e.stderr,
162-
)
163-
raise e
152+
for attempt in range(1, retries + 1):
153+
try:
154+
output = subprocess.run(
155+
cmd,
156+
stdout=subprocess.PIPE,
157+
stderr=subprocess.PIPE,
158+
check=check,
159+
cwd=cwd,
160+
)
161+
break
162+
except subprocess.CalledProcessError as e:
163+
LOGGER.error(
164+
"Error running command: \nstdout: %s\nstderr: %s",
165+
e.stdout,
166+
e.stderr,
167+
)
168+
if attempt >= retries:
169+
raise e
170+
LOGGER.warning(
171+
"Command failed, retrying... (attempt %d of %d)", attempt, retries
172+
)
173+
164174
LOGGER.debug("Command output: %s", output.stdout.decode("utf-8"))
165175
return output
166176

@@ -227,7 +237,8 @@ def copy_images_to_destination(
227237
cmd.extend(["--authfile", auth_file])
228238

229239
LOGGER.info("Copying image to destination: %s", cmd)
230-
subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
240+
241+
run_command(cmd, retries=5)
231242

232243

233244
def sort_versions(version_list: list[Any]) -> list[Any]:

operator-pipeline-images/tests/test_buildah.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ def test_build_image(mock_run_command: MagicMock) -> None:
1919
"-t",
2020
"image",
2121
"context",
22-
]
22+
],
23+
retries=2,
2324
)
2425

2526

@@ -29,5 +30,6 @@ def test_push_image(mock_run_command: MagicMock) -> None:
2930
assert result == mock_run_command.return_value
3031

3132
mock_run_command.assert_called_once_with(
32-
["buildah", "push", "--authfile", "authfile", "image", "docker://image"]
33+
["buildah", "push", "--authfile", "authfile", "image", "docker://image"],
34+
retries=5,
3335
)

0 commit comments

Comments
 (0)