Skip to content

Commit 7f6d6c2

Browse files
authored
Merge pull request #3155 from mboersma/cherry-pick-3148-release-1.7
[release-1.7] overcome transient errors in ci-entrypoint.sh
2 parents c009aa1 + f34cfa7 commit 7f6d6c2

File tree

2 files changed

+124
-77
lines changed

2 files changed

+124
-77
lines changed

Makefile

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,9 @@ create-management-cluster: $(KUSTOMIZE) $(ENVSUBST) $(KUBECTL) $(KIND) ## Create
290290
$(KUBECTL) wait --for=condition=Available --timeout=5m -n capi-kubeadm-bootstrap-system deployment -l cluster.x-k8s.io/provider=bootstrap-kubeadm
291291
$(KUBECTL) wait --for=condition=Available --timeout=5m -n capi-kubeadm-control-plane-system deployment -l cluster.x-k8s.io/provider=control-plane-kubeadm
292292

293+
# Wait for the ClusterResourceSet CRD resource to be "installed" onto the mgmt cluster before installing CRS addons
294+
timeout --foreground 300 bash -c "until $(KUBECTL) get clusterresourcesets -A; do sleep 3; done"
295+
293296
# install Windows Calico cluster resource set
294297
$(KUBECTL) create configmap calico-windows-addon --from-file="$(ADDONS_DIR)/windows/calico" --dry-run=client -o yaml | kubectl apply -f -
295298
$(KUBECTL) apply -f templates/addons/windows/calico-resource-set.yaml
@@ -298,7 +301,10 @@ create-management-cluster: $(KUSTOMIZE) $(ENVSUBST) $(KUBECTL) $(KIND) ## Create
298301
$(KUBECTL) wait --for=condition=Available --timeout=5m -n capz-system deployment -l cluster.x-k8s.io/provider=infrastructure-azure
299302

300303
# required sleep for when creating management and workload cluster simultaneously
301-
sleep 10
304+
# Wait for the core CRD resources to be "installed" onto the mgmt cluster before returning control
305+
timeout --foreground 300 bash -c "until $(KUBECTL) get clusters -A; do sleep 3; done"
306+
timeout --foreground 300 bash -c "until $(KUBECTL) get azureclusters -A; do sleep 3; done"
307+
timeout --foreground 300 bash -c "until $(KUBECTL) get kubeadmcontrolplanes -A; do sleep 3; done"
302308
@echo 'Set kubectl context to the kind management cluster by running "$(KUBECTL) config set-context kind-capz"'
303309

304310
.PHONY: create-workload-cluster

scripts/ci-entrypoint.sh

Lines changed: 117 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -131,36 +131,42 @@ create_cluster() {
131131
"${REPO_ROOT}/hack/create-dev-cluster.sh"
132132
}
133133

134-
install_addons() {
134+
# get_cidrs derives the CIDR from the Cluster's '.spec.clusterNetwork.pods.cidrBlocks' metadata
135+
# any retryable operation in this function must return a non-zero exit code on failure so that we can
136+
# retry it using a `until get_cidrs; do sleep 5; done` pattern;
137+
# and any statement must be idempotent so that subsequent retry attempts can make forward progress.
138+
get_cidrs() {
135139
# Get cluster CIDRs from Cluster object
136140
CIDR0=$(${KUBECTL} get cluster "${CLUSTER_NAME}" -o=jsonpath='{.spec.clusterNetwork.pods.cidrBlocks[0]}')
137-
CIDR1=$(${KUBECTL} get cluster "${CLUSTER_NAME}" -o=jsonpath='{.spec.clusterNetwork.pods.cidrBlocks[1]}' 2> /dev/null) || true
138-
139-
# export the target cluster KUBECONFIG if not already set
140-
export KUBECONFIG="${KUBECONFIG:-${PWD}/kubeconfig}"
141-
142-
# wait for the apiserver pod to be Ready.
143-
APISERVER_POD=$("${KUBECTL}" get pods -n kube-system -o name | grep apiserver)
144-
"${KUBECTL}" wait --for=condition=Ready -n kube-system "${APISERVER_POD}" --timeout=5m
145-
146-
# Copy the kubeadm configmap to the calico-system namespace. This is a workaround needed for the calico-node-windows daemonset to be able to run in the calico-system namespace.
147-
"${KUBECTL}" create ns calico-system
148-
until "${KUBECTL}" get configmap kubeadm-config --namespace=kube-system
149-
do
150-
# Wait for the kubeadm-config configmap to exist.
151-
sleep 2
152-
done
153-
"${KUBECTL}" get configmap kubeadm-config --namespace=kube-system -o yaml \
154-
| sed 's/namespace: kube-system/namespace: calico-system/' \
155-
| "${KUBECTL}" create -f -
141+
export CIDR0
142+
CIDR_LENGTH=$(${KUBECTL} get cluster "${CLUSTER_NAME}" -o=jsonpath='{.spec.clusterNetwork.pods.cidrBlocks}' | jq '. | length')
143+
if [[ "${CIDR_LENGTH}" == "2" ]]; then
144+
CIDR1=$(${KUBECTL} get cluster "${CLUSTER_NAME}" -o=jsonpath='{.spec.clusterNetwork.pods.cidrBlocks[1]}')
145+
export CIDR1
146+
fi
147+
}
156148

149+
# install_calico installs Calico CNI componentry onto the Cluster
150+
# any retryable operation in this function must return a non-zero exit code on failure so that we can
151+
# retry it using a `until install_calico; do sleep 5; done` pattern;
152+
# and any statement must be idempotent so that subsequent retry attempts can make forward progress.
153+
install_calico() {
154+
# Copy the kubeadm configmap to the calico-system namespace.
155+
# This is a workaround needed for the calico-node-windows daemonset
156+
# to be able to run in the calico-system namespace.
157+
"${KUBECTL}" create namespace calico-system --dry-run=client -o yaml | kubectl apply -f -
158+
if ! "${KUBECTL}" get configmap kubeadm-config --namespace=calico-system; then
159+
"${KUBECTL}" get configmap kubeadm-config --namespace=kube-system -o yaml > kubeadm-config-kube-system
160+
sed 's/namespace: kube-system/namespace: calico-system/' kubeadm-config-kube-system | "${KUBECTL}" apply -f -
161+
rm kubeadm-config-kube-system
162+
fi
157163
# install Calico CNI
158164
echo "Installing Calico CNI via helm"
159-
if [[ "${CIDR0}" =~ .*:.* ]]; then
165+
if [[ "${CIDR0:-}" =~ .*:.* ]]; then
160166
echo "Cluster CIDR is IPv6"
161167
CALICO_VALUES_FILE="${REPO_ROOT}/templates/addons/calico-ipv6/values.yaml"
162168
CIDR_STRING_VALUES="installation.calicoNetwork.ipPools[0].cidr=${CIDR0}"
163-
elif [[ "${CIDR1}" =~ .*:.* ]]; then
169+
elif [[ "${CIDR1:-}" =~ .*:.* ]]; then
164170
echo "Cluster CIDR is dual-stack"
165171
CALICO_VALUES_FILE="${REPO_ROOT}/templates/addons/calico-dual-stack/values.yaml"
166172
CIDR_STRING_VALUES="installation.calicoNetwork.ipPools[0].cidr=${CIDR0},installation.calicoNetwork.ipPools[1].cidr=${CIDR1}"
@@ -169,60 +175,49 @@ install_addons() {
169175
CALICO_VALUES_FILE="${REPO_ROOT}/templates/addons/calico/values.yaml"
170176
CIDR_STRING_VALUES="installation.calicoNetwork.ipPools[0].cidr=${CIDR0}"
171177
fi
178+
"${HELM}" upgrade calico --install --repo https://projectcalico.docs.tigera.io/charts tigera-operator -f "${CALICO_VALUES_FILE}" --set-string "${CIDR_STRING_VALUES}" --namespace calico-system
179+
}
172180

173-
"${HELM}" repo add projectcalico https://projectcalico.docs.tigera.io/charts
174-
"${HELM}" install calico projectcalico/tigera-operator -f "${CALICO_VALUES_FILE}" --set-string "${CIDR_STRING_VALUES}" --namespace tigera-operator --create-namespace
175-
176-
# Add FeatureOverride for ChecksumOffloadBroken in FelixConfiguration.
177-
# This is the recommended workaround for https://github.com/projectcalico/calico/issues/3145.
178-
"${KUBECTL}" apply -f "${REPO_ROOT}"/templates/addons/calico/felix-override.yaml
179-
180-
# install cloud-provider-azure components, if using out-of-tree
181-
if [[ -n "${TEST_CCM:-}" ]]; then
182-
CLOUD_CONFIG="/etc/kubernetes/azure.json"
183-
CONFIG_SECRET_NAME=""
184-
ENABLE_DYNAMIC_RELOADING=false
185-
if [[ -n "${LOAD_CLOUD_CONFIG_FROM_SECRET:-}" ]]; then
186-
CLOUD_CONFIG=""
187-
CONFIG_SECRET_NAME="azure-cloud-provider"
188-
ENABLE_DYNAMIC_RELOADING=true
189-
copy_secret
190-
fi
191-
192-
CCM_CLUSTER_CIDR="${CIDR0}"
193-
if [[ -n "${CIDR1}" ]]; then
194-
CCM_CLUSTER_CIDR="${CIDR0}\,${CIDR1}"
195-
fi
196-
echo "CCM cluster CIDR: ${CCM_CLUSTER_CIDR:-}"
197-
198-
export CCM_LOG_VERBOSITY="${CCM_LOG_VERBOSITY:-4}"
199-
echo "Installing cloud-provider-azure components via helm"
200-
"${HELM}" install --repo https://raw.githubusercontent.com/kubernetes-sigs/cloud-provider-azure/master/helm/repo cloud-provider-azure --generate-name \
201-
--set infra.clusterName="${CLUSTER_NAME}" \
202-
--set cloudControllerManager.imageRepository="${IMAGE_REGISTRY}" \
203-
--set cloudNodeManager.imageRepository="${IMAGE_REGISTRY}" \
204-
--set cloudControllerManager.imageName="${CCM_IMAGE_NAME}" \
205-
--set cloudNodeManager.imageName="${CNM_IMAGE_NAME}" \
206-
--set-string cloudControllerManager.imageTag="${IMAGE_TAG}" \
207-
--set-string cloudNodeManager.imageTag="${IMAGE_TAG}" \
208-
--set cloudControllerManager.replicas="${CCM_COUNT}" \
209-
--set cloudControllerManager.enableDynamicReloading="${ENABLE_DYNAMIC_RELOADING}" \
210-
--set cloudControllerManager.cloudConfig="${CLOUD_CONFIG}" \
211-
--set cloudControllerManager.cloudConfigSecretName="${CONFIG_SECRET_NAME}" \
212-
--set cloudControllerManager.logVerbosity="${CCM_LOG_VERBOSITY}" \
213-
--set-string cloudControllerManager.clusterCIDR="${CCM_CLUSTER_CIDR}"
181+
# install_cloud_provider_azure installs OOT cloud-provider-azure componentry onto the Cluster.
182+
# Any retryable operation in this function must return a non-zero exit code on failure so that we can
183+
# retry it using a `until install_cloud_provider_azure; do sleep 5; done` pattern;
184+
# and any statement must be idempotent so that subsequent retry attempts can make forward progress.
185+
install_cloud_provider_azure() {
186+
CLOUD_CONFIG="/etc/kubernetes/azure.json"
187+
CONFIG_SECRET_NAME=""
188+
ENABLE_DYNAMIC_RELOADING=false
189+
if [[ -n "${LOAD_CLOUD_CONFIG_FROM_SECRET:-}" ]]; then
190+
CLOUD_CONFIG=""
191+
CONFIG_SECRET_NAME="azure-cloud-provider"
192+
ENABLE_DYNAMIC_RELOADING=true
193+
copy_secret
214194
fi
215195

216-
export -f wait_for_nodes
217-
timeout --foreground 1800 bash -c wait_for_nodes
218-
219-
echo "Waiting for all calico-system pods to be ready"
220-
"${KUBECTL}" wait --for=condition=Ready pod -n calico-system --all --timeout=10m
221-
222-
echo "Waiting for all kube-system pods to be ready"
223-
"${KUBECTL}" wait --for=condition=Ready pod -n kube-system --all --timeout=10m
196+
CCM_CLUSTER_CIDR="${CIDR0}"
197+
if [[ -n "${CIDR1}" ]]; then
198+
CCM_CLUSTER_CIDR="${CIDR0}\,${CIDR1}"
199+
fi
200+
echo "CCM cluster CIDR: ${CCM_CLUSTER_CIDR:-}"
201+
202+
export CCM_LOG_VERBOSITY="${CCM_LOG_VERBOSITY:-4}"
203+
echo "Installing cloud-provider-azure components via helm"
204+
"${HELM}" upgrade cloud-provider-azure --install --repo https://raw.githubusercontent.com/kubernetes-sigs/cloud-provider-azure/master/helm/repo cloud-provider-azure \
205+
--set infra.clusterName="${CLUSTER_NAME}" \
206+
--set cloudControllerManager.imageRepository="${IMAGE_REGISTRY}" \
207+
--set cloudNodeManager.imageRepository="${IMAGE_REGISTRY}" \
208+
--set cloudControllerManager.imageName="${CCM_IMAGE_NAME}" \
209+
--set cloudNodeManager.imageName="${CNM_IMAGE_NAME}" \
210+
--set-string cloudControllerManager.imageTag="${IMAGE_TAG}" \
211+
--set-string cloudNodeManager.imageTag="${IMAGE_TAG}" \
212+
--set cloudControllerManager.replicas="${CCM_COUNT}" \
213+
--set cloudControllerManager.enableDynamicReloading="${ENABLE_DYNAMIC_RELOADING}" \
214+
--set cloudControllerManager.cloudConfig="${CLOUD_CONFIG}" \
215+
--set cloudControllerManager.cloudConfigSecretName="${CONFIG_SECRET_NAME}" \
216+
--set cloudControllerManager.logVerbosity="${CCM_LOG_VERBOSITY}" \
217+
--set-string cloudControllerManager.clusterCIDR="${CCM_CLUSTER_CIDR}"
224218
}
225219

220+
# wait_for_nodes returns when all nodes in the workload cluster are Ready.
226221
wait_for_nodes() {
227222
echo "Waiting for ${CONTROL_PLANE_MACHINE_COUNT} control plane machine(s), ${WORKER_MACHINE_COUNT} worker machine(s), and ${WINDOWS_WORKER_MACHINE_COUNT:-0} windows machine(s) to become Ready"
228223

@@ -232,8 +227,54 @@ wait_for_nodes() {
232227
sleep 10
233228
done
234229

235-
"${KUBECTL}" wait --for=condition=Ready node --all --timeout=5m
236-
"${KUBECTL}" get nodes -owide
230+
until "${KUBECTL}" wait --for=condition=Ready node --all --timeout=15m; do
231+
sleep 5
232+
done
233+
until "${KUBECTL}" get nodes -o wide; do
234+
sleep 5
235+
done
236+
}
237+
238+
# wait_for_pods returns when all pods on the workload cluster are Running.
239+
wait_for_pods() {
240+
echo "Waiting for all pod init containers scheduled in the cluster to be ready"
241+
while "${KUBECTL}" get pods --all-namespaces -o jsonpath="{.items[*].status.initContainerStatuses[*].ready}" | grep -q false; do
242+
echo "Not all pod init containers are Ready...."
243+
sleep 5
244+
done
245+
246+
echo "Waiting for all pod containers scheduled in the cluster to be ready"
247+
while "${KUBECTL}" get pods --all-namespaces -o jsonpath="{.items[*].status.containerStatuses[*].ready}" | grep -q false; do
248+
echo "Not all pod containers are Ready...."
249+
sleep 5
250+
done
251+
until "${KUBECTL}" get pods --all-namespaces -o wide; do
252+
sleep 5
253+
done
254+
}
255+
256+
install_addons() {
257+
until get_cidrs; do
258+
sleep 5
259+
done
260+
# export the target cluster KUBECONFIG if not already set
261+
export KUBECONFIG="${KUBECONFIG:-${PWD}/kubeconfig}"
262+
until install_calico; do
263+
sleep 5
264+
done
265+
# install cloud-provider-azure components, if using out-of-tree
266+
if [[ -n "${TEST_CCM:-}" ]]; then
267+
until install_cloud_provider_azure; do
268+
sleep 5
269+
done
270+
fi
271+
# In order to determine the successful outcome of CNI and cloud-provider-azure,
272+
# we need to wait a little bit for nodes and pods terminal state,
273+
# so we block successful return upon the cluster being fully operational.
274+
export -f wait_for_nodes
275+
timeout --foreground 1800 bash -c wait_for_nodes
276+
export -f wait_for_pods
277+
timeout --foreground 1800 bash -c wait_for_pods
237278
}
238279

239280
copy_secret() {
@@ -256,10 +297,9 @@ cleanup() {
256297

257298
on_exit() {
258299
if [[ -n ${KUBECONFIG:-} ]]; then
259-
"${KUBECTL}" get nodes -owide || echo "Unable to get nodes"
260-
"${KUBECTL}" get pods -A -owide || echo "Unable to get pods"
300+
"${KUBECTL}" get nodes -o wide || echo "Unable to get nodes"
301+
"${KUBECTL}" get pods -A -o wide || echo "Unable to get pods"
261302
fi
262-
263303
# unset kubeconfig which is currently pointing at workload cluster.
264304
# we want to be pointing at the management cluster (kind in this case)
265305
unset KUBECONFIG
@@ -282,6 +322,7 @@ create_cluster
282322

283323
# install CNI and CCM
284324
install_addons
325+
echo "Cluster ${CLUSTER_NAME} created and fully operational"
285326

286327
if [[ "${#}" -gt 0 ]]; then
287328
# disable error exit so we can run post-command cleanup

0 commit comments

Comments
 (0)