Skip to content

Commit c96f870

Browse files
authored
Merge pull request #89 from wunderio/feature/sts-dp-events
statefulset and deployment events for failed releases
2 parents f67de8f + 699c4ab commit c96f870

File tree

9 files changed

+569
-242
lines changed

9 files changed

+569
-242
lines changed

cmd/ciReleaseDebugFailed.go

Lines changed: 412 additions & 0 deletions
Large diffs are not rendered by default.

cmd/ciReleaseDeploy.go

Lines changed: 32 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ var ciReleaseDeployCmd = &cobra.Command{
213213
SILTA_ENVIRONMENT_NAME='%s'
214214
BRANCHNAME='%s'
215215
NGINX_IMAGE_URL='%s'
216-
CLUSTER_DOMAIN='%s'
216+
CLUSTER_DOMAIN='%s'
217217
EXTRA_NOAUTHIPS='%s'
218218
EXTRA_VPCNATIVE='%s'
219219
EXTRA_CLUSTERTYPE='%s'
@@ -222,37 +222,16 @@ var ciReleaseDeployCmd = &cobra.Command{
222222
EXTRA_HELM_FLAGS='%s'
223223
DEPLOYMENT_TIMEOUT='%s'
224224
225-
# Detect pods in FAILED state
226-
function show_failing_pods() {
227-
echo ""
228-
failed_pods=$(kubectl get pod -l "release=$RELEASE_NAME,cronjob!=true" -n "$NAMESPACE" -o custom-columns="POD:metadata.name,STATE:status.containerStatuses[*].ready" --no-headers | grep -E "<none>|false" | grep -Eo '^[^ ]+')
229-
if [[ ! -z "$failed_pods" ]] ; then
230-
echo "Failing pods:"
231-
while IFS= read -r pod; do
232-
echo "---- ${NAMESPACE} / ${pod} ----"
233-
echo "* Events"
234-
kubectl get events --field-selector involvedObject.name=${pod},type!=Normal --show-kind=true --ignore-not-found=true --namespace ${NAMESPACE}
235-
echo ""
236-
echo "* Logs"
237-
containers=$(kubectl get pods "${pod}" --namespace "${NAMESPACE}" -o json | jq -r 'try .status | .containerStatuses[] | select(.ready == false).name')
238-
if [[ ! -z "$containers" ]] ; then
239-
for container in ${containers}; do
240-
kubectl logs "${pod}" --prefix=true --since="${DEPLOYMENT_TIMEOUT}" --namespace "${NAMESPACE}" -c "${container}"
241-
done
242-
else
243-
echo "no logs found"
244-
fi
245-
246-
echo "----"
247-
done <<< "$failed_pods"
225+
function deployment_failed() {
226+
debug_failed_deployment
227+
rm -f helm-output.log || true
228+
}
248229
249-
false
250-
else
251-
true
252-
fi
230+
function debug_failed_deployment() {
231+
silta ci release debug-failed --release-name="${RELEASE_NAME}" --namespace="${NAMESPACE}"
253232
}
254233
255-
trap show_failing_pods ERR
234+
trap deployment_failed ERR
256235
257236
helm upgrade --install "${RELEASE_NAME}" "${CHART_NAME}" \
258237
--repo "${CHART_REPOSITORY}" \
@@ -298,7 +277,7 @@ var ciReleaseDeployCmd = &cobra.Command{
298277
GIT_REPOSITORY_URL='%s'
299278
GITAUTH_USERNAME='%s'
300279
GITAUTH_PASSWORD='%s'
301-
CLUSTER_DOMAIN='%s'
280+
CLUSTER_DOMAIN='%s'
302281
NAMESPACE='%s'
303282
EXTRA_NOAUTHIPS='%s'
304283
EXTRA_VPCNATIVE='%s'
@@ -310,38 +289,16 @@ var ciReleaseDeployCmd = &cobra.Command{
310289
DEPLOYMENT_TIMEOUT='%s'
311290
DEPLOYMENT_TIMEOUT_SECONDS='%d'
312291
313-
# Detect pods in FAILED state
314-
function show_failing_pods() {
315-
echo ""
316-
failed_pods=$(kubectl get pod -l "release=$RELEASE_NAME,cronjob!=true" -n "$NAMESPACE" -o custom-columns="POD:metadata.name,STATE:status.containerStatuses[*].ready" --no-headers | grep -E "<none>|false" | grep -Eo '^[^ ]+')
317-
if [[ ! -z "$failed_pods" ]] ; then
318-
echo "Failing pods:"
319-
while IFS= read -r pod; do
320-
echo "---- ${NAMESPACE} / ${pod} ----"
321-
echo "* Events"
322-
kubectl get events --field-selector involvedObject.name=${pod},type!=Normal --show-kind=true --ignore-not-found=true --namespace ${NAMESPACE}
323-
echo ""
324-
echo "* Logs"
325-
containers=$(kubectl get pods "${pod}" --namespace "${NAMESPACE}" -o json | jq -r 'try .status | .containerStatuses[] | select(.ready == false).name')
326-
if [[ ! -z "$containers" ]] ; then
327-
for container in ${containers}; do
328-
kubectl logs "${pod}" --prefix=true --since="${DEPLOYMENT_TIMEOUT}" --namespace "${NAMESPACE}" -c "${container}"
329-
done
330-
else
331-
echo "no logs found"
332-
fi
333-
334-
echo "----"
335-
done <<< "$failed_pods"
292+
function deployment_failed() {
293+
debug_failed_deployment
294+
rm -f helm-output.log || true
295+
}
336296
337-
false
338-
else
339-
true
340-
fi
341-
rm -f helm-output.log
297+
function debug_failed_deployment() {
298+
silta ci release debug-failed --release-name="${RELEASE_NAME}" --namespace="${NAMESPACE}"
342299
}
343300
344-
trap show_failing_pods ERR
301+
trap deployment_failed ERR
345302
346303
helm upgrade --install "${RELEASE_NAME}" "${CHART_NAME}" \
347304
--repo "${CHART_REPOSITORY}" \
@@ -381,17 +338,17 @@ var ciReleaseDeployCmd = &cobra.Command{
381338
# Helm command is complete.
382339
if ! ps -p "$pid" > /dev/null; then
383340
echo "Helm output:"
384-
cat helm-output.log
341+
cat helm-output.log || true
385342
wait $pid
386343
if grep -q "UPGRADE FAILED" helm-output.log ; then
387-
show_failing_pods
344+
debug_failed_deployment
388345
fi
389346
break
390347
fi
391348
392349
if [ $TIME_WAITING -gt ${DEPLOYMENT_TIMEOUT_SECONDS} ]; then
393350
echo "Timeout waiting for resources."
394-
show_failing_pods
351+
debug_failed_deployment
395352
exit 1
396353
fi
397354
@@ -407,7 +364,7 @@ var ciReleaseDeployCmd = &cobra.Command{
407364
echo "$statefulsets" | xargs -n 1 kubectl rollout status statefulset -n "$NAMESPACE" --timeout 5m
408365
fi
409366
kubectl get deployment -n "$NAMESPACE" -l "release=${RELEASE_NAME}" -o name | xargs -n 1 kubectl rollout status -n "$NAMESPACE" --timeout 5m
410-
rm -f helm-output.log
367+
rm -f helm-output.log || true
411368
`,
412369
releaseName, chartName, chartRepository, chartVersionOverride,
413370
siltaEnvironmentName, branchname,
@@ -513,7 +470,7 @@ var ciReleaseDeployCmd = &cobra.Command{
513470
GIT_REPOSITORY_URL='%s'
514471
GITAUTH_USERNAME='%s'
515472
GITAUTH_PASSWORD='%s'
516-
CLUSTER_DOMAIN='%s'
473+
CLUSTER_DOMAIN='%s'
517474
EXTRA_NOAUTHIPS='%s'
518475
EXTRA_VPCNATIVE='%s'
519476
EXTRA_CLUSTERTYPE='%s'
@@ -526,38 +483,16 @@ var ciReleaseDeployCmd = &cobra.Command{
526483
DEPLOYMENT_TIMEOUT='%s'
527484
DEPLOYMENT_TIMEOUT_SECONDS='%d'
528485
529-
# Detect pods in FAILED state
530-
function show_failing_pods() {
531-
echo ""
532-
failed_pods=$(kubectl get pod -l "release=$RELEASE_NAME,cronjob!=true" -n "$NAMESPACE" -o custom-columns="POD:metadata.name,STATE:status.containerStatuses[*].ready" --no-headers | grep -E "<none>|false" | grep -Eo '^[^ ]+')
533-
if [[ ! -z "$failed_pods" ]] ; then
534-
echo "Failing pods:"
535-
while IFS= read -r pod; do
536-
echo "---- ${NAMESPACE} / ${pod} ----"
537-
echo "* Events"
538-
kubectl get events --field-selector involvedObject.name=${pod},type!=Normal --show-kind=true --ignore-not-found=true --namespace ${NAMESPACE}
539-
echo ""
540-
echo "* Logs"
541-
containers=$(kubectl get pods "${pod}" --namespace "${NAMESPACE}" -o json | jq -r 'try .status | .containerStatuses[] | select(.ready == false).name')
542-
if [[ ! -z "$containers" ]] ; then
543-
for container in ${containers}; do
544-
kubectl logs "${pod}" --prefix=true --since="${DEPLOYMENT_TIMEOUT}" --namespace "${NAMESPACE}" -c "${container}"
545-
done
546-
else
547-
echo "no logs found"
548-
fi
549-
550-
echo "----"
551-
done <<< "$failed_pods"
486+
function deployment_failed() {
487+
debug_failed_deployment
488+
rm -f helm-output.log || true
489+
}
552490
553-
false
554-
else
555-
true
556-
fi
557-
rm -f helm-output.log
491+
function debug_failed_deployment() {
492+
silta ci release debug-failed --release-name="${RELEASE_NAME}" --namespace="${NAMESPACE}"
558493
}
559494
560-
trap show_failing_pods ERR
495+
trap deployment_failed ERR
561496
562497
helm upgrade --install "${RELEASE_NAME}" "${CHART_NAME}" \
563498
--repo "${CHART_REPOSITORY}" \
@@ -601,17 +536,17 @@ var ciReleaseDeployCmd = &cobra.Command{
601536
# Helm command is complete.
602537
if ! ps -p "$pid" > /dev/null; then
603538
echo "Helm output:"
604-
cat helm-output.log
539+
cat helm-output.log || true
605540
wait $pid
606541
if grep -q "UPGRADE FAILED" helm-output.log ; then
607-
show_failing_pods
542+
debug_failed_deployment
608543
fi
609544
break
610545
fi
611546
612547
if [ $TIME_WAITING -gt ${DEPLOYMENT_TIMEOUT_SECONDS} ]; then
613548
echo "Timeout waiting for resources."
614-
show_failing_pods
549+
debug_failed_deployment
615550
exit 1
616551
fi
617552
@@ -627,7 +562,7 @@ var ciReleaseDeployCmd = &cobra.Command{
627562
echo "$statefulsets" | xargs -n 1 kubectl rollout status statefulset -n "$NAMESPACE" --timeout 5m
628563
fi
629564
kubectl get deployment -n "$NAMESPACE" -l "release=${RELEASE_NAME}" -o name | xargs -n 1 kubectl rollout status -n "$NAMESPACE" --timeout 5m
630-
rm -f helm-output.log
565+
rm -f helm-output.log || true
631566
`,
632567
releaseName, chartName, chartRepository, chartVersionOverride,
633568
siltaEnvironmentName, branchname,

docs/silta_ci_release.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@ silta ci release [flags]
2323

2424
* [silta ci](silta_ci.md) - Silta CI Commands
2525
* [silta ci release clean-failed](silta_ci_release_clean-failed.md) - Clean failed releases
26+
* [silta ci release debug-failed](silta_ci_release_debug-failed.md) - Debug failed deployment resources
2627
* [silta ci release delete](silta_ci_release_delete.md) - Delete a release
27-
* [silta ci release delete-resources](silta_ci_release_delete-resources.md) - Delete orphhaned release resources
28+
* [silta ci release delete-resources](silta_ci_release_delete-resources.md) - Delete orphaned release resources
2829
* [silta ci release deploy](silta_ci_release_deploy.md) - Deploy release
2930
* [silta ci release diff](silta_ci_release_diff.md) - Diff release resources
3031
* [silta ci release environmentname](silta_ci_release_environmentname.md) - Return environment name
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
## silta ci release debug-failed
2+
3+
Debug failed deployment resources
4+
5+
### Synopsis
6+
7+
Debug failed deployment by checking:
8+
- OOMKilled containers
9+
- Failed pods with their events and logs
10+
- Not-ready statefulsets with their events
11+
- Not-ready deployments with their events
12+
13+
This command is typically called when a deployment fails to help diagnose the issue.
14+
15+
```
16+
silta ci release debug-failed [flags]
17+
```
18+
19+
### Options
20+
21+
```
22+
-h, --help help for debug-failed
23+
--namespace string Namespace
24+
--release-name string Release name
25+
```
26+
27+
### Options inherited from parent commands
28+
29+
```
30+
--debug Print variables, do not execute external commands, rather print them
31+
--use-env Use environment variables for value assignment (default true)
32+
```
33+
34+
### SEE ALSO
35+
36+
* [silta ci release](silta_ci_release.md) - CI release related commands
37+

docs/silta_ci_release_delete-resources.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
## silta ci release delete-resources
22

3-
Delete orphhaned release resources
3+
Delete orphaned release resources
4+
5+
### Synopsis
6+
7+
Deletes release resources based on labels ("release", "app.kubernetes.io/instance" and "app=<release-name>-es" (for Elasticsearch storage))
8+
This command can be used to clean up resources when helm release configmaps are absent.
9+
410

511
```
612
silta ci release delete-resources [flags]

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ require (
9898
github.com/pelletier/go-toml/v2 v2.2.4 // indirect
9999
github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
100100
github.com/pkg/errors v0.9.1 // indirect
101+
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
101102
github.com/prometheus/client_model v0.6.2 // indirect
102103
github.com/prometheus/common v0.65.0 // indirect
103104
github.com/prometheus/procfs v0.16.1 // indirect

internal/common/ciReleaseFunctions.go

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,24 +18,9 @@ import (
1818
// UninstallRelease removes a Helm release and related resources
1919
func UninstallHelmRelease(kubernetesClient *kubernetes.Clientset, helmClient *helmAction.Configuration, namespace string, releaseName string, deletePVCs bool) error {
2020

21-
// Do not bail when release removal fails, remove related resources anyway.
2221
log.Printf("Removing release: %s", releaseName)
23-
uninstall := helmAction.NewUninstall(helmClient)
24-
uninstall.KeepHistory = false // Remove release secrets as well
25-
uninstall.DisableHooks = false
26-
uninstall.Timeout = 300 // seconds, adjust as needed
27-
uninstall.Wait = true // Wait for resources to be deleted
2822

29-
resp, err := uninstall.Run(releaseName)
30-
if err != nil {
31-
log.Printf("Failed to remove helm release: %s", err)
32-
} else {
33-
if resp != nil && resp.Info != "" {
34-
log.Printf("Helm uninstall info: %s", resp.Info)
35-
}
36-
}
37-
38-
// Delete related jobs
23+
// Delete related jobs (mainly post-release jobs)
3924
selectorLabels := []string{
4025
"release",
4126
"app.kubernetes.io/instance",
@@ -53,6 +38,22 @@ func UninstallHelmRelease(kubernetesClient *kubernetes.Clientset, helmClient *he
5338
}
5439
}
5540

41+
// Delete helm release
42+
uninstall := helmAction.NewUninstall(helmClient)
43+
uninstall.KeepHistory = false // Remove release secrets as well
44+
uninstall.DisableHooks = false
45+
uninstall.Timeout = 300 * time.Second // seconds, adjust as needed
46+
uninstall.Wait = true // Wait for resources to be deleted
47+
48+
resp, err := uninstall.Run(releaseName)
49+
if err != nil {
50+
log.Printf("Failed to remove helm release: %s", err)
51+
} else {
52+
if resp != nil && resp.Info != "" {
53+
log.Printf("Helm uninstall info: %s", resp.Info)
54+
}
55+
}
56+
5657
if deletePVCs {
5758

5859
// Find and remove related PVC's by release name label

tests/cmd_test.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ import (
66
"os/exec"
77
"strings"
88
"testing"
9+
10+
"github.com/pmezard/go-difflib/difflib"
911
)
1012

1113
var cliBinaryName = "./silta"
@@ -38,13 +40,31 @@ func CliExecTest(t *testing.T, command string, environment []string, testString
3840
} else {
3941
t.Logf("Error: %s", err.String())
4042
t.Errorf("Expected :\n '%s' \n Received: \n '%s'\n'%s'", testString, out.String(), err.String())
43+
d := diffOutput(t, testString, out.String())
44+
t.Errorf("Diff:\n %s", d)
4145
}
4246

4347
} else {
4448
if strings.Contains(out.String(), testString) || strings.Contains(err.String(), testString) {
4549
} else {
4650
t.Logf("Error: %s", err.String())
4751
t.Errorf("Expected :\n '%s' \n Received: \n '%s'\n'%s'", testString, out.String(), err.String())
52+
d := diffOutput(t, testString, out.String())
53+
t.Errorf("Diff:\n %s", d)
4854
}
4955
}
5056
}
57+
58+
func diffOutput(t *testing.T, expected string, received string) string {
59+
diff := difflib.UnifiedDiff{
60+
A: difflib.SplitLines(expected),
61+
B: difflib.SplitLines(received),
62+
FromFile: "Expected",
63+
FromDate: "",
64+
ToFile: "Received",
65+
ToDate: "",
66+
Context: 1,
67+
}
68+
text, _ := difflib.GetUnifiedDiffString(diff)
69+
return text
70+
}

0 commit comments

Comments
 (0)