Skip to content

Commit 59d88f9

Browse files
committed
fix: address CodeRabbitAI critical review comments
Critical fixes: - Fix IFS=':::' delimiter parsing to use proper parameter expansion - Fix IFS=: parsing for resource_conflicts.txt to handle colons in resource types - Add prerequisite checks for jq (required) and kubelogin (optional) - Handle kubelogin failures gracefully with || true - Track credential failures in CREDENTIAL_ISSUES and report as critical issues - Fix inconsistent severity/CRITICAL_ISSUES logic (remove increment from WARNING branches) - Remove unused variables: NAMESPACES_TO_CHECK, max_count - Replace single-iteration loop with direct namespace check - Sensitive data exposure already mitigated (no kubectl get -o yaml) Details: - Replace 'while IFS=":::"' with proper line reading and parameter expansion - Use awk to properly parse colon-delimited fields with embedded colons - Add jq check (exit if missing) and kubelogin check (warn if missing) - Initialize CREDENTIAL_ISSUES=0 and increment on az aks failures - Report credential failures as critical issues in diagnostic report - Remove CRITICAL_ISSUES increment from partial pod failure (WARNING only) - Remove CRITICAL_ISSUES increment from timeout pattern (WARNING only) - Clean up shellcheck warnings (SC2034, SC2043) All actionable CodeRabbitAI comments addressed.
1 parent 1acf5f9 commit 59d88f9

File tree

1 file changed

+86
-36
lines changed
  • .claude/skills/diagnose-maestro-deployment/scripts

1 file changed

+86
-36
lines changed

.claude/skills/diagnose-maestro-deployment/scripts/diagnose.sh

Lines changed: 86 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -100,11 +100,23 @@ if ! command -v helm &> /dev/null; then
100100
exit 1
101101
fi
102102

103+
if ! command -v jq &> /dev/null; then
104+
echo "ERROR: jq not installed (required for JSON parsing)"
105+
echo "Install with: brew install jq (macOS) or apt-get install jq (Linux)"
106+
exit 1
107+
fi
108+
103109
if ! az account show &> /dev/null; then
104110
echo "ERROR: Not logged into Azure"
105111
exit 1
106112
fi
107113

114+
# kubelogin is optional but recommended for Azure AD authentication
115+
if ! command -v kubelogin &> /dev/null; then
116+
echo "WARNING: kubelogin not installed (Azure AD authentication may fail)"
117+
echo "Install with: brew install Azure/kubelogin/kubelogin (macOS)"
118+
fi
119+
108120
echo "✓ All prerequisites met"
109121
echo ""
110122

@@ -118,16 +130,23 @@ MGMT_KUBECONFIG="$TEMP_DIR/mgmt.kubeconfig"
118130
# Get cluster credentials
119131
echo "Step 2: Retrieving cluster credentials..."
120132

133+
# Initialize issue tracking early (will be used if credentials fail)
134+
CREDENTIAL_ISSUES=0
135+
121136
if az aks get-credentials \
122137
--resource-group "$SVC_RESOURCE_GROUP" \
123138
--name "$SVC_CLUSTER_NAME" \
124139
--overwrite-existing \
125140
-f "$SVC_KUBECONFIG" 2>/dev/null; then
126141
echo "✓ Service cluster credentials retrieved"
127-
kubelogin convert-kubeconfig -l azurecli --kubeconfig "$SVC_KUBECONFIG" 2>/dev/null
142+
# kubelogin may fail but shouldn't stop the script
143+
if command -v kubelogin &> /dev/null; then
144+
kubelogin convert-kubeconfig -l azurecli --kubeconfig "$SVC_KUBECONFIG" 2>/dev/null || true
145+
fi
128146
else
129147
echo "✗ Failed to get service cluster credentials"
130148
SVC_KUBECONFIG=""
149+
CREDENTIAL_ISSUES=$((CREDENTIAL_ISSUES + 1))
131150
fi
132151

133152
if az aks get-credentials \
@@ -136,10 +155,14 @@ if az aks get-credentials \
136155
--overwrite-existing \
137156
-f "$MGMT_KUBECONFIG" 2>/dev/null; then
138157
echo "✓ Management cluster credentials retrieved"
139-
kubelogin convert-kubeconfig -l azurecli --kubeconfig "$MGMT_KUBECONFIG" 2>/dev/null
158+
# kubelogin may fail but shouldn't stop the script
159+
if command -v kubelogin &> /dev/null; then
160+
kubelogin convert-kubeconfig -l azurecli --kubeconfig "$MGMT_KUBECONFIG" 2>/dev/null || true
161+
fi
140162
else
141163
echo "✗ Failed to get management cluster credentials"
142164
MGMT_KUBECONFIG=""
165+
CREDENTIAL_ISSUES=$((CREDENTIAL_ISSUES + 1))
143166
fi
144167

145168
echo ""
@@ -194,7 +217,6 @@ if [ -n "$MGMT_KUBECONFIG" ]; then
194217

195218
# Determine what to check based on log analysis
196219
FAILED_RELEASES=""
197-
NAMESPACES_TO_CHECK=""
198220

199221
if [ -f "$LOG_ANALYSIS_DIR/failed_helm_releases.txt" ]; then
200222
# Use log analysis results to identify failed releases
@@ -240,8 +262,17 @@ if [ -n "$MGMT_KUBECONFIG" ]; then
240262

241263
# Check for resource conflicts if indicated in log analysis
242264
if [ -f "$LOG_ANALYSIS_DIR/resource_conflicts.txt" ]; then
243-
while IFS=: read -r conflict_type resource_name resource_type manager fields; do
265+
while IFS= read -r line; do
266+
# Parse CONFLICT:resource_name:resource_type:manager:fields format
267+
# Use awk to properly split on first 4 colons only
268+
conflict_type=$(echo "$line" | awk -F: '{print $1}')
244269
if [ "$conflict_type" = "CONFLICT" ]; then
270+
resource_name=$(echo "$line" | awk -F: '{print $2}')
271+
# Resource type may contain colons, extract everything between 2nd and 3rd-to-last colon
272+
resource_type=$(echo "$line" | awk -F: '{for(i=3;i<NF-1;i++) printf "%s%s", $i, (i<NF-2?":":"")}')
273+
manager=$(echo "$line" | awk -F: '{print $(NF-1)}')
274+
fields=$(echo "$line" | awk -F: '{print $NF}')
275+
245276
echo "Resource Conflict Detected:" >> "$REPORT_FILE"
246277
echo " Resource: $resource_name (type: $resource_type)" >> "$REPORT_FILE"
247278
echo " Managed by: $manager" >> "$REPORT_FILE"
@@ -250,13 +281,6 @@ if [ -n "$MGMT_KUBECONFIG" ]; then
250281
echo "$fields" | tr '|' '\n' | sed 's/^/ - /' >> "$REPORT_FILE"
251282
fi
252283
echo "" >> "$REPORT_FILE"
253-
254-
# Try to get the actual resource
255-
if kubectl --kubeconfig "$MGMT_KUBECONFIG" get "$resource_type" "$resource_name" &>/dev/null; then
256-
echo "Resource details:" >> "$REPORT_FILE"
257-
kubectl --kubeconfig "$MGMT_KUBECONFIG" get "$resource_type" "$resource_name" -o yaml >> "$REPORT_FILE" 2>/dev/null
258-
echo "" >> "$REPORT_FILE"
259-
fi
260284
fi
261285
done < "$LOG_ANALYSIS_DIR/resource_conflicts.txt"
262286
fi
@@ -310,7 +334,10 @@ if [ -d "$LOG_ANALYSIS_DIR" ]; then
310334
# Include error patterns
311335
if [ -f "$LOG_ANALYSIS_DIR/error_patterns.txt" ] && [ -s "$LOG_ANALYSIS_DIR/error_patterns.txt" ]; then
312336
echo "Identified Error Patterns:" >> "$REPORT_FILE"
313-
while IFS=':::' read -r pattern context; do
337+
while IFS= read -r line; do
338+
# Split on literal ':::' delimiter
339+
pattern="${line%%:::*}"
340+
context="${line#*:::}"
314341
echo " • Pattern: $pattern" >> "$REPORT_FILE"
315342
echo " Context: $(echo "$context" | head -c 200)..." >> "$REPORT_FILE"
316343
echo "" >> "$REPORT_FILE"
@@ -343,12 +370,10 @@ if [ -f "$LOG_ANALYSIS_DIR/error_patterns.txt" ] && [ -s "$LOG_ANALYSIS_DIR/erro
343370

344371
# Determine primary failure based on most common pattern
345372
primary_pattern=""
346-
max_count=0
347373
if [ -s "$LOG_ANALYSIS_DIR/pattern_counts.txt" ]; then
348374
# Get the first line (highest count)
349375
read -r count pattern_name < "$LOG_ANALYSIS_DIR/pattern_counts.txt"
350376
primary_pattern="$pattern_name"
351-
max_count="$count"
352377
fi
353378

354379
if [ -n "$primary_pattern" ]; then
@@ -388,8 +413,16 @@ fi
388413
# Analyze resource conflicts
389414
if [ -f "$LOG_ANALYSIS_DIR/resource_conflicts.txt" ] && [ -s "$LOG_ANALYSIS_DIR/resource_conflicts.txt" ]; then
390415
echo "Resource Conflicts Detected:" >> "$REPORT_FILE"
391-
while IFS=: read -r conflict_type resource_name resource_type manager fields; do
416+
while IFS= read -r line; do
417+
# Parse CONFLICT:resource_name:resource_type:manager:fields format
418+
conflict_type=$(echo "$line" | awk -F: '{print $1}')
392419
if [ "$conflict_type" = "CONFLICT" ]; then
420+
resource_name=$(echo "$line" | awk -F: '{print $2}')
421+
# Resource type may contain colons
422+
resource_type=$(echo "$line" | awk -F: '{for(i=3;i<NF-1;i++) printf "%s%s", $i, (i<NF-2?":":"")}')
423+
manager=$(echo "$line" | awk -F: '{print $(NF-1)}')
424+
fields=$(echo "$line" | awk -F: '{print $NF}')
425+
393426
echo " • Resource: $resource_name ($resource_type)" >> "$REPORT_FILE"
394427
echo " Conflicting manager: $manager" >> "$REPORT_FILE"
395428
if [ -n "$fields" ] && [ "$fields" != "" ]; then
@@ -410,7 +443,26 @@ echo "" >> "$REPORT_FILE"
410443
ISSUES_FOUND=0
411444
CRITICAL_ISSUES=0
412445

413-
# Issue 1: Failed Helm Releases (dynamic)
446+
# Issue: Credential failures (if any)
447+
if [ "$CREDENTIAL_ISSUES" -gt 0 ]; then
448+
ISSUES_FOUND=$((ISSUES_FOUND + 1))
449+
CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1))
450+
echo "[$ISSUES_FOUND] Failed to Retrieve Cluster Credentials" >> "$REPORT_FILE"
451+
echo " Severity: CRITICAL" >> "$REPORT_FILE"
452+
if [ -z "$SVC_KUBECONFIG" ]; then
453+
echo " Failed: Service cluster ($SVC_CLUSTER_NAME)" >> "$REPORT_FILE"
454+
fi
455+
if [ -z "$MGMT_KUBECONFIG" ]; then
456+
echo " Failed: Management cluster ($MGMT_CLUSTER_NAME)" >> "$REPORT_FILE"
457+
fi
458+
echo " " >> "$REPORT_FILE"
459+
echo " Recommendation:" >> "$REPORT_FILE"
460+
echo " Verify Azure credentials and cluster access permissions" >> "$REPORT_FILE"
461+
echo " Check that resource groups and cluster names are correct" >> "$REPORT_FILE"
462+
echo "" >> "$REPORT_FILE"
463+
fi
464+
465+
# Issue: Failed Helm Releases (dynamic)
414466
if [ -n "$FAILED_RELEASES" ]; then
415467
for release_info in $FAILED_RELEASES; do
416468
if [[ "$release_info" == *":"* ]]; then
@@ -443,8 +495,8 @@ if [ -n "$FAILED_RELEASES" ]; then
443495
echo " Severity: CRITICAL" >> "$REPORT_FILE"
444496
echo " Actual Status: ✗ No pods found in namespace" >> "$REPORT_FILE"
445497
else
498+
# Partial failure: some pods running, some not - this is WARNING not CRITICAL
446499
severity="WARNING"
447-
CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1))
448500
echo " Severity: WARNING" >> "$REPORT_FILE"
449501
echo " Actual Status: ⚠ $running_pods/$total_pods pods Running" >> "$REPORT_FILE"
450502
fi
@@ -466,24 +518,22 @@ fi
466518

467519
# Issue 2: Missing deployments in service cluster
468520
if [ -n "$SVC_KUBECONFIG" ]; then
469-
# Check for expected namespaces
470-
for ns in maestro; do
471-
if ! kubectl --kubeconfig "$SVC_KUBECONFIG" get namespace "$ns" &>/dev/null; then
472-
ISSUES_FOUND=$((ISSUES_FOUND + 1))
473-
CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1))
474-
echo "[$ISSUES_FOUND] Missing Deployment: $ns in Service Cluster" >> "$REPORT_FILE"
475-
echo " Severity: CRITICAL" >> "$REPORT_FILE"
476-
echo " Status: Namespace does not exist" >> "$REPORT_FILE"
477-
echo " " >> "$REPORT_FILE"
478-
echo " Likely Cause:" >> "$REPORT_FILE"
479-
echo " Deployment pipeline may have halted before service cluster setup" >> "$REPORT_FILE"
480-
echo " " >> "$REPORT_FILE"
481-
echo " Recommendation:" >> "$REPORT_FILE"
482-
echo " Option 1: Continue deployment to service cluster" >> "$REPORT_FILE"
483-
echo " Option 2: Re-run complete deployment" >> "$REPORT_FILE"
484-
echo "" >> "$REPORT_FILE"
485-
fi
486-
done
521+
# Check for maestro namespace
522+
if ! kubectl --kubeconfig "$SVC_KUBECONFIG" get namespace maestro &>/dev/null; then
523+
ISSUES_FOUND=$((ISSUES_FOUND + 1))
524+
CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1))
525+
echo "[$ISSUES_FOUND] Missing Deployment: maestro in Service Cluster" >> "$REPORT_FILE"
526+
echo " Severity: CRITICAL" >> "$REPORT_FILE"
527+
echo " Status: Namespace does not exist" >> "$REPORT_FILE"
528+
echo " " >> "$REPORT_FILE"
529+
echo " Likely Cause:" >> "$REPORT_FILE"
530+
echo " Deployment pipeline may have halted before service cluster setup" >> "$REPORT_FILE"
531+
echo " " >> "$REPORT_FILE"
532+
echo " Recommendation:" >> "$REPORT_FILE"
533+
echo " Option 1: Continue deployment to service cluster" >> "$REPORT_FILE"
534+
echo " Option 2: Re-run complete deployment" >> "$REPORT_FILE"
535+
echo "" >> "$REPORT_FILE"
536+
fi
487537
fi
488538

489539
# Issue 3: Error patterns from logs
@@ -506,8 +556,8 @@ if [ -f "$LOG_ANALYSIS_DIR/error_patterns.txt" ] && [ -s "$LOG_ANALYSIS_DIR/erro
506556
echo " Description: Resource timing conflict detected" >> "$REPORT_FILE"
507557
;;
508558
timeout)
559+
# Timeouts are warnings unless they prevent critical operations
509560
echo " Severity: WARNING" >> "$REPORT_FILE"
510-
CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1))
511561
echo " Description: Operation timed out" >> "$REPORT_FILE"
512562
;;
513563
authentication)

0 commit comments

Comments
 (0)