@@ -100,11 +100,23 @@ if ! command -v helm &> /dev/null; then
100100 exit 1
101101fi
102102
103+ if ! command -v jq & > /dev/null; then
104+ echo " ERROR: jq not installed (required for JSON parsing)"
105+ echo " Install with: brew install jq (macOS) or apt-get install jq (Linux)"
106+ exit 1
107+ fi
108+
103109if ! az account show & > /dev/null; then
104110 echo " ERROR: Not logged into Azure"
105111 exit 1
106112fi
107113
114+ # kubelogin is optional but recommended for Azure AD authentication
115+ if ! command -v kubelogin & > /dev/null; then
116+ echo " WARNING: kubelogin not installed (Azure AD authentication may fail)"
117+ echo " Install with: brew install Azure/kubelogin/kubelogin (macOS)"
118+ fi
119+
108120echo " ✓ All prerequisites met"
109121echo " "
110122
@@ -118,16 +130,23 @@ MGMT_KUBECONFIG="$TEMP_DIR/mgmt.kubeconfig"
118130# Get cluster credentials
119131echo " Step 2: Retrieving cluster credentials..."
120132
133+ # Initialize issue tracking early (will be used if credentials fail)
134+ CREDENTIAL_ISSUES=0
135+
121136if az aks get-credentials \
122137 --resource-group " $SVC_RESOURCE_GROUP " \
123138 --name " $SVC_CLUSTER_NAME " \
124139 --overwrite-existing \
125140 -f " $SVC_KUBECONFIG " 2> /dev/null; then
126141 echo " ✓ Service cluster credentials retrieved"
127- kubelogin convert-kubeconfig -l azurecli --kubeconfig " $SVC_KUBECONFIG " 2> /dev/null
142+ # kubelogin may fail but shouldn't stop the script
143+ if command -v kubelogin & > /dev/null; then
144+ kubelogin convert-kubeconfig -l azurecli --kubeconfig " $SVC_KUBECONFIG " 2> /dev/null || true
145+ fi
128146else
129147 echo " ✗ Failed to get service cluster credentials"
130148 SVC_KUBECONFIG=" "
149+ CREDENTIAL_ISSUES=$(( CREDENTIAL_ISSUES + 1 ))
131150fi
132151
133152if az aks get-credentials \
@@ -136,10 +155,14 @@ if az aks get-credentials \
136155 --overwrite-existing \
137156 -f " $MGMT_KUBECONFIG " 2> /dev/null; then
138157 echo " ✓ Management cluster credentials retrieved"
139- kubelogin convert-kubeconfig -l azurecli --kubeconfig " $MGMT_KUBECONFIG " 2> /dev/null
158+ # kubelogin may fail but shouldn't stop the script
159+ if command -v kubelogin & > /dev/null; then
160+ kubelogin convert-kubeconfig -l azurecli --kubeconfig " $MGMT_KUBECONFIG " 2> /dev/null || true
161+ fi
140162else
141163 echo " ✗ Failed to get management cluster credentials"
142164 MGMT_KUBECONFIG=" "
165+ CREDENTIAL_ISSUES=$(( CREDENTIAL_ISSUES + 1 ))
143166fi
144167
145168echo " "
@@ -194,7 +217,6 @@ if [ -n "$MGMT_KUBECONFIG" ]; then
194217
195218 # Determine what to check based on log analysis
196219 FAILED_RELEASES=" "
197- NAMESPACES_TO_CHECK=" "
198220
199221 if [ -f " $LOG_ANALYSIS_DIR /failed_helm_releases.txt" ]; then
200222 # Use log analysis results to identify failed releases
@@ -240,8 +262,17 @@ if [ -n "$MGMT_KUBECONFIG" ]; then
240262
241263 # Check for resource conflicts if indicated in log analysis
242264 if [ -f " $LOG_ANALYSIS_DIR /resource_conflicts.txt" ]; then
243- while IFS=: read -r conflict_type resource_name resource_type manager fields; do
265+ while IFS= read -r line; do
266+ # Parse CONFLICT:resource_name:resource_type:manager:fields format
267+ # Use awk to properly split on first 4 colons only
268+ conflict_type=$( echo " $line " | awk -F: ' {print $1}' )
244269 if [ " $conflict_type " = " CONFLICT" ]; then
270+ resource_name=$( echo " $line " | awk -F: ' {print $2}' )
271+ # Resource type may contain colons, extract everything between 2nd and 3rd-to-last colon
272+ resource_type=$( echo " $line " | awk -F: ' {for(i=3;i<NF-1;i++) printf "%s%s", $i, (i<NF-2?":":"")}' )
273+ manager=$( echo " $line " | awk -F: ' {print $(NF-1)}' )
274+ fields=$( echo " $line " | awk -F: ' {print $NF}' )
275+
245276 echo " Resource Conflict Detected:" >> " $REPORT_FILE "
246277 echo " Resource: $resource_name (type: $resource_type )" >> " $REPORT_FILE "
247278 echo " Managed by: $manager " >> " $REPORT_FILE "
@@ -250,13 +281,6 @@ if [ -n "$MGMT_KUBECONFIG" ]; then
250281 echo " $fields " | tr ' |' ' \n' | sed ' s/^/ - /' >> " $REPORT_FILE "
251282 fi
252283 echo " " >> " $REPORT_FILE "
253-
254- # Try to get the actual resource
255- if kubectl --kubeconfig " $MGMT_KUBECONFIG " get " $resource_type " " $resource_name " & > /dev/null; then
256- echo " Resource details:" >> " $REPORT_FILE "
257- kubectl --kubeconfig " $MGMT_KUBECONFIG " get " $resource_type " " $resource_name " -o yaml >> " $REPORT_FILE " 2> /dev/null
258- echo " " >> " $REPORT_FILE "
259- fi
260284 fi
261285 done < " $LOG_ANALYSIS_DIR /resource_conflicts.txt"
262286 fi
@@ -310,7 +334,10 @@ if [ -d "$LOG_ANALYSIS_DIR" ]; then
310334 # Include error patterns
311335 if [ -f " $LOG_ANALYSIS_DIR /error_patterns.txt" ] && [ -s " $LOG_ANALYSIS_DIR /error_patterns.txt" ]; then
312336 echo " Identified Error Patterns:" >> " $REPORT_FILE "
313- while IFS=' :::' read -r pattern context; do
337+ while IFS= read -r line; do
338+ # Split on literal ':::' delimiter
339+ pattern=" ${line%%:::* } "
340+ context=" ${line#*::: } "
314341 echo " • Pattern: $pattern " >> " $REPORT_FILE "
315342 echo " Context: $( echo " $context " | head -c 200) ..." >> " $REPORT_FILE "
316343 echo " " >> " $REPORT_FILE "
@@ -343,12 +370,10 @@ if [ -f "$LOG_ANALYSIS_DIR/error_patterns.txt" ] && [ -s "$LOG_ANALYSIS_DIR/erro
343370
344371 # Determine primary failure based on most common pattern
345372 primary_pattern=" "
346- max_count=0
347373 if [ -s " $LOG_ANALYSIS_DIR /pattern_counts.txt" ]; then
348374 # Get the first line (highest count)
349375 read -r count pattern_name < " $LOG_ANALYSIS_DIR /pattern_counts.txt"
350376 primary_pattern=" $pattern_name "
351- max_count=" $count "
352377 fi
353378
354379 if [ -n " $primary_pattern " ]; then
388413# Analyze resource conflicts
389414if [ -f " $LOG_ANALYSIS_DIR /resource_conflicts.txt" ] && [ -s " $LOG_ANALYSIS_DIR /resource_conflicts.txt" ]; then
390415 echo " Resource Conflicts Detected:" >> " $REPORT_FILE "
391- while IFS=: read -r conflict_type resource_name resource_type manager fields; do
416+ while IFS= read -r line; do
417+ # Parse CONFLICT:resource_name:resource_type:manager:fields format
418+ conflict_type=$( echo " $line " | awk -F: ' {print $1}' )
392419 if [ " $conflict_type " = " CONFLICT" ]; then
420+ resource_name=$( echo " $line " | awk -F: ' {print $2}' )
421+ # Resource type may contain colons
422+ resource_type=$( echo " $line " | awk -F: ' {for(i=3;i<NF-1;i++) printf "%s%s", $i, (i<NF-2?":":"")}' )
423+ manager=$( echo " $line " | awk -F: ' {print $(NF-1)}' )
424+ fields=$( echo " $line " | awk -F: ' {print $NF}' )
425+
393426 echo " • Resource: $resource_name ($resource_type )" >> " $REPORT_FILE "
394427 echo " Conflicting manager: $manager " >> " $REPORT_FILE "
395428 if [ -n " $fields " ] && [ " $fields " != " " ]; then
@@ -410,7 +443,26 @@ echo "" >> "$REPORT_FILE"
410443ISSUES_FOUND=0
411444CRITICAL_ISSUES=0
412445
413- # Issue 1: Failed Helm Releases (dynamic)
446+ # Issue: Credential failures (if any)
447+ if [ " $CREDENTIAL_ISSUES " -gt 0 ]; then
448+ ISSUES_FOUND=$(( ISSUES_FOUND + 1 ))
449+ CRITICAL_ISSUES=$(( CRITICAL_ISSUES + 1 ))
450+ echo " [$ISSUES_FOUND ] Failed to Retrieve Cluster Credentials" >> " $REPORT_FILE "
451+ echo " Severity: CRITICAL" >> " $REPORT_FILE "
452+ if [ -z " $SVC_KUBECONFIG " ]; then
453+ echo " Failed: Service cluster ($SVC_CLUSTER_NAME )" >> " $REPORT_FILE "
454+ fi
455+ if [ -z " $MGMT_KUBECONFIG " ]; then
456+ echo " Failed: Management cluster ($MGMT_CLUSTER_NAME )" >> " $REPORT_FILE "
457+ fi
458+ echo " " >> " $REPORT_FILE "
459+ echo " Recommendation:" >> " $REPORT_FILE "
460+ echo " Verify Azure credentials and cluster access permissions" >> " $REPORT_FILE "
461+ echo " Check that resource groups and cluster names are correct" >> " $REPORT_FILE "
462+ echo " " >> " $REPORT_FILE "
463+ fi
464+
465+ # Issue: Failed Helm Releases (dynamic)
414466if [ -n " $FAILED_RELEASES " ]; then
415467 for release_info in $FAILED_RELEASES ; do
416468 if [[ " $release_info " == * " :" * ]]; then
@@ -443,8 +495,8 @@ if [ -n "$FAILED_RELEASES" ]; then
443495 echo " Severity: CRITICAL" >> " $REPORT_FILE "
444496 echo " Actual Status: ✗ No pods found in namespace" >> " $REPORT_FILE "
445497 else
498+ # Partial failure: some pods running, some not - this is WARNING not CRITICAL
446499 severity=" WARNING"
447- CRITICAL_ISSUES=$(( CRITICAL_ISSUES + 1 ))
448500 echo " Severity: WARNING" >> " $REPORT_FILE "
449501 echo " Actual Status: ⚠ $running_pods /$total_pods pods Running" >> " $REPORT_FILE "
450502 fi
466518
467519# Issue 2: Missing deployments in service cluster
468520if [ -n " $SVC_KUBECONFIG " ]; then
469- # Check for expected namespaces
470- for ns in maestro; do
471- if ! kubectl --kubeconfig " $SVC_KUBECONFIG " get namespace " $ns " & > /dev/null; then
472- ISSUES_FOUND=$(( ISSUES_FOUND + 1 ))
473- CRITICAL_ISSUES=$(( CRITICAL_ISSUES + 1 ))
474- echo " [$ISSUES_FOUND ] Missing Deployment: $ns in Service Cluster" >> " $REPORT_FILE "
475- echo " Severity: CRITICAL" >> " $REPORT_FILE "
476- echo " Status: Namespace does not exist" >> " $REPORT_FILE "
477- echo " " >> " $REPORT_FILE "
478- echo " Likely Cause:" >> " $REPORT_FILE "
479- echo " Deployment pipeline may have halted before service cluster setup" >> " $REPORT_FILE "
480- echo " " >> " $REPORT_FILE "
481- echo " Recommendation:" >> " $REPORT_FILE "
482- echo " Option 1: Continue deployment to service cluster" >> " $REPORT_FILE "
483- echo " Option 2: Re-run complete deployment" >> " $REPORT_FILE "
484- echo " " >> " $REPORT_FILE "
485- fi
486- done
521+ # Check for maestro namespace
522+ if ! kubectl --kubeconfig " $SVC_KUBECONFIG " get namespace maestro & > /dev/null; then
523+ ISSUES_FOUND=$(( ISSUES_FOUND + 1 ))
524+ CRITICAL_ISSUES=$(( CRITICAL_ISSUES + 1 ))
525+ echo " [$ISSUES_FOUND ] Missing Deployment: maestro in Service Cluster" >> " $REPORT_FILE "
526+ echo " Severity: CRITICAL" >> " $REPORT_FILE "
527+ echo " Status: Namespace does not exist" >> " $REPORT_FILE "
528+ echo " " >> " $REPORT_FILE "
529+ echo " Likely Cause:" >> " $REPORT_FILE "
530+ echo " Deployment pipeline may have halted before service cluster setup" >> " $REPORT_FILE "
531+ echo " " >> " $REPORT_FILE "
532+ echo " Recommendation:" >> " $REPORT_FILE "
533+ echo " Option 1: Continue deployment to service cluster" >> " $REPORT_FILE "
534+ echo " Option 2: Re-run complete deployment" >> " $REPORT_FILE "
535+ echo " " >> " $REPORT_FILE "
536+ fi
487537fi
488538
489539# Issue 3: Error patterns from logs
@@ -506,8 +556,8 @@ if [ -f "$LOG_ANALYSIS_DIR/error_patterns.txt" ] && [ -s "$LOG_ANALYSIS_DIR/erro
506556 echo " Description: Resource timing conflict detected" >> " $REPORT_FILE "
507557 ;;
508558 timeout)
559+ # Timeouts are warnings unless they prevent critical operations
509560 echo " Severity: WARNING" >> " $REPORT_FILE "
510- CRITICAL_ISSUES=$(( CRITICAL_ISSUES + 1 ))
511561 echo " Description: Operation timed out" >> " $REPORT_FILE "
512562 ;;
513563 authentication)
0 commit comments