fix isAppAlreadyInstalled error handling (#1608)

patrykw-splunk · Patryk Wasielewski · web-flow · commit 44f74f60a54f · 2025-11-24T11:44:13.000+01:00
* fix isAppAlreadyInstalled error handling

---------

Co-authored-by: Patryk Wasielewski &lt;pwasiele@splunk.com&gt;
diff --git a/pkg/splunk/enterprise/afwscheduler.go b/pkg/splunk/enterprise/afwscheduler.go
@@ -766,8 +766,15 @@ func installApp(rctx context.Context, localCtx *localScopePlaybookContext, cr sp
 	streamOptions := splutil.NewStreamOptionsObject(command)
 
 	stdOut, stdErr, err := localCtx.podExecClient.RunPodExecCommand(rctx, streamOptions, []string{"/bin/sh"})
-	// if the app was already installed previously, then just mark it for install complete
-	if stdErr != "" || err != nil {
+
+	// TODO(patrykw-splunk): remove this once we have confirm that we are not using stderr for error detection at all
+	// Log stderr content for debugging but don't use it for error detection
+	if stdErr != "" {
+		scopedLog.Info("App install command stderr output (informational only)", "stderr", stdErr)
+	}
+
+	// Check only the actual command execution error, not stderr content
+	if err != nil {
 		phaseInfo.FailCount++
 		scopedLog.Error(err, "local scoped app package install failed", "stdout", stdOut, "stderr", stdErr, "app pkg path", appPkgPathOnPod, "failCount", phaseInfo.FailCount)
 		return fmt.Errorf("local scoped app package install failed. stdOut: %s, stdErr: %s, app pkg path: %s, failCount: %d", stdOut, stdErr, appPkgPathOnPod, phaseInfo.FailCount)
@@ -785,28 +792,51 @@ func isAppAlreadyInstalled(ctx context.Context, cr splcommon.MetaObject, podExec
 
 	scopedLog.Info("check app's installation state")
 
-	command := fmt.Sprintf("/opt/splunk/bin/splunk list app %s -auth admin:`cat /mnt/splunk-secrets/password`| grep ENABLED; echo -n $?", appTopFolder)
+	command := fmt.Sprintf("/opt/splunk/bin/splunk list app %s -auth admin:`cat /mnt/splunk-secrets/password`| grep ENABLED", appTopFolder)
 
 	streamOptions := splutil.NewStreamOptionsObject(command)
 
 	stdOut, stdErr, err := podExecClient.RunPodExecCommand(ctx, streamOptions, []string{"/bin/sh"})
 
+	// Handle specific stderr cases first
 	if strings.Contains(stdErr, "Could not find object") {
 		// when app is not installed you will see something like on StdErr:
 		// "Could not find object id=<app_name>"
 		// which mean app is not installed (no need to check enabled at this time)
 		return false, nil
 	}
 
-	if stdErr != "" || err != nil {
-		return false, fmt.Errorf("could not get installed app status stdOut: %s, stdErr: %s, command: %s", stdOut, stdErr, command)
+	// Log any other stderr content for debugging but don't use it for error detection
+	if stdErr != "" {
+		scopedLog.Info("Command stderr output (informational only)", "stderr", stdErr)
 	}
 
-	appInstallCheck, _ := strconv.Atoi(stdOut)
+	// Now check the actual command result
+	if err != nil {
+		// The command pipeline ends with 'grep ENABLED', so exit codes follow grep semantics:
+		// For grep: exit code 1 = pattern not found, exit code 2+ = actual error
+		errMsg := err.Error()
+
+		// Check for grep exit code 1 (pattern not found)
+		if strings.Contains(errMsg, "exit status 1") || strings.Contains(errMsg, "command terminated with exit code 1") {
+			// grep exit code 1 means "ENABLED" pattern not found - app exists but is not enabled
+			scopedLog.Info("App not enabled - grep pattern not found", "stdout", stdOut, "stderr", stdErr)
+			return false, nil
+		}
+
+		// Any other exit code indicates a real error (splunk command failed, etc.)
+		return false, fmt.Errorf("could not get installed app status stdOut: %s, stdErr: %s, error: %v, command: %s", stdOut, stdErr, err, command)
+	}
 
-	scopedLog.Info("Apps installation state", stdOut, stdOut)
+	// If we reach here, grep found "ENABLED" (exit code 0)
+	// stdOut should contain the app status line with "ENABLED"
+	if stdOut == "" {
+		// This shouldn't happen if grep succeeded, but let's be safe
+		return false, fmt.Errorf("command succeeded but no output received, command: %s", command)
+	}
 
-	return appInstallCheck == 0, nil
+	scopedLog.Info("App installation state check successful - app is enabled", "appStatus", strings.TrimSpace(stdOut))
+	return true, nil
 }
 
 // get the name of top folder from the package.
@@ -1320,6 +1350,15 @@ installPhase:
 				phaseInfo := getPhaseInfoByPhaseType(ctx, installWorker, enterpriseApi.PhaseInstall)
 				if isPhaseMaxRetriesReached(ctx, phaseInfo, installWorker.afwConfig) {
 					phaseInfo.Status = enterpriseApi.AppPkgInstallError
+
+					// For fanout CRs, also update the main PhaseInfo to reflect the failure
+					if isFanOutApplicableToCR(installWorker.cr) {
+						scopedLog.Info("Max retries reached for fanout CR - updating main phase info", "app", installWorker.appDeployInfo.AppName, "failCount", phaseInfo.FailCount)
+						installWorker.appDeployInfo.PhaseInfo.Phase = enterpriseApi.PhaseInstall
+						installWorker.appDeployInfo.PhaseInfo.Status = enterpriseApi.AppPkgInstallError
+						installWorker.appDeployInfo.DeployStatus = enterpriseApi.DeployStatusError
+					}
+
 					ppln.deleteWorkerFromPipelinePhase(ctx, phaseInfo.Phase, installWorker)
 				} else if isPhaseStatusComplete(phaseInfo) {
 					ppln.deleteWorkerFromPipelinePhase(ctx, phaseInfo.Phase, installWorker)
diff --git a/pkg/splunk/enterprise/afwscheduler_test.go b/pkg/splunk/enterprise/afwscheduler_test.go
@@ -3061,18 +3061,23 @@ func TestRunLocalScopedPlaybook(t *testing.T) {
 		t.Errorf("Failed to detect that steps to get installed app failed")
 	}
 
-	// Test3: get installed app name passes but getting installed app name failed
+	// Test3: get installed app name passes but isAppAlreadyInstalled fails with real error (not "Could not find object")
 	mockPodExecReturnContexts[1].StdErr = ""
+	mockPodExecReturnContexts[2].StdErr = "Some other real error message" // Real error, not "Could not find object"
+	mockPodExecReturnContexts[2].Err = fmt.Errorf("exit status 2") // Real error, not grep exit code 1
 	localInstallCtxt.sem <- struct{}{}
 	waiter.Add(1)
 	err = localInstallCtxt.runPlaybook(ctx)
 	if err == nil {
-		t.Errorf("Failed to detect not able to get installed app name: err")
+		t.Errorf("Failed to detect isAppAlreadyInstalled error")
 	}
 
-	// Test4: get installed app command passes but installing app fails
-	mockPodExecReturnContexts[2].StdOut = "1" //app is not yet installed or it is not enabled
-	mockPodExecReturnContexts[2].StdErr = ""  //no error thrown
+	// Test4: isAppAlreadyInstalled returns app not enabled (grep exit code 1), then install fails
+	mockPodExecReturnContexts[2].StdOut = "" // No stdout means grep didn't find ENABLED
+	mockPodExecReturnContexts[2].StdErr = "" // No stderr
+	mockPodExecReturnContexts[2].Err = fmt.Errorf("exit status 1") // grep exit code 1 = pattern not found
+	mockPodExecReturnContexts[3].StdErr = "real installation error" // This is just logged now
+	mockPodExecReturnContexts[3].Err = fmt.Errorf("install command failed") // This causes the actual failure
 
 	localInstallCtxt.sem <- struct{}{}
 	waiter.Add(1)
@@ -3081,28 +3086,35 @@ func TestRunLocalScopedPlaybook(t *testing.T) {
 		t.Errorf("Expected app install failed")
 	}
 
-	mockPodExecReturnContexts[2].StdOut = "1" //app is not yet installed or it is not enabled
-	mockPodExecReturnContexts[2].StdErr = "Could not find object"
+	// Test5: App not found scenario (Could not find object) - should proceed to install but install fails
+	mockPodExecReturnContexts[2].StdOut = ""
+	mockPodExecReturnContexts[2].StdErr = "Could not find object id=app1"
+	mockPodExecReturnContexts[2].Err = nil // This should return false, nil (app not installed)
+	// Keep the installation error from previous test to make install fail
+	mockPodExecReturnContexts[3].StdErr = "real installation error" // Install should fail
+	mockPodExecReturnContexts[3].Err = fmt.Errorf("install failed")
 
 	localInstallCtxt.sem <- struct{}{}
 	waiter.Add(1)
 	err = localInstallCtxt.runPlaybook(ctx)
 	if err == nil {
-		t.Errorf("Expected app install failed")
+		t.Errorf("Expected app install failed due to installation error")
 	}
 
-	// Test5: install app should be successful
-
-	mockPodExecReturnContexts[3].StdErr = "" //no error for app install
+	// Test6: Install succeeds with stderr content (should be ignored), but cleanup fails
+	mockPodExecReturnContexts[3].StdErr = "Some informational message in stderr" // Stderr content should be ignored
+	mockPodExecReturnContexts[3].Err = nil // No actual error for install
+	// Keep cleanup failure from previous test setup to make overall test fail
+	// mockPodExecReturnContexts[4] still has error from earlier
 
 	localInstallCtxt.sem <- struct{}{}
 	waiter.Add(1)
 	err = localInstallCtxt.runPlaybook(ctx)
 	if err == nil {
-		t.Errorf("Expected app install succeeded but app arhive deletion failed")
+		t.Errorf("Expected app install succeeded but app archive deletion failed")
 	}
 
-	// Test6: successful scenario where everything succeeds
+	// Test7: successful scenario where everything succeeds
 	mockPodExecReturnContexts[4].StdErr = ""
 	localInstallCtxt.sem <- struct{}{}
 	waiter.Add(1)
@@ -3301,19 +3313,24 @@ func TestPremiumAppScopedPlaybook(t *testing.T) {
 		t.Errorf("Failed to detect that steps to get installed app failed")
 	}
 
-	// Test3: get installed app name passes but getting installed app name failed
+	// Test3: get installed app name passes but isAppAlreadyInstalled fails with real error (not "Could not find object")
 	mockPodExecReturnContexts[1].StdErr = ""
+	mockPodExecReturnContexts[2].StdErr = "Some other real error message" // Real error, not "Could not find object"
+	mockPodExecReturnContexts[2].Err = fmt.Errorf("exit status 2")        // Real error, not grep exit code 1
 	localInstallCtxt.sem <- struct{}{}
 	waiter.Add(1)
 	err = pCtx.runPlaybook(ctx)
 	if err == nil {
-		t.Errorf("Failed to detect not able to get installed app name: err")
+		t.Errorf("Failed to detect isAppAlreadyInstalled error")
 	}
 
-	// Test4: get installed app command passes, it returns app is not enabled
-	// so app install will run and it should  fail
-	mockPodExecReturnContexts[2].StdOut = "1" //app is not yet installed or it is not enabled
-	mockPodExecReturnContexts[2].StdErr = ""  //no error thrown
+	// Test4: isAppAlreadyInstalled returns app is not enabled (grep exit code 1)
+	// so app install will run and it should fail with real error
+	mockPodExecReturnContexts[2].StdOut = ""                        // No stdout means grep didn't find ENABLED
+	mockPodExecReturnContexts[2].StdErr = ""                        // No stderr
+	mockPodExecReturnContexts[2].Err = fmt.Errorf("exit status 1")  // grep exit code 1 = pattern not found
+	mockPodExecReturnContexts[3].StdErr = "real installation error" // This is just logged now
+	mockPodExecReturnContexts[3].Err = fmt.Errorf("install command failed") // This causes the actual failure
 
 	localInstallCtxt.sem <- struct{}{}
 	waiter.Add(1)
@@ -3322,9 +3339,9 @@ func TestPremiumAppScopedPlaybook(t *testing.T) {
 		t.Errorf("Expected app install failed")
 	}
 
-	// Test5: install app should be successful but es post install fails
-
-	mockPodExecReturnContexts[3].StdErr = "" //no error for app install
+	// Test5: Install succeeds with stderr content (should be ignored), but post install fails
+	mockPodExecReturnContexts[3].StdErr = "Some informational message in stderr" // Stderr content should be ignored
+	mockPodExecReturnContexts[3].Err = nil // No actual error for install
 
 	localInstallCtxt.sem <- struct{}{}
 	waiter.Add(1)
@@ -3343,7 +3360,24 @@ func TestPremiumAppScopedPlaybook(t *testing.T) {
 		t.Errorf("Expected es post  install succeeded but app arhive deletion failed")
 	}
 
-	// Test7: successful scenario where everything succeeds
+	// Test7: App already installed with stderr content - should skip installation
+	// Reset all mock contexts for this test
+	mockPodExecReturnContexts[0].Err = nil                                  // File exists check passes
+	mockPodExecReturnContexts[1].StdErr = ""                                // Get app name passes
+	mockPodExecReturnContexts[1].StdOut = "app1"                            // App name is found
+	mockPodExecReturnContexts[2].StdOut = "app1 CONFIGURED ENABLED VISIBLE" // App is already enabled
+	mockPodExecReturnContexts[2].StdErr = "Some informational message in stderr"
+	mockPodExecReturnContexts[2].Err = nil // No error - app is found and enabled
+	// Install step should be skipped, but cleanup should still work
+	mockPodExecReturnContexts[5].StdErr = "" // Cleanup should succeed
+	localInstallCtxt.sem <- struct{}{}
+	waiter.Add(1)
+	err = pCtx.runPlaybook(ctx)
+	if err != nil {
+		t.Errorf("runPlayBook should not have returned error when app is already installed with stderr content. err=%s", err.Error())
+	}
+
+	// Test8: successful scenario where everything succeeds
 	mockPodExecReturnContexts[5].StdErr = ""
 	localInstallCtxt.sem <- struct{}{}
 	waiter.Add(1)
@@ -4462,3 +4496,126 @@ func TestAddTelAppCManager(t *testing.T) {
 	// Negative testing
 	addTelApp(ctx, mockPodExecClient, 2, &crNew)
 }
+
+func TestIsAppAlreadyInstalled(t *testing.T) {
+	ctx := context.TODO()
+
+	tests := []struct {
+		name           string
+		stdOut         string
+		stdErr         string
+		err            error
+		expectedResult bool
+		expectedError  bool
+		description    string
+	}{
+		{
+			name:           "App is enabled - success case",
+			stdOut:         "myapp CONFIGURED ENABLED VISIBLE",
+			stdErr:         "",
+			err:            nil,
+			expectedResult: true,
+			expectedError:  false,
+			description:    "App is found and enabled",
+		},
+		{
+			name:           "App not found - grep exit code 1",
+			stdOut:         "",
+			stdErr:         "",
+			err:            fmt.Errorf("command terminated with exit code 1"),
+			expectedResult: false,
+			expectedError:  false,
+			description:    "App not enabled - grep pattern not found",
+		},
+		{
+			name:           "App not found - Could not find object",
+			stdOut:         "",
+			stdErr:         "Could not find object id=myapp",
+			err:            nil,
+			expectedResult: false,
+			expectedError:  false,
+			description:    "App not installed at all",
+		},
+		{
+			name:           "App enabled with stderr content",
+			stdOut:         "myapp CONFIGURED ENABLED VISIBLE",
+			stdErr:         "Some informational message in stderr",
+			err:            nil,
+			expectedResult: true,
+			expectedError:  false,
+			description:    "Stderr content should be ignored when app is enabled",
+		},
+		{
+			name:           "App not enabled with stderr content",
+			stdOut:         "",
+			stdErr:         "Some informational message in stderr",
+			err:            fmt.Errorf("exit status 1"),
+			expectedResult: false,
+			expectedError:  false,
+			description:    "Stderr content should be ignored, grep exit code 1 means not enabled",
+		},
+		{
+			name:           "Real error - exit code 2",
+			stdOut:         "",
+			stdErr:         "Some real error occurred",
+			err:            fmt.Errorf("exit status 2"),
+			expectedResult: false,
+			expectedError:  true,
+			description:    "Real error should be returned",
+		},
+		{
+			name:           "Command succeeded but no output",
+			stdOut:         "",
+			stdErr:         "",
+			err:            nil,
+			expectedResult: false,
+			expectedError:  true,
+			description:    "Should error if command succeeds but no output",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// Create a test CR
+			cr := &enterpriseApi.Standalone{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-standalone",
+					Namespace: "test",
+				},
+			}
+
+			// Create mock pod exec client with CR
+			mockPodExecClient := &spltest.MockPodExecClient{Cr: cr}
+			mockPodExecClient.SetTargetPodName(ctx, "test-pod")
+
+			// Set up the mock return context
+			mockReturnContext := &spltest.MockPodExecReturnContext{
+				StdOut: tt.stdOut,
+				StdErr: tt.stdErr,
+				Err:    tt.err,
+			}
+
+			// Add the mock command and return context - use the exact command pattern
+			command := "/opt/splunk/bin/splunk list app testapp -auth admin:`cat /mnt/splunk-secrets/password`| grep ENABLED"
+			mockPodExecClient.AddMockPodExecReturnContexts(ctx, []string{command}, mockReturnContext)
+
+			// Call the function
+			result, err := isAppAlreadyInstalled(ctx, cr, mockPodExecClient, "testapp")
+
+			// Check results
+			if tt.expectedError {
+				if err == nil {
+					t.Errorf("Expected error but got none for test: %s", tt.description)
+				}
+			} else {
+				if err != nil {
+					t.Errorf("Unexpected error for test '%s': %v", tt.description, err)
+				}
+			}
+
+			if result != tt.expectedResult {
+				t.Errorf("Expected result %v but got %v for test: %s", tt.expectedResult, result, tt.description)
+			}
+		})
+	}
+}