Fix NVMe Stage idempotency issues

emmahardison · web-flow · commit ae6ee3b621cf · 2025-04-29T09:01:20.000-06:00
diff --git a/frontend/csi/node_server.go b/frontend/csi/node_server.go
@@ -95,6 +95,7 @@ var (
 	afterInitialTrackingInfoWrite  = fiji.Register("afterInitialTrackingInfoWrite", "node_server")
 	afterNvmeLuksDeviceClosed      = fiji.Register("afterNvmeLuksDeviceClosed", "node_server")
 	afterNvmeDisconnect            = fiji.Register("afterNvmeDisconnect", "node_server")
+	beforeTrackingInfoWrite        = fiji.Register("beforeTrackingInfoWrite", "node_server")
 )
 
 const (
@@ -1219,13 +1220,13 @@ func (p *Plugin) populatePublishedSessions(ctx context.Context) {
 		}
 
 		publishInfo := &trackingInfo.VolumePublishInfo
-
 		if publishInfo.SANType != sa.NVMe {
 			newCtx := context.WithValue(ctx, iscsi.SessionInfoSource, utils.SessionSourceTrackingInfo)
 			p.iscsi.AddSession(newCtx, &publishedISCSISessions, publishInfo, volumeID, "", models.NotInvalid)
 		} else {
 			p.nvmeHandler.AddPublishedNVMeSession(&publishedNVMeSessions, publishInfo)
 		}
+
 	}
 }
 
@@ -2879,6 +2880,10 @@ func (p *Plugin) nodeStageNVMeVolume(
 		}
 	}
 
+	if err := beforeTrackingInfoWrite.Inject(); err != nil {
+		return err
+	}
+
 	volTrackingInfo := &models.VolumeTrackingInfo{
 		VolumePublishInfo: *publishInfo,
 		StagingTargetPath: stagingTargetPath,
diff --git a/internal/fiji/models/factory.go b/internal/fiji/models/factory.go
@@ -30,6 +30,8 @@ const (
 	ErrorNTimes HandlerType = "error-n-times"
 	// ErrorAfterNTimes tells the fault to error after 'n' times indefinitely.
 	ErrorAfterNTimes HandlerType = "error-after-n-times"
+	// ErrorXTimesAfterYTimes tells the fault to error up to 'x' times after 'y' times then succeed indefinitely
+	ErrorXTimesAfterYTimes HandlerType = "error-x-times-after-y-times"
 	// ExitAfterNTimes tells the fault to exit the process after 'n' times.
 	ExitAfterNTimes HandlerType = "exit-after-n-times"
 )
@@ -58,6 +60,8 @@ func NewFaultHandlerFromModel(model []byte) (FaultHandler, error) {
 		return handlers.NewErrorNTimesHandler(model)
 	case ErrorAfterNTimes:
 		return handlers.NewErrorAfterNTimesHandler(model)
+	case ErrorXTimesAfterYTimes:
+		return handlers.NewErrorXTimesAfterYTimesHandler(model)
 	case ExitAfterNTimes:
 		return handlers.NewExitAfterNTimesHandler(model)
 	}
diff --git a/internal/fiji/models/handlers/error_x_times_after_y_times.go b/internal/fiji/models/handlers/error_x_times_after_y_times.go
@@ -0,0 +1,58 @@
+package handlers
+
+import (
+	"encoding/json"
+	"fmt"
+
+	. "github.com/netapp/trident/logging"
+)
+
+type ErrorXTimesAfterYTimesHandler struct {
+	Name          string `json:"name"`
+	HitCount      int    `json:"hitCount"`
+	PassCount     int    `json:"passCount"`
+	FailCount     int    `json:"failCount"`
+	ErrorHitCount int    `json:"errorHitCount"`
+}
+
+func (handler *ErrorXTimesAfterYTimesHandler) Handle() error {
+	Log().Debugf("Firing %s handler.", handler.Name)
+
+	// While the passCount is greater than the hitCount, this handler should return nil.
+	if handler.HitCount < handler.PassCount {
+		handler.HitCount++
+		remaining := handler.PassCount - handler.HitCount
+		Log().Debugf("%v remaining passes from %s handler.", remaining, handler.Name)
+		return nil
+	}
+
+	// Once passCount is reached, start erroring for FailCount times.
+	if handler.ErrorHitCount < handler.FailCount {
+		handler.ErrorHitCount++
+		remaining := handler.FailCount - handler.ErrorHitCount
+		Log().Debugf("%v remaining errors from %s handler.", remaining, handler.Name)
+		return fmt.Errorf("fiji error from [%s] handler; %v errors remaining", handler.Name, remaining)
+	}
+
+	// After FailCount errors, succeed indefinitely.
+	Log().Debugf("No errors remaining from %s handler.", handler.Name)
+	return nil
+}
+
+func NewErrorXTimesAfterYTimesHandler(model []byte) (*ErrorXTimesAfterYTimesHandler, error) {
+	var handler ErrorXTimesAfterYTimesHandler
+	if err := json.Unmarshal(model, &handler); err != nil {
+		return nil, err
+	}
+
+	// Validate PassCount and FailCount
+	if handler.PassCount <= 0 {
+		return nil, fmt.Errorf("invalid value specified for passCount: must be greater than 0")
+	}
+
+	if handler.FailCount <= 0 {
+		return nil, fmt.Errorf("invalid value specified for failCount: must be greater than 0")
+	}
+
+	return &handler, nil
+}
diff --git a/internal/fiji/models/handlers/error_x_times_after_y_times_test.go b/internal/fiji/models/handlers/error_x_times_after_y_times_test.go
@@ -0,0 +1,94 @@
+package handlers
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestErrorXTimesAfterYTimesHandler(t *testing.T) {
+	tt := map[string]struct {
+		formatStr   string
+		passCount   int
+		failCount   int
+		assertValue assert.ValueAssertionFunc
+		assertError assert.ErrorAssertionFunc
+	}{
+		"with no KVP for counts": {
+			formatStr:   `{"name":"error-x-times-after-y-times"}`,
+			assertValue: assert.Nil,
+			assertError: assert.Error,
+		},
+		"with negative passCount": {
+			formatStr:   `{"name":"error-x-times-after-y-times", "passCount": %v, "failCount": %v}`,
+			passCount:   -1,
+			failCount:   3,
+			assertValue: assert.Nil,
+			assertError: assert.Error,
+		},
+		"with negative failCount": {
+			formatStr:   `{"name":"error-x-times-after-y-times", "passCount": %v, "failCount": %v}`,
+			passCount:   2,
+			failCount:   -1,
+			assertValue: assert.Nil,
+			assertError: assert.Error,
+		},
+		"with zero passCount": {
+			formatStr:   `{"name":"error-x-times-after-y-times", "passCount": %v, "failCount": %v}`,
+			passCount:   0,
+			failCount:   3,
+			assertValue: assert.Nil,
+			assertError: assert.Error,
+		},
+		"with zero failCount": {
+			formatStr:   `{"name":"error-x-times-after-y-times", "passCount": %v, "failCount": %v}`,
+			passCount:   2,
+			failCount:   0,
+			assertValue: assert.Nil,
+			assertError: assert.Error,
+		},
+		"with valid values": {
+			formatStr:   `{"name":"error-x-times-after-y-times", "passCount": %v, "failCount": %v}`,
+			passCount:   2,
+			failCount:   3,
+			assertValue: assert.NotNil,
+			assertError: assert.NoError,
+		},
+	}
+
+	for name, test := range tt {
+		t.Run(name, func(t *testing.T) {
+			modelStr := fmt.Sprintf(test.formatStr, test.passCount, test.failCount)
+			handler, err := NewErrorXTimesAfterYTimesHandler([]byte(modelStr))
+			test.assertError(t, err)
+			test.assertValue(t, handler)
+		})
+	}
+}
+
+func TestErrorXTimesAfterYTimesHandler_Handle(t *testing.T) {
+	passCount := 2
+	failCount := 3
+	modelJSON := fmt.Sprintf(`{"name":"error-x-times-after-y-times", "passCount": %v, "failCount": %v}`, passCount, failCount)
+	handler, err := NewErrorXTimesAfterYTimesHandler([]byte(modelJSON))
+	assert.NoError(t, err)
+	assert.NotNil(t, handler)
+	assert.Equal(t, passCount, handler.PassCount)
+	assert.Equal(t, failCount, handler.FailCount)
+
+	// Test successful passes
+	for i := 0; i < passCount; i++ {
+		assert.NoError(t, handler.Handle())
+	}
+
+	// Test failures
+	for i := 0; i < failCount; i++ {
+		assert.Error(t, handler.Handle())
+	}
+
+	// Test succeeding indefinitely after failures
+	for i := 0; i < 5; i++ { // Arbitrary number of additional calls
+		assert.NoError(t, handler.Handle())
+	}
+}
diff --git a/utils/devices/luks/luks_linux.go b/utils/devices/luks/luks_linux.go
@@ -19,7 +19,7 @@ import (
 )
 
 const (
-	luksCommandTimeout time.Duration = time.Second * 30
+	luksCommandTimeout time.Duration = time.Second * 60
 
 	luksCypherMode = "aes-xts-plain64"
 	luksType       = "luks2"
diff --git a/utils/devices/luks/luks_linux_test.go b/utils/devices/luks/luks_linux_test.go
@@ -49,12 +49,6 @@ func mockCryptsetupLuksOpen(mock *mockexec.MockCommand) *gomock.Call {
 	)
 }
 
-func mockCryptsetupLuksClose(mock *mockexec.MockCommand) *gomock.Call {
-	return mock.EXPECT().ExecuteWithTimeoutAndInput(
-		gomock.Any(), "cryptsetup", luksCommandTimeout, true, "", "luksClose", gomock.Any(),
-	)
-}
-
 func mockCryptsetupLuksStatusWithDevicePath(mock *mockexec.MockCommand) *gomock.Call {
 	return mock.EXPECT().ExecuteWithTimeoutAndInput(
 		gomock.Any(), "cryptsetup", luksCommandTimeout, true, "", "status", gomock.Any(),
@@ -255,7 +249,6 @@ func TestLUKSDevice_ExecErrors(t *testing.T) {
 		mockCryptsetupLuksFormat(mockCommand).Return([]byte(""), luksError),
 		mockCryptsetupLuksOpen(mockCommand).Return([]byte(""), luksError),
 		mockCryptsetupLuksStatus(mockCommand).Return([]byte(""), luksError),
-		mockCryptsetupLuksClose(mockCommand).Return([]byte(""), luksError),
 	)
 
 	isFormatted, err := luksDevice.IsLUKSFormatted(context.Background())
@@ -271,10 +264,6 @@ func TestLUKSDevice_ExecErrors(t *testing.T) {
 	isOpen, err := luksDevice.IsOpen(context.Background())
 	assert.Error(t, err)
 	assert.False(t, isOpen)
-
-	devicesClient := devices.NewDetailed(mockCommand, afero.NewMemMapFs(), nil)
-	err = devicesClient.CloseLUKSDevice(context.Background(), luksDevice.MappedDevicePath())
-	assert.Error(t, err)
 }
 
 func TestEnsureLUKSDevice_FailsWithExecError(t *testing.T) {
diff --git a/utils/iscsi/iscsi_linux_test.go b/utils/iscsi/iscsi_linux_test.go
@@ -72,7 +72,7 @@ tcp: [4] 127.0.0.2:3260,1029 ` + targetIQN + ` (non-flash)`
 					"config").Return([]byte(multipathConfig("no", false)), nil)
 				mockCommand.EXPECT().Execute(context.TODO(), "iscsiadm", "-m",
 					"session").Return([]byte(iscsiadmSessionOutput), nil)
-				mockCommand.EXPECT().ExecuteWithTimeoutAndInput(context.TODO(), "cryptsetup", 30*time.Second, true, "",
+				mockCommand.EXPECT().ExecuteWithTimeoutAndInput(context.TODO(), "cryptsetup", time.Minute, true, "",
 					"status", "/dev/mapper/luks-test-volume")
 				return mockCommand
 			},
@@ -164,7 +164,7 @@ tcp: [4] 127.0.0.2:3260,1029 ` + targetIQN + ` (non-flash)`
 					"config").Return([]byte(multipathConfig("no", false)), nil)
 				mockCommand.EXPECT().Execute(context.TODO(), "iscsiadm", "-m",
 					"session").Return([]byte(iscsiadmSessionOutput), nil)
-				mockCommand.EXPECT().ExecuteWithTimeoutAndInput(context.TODO(), "cryptsetup", 30*time.Second, true, "",
+				mockCommand.EXPECT().ExecuteWithTimeoutAndInput(context.TODO(), "cryptsetup", time.Minute, true, "",
 					"status",
 					"/dev/mapper/luks-test-volume")
 				return mockCommand
diff --git a/utils/nvme/nvme.go b/utils/nvme/nvme.go
@@ -24,9 +24,13 @@ import (
 	"github.com/netapp/trident/utils/mount"
 )
 
-const NVMeAttachTimeout = 20 * time.Second
+var (
+	duringAddPublishedNVMeSession = fiji.Register("duringAddPublishedNVMeSession", "nvme")
+	afterFormatBeforeFileSystem   = fiji.Register("afterFormatBeforeFileSystem", "nvme")
+	beforeNVMeFlushDevice         = fiji.Register("beforeNVMeFlushDevice", "nvme")
+)
 
-var beforeNVMeFlushDevice = fiji.Register("beforeNVMeFlushDevice", "nvme")
+const NVMeAttachTimeout = 20 * time.Second
 
 func NewNVMeSubsystem(nqn string, command exec.Command, fs afero.Fs) *NVMeSubsystem {
 	return NewNVMeSubsystemDetailed(nqn, "", []Path{}, command, fs)
@@ -324,11 +328,16 @@ func (nh *NVMeHandler) NVMeMountVolume(
 	isLUKSDevice := convert.ToBool(publishInfo.LUKSEncryption)
 	if isLUKSDevice {
 		luksDevice := luks.NewDevice(devicePath, name, nh.command)
+
 		luksFormatted, err = luksDevice.EnsureDeviceMappedOnHost(ctx, name, secrets)
 		if err != nil {
 			return err
 		}
 
+		if err := afterFormatBeforeFileSystem.Inject(); err != nil {
+			return err
+		}
+
 		devicePath = luksDevice.MappedDevicePath()
 	}
 
@@ -559,6 +568,8 @@ func (nh *NVMeHandler) AddPublishedNVMeSession(pubSessions *NVMeSessions, publis
 	if pubSessions == nil {
 		return
 	}
+	// This fiji point is only for testing exits, so no error check needed
+	_ = duringAddPublishedNVMeSession.Inject()
 
 	pubSessions.AddNVMeSession(*NewNVMeSubsystem(publishInfo.NVMeSubsystemNQN, nh.command, nh.osFs),
 		publishInfo.NVMeTargetIPs)
diff --git a/utils/nvme/nvme_linux_test.go b/utils/nvme/nvme_linux_test.go
@@ -601,10 +601,10 @@ func TestNVMeMountVolume(t *testing.T) {
 			getMockCommand: func(ctrl *gomock.Controller) exec.Command {
 				mockCommand := mockexec.NewMockCommand(ctrl)
 				mockCommand.EXPECT().ExecuteWithTimeoutAndInput(
-					gomock.Any(), "cryptsetup", 30*time.Second, true, "", "status",
+					gomock.Any(), "cryptsetup", time.Minute, true, "", "status",
 					"/dev/mapper/luks-mockName").Return([]byte{}, mockexec.NewMockExitError(4,
 					"device does not exist"))
-				mockCommand.EXPECT().ExecuteWithTimeoutAndInput(gomock.Any(), "cryptsetup", 30*time.Second, true, "", "status",
+				mockCommand.EXPECT().ExecuteWithTimeoutAndInput(gomock.Any(), "cryptsetup", time.Minute, true, "", "status",
 					"/dev/mapper/luks-mockName").Return([]byte{}, nil)
 				return mockCommand
 			},

Original file line number	Diff line number	Diff line change
`@@ -95,6 +95,7 @@ var (`
`95`	`95`	`afterInitialTrackingInfoWrite = fiji.Register("afterInitialTrackingInfoWrite", "node_server")`
`96`	`96`	`afterNvmeLuksDeviceClosed = fiji.Register("afterNvmeLuksDeviceClosed", "node_server")`
`97`	`97`	`afterNvmeDisconnect = fiji.Register("afterNvmeDisconnect", "node_server")`
	`98`	`+ beforeTrackingInfoWrite = fiji.Register("beforeTrackingInfoWrite", "node_server")`
`98`	`99`	`)`
`99`	`100`
`100`	`101`	`const (`
`@@ -1219,13 +1220,13 @@ func (p *Plugin) populatePublishedSessions(ctx context.Context) {`
`1219`	`1220`	`}`
`1220`	`1221`
`1221`	`1222`	`publishInfo := &trackingInfo.VolumePublishInfo`
`1222`		`-`
`1223`	`1223`	`if publishInfo.SANType != sa.NVMe {`
`1224`	`1224`	`newCtx := context.WithValue(ctx, iscsi.SessionInfoSource, utils.SessionSourceTrackingInfo)`
`1225`	`1225`	`p.iscsi.AddSession(newCtx, &publishedISCSISessions, publishInfo, volumeID, "", models.NotInvalid)`
`1226`	`1226`	`} else {`
`1227`	`1227`	`p.nvmeHandler.AddPublishedNVMeSession(&publishedNVMeSessions, publishInfo)`
`1228`	`1228`	`}`
	`1229`	`+`
`1229`	`1230`	`}`
`1230`	`1231`	`}`
`1231`	`1232`
`@@ -2879,6 +2880,10 @@ func (p *Plugin) nodeStageNVMeVolume(`
`2879`	`2880`	`}`
`2880`	`2881`	`}`
`2881`	`2882`
	`2883`	`+ if err := beforeTrackingInfoWrite.Inject(); err != nil {`
	`2884`	`+ return err`
	`2885`	`+ }`
	`2886`	`+`
`2882`	`2887`	`volTrackingInfo := &models.VolumeTrackingInfo{`
`2883`	`2888`	`VolumePublishInfo: *publishInfo,`
`2884`	`2889`	`StagingTargetPath: stagingTargetPath,`
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ import (`
`19`	`19`	`)`
`20`	`20`
`21`	`21`	`const (`
`22`		`- luksCommandTimeout time.Duration = time.Second * 30`
	`22`	`+ luksCommandTimeout time.Duration = time.Second * 60`
`23`	`23`
`24`	`24`	`luksCypherMode = "aes-xts-plain64"`
`25`	`25`	`luksType = "luks2"`