Skip to content

Commit ae6ee3b

Browse files
authored
Fix NVMe Stage idempotency issues
1 parent cb43579 commit ae6ee3b

File tree

9 files changed

+180
-19
lines changed

9 files changed

+180
-19
lines changed

frontend/csi/node_server.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ var (
9595
afterInitialTrackingInfoWrite = fiji.Register("afterInitialTrackingInfoWrite", "node_server")
9696
afterNvmeLuksDeviceClosed = fiji.Register("afterNvmeLuksDeviceClosed", "node_server")
9797
afterNvmeDisconnect = fiji.Register("afterNvmeDisconnect", "node_server")
98+
beforeTrackingInfoWrite = fiji.Register("beforeTrackingInfoWrite", "node_server")
9899
)
99100

100101
const (
@@ -1219,13 +1220,13 @@ func (p *Plugin) populatePublishedSessions(ctx context.Context) {
12191220
}
12201221

12211222
publishInfo := &trackingInfo.VolumePublishInfo
1222-
12231223
if publishInfo.SANType != sa.NVMe {
12241224
newCtx := context.WithValue(ctx, iscsi.SessionInfoSource, utils.SessionSourceTrackingInfo)
12251225
p.iscsi.AddSession(newCtx, &publishedISCSISessions, publishInfo, volumeID, "", models.NotInvalid)
12261226
} else {
12271227
p.nvmeHandler.AddPublishedNVMeSession(&publishedNVMeSessions, publishInfo)
12281228
}
1229+
12291230
}
12301231
}
12311232

@@ -2879,6 +2880,10 @@ func (p *Plugin) nodeStageNVMeVolume(
28792880
}
28802881
}
28812882

2883+
if err := beforeTrackingInfoWrite.Inject(); err != nil {
2884+
return err
2885+
}
2886+
28822887
volTrackingInfo := &models.VolumeTrackingInfo{
28832888
VolumePublishInfo: *publishInfo,
28842889
StagingTargetPath: stagingTargetPath,

internal/fiji/models/factory.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ const (
3030
ErrorNTimes HandlerType = "error-n-times"
3131
// ErrorAfterNTimes tells the fault to error after 'n' times indefinitely.
3232
ErrorAfterNTimes HandlerType = "error-after-n-times"
33+
// ErrorXTimesAfterYTimes tells the fault to error up to 'x' times after 'y' times then succeed indefinitely
34+
ErrorXTimesAfterYTimes HandlerType = "error-x-times-after-y-times"
3335
// ExitAfterNTimes tells the fault to exit the process after 'n' times.
3436
ExitAfterNTimes HandlerType = "exit-after-n-times"
3537
)
@@ -58,6 +60,8 @@ func NewFaultHandlerFromModel(model []byte) (FaultHandler, error) {
5860
return handlers.NewErrorNTimesHandler(model)
5961
case ErrorAfterNTimes:
6062
return handlers.NewErrorAfterNTimesHandler(model)
63+
case ErrorXTimesAfterYTimes:
64+
return handlers.NewErrorXTimesAfterYTimesHandler(model)
6165
case ExitAfterNTimes:
6266
return handlers.NewExitAfterNTimesHandler(model)
6367
}
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
package handlers
2+
3+
import (
4+
"encoding/json"
5+
"fmt"
6+
7+
. "github.com/netapp/trident/logging"
8+
)
9+
10+
type ErrorXTimesAfterYTimesHandler struct {
11+
Name string `json:"name"`
12+
HitCount int `json:"hitCount"`
13+
PassCount int `json:"passCount"`
14+
FailCount int `json:"failCount"`
15+
ErrorHitCount int `json:"errorHitCount"`
16+
}
17+
18+
func (handler *ErrorXTimesAfterYTimesHandler) Handle() error {
19+
Log().Debugf("Firing %s handler.", handler.Name)
20+
21+
// While the passCount is greater than the hitCount, this handler should return nil.
22+
if handler.HitCount < handler.PassCount {
23+
handler.HitCount++
24+
remaining := handler.PassCount - handler.HitCount
25+
Log().Debugf("%v remaining passes from %s handler.", remaining, handler.Name)
26+
return nil
27+
}
28+
29+
// Once passCount is reached, start erroring for FailCount times.
30+
if handler.ErrorHitCount < handler.FailCount {
31+
handler.ErrorHitCount++
32+
remaining := handler.FailCount - handler.ErrorHitCount
33+
Log().Debugf("%v remaining errors from %s handler.", remaining, handler.Name)
34+
return fmt.Errorf("fiji error from [%s] handler; %v errors remaining", handler.Name, remaining)
35+
}
36+
37+
// After FailCount errors, succeed indefinitely.
38+
Log().Debugf("No errors remaining from %s handler.", handler.Name)
39+
return nil
40+
}
41+
42+
func NewErrorXTimesAfterYTimesHandler(model []byte) (*ErrorXTimesAfterYTimesHandler, error) {
43+
var handler ErrorXTimesAfterYTimesHandler
44+
if err := json.Unmarshal(model, &handler); err != nil {
45+
return nil, err
46+
}
47+
48+
// Validate PassCount and FailCount
49+
if handler.PassCount <= 0 {
50+
return nil, fmt.Errorf("invalid value specified for passCount: must be greater than 0")
51+
}
52+
53+
if handler.FailCount <= 0 {
54+
return nil, fmt.Errorf("invalid value specified for failCount: must be greater than 0")
55+
}
56+
57+
return &handler, nil
58+
}
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
package handlers
2+
3+
import (
4+
"fmt"
5+
"testing"
6+
7+
"github.com/stretchr/testify/assert"
8+
)
9+
10+
func TestErrorXTimesAfterYTimesHandler(t *testing.T) {
11+
tt := map[string]struct {
12+
formatStr string
13+
passCount int
14+
failCount int
15+
assertValue assert.ValueAssertionFunc
16+
assertError assert.ErrorAssertionFunc
17+
}{
18+
"with no KVP for counts": {
19+
formatStr: `{"name":"error-x-times-after-y-times"}`,
20+
assertValue: assert.Nil,
21+
assertError: assert.Error,
22+
},
23+
"with negative passCount": {
24+
formatStr: `{"name":"error-x-times-after-y-times", "passCount": %v, "failCount": %v}`,
25+
passCount: -1,
26+
failCount: 3,
27+
assertValue: assert.Nil,
28+
assertError: assert.Error,
29+
},
30+
"with negative failCount": {
31+
formatStr: `{"name":"error-x-times-after-y-times", "passCount": %v, "failCount": %v}`,
32+
passCount: 2,
33+
failCount: -1,
34+
assertValue: assert.Nil,
35+
assertError: assert.Error,
36+
},
37+
"with zero passCount": {
38+
formatStr: `{"name":"error-x-times-after-y-times", "passCount": %v, "failCount": %v}`,
39+
passCount: 0,
40+
failCount: 3,
41+
assertValue: assert.Nil,
42+
assertError: assert.Error,
43+
},
44+
"with zero failCount": {
45+
formatStr: `{"name":"error-x-times-after-y-times", "passCount": %v, "failCount": %v}`,
46+
passCount: 2,
47+
failCount: 0,
48+
assertValue: assert.Nil,
49+
assertError: assert.Error,
50+
},
51+
"with valid values": {
52+
formatStr: `{"name":"error-x-times-after-y-times", "passCount": %v, "failCount": %v}`,
53+
passCount: 2,
54+
failCount: 3,
55+
assertValue: assert.NotNil,
56+
assertError: assert.NoError,
57+
},
58+
}
59+
60+
for name, test := range tt {
61+
t.Run(name, func(t *testing.T) {
62+
modelStr := fmt.Sprintf(test.formatStr, test.passCount, test.failCount)
63+
handler, err := NewErrorXTimesAfterYTimesHandler([]byte(modelStr))
64+
test.assertError(t, err)
65+
test.assertValue(t, handler)
66+
})
67+
}
68+
}
69+
70+
func TestErrorXTimesAfterYTimesHandler_Handle(t *testing.T) {
71+
passCount := 2
72+
failCount := 3
73+
modelJSON := fmt.Sprintf(`{"name":"error-x-times-after-y-times", "passCount": %v, "failCount": %v}`, passCount, failCount)
74+
handler, err := NewErrorXTimesAfterYTimesHandler([]byte(modelJSON))
75+
assert.NoError(t, err)
76+
assert.NotNil(t, handler)
77+
assert.Equal(t, passCount, handler.PassCount)
78+
assert.Equal(t, failCount, handler.FailCount)
79+
80+
// Test successful passes
81+
for i := 0; i < passCount; i++ {
82+
assert.NoError(t, handler.Handle())
83+
}
84+
85+
// Test failures
86+
for i := 0; i < failCount; i++ {
87+
assert.Error(t, handler.Handle())
88+
}
89+
90+
// Test succeeding indefinitely after failures
91+
for i := 0; i < 5; i++ { // Arbitrary number of additional calls
92+
assert.NoError(t, handler.Handle())
93+
}
94+
}

utils/devices/luks/luks_linux.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ import (
1919
)
2020

2121
const (
22-
luksCommandTimeout time.Duration = time.Second * 30
22+
luksCommandTimeout time.Duration = time.Second * 60
2323

2424
luksCypherMode = "aes-xts-plain64"
2525
luksType = "luks2"

utils/devices/luks/luks_linux_test.go

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,6 @@ func mockCryptsetupLuksOpen(mock *mockexec.MockCommand) *gomock.Call {
4949
)
5050
}
5151

52-
func mockCryptsetupLuksClose(mock *mockexec.MockCommand) *gomock.Call {
53-
return mock.EXPECT().ExecuteWithTimeoutAndInput(
54-
gomock.Any(), "cryptsetup", luksCommandTimeout, true, "", "luksClose", gomock.Any(),
55-
)
56-
}
57-
5852
func mockCryptsetupLuksStatusWithDevicePath(mock *mockexec.MockCommand) *gomock.Call {
5953
return mock.EXPECT().ExecuteWithTimeoutAndInput(
6054
gomock.Any(), "cryptsetup", luksCommandTimeout, true, "", "status", gomock.Any(),
@@ -255,7 +249,6 @@ func TestLUKSDevice_ExecErrors(t *testing.T) {
255249
mockCryptsetupLuksFormat(mockCommand).Return([]byte(""), luksError),
256250
mockCryptsetupLuksOpen(mockCommand).Return([]byte(""), luksError),
257251
mockCryptsetupLuksStatus(mockCommand).Return([]byte(""), luksError),
258-
mockCryptsetupLuksClose(mockCommand).Return([]byte(""), luksError),
259252
)
260253

261254
isFormatted, err := luksDevice.IsLUKSFormatted(context.Background())
@@ -271,10 +264,6 @@ func TestLUKSDevice_ExecErrors(t *testing.T) {
271264
isOpen, err := luksDevice.IsOpen(context.Background())
272265
assert.Error(t, err)
273266
assert.False(t, isOpen)
274-
275-
devicesClient := devices.NewDetailed(mockCommand, afero.NewMemMapFs(), nil)
276-
err = devicesClient.CloseLUKSDevice(context.Background(), luksDevice.MappedDevicePath())
277-
assert.Error(t, err)
278267
}
279268

280269
func TestEnsureLUKSDevice_FailsWithExecError(t *testing.T) {

utils/iscsi/iscsi_linux_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ tcp: [4] 127.0.0.2:3260,1029 ` + targetIQN + ` (non-flash)`
7272
"config").Return([]byte(multipathConfig("no", false)), nil)
7373
mockCommand.EXPECT().Execute(context.TODO(), "iscsiadm", "-m",
7474
"session").Return([]byte(iscsiadmSessionOutput), nil)
75-
mockCommand.EXPECT().ExecuteWithTimeoutAndInput(context.TODO(), "cryptsetup", 30*time.Second, true, "",
75+
mockCommand.EXPECT().ExecuteWithTimeoutAndInput(context.TODO(), "cryptsetup", time.Minute, true, "",
7676
"status", "/dev/mapper/luks-test-volume")
7777
return mockCommand
7878
},
@@ -164,7 +164,7 @@ tcp: [4] 127.0.0.2:3260,1029 ` + targetIQN + ` (non-flash)`
164164
"config").Return([]byte(multipathConfig("no", false)), nil)
165165
mockCommand.EXPECT().Execute(context.TODO(), "iscsiadm", "-m",
166166
"session").Return([]byte(iscsiadmSessionOutput), nil)
167-
mockCommand.EXPECT().ExecuteWithTimeoutAndInput(context.TODO(), "cryptsetup", 30*time.Second, true, "",
167+
mockCommand.EXPECT().ExecuteWithTimeoutAndInput(context.TODO(), "cryptsetup", time.Minute, true, "",
168168
"status",
169169
"/dev/mapper/luks-test-volume")
170170
return mockCommand

utils/nvme/nvme.go

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,13 @@ import (
2424
"github.com/netapp/trident/utils/mount"
2525
)
2626

27-
const NVMeAttachTimeout = 20 * time.Second
27+
var (
28+
duringAddPublishedNVMeSession = fiji.Register("duringAddPublishedNVMeSession", "nvme")
29+
afterFormatBeforeFileSystem = fiji.Register("afterFormatBeforeFileSystem", "nvme")
30+
beforeNVMeFlushDevice = fiji.Register("beforeNVMeFlushDevice", "nvme")
31+
)
2832

29-
var beforeNVMeFlushDevice = fiji.Register("beforeNVMeFlushDevice", "nvme")
33+
const NVMeAttachTimeout = 20 * time.Second
3034

3135
func NewNVMeSubsystem(nqn string, command exec.Command, fs afero.Fs) *NVMeSubsystem {
3236
return NewNVMeSubsystemDetailed(nqn, "", []Path{}, command, fs)
@@ -324,11 +328,16 @@ func (nh *NVMeHandler) NVMeMountVolume(
324328
isLUKSDevice := convert.ToBool(publishInfo.LUKSEncryption)
325329
if isLUKSDevice {
326330
luksDevice := luks.NewDevice(devicePath, name, nh.command)
331+
327332
luksFormatted, err = luksDevice.EnsureDeviceMappedOnHost(ctx, name, secrets)
328333
if err != nil {
329334
return err
330335
}
331336

337+
if err := afterFormatBeforeFileSystem.Inject(); err != nil {
338+
return err
339+
}
340+
332341
devicePath = luksDevice.MappedDevicePath()
333342
}
334343

@@ -559,6 +568,8 @@ func (nh *NVMeHandler) AddPublishedNVMeSession(pubSessions *NVMeSessions, publis
559568
if pubSessions == nil {
560569
return
561570
}
571+
// This fiji point is only for testing exits, so no error check needed
572+
_ = duringAddPublishedNVMeSession.Inject()
562573

563574
pubSessions.AddNVMeSession(*NewNVMeSubsystem(publishInfo.NVMeSubsystemNQN, nh.command, nh.osFs),
564575
publishInfo.NVMeTargetIPs)

utils/nvme/nvme_linux_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -601,10 +601,10 @@ func TestNVMeMountVolume(t *testing.T) {
601601
getMockCommand: func(ctrl *gomock.Controller) exec.Command {
602602
mockCommand := mockexec.NewMockCommand(ctrl)
603603
mockCommand.EXPECT().ExecuteWithTimeoutAndInput(
604-
gomock.Any(), "cryptsetup", 30*time.Second, true, "", "status",
604+
gomock.Any(), "cryptsetup", time.Minute, true, "", "status",
605605
"/dev/mapper/luks-mockName").Return([]byte{}, mockexec.NewMockExitError(4,
606606
"device does not exist"))
607-
mockCommand.EXPECT().ExecuteWithTimeoutAndInput(gomock.Any(), "cryptsetup", 30*time.Second, true, "", "status",
607+
mockCommand.EXPECT().ExecuteWithTimeoutAndInput(gomock.Any(), "cryptsetup", time.Minute, true, "", "status",
608608
"/dev/mapper/luks-mockName").Return([]byte{}, nil)
609609
return mockCommand
610610
},

0 commit comments

Comments
 (0)