Skip to content

Commit 7f2918e

Browse files
authored
Safely handle CHAP rotation for stale portals during iSCSI self-healing
This commit changes iSCSI self-healing to never logout of stale non-CHAP portals. For stale CHAP portals, logouts are now only performed when the the portal is accessible.
1 parent ec1816e commit 7f2918e

File tree

9 files changed

+673
-214
lines changed

9 files changed

+673
-214
lines changed

frontend/csi/node_server.go

Lines changed: 92 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ const (
5050
iSCSINodeUnstageMaxDuration = 15 * time.Second
5151
iSCSILoginTimeout = 10 * time.Second
5252
iSCSISelfHealingLockContext = "ISCSISelfHealingThread"
53+
iSCSISelfHealingTimeout = 90 * time.Second
5354
fcpNodeUnstageMaxDuration = 15 * time.Second
5455
nvmeSelfHealingLockContext = "NVMeSelfHealingThread"
5556
defaultNodeReconciliationPeriod = 1 * time.Minute
@@ -1843,6 +1844,17 @@ func (p *Plugin) ensureAttachISCSIVolume(
18431844
return mpathSize, nil
18441845
}
18451846

1847+
// getChapInfoFromController attempts to gather CHAP info from the controller in a timebound manner.
1848+
func (p *Plugin) getChapInfoFromController(
1849+
ctx context.Context, volumeID, nodeName string,
1850+
) (*models.IscsiChapInfo, error) {
1851+
chapInfo, err := p.restClient.GetChap(ctx, volumeID, nodeName)
1852+
if err != nil {
1853+
return nil, fmt.Errorf("failed to get CHAP info from CSI controller for volume: '%s'; %w", volumeID, err)
1854+
}
1855+
return chapInfo, nil
1856+
}
1857+
18461858
func (p *Plugin) updateChapInfoFromController(
18471859
ctx context.Context, req *csi.NodeStageVolumeRequest, publishInfo *models.VolumePublishInfo,
18481860
) error {
@@ -2478,11 +2490,17 @@ func (p *Plugin) selfHealingRectifySession(ctx context.Context, portal string, a
24782490

24792491
switch action {
24802492
case models.LogoutLoginScan:
2493+
// Do not log out if the portal is unresponsive as subsequent logins are likely to fail.
2494+
if isAccessible, err := p.iscsi.IsPortalAccessible(ctx, portal); !isAccessible {
2495+
Logc(ctx).WithError(err).Warnf("Cannot safely log out of unresponsive portal '%s'.", portal)
2496+
return fmt.Errorf("cannot safely log out of unresponsive portal '%s'", portal)
2497+
}
2498+
24812499
if err = p.iscsi.Logout(ctx, publishInfo.IscsiTargetIQN, portal); err != nil {
24822500
return fmt.Errorf("error while logging out of target %s", publishInfo.IscsiTargetIQN)
2483-
} else {
2484-
Logc(ctx).Debug("Logout is successful.")
24852501
}
2502+
Logc(ctx).Debug("Logout is successful.")
2503+
24862504
// Logout is successful, fallthrough to perform login.
24872505
fallthrough
24882506
case models.LoginScan:
@@ -2506,12 +2524,17 @@ func (p *Plugin) selfHealingRectifySession(ctx context.Context, portal string, a
25062524
}
25072525

25082526
if publishedCHAPCredentials != publishInfo.IscsiChapInfo {
2527+
fields := LogFields{
2528+
"portal": portal,
2529+
"CHAPInUse": true,
2530+
}
2531+
2532+
// Update the CHAP info in the portal. This should never fail.
25092533
updateErr := publishedISCSISessions.UpdateCHAPForPortal(portal, publishInfo.IscsiChapInfo)
25102534
if updateErr != nil {
2511-
Logc(ctx).Warn("Failed to update published CHAP information.")
2535+
Logc(ctx).WithFields(fields).Warn("Failed to update published CHAP information.")
25122536
}
2513-
2514-
Logc(ctx).Debug("Updated published CHAP information.")
2537+
Logc(ctx).WithFields(fields).Debug("Updated published CHAP information after successful login.")
25152538
}
25162539

25172540
Logc(ctx).Debug("Login to target is successful.")
@@ -2564,6 +2587,64 @@ func (p *Plugin) deprecatedIgroupInUse(ctx context.Context) bool {
25642587
return false
25652588
}
25662589

2590+
// updateCHAPInfoForSessions provides a best attempt to populate up-to-date CHAP credentials within iSCSI self-healing's
2591+
// published sessions. It tracks CHAP credentials by unique IQN to reduce the number of calls to the controller.
2592+
func (p *Plugin) updateCHAPInfoForSessions(
2593+
ctx context.Context, publishedSessions, currentSessions *models.ISCSISessions,
2594+
) error {
2595+
if publishedSessions == nil || currentSessions == nil {
2596+
return nil
2597+
}
2598+
2599+
// Timebox this operation to prevent it from running indefinitely and blocking everything in self-healing and
2600+
// other operations in the node server.
2601+
cancelCtx, cancel := context.WithTimeout(ctx, iSCSISelfHealingTimeout/3)
2602+
defer cancel()
2603+
2604+
// Store the CHAP credentials we've found for certain IQNs.
2605+
// IQNs should be unique between SVMs and CHAP credentials are scoped at the SVM level.
2606+
iqnToCHAP := make(map[string]*models.IscsiChapInfo, 0)
2607+
var errs error
2608+
2609+
for portal, publishedData := range publishedSessions.Info {
2610+
// If the session isn't stale on the host, then we can ignore this portal or now.
2611+
data, ok := currentSessions.Info[portal]
2612+
if ok && !p.iscsi.IsSessionStale(cancelCtx, data.PortalInfo.SessionNumber) {
2613+
continue
2614+
} else if !publishedData.PortalInfo.CHAPInUse() || !publishedData.PortalInfo.HasTargetIQN() {
2615+
continue
2616+
}
2617+
2618+
chapInfo, ok := iqnToCHAP[publishedData.PortalInfo.ISCSITargetIQN]
2619+
if !ok {
2620+
volumeID, err := publishedSessions.VolumeIDForPortal(portal)
2621+
if err != nil {
2622+
errs = errors.Join(errs, fmt.Errorf("failed to get volume ID for portal: '%s'; %w", portal, err))
2623+
continue
2624+
}
2625+
2626+
chapInfo, err = p.getChapInfoFromController(cancelCtx, volumeID, p.nodeName)
2627+
if err != nil {
2628+
errs = errors.Join(errs, fmt.Errorf("failed to get CHAP info for portal: '%s'; %w", portal, err))
2629+
continue
2630+
}
2631+
2632+
// Store what we've learned for future iterations.
2633+
iqnToCHAP[publishedData.PortalInfo.ISCSITargetIQN] = chapInfo
2634+
}
2635+
2636+
publishedData.PortalInfo.UpdateCHAPCredentials(*chapInfo)
2637+
Logc(cancelCtx).WithField("portal", portal).Debug("Updated CHAP info for portal.")
2638+
}
2639+
2640+
if errs != nil {
2641+
Logc(cancelCtx).WithError(errs).Error("Failed to get updated CHAP info for portal(s).")
2642+
return errs
2643+
}
2644+
2645+
return nil
2646+
}
2647+
25672648
// performISCSISelfHealing inspects the desired state of the iSCSI sessions with the current state and accordingly
25682649
// identifies candidate sessions that require remediation. This function is invoked periodically.
25692650
func (p *Plugin) performISCSISelfHealing(ctx context.Context) {
@@ -2578,7 +2659,7 @@ func (p *Plugin) performISCSISelfHealing(ctx context.Context) {
25782659
}()
25792660

25802661
// After this time self-healing may be stopped
2581-
stopSelfHealingAt := time.Now().Add(60 * time.Second)
2662+
stopSelfHealingAt := time.Now().Add(iSCSISelfHealingTimeout)
25822663

25832664
// If there are not iSCSI volumes expected on the host skip self-healing
25842665
if publishedISCSISessions.IsEmpty() {
@@ -2608,6 +2689,11 @@ func (p *Plugin) performISCSISelfHealing(ctx context.Context) {
26082689
Logc(ctx).Debug("No iSCSI sessions LUN mappings found.")
26092690
}
26102691

2692+
// Update CHAP info for published sessions.
2693+
if err := p.updateCHAPInfoForSessions(ctx, &publishedISCSISessions, &currentISCSISessions); err != nil {
2694+
Logc(ctx).WithError(err).Error("Failed to update CHAP credentials for published iSCSI sessions.")
2695+
}
2696+
26112697
Logc(ctx).Debugf("Published iSCSI Sessions: %v", publishedISCSISessions)
26122698
Logc(ctx).Debugf("Current iSCSI Sessions: %v", currentISCSISessions)
26132699

frontend/csi/node_server_test.go

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -614,6 +614,91 @@ func TestNodeStageISCSIVolume(t *testing.T) {
614614
}
615615
}
616616

617+
func TestGetChapInfoFromController(t *testing.T) {
618+
type args struct {
619+
ctx context.Context
620+
volume string
621+
node string
622+
}
623+
624+
type data struct {
625+
chapInfo *models.IscsiChapInfo
626+
}
627+
628+
type assertions struct {
629+
Error assert.ErrorAssertionFunc
630+
Empty assert.ValueAssertionFunc
631+
Equal assert.ComparisonAssertionFunc
632+
}
633+
634+
tt := map[string]struct {
635+
args args
636+
data data
637+
assert assertions
638+
mocks func(mockAPI *mockControllerAPI.MockTridentController, args args, data data)
639+
}{
640+
"Successfully gets CHAP credentials": {
641+
args: args{
642+
ctx: context.Background(),
643+
volume: "foo",
644+
node: "bar",
645+
},
646+
data: data{
647+
chapInfo: &models.IscsiChapInfo{
648+
UseCHAP: true,
649+
IscsiUsername: "user",
650+
IscsiInitiatorSecret: "pass",
651+
IscsiTargetUsername: "user2",
652+
IscsiTargetSecret: "pass2",
653+
},
654+
},
655+
assert: assertions{
656+
Error: assert.NoError,
657+
Empty: assert.NotEmpty,
658+
Equal: assert.Equal,
659+
},
660+
mocks: func(mockAPI *mockControllerAPI.MockTridentController, args args, data data) {
661+
mockAPI.EXPECT().GetChap(args.ctx, args.volume, args.node).Return(data.chapInfo, nil)
662+
},
663+
},
664+
"Fails to get CHAP credentials": {
665+
args: args{
666+
ctx: context.Background(),
667+
volume: "foo",
668+
node: "bar",
669+
},
670+
data: data{
671+
chapInfo: nil,
672+
},
673+
assert: assertions{
674+
Error: assert.Error,
675+
Empty: assert.Empty,
676+
Equal: assert.Equal,
677+
},
678+
mocks: func(mockAPI *mockControllerAPI.MockTridentController, args args, data data) {
679+
mockAPI.EXPECT().GetChap(args.ctx, args.volume, args.node).Return(data.chapInfo, errors.New("api error"))
680+
},
681+
},
682+
}
683+
684+
for name, test := range tt {
685+
t.Run(name, func(t *testing.T) {
686+
mockCtrl := gomock.NewController(t)
687+
mockAPI := mockControllerAPI.NewMockTridentController(mockCtrl)
688+
test.mocks(mockAPI, test.args, test.data)
689+
690+
plugin := &Plugin{
691+
restClient: mockAPI,
692+
}
693+
694+
info, err := plugin.getChapInfoFromController(test.args.ctx, test.args.volume, test.args.node)
695+
test.assert.Error(t, err)
696+
test.assert.Empty(t, info)
697+
test.assert.Equal(t, info, test.data.chapInfo)
698+
})
699+
}
700+
}
701+
617702
func TestUpdateChapInfoFromController_Success(t *testing.T) {
618703
testCtx := context.Background()
619704
volumeName := "foo"
@@ -673,6 +758,129 @@ func TestUpdateChapInfoFromController_Error(t *testing.T) {
673758
assert.EqualValues(t, models.IscsiChapInfo{}, testPublishInfo.IscsiAccessInfo.IscsiChapInfo)
674759
}
675760

761+
func TestUpdateChapInfoForSessions(t *testing.T) {
762+
// Populate sessions with only the state that matters. The rest of the fields are not relevant for this test.
763+
publishedSessions := &models.ISCSISessions{}
764+
currentSessions := &models.ISCSISessions{}
765+
766+
// CHAP portals
767+
chapPortals := []string{"0.0.0.0", "4.4.4.4", "5.5.5.5"}
768+
769+
// Unique CHAP portal
770+
uniqueChapPortal := "9.9.9.9"
771+
uniqueIQN := "iqn.9999-99.com.netapp:target"
772+
773+
// non-CHAP portals
774+
nonChapPortals := []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"}
775+
776+
sessionID := "0"
777+
778+
// Shared CHAP credentials
779+
sharedCHAPInfo := models.IscsiChapInfo{
780+
UseCHAP: true,
781+
IscsiUsername: "user",
782+
IscsiInitiatorSecret: "pass",
783+
IscsiTargetUsername: "user2",
784+
IscsiTargetSecret: "pass2",
785+
}
786+
787+
// Add CHAP sessions to both maps.
788+
for _, portal := range chapPortals {
789+
err := publishedSessions.AddPortal(portal, models.PortalInfo{
790+
ISCSITargetIQN: "iqn.2020-01.com.netapp:target",
791+
Credentials: sharedCHAPInfo,
792+
})
793+
assert.NoError(t, err)
794+
795+
err = currentSessions.AddPortal(portal, models.PortalInfo{
796+
ISCSITargetIQN: "iqn.2020-01.com.netapp:target",
797+
SessionNumber: sessionID,
798+
})
799+
assert.NoError(t, err)
800+
}
801+
802+
// Add a CHAP session with a unique IQN.
803+
err := publishedSessions.AddPortal(uniqueChapPortal, models.PortalInfo{
804+
// Use a different IQN here to prove we cache returned values from the REST API.
805+
ISCSITargetIQN: uniqueIQN,
806+
Credentials: sharedCHAPInfo,
807+
})
808+
assert.NoError(t, err)
809+
err = currentSessions.AddPortal(uniqueChapPortal, models.PortalInfo{
810+
ISCSITargetIQN: uniqueIQN,
811+
SessionNumber: sessionID,
812+
})
813+
assert.NoError(t, err)
814+
815+
// Add non-CHAP session
816+
for _, portal := range nonChapPortals {
817+
err := publishedSessions.AddPortal(portal, models.PortalInfo{
818+
ISCSITargetIQN: "iqn.2020-01.com.netapp:target",
819+
Credentials: models.IscsiChapInfo{},
820+
})
821+
assert.NoError(t, err)
822+
823+
err = currentSessions.AddPortal(portal, models.PortalInfo{
824+
ISCSITargetIQN: "iqn.2020-01.com.netapp:target",
825+
Credentials: models.IscsiChapInfo{},
826+
SessionNumber: sessionID,
827+
})
828+
assert.NoError(t, err)
829+
}
830+
831+
// Populate a single of LUN and volume in each session.
832+
volume := "foo"
833+
node := "bar"
834+
for _, sessionData := range publishedSessions.Info {
835+
sessionData.LUNs.Info[0] = volume
836+
}
837+
838+
// Create a mock controller client that will return the expected CHAP info.
839+
mockCtrl := gomock.NewController(t)
840+
mockISCSI := mock_iscsi.NewMockISCSI(mockCtrl)
841+
mockClient := mockControllerAPI.NewMockTridentController(mockCtrl)
842+
843+
plugin := &Plugin{
844+
nodeName: node,
845+
iscsi: mockISCSI,
846+
restClient: mockClient,
847+
}
848+
849+
// Expect calls on the mock client for all sessions that use CHAP.
850+
freshCHAPInfo := &models.IscsiChapInfo{
851+
UseCHAP: true,
852+
IscsiUsername: "user2",
853+
IscsiInitiatorSecret: "pass2",
854+
IscsiTargetUsername: "user",
855+
IscsiTargetSecret: "pass",
856+
}
857+
858+
// Mock calls to the iSCSI client
859+
count := len(currentSessions.Info)
860+
mockISCSI.EXPECT().IsSessionStale(gomock.Any(), sessionID).Return(true).Times(count)
861+
862+
// Mock API calls
863+
count = len(chapPortals) - (len(chapPortals) - 1) + 1 // Expect one more call for the unique CHAP portal.
864+
mockClient.EXPECT().GetChap(
865+
gomock.Any(), volume, node,
866+
).Return(freshCHAPInfo, nil).Times(count)
867+
868+
err = plugin.updateCHAPInfoForSessions(ctx, publishedSessions, currentSessions)
869+
assert.NoError(t, err)
870+
871+
// Verify that the CHAP info was updated in all sessions that use CHAP.
872+
for _, portal := range chapPortals {
873+
chapInfoInSession := publishedSessions.Info[portal].PortalInfo.Credentials
874+
assert.EqualValues(t, *freshCHAPInfo, chapInfoInSession)
875+
}
876+
877+
// Verify that the non-CHAP portals were not updated.
878+
for _, portal := range nonChapPortals {
879+
chapInfoInSession := publishedSessions.Info[portal].PortalInfo.Credentials
880+
assert.EqualValues(t, models.IscsiChapInfo{}, chapInfoInSession)
881+
}
882+
}
883+
676884
type PortalAction struct {
677885
Portal string
678886
Action models.ISCSIAction

0 commit comments

Comments
 (0)