Skip to content

Commit 2476631

Browse files
authored
Do not log out of stale portals during iSCSI self-healing
This commit changes iSCSI self-healing such that the logout remediation is never performed for stale portals. Instead, if a session is stale for an extended period of time, Trident will gather additional session and connection state then log a warning to administrators to indicate which portals are potentially unhealthy.
1 parent e293295 commit 2476631

File tree

3 files changed

+121
-11
lines changed

3 files changed

+121
-11
lines changed

utils/iscsi/iscsi.go

Lines changed: 118 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"fmt"
1212
"os"
1313
"os/exec"
14+
"path/filepath"
1415
"regexp"
1516
"sort"
1617
"strconv"
@@ -42,6 +43,7 @@ const (
4243
SessionInfoSource = "sessionSource"
4344
SessionSourceCurrentStatus = "currentStatus"
4445
SessionSourceNodeStage = "nodeStage"
46+
sessionConnectionStateUp = "up"
4547

4648
iscsiadmLoginTimeoutValue = 10
4749
iscsiadmLoginTimeout = iscsiadmLoginTimeoutValue * time.Second
@@ -1115,7 +1117,109 @@ func (client *Client) portalsToLogin(ctx context.Context, targetIQN string, port
11151117
return portalsNotLoggedIn, loggedIn, nil
11161118
}
11171119

1118-
// IsSessionStale - reads /sys/class/iscsi_session/session<sid>/state and returns true if it is not "LOGGED_IN".
1120+
// getSessionConnectionsState returns the state of iscsi session connections stored in:
1121+
// '/sys/class/iscsi_session/session<ID>/device/connection<ID>:0/iscsi_connection/connection<ID>:0.
1122+
func (client *Client) getSessionConnectionsState(ctx context.Context, sessionID string) []string {
1123+
Logc(ctx).WithField("sessionID", sessionID).Debug(">>>> iscsi.getSessionConnectionsState")
1124+
defer Logc(ctx).Debug("<<<< iscsi.getSessionConnectionsState")
1125+
1126+
// Find the session device dirs under: '/sys/class/iscsi_session/session<ID>/device/'.
1127+
sessionName := fmt.Sprintf("session%s", sessionID)
1128+
sessionDevicePath := filepath.Join(client.chrootPathPrefix, "sys", "class", "iscsi_session", sessionName, "device")
1129+
sessionDeviceEntries, err := client.os.ReadDir(sessionDevicePath)
1130+
if err != nil {
1131+
Logc(ctx).WithField("path", sessionDevicePath).WithError(err).Error("Could not read session dirs.")
1132+
return nil
1133+
}
1134+
1135+
const notFound = "<NOT FOUND>"
1136+
var errs error
1137+
1138+
// Dynamically discover the 'state' for all underlying connections and return them.
1139+
connectionStates := make([]string, 0)
1140+
for _, entry := range sessionDeviceEntries {
1141+
// Only consider: `/sys/class/iscsi_session/session<ID>/device/connection<ID>:0`
1142+
connection := entry.Name()
1143+
if !strings.HasPrefix(connection, "connection") {
1144+
continue
1145+
}
1146+
1147+
// At this point, we know we're looking at something like:
1148+
// '/sys/class/iscsi_session/session<ID>/device/connection<ID>:0' but we need:
1149+
// '/sys/class/iscsi_session/session<ID>/device/connection<ID>:0/iscsi_connection/connection<ID>:0'
1150+
state := notFound
1151+
statePath := filepath.Join(sessionDevicePath, connection, "iscsi_connection", connection, "state")
1152+
rawState, err := client.os.ReadFile(statePath)
1153+
if err != nil {
1154+
errs = errors.Join(errs, fmt.Errorf("failed to read session state at: '%s'; %w", statePath, err))
1155+
} else if len(rawState) != 0 {
1156+
state = strings.TrimSpace(string(rawState))
1157+
}
1158+
1159+
// If the connection state is "up" or not found, further inspection won't be helpful. Ignore this and move on.
1160+
if state == sessionConnectionStateUp || state == notFound {
1161+
continue
1162+
}
1163+
1164+
// Get the persistent address. This is the IP associated with a session.
1165+
address := notFound
1166+
addrPath := filepath.Join(sessionDevicePath, connection, "iscsi_connection", connection, "persistent_address")
1167+
rawAddress, err := client.os.ReadFile(addrPath)
1168+
if err != nil {
1169+
errs = errors.Join(errs, fmt.Errorf("failed to read connection IP at: '%s'; %w", addrPath, err))
1170+
} else if len(rawAddress) != 0 {
1171+
address = strings.TrimSpace(string(rawAddress))
1172+
}
1173+
1174+
// Get the persistent port. This is the port associated with a session.
1175+
port := notFound
1176+
portPath := filepath.Join(sessionDevicePath, connection, "iscsi_connection", connection, "persistent_port")
1177+
rawPort, err := client.os.ReadFile(portPath)
1178+
if err != nil {
1179+
errs = errors.Join(errs, fmt.Errorf("failed to read connection port at: '%s'; %w", portPath, err))
1180+
} else if len(rawPort) != 0 {
1181+
port = strings.TrimSpace(string(rawPort))
1182+
}
1183+
1184+
portal := fmt.Sprintf("%s:%s", address, port)
1185+
1186+
// This will allow Trident to communicate which portals have bad connections.
1187+
connectionState := fmt.Sprintf("\"portal:'%s'; connection:'%s'; state:'%s'\"", portal, connection, state)
1188+
connectionStates = append(connectionStates, connectionState)
1189+
}
1190+
1191+
if errs != nil {
1192+
Logc(ctx).WithError(errs).Error("Could not discover state of iSCSI connections.")
1193+
}
1194+
1195+
return connectionStates
1196+
}
1197+
1198+
// getSessionState returns the state stored in /sys/class/iscsi_session/session<sid>/state.
1199+
// If no state is found, an empty string is returned.
1200+
func (client *Client) getSessionState(ctx context.Context, sessionID string) string {
1201+
Logc(ctx).WithField("sessionID", sessionID).Debug(">>>> iscsi.getSessionState")
1202+
defer Logc(ctx).Debug("<<<< iscsi.getSessionState")
1203+
1204+
// Find the session state from the session at /sys/class/iscsi_session/sessionXXX/state
1205+
filename := fmt.Sprintf(client.chrootPathPrefix+"/sys/class/iscsi_session/session%s/state", sessionID)
1206+
sessionStateBytes, err := client.os.ReadFile(filename)
1207+
if err != nil {
1208+
Logc(ctx).WithField("path", filename).WithError(err).Error("Could not read session state file.")
1209+
return ""
1210+
}
1211+
1212+
sessionState := strings.TrimSpace(string(sessionStateBytes))
1213+
Logc(ctx).WithFields(LogFields{
1214+
"sessionID": sessionID,
1215+
"sessionState": sessionState,
1216+
"sysfsFile": filename,
1217+
}).Debug("Found iSCSI session state.")
1218+
1219+
return sessionState
1220+
}
1221+
1222+
// isSessionStale - reads /sys/class/iscsi_session/session<sid>/state and returns true if it is not "LOGGED_IN".
11191223
// Looks that the state of an already established session to identify if it is
11201224
// logged in or not, if it is not logged in then it could be a stale session.
11211225
// For now, we are relying on the sysfs files
@@ -1127,10 +1231,7 @@ func (client *Client) isSessionStale(ctx context.Context, sessionID string) bool
11271231
filename := fmt.Sprintf(client.chrootPathPrefix+"/sys/class/iscsi_session/session%s/state", sessionID)
11281232
sessionStateBytes, err := client.os.ReadFile(filename)
11291233
if err != nil {
1130-
Logc(ctx).WithFields(LogFields{
1131-
"path": filename,
1132-
"error": err,
1133-
}).Error("Could not read session state file")
1234+
Logc(ctx).WithField("path", filename).WithError(err).Error("Could not read session state file.")
11341235
return false
11351236
}
11361237

@@ -2655,7 +2756,6 @@ func (client *Client) InspectAllISCSISessions(
26552756
var candidateStalePortals, candidateNonStalePortal []string
26562757

26572758
for portal, publishedSessionData := range publishedSessions.Info {
2658-
26592759
logFields := LogFields{"portal": portal}
26602760

26612761
var publishedPortalInfo, currentPortalInfo *models.PortalInfo
@@ -2725,6 +2825,14 @@ func (client *Client) InspectAllISCSISessions(
27252825

27262826
if action != models.NoAction {
27272827
candidateStalePortals = append(candidateStalePortals, portal)
2828+
2829+
// At this point we know the iSCSI session has been stale for some time but do not know why.
2830+
// Retrieve additional state from sysfs and inform the admin of an issue.
2831+
Logc(ctx).WithFields(LogFields{
2832+
"portal": portal,
2833+
"sessionState": client.getSessionState(ctx, currentPortalInfo.SessionNumber),
2834+
"connectionState": client.getSessionConnectionsState(ctx, currentPortalInfo.SessionNumber),
2835+
}).Warn("Portal requires manual intervention; storage network connection may be unstable.")
27282836
}
27292837
continue
27302838
}
@@ -2794,7 +2902,10 @@ func isStalePortal(
27942902
} else if timeNow.Sub(publishedPortalInfo.FirstIdentifiedStaleAt) >= iSCSISessionWaitTime {
27952903
Logc(ctx).WithFields(logFields).Warningf("Portal exceeded stale wait time at %v; adding to stale portals list.",
27962904
timeNow)
2797-
return models.LogoutLoginScan
2905+
// Things like storage platform upgrades or extended network outages may result in a FREE or FAILED state on the
2906+
// session. At this point in time, there isn't a reliable mechanism to know when it would be safe to perform a
2907+
// Logout remediation step, so only ever perform a LoginScan.
2908+
return models.LoginScan
27982909
} else {
27992910
Logc(ctx).WithFields(logFields).Warningf("Portal has not exceeded stale wait time at %v.", timeNow)
28002911
}

utils/iscsi/iscsi_test.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5879,7 +5879,7 @@ func TestIsStalePortal(t *testing.T) {
58795879
SessionWaitTime: 10 * time.Second,
58805880
TimeNow: time.Now().Add(20 * time.Second),
58815881
Portal: ipList[0],
5882-
ResultAction: models.LogoutLoginScan,
5882+
ResultAction: models.LoginScan,
58835883
SimulateConditions: func(publishedSessions, currentSessions *models.ISCSISessions, portal string) {
58845884
publishedSessions.Info[portal].PortalInfo.FirstIdentifiedStaleAt = time.Now()
58855885
},
@@ -5943,7 +5943,7 @@ func TestIsStalePortal(t *testing.T) {
59435943
SessionWaitTime: 10 * time.Second,
59445944
TimeNow: time.Now().Add(20 * time.Second),
59455945
Portal: ipList[0],
5946-
ResultAction: models.LogoutLoginScan,
5946+
ResultAction: models.LoginScan,
59475947
SimulateConditions: func(publishedSessions, currentSessions *models.ISCSISessions, portal string) {
59485948
publishedSessions.Info[portal].PortalInfo.Credentials = chapCredentials[0]
59495949
publishedSessions.Info[portal].PortalInfo.FirstIdentifiedStaleAt = time.Now()
@@ -5954,7 +5954,6 @@ func TestIsStalePortal(t *testing.T) {
59545954
for _, input := range inputs {
59555955
t.Run(input.TestName, func(t *testing.T) {
59565956
portal := input.Portal
5957-
59585957
input.SimulateConditions(input.PublishedPortals, input.CurrentPortals, portal)
59595958

59605959
publishedPortalData, _ := input.PublishedPortals.Info[portal]

utils/iscsi_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2024 NetApp, Inc. All Rights Reserved.
1+
// Copyright 2025 NetApp, Inc. All Rights Reserved.
22

33
package utils
44

0 commit comments

Comments
 (0)