Skip to content

Commit 5ed9722

Browse files
authored
Do not log out of stale portals during iSCSI self-healing
This commit changes iSCSI self-healing such that the logout remediation is never performed for stale portals. Instead, if a session is stale for an extended period of time, Trident will gather additional session and connection state then log a warning to administrators to indicate which portals are potentially unhealthy.
1 parent 181fe17 commit 5ed9722

File tree

2 files changed

+120
-8
lines changed

2 files changed

+120
-8
lines changed

utils/iscsi/iscsi.go

Lines changed: 118 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"fmt"
1212
"os"
1313
"os/exec"
14+
"path/filepath"
1415
"regexp"
1516
"sort"
1617
"strconv"
@@ -42,6 +43,7 @@ const (
4243
SessionInfoSource = "sessionSource"
4344
SessionSourceCurrentStatus = "currentStatus"
4445
SessionSourceNodeStage = "nodeStage"
46+
sessionConnectionStateUp = "up"
4547

4648
iscsiadmLoginTimeoutValue = 10
4749
iscsiadmLoginTimeout = iscsiadmLoginTimeoutValue * time.Second
@@ -1115,7 +1117,109 @@ func (client *Client) portalsToLogin(ctx context.Context, targetIQN string, port
11151117
return portalsNotLoggedIn, loggedIn, nil
11161118
}
11171119

1118-
// IsSessionStale - reads /sys/class/iscsi_session/session<sid>/state and returns true if it is not "LOGGED_IN".
1120+
// getSessionConnectionsState returns the state of iscsi session connections stored in:
1121+
// '/sys/class/iscsi_session/session<ID>/device/connection<ID>:0/iscsi_connection/connection<ID>:0.
1122+
func (client *Client) getSessionConnectionsState(ctx context.Context, sessionID string) []string {
1123+
Logc(ctx).WithField("sessionID", sessionID).Debug(">>>> iscsi.getSessionConnectionsState")
1124+
defer Logc(ctx).Debug("<<<< iscsi.getSessionConnectionsState")
1125+
1126+
// Find the session device dirs under: '/sys/class/iscsi_session/session<ID>/device/'.
1127+
sessionName := fmt.Sprintf("session%s", sessionID)
1128+
sessionDevicePath := filepath.Join(client.chrootPathPrefix, "sys", "class", "iscsi_session", sessionName, "device")
1129+
sessionDeviceEntries, err := client.os.ReadDir(sessionDevicePath)
1130+
if err != nil {
1131+
Logc(ctx).WithField("path", sessionDevicePath).WithError(err).Error("Could not read session dirs.")
1132+
return nil
1133+
}
1134+
1135+
const notFound = "<NOT FOUND>"
1136+
var errs error
1137+
1138+
// Dynamically discover the 'state' for all underlying connections and return them.
1139+
connectionStates := make([]string, 0)
1140+
for _, entry := range sessionDeviceEntries {
1141+
// Only consider: `/sys/class/iscsi_session/session<ID>/device/connection<ID>:0`
1142+
connection := entry.Name()
1143+
if !strings.HasPrefix(connection, "connection") {
1144+
continue
1145+
}
1146+
1147+
// At this point, we know we're looking at something like:
1148+
// '/sys/class/iscsi_session/session<ID>/device/connection<ID>:0' but we need:
1149+
// '/sys/class/iscsi_session/session<ID>/device/connection<ID>:0/iscsi_connection/connection<ID>:0'
1150+
state := notFound
1151+
statePath := filepath.Join(sessionDevicePath, connection, "iscsi_connection", connection, "state")
1152+
rawState, err := client.os.ReadFile(statePath)
1153+
if err != nil {
1154+
errs = errors.Join(errs, fmt.Errorf("failed to read session state at: '%s'; %w", statePath, err))
1155+
} else if len(rawState) != 0 {
1156+
state = strings.TrimSpace(string(rawState))
1157+
}
1158+
1159+
// If the connection state is "up" or not found, further inspection won't be helpful. Ignore this and move on.
1160+
if state == sessionConnectionStateUp || state == notFound {
1161+
continue
1162+
}
1163+
1164+
// Get the persistent address. This is the IP associated with a session.
1165+
address := notFound
1166+
addrPath := filepath.Join(sessionDevicePath, connection, "iscsi_connection", connection, "persistent_address")
1167+
rawAddress, err := client.os.ReadFile(addrPath)
1168+
if err != nil {
1169+
errs = errors.Join(errs, fmt.Errorf("failed to read connection IP at: '%s'; %w", addrPath, err))
1170+
} else if len(rawAddress) != 0 {
1171+
address = strings.TrimSpace(string(rawAddress))
1172+
}
1173+
1174+
// Get the persistent port. This is the port associated with a session.
1175+
port := notFound
1176+
portPath := filepath.Join(sessionDevicePath, connection, "iscsi_connection", connection, "persistent_port")
1177+
rawPort, err := client.os.ReadFile(portPath)
1178+
if err != nil {
1179+
errs = errors.Join(errs, fmt.Errorf("failed to read connection port at: '%s'; %w", portPath, err))
1180+
} else if len(rawPort) != 0 {
1181+
port = strings.TrimSpace(string(rawPort))
1182+
}
1183+
1184+
portal := fmt.Sprintf("%s:%s", address, port)
1185+
1186+
// This will allow Trident to communicate which portals have bad connections.
1187+
connectionState := fmt.Sprintf("\"portal:'%s'; connection:'%s'; state:'%s'\"", portal, connection, state)
1188+
connectionStates = append(connectionStates, connectionState)
1189+
}
1190+
1191+
if errs != nil {
1192+
Logc(ctx).WithError(errs).Error("Could not discover state of iSCSI connections.")
1193+
}
1194+
1195+
return connectionStates
1196+
}
1197+
1198+
// getSessionState returns the state stored in /sys/class/iscsi_session/session<sid>/state.
1199+
// If no state is found, an empty string is returned.
1200+
func (client *Client) getSessionState(ctx context.Context, sessionID string) string {
1201+
Logc(ctx).WithField("sessionID", sessionID).Debug(">>>> iscsi.getSessionState")
1202+
defer Logc(ctx).Debug("<<<< iscsi.getSessionState")
1203+
1204+
// Find the session state from the session at /sys/class/iscsi_session/sessionXXX/state
1205+
filename := fmt.Sprintf(client.chrootPathPrefix+"/sys/class/iscsi_session/session%s/state", sessionID)
1206+
sessionStateBytes, err := client.os.ReadFile(filename)
1207+
if err != nil {
1208+
Logc(ctx).WithField("path", filename).WithError(err).Error("Could not read session state file.")
1209+
return ""
1210+
}
1211+
1212+
sessionState := strings.TrimSpace(string(sessionStateBytes))
1213+
Logc(ctx).WithFields(LogFields{
1214+
"sessionID": sessionID,
1215+
"sessionState": sessionState,
1216+
"sysfsFile": filename,
1217+
}).Debug("Found iSCSI session state.")
1218+
1219+
return sessionState
1220+
}
1221+
1222+
// isSessionStale - reads /sys/class/iscsi_session/session<sid>/state and returns true if it is not "LOGGED_IN".
11191223
// Looks that the state of an already established session to identify if it is
11201224
// logged in or not, if it is not logged in then it could be a stale session.
11211225
// For now, we are relying on the sysfs files
@@ -1127,10 +1231,7 @@ func (client *Client) isSessionStale(ctx context.Context, sessionID string) bool
11271231
filename := fmt.Sprintf(client.chrootPathPrefix+"/sys/class/iscsi_session/session%s/state", sessionID)
11281232
sessionStateBytes, err := client.os.ReadFile(filename)
11291233
if err != nil {
1130-
Logc(ctx).WithFields(LogFields{
1131-
"path": filename,
1132-
"error": err,
1133-
}).Error("Could not read session state file")
1234+
Logc(ctx).WithField("path", filename).WithError(err).Error("Could not read session state file.")
11341235
return false
11351236
}
11361237

@@ -2725,6 +2826,14 @@ func (client *Client) InspectAllISCSISessions(
27252826

27262827
if action != models.NoAction {
27272828
candidateStalePortals = append(candidateStalePortals, portal)
2829+
2830+
// At this point we know the iSCSI session has been stale for some time but do not know why.
2831+
// Retrieve additional state from sysfs and inform the admin of an issue.
2832+
Logc(ctx).WithFields(LogFields{
2833+
"portal": portal,
2834+
"sessionState": client.getSessionState(ctx, currentPortalInfo.SessionNumber),
2835+
"connectionState": client.getSessionConnectionsState(ctx, currentPortalInfo.SessionNumber),
2836+
}).Warn("Portal requires manual intervention; storage network connection may be unstable.")
27282837
}
27292838
continue
27302839
}
@@ -2794,7 +2903,10 @@ func isStalePortal(
27942903
} else if timeNow.Sub(publishedPortalInfo.FirstIdentifiedStaleAt) >= iSCSISessionWaitTime {
27952904
Logc(ctx).WithFields(logFields).Warningf("Portal exceeded stale wait time at %v; adding to stale portals list.",
27962905
timeNow)
2797-
return models.LogoutLoginScan
2906+
// Things like storage platform upgrades or extended network outages may result in a FREE or FAILED state on the
2907+
// session. At this point in time, there isn't a reliable mechanism to know when it would be safe to perform a
2908+
// Logout remediation step, so only ever perform a LoginScan.
2909+
return models.LoginScan
27982910
} else {
27992911
Logc(ctx).WithFields(logFields).Warningf("Portal has not exceeded stale wait time at %v.", timeNow)
28002912
}

utils/iscsi/iscsi_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5879,7 +5879,7 @@ func TestIsStalePortal(t *testing.T) {
58795879
SessionWaitTime: 10 * time.Second,
58805880
TimeNow: time.Now().Add(20 * time.Second),
58815881
Portal: ipList[0],
5882-
ResultAction: models.LogoutLoginScan,
5882+
ResultAction: models.LoginScan,
58835883
SimulateConditions: func(publishedSessions, currentSessions *models.ISCSISessions, portal string) {
58845884
publishedSessions.Info[portal].PortalInfo.FirstIdentifiedStaleAt = time.Now()
58855885
},
@@ -5943,7 +5943,7 @@ func TestIsStalePortal(t *testing.T) {
59435943
SessionWaitTime: 10 * time.Second,
59445944
TimeNow: time.Now().Add(20 * time.Second),
59455945
Portal: ipList[0],
5946-
ResultAction: models.LogoutLoginScan,
5946+
ResultAction: models.LoginScan,
59475947
SimulateConditions: func(publishedSessions, currentSessions *models.ISCSISessions, portal string) {
59485948
publishedSessions.Info[portal].PortalInfo.Credentials = chapCredentials[0]
59495949
publishedSessions.Info[portal].PortalInfo.FirstIdentifiedStaleAt = time.Now()

0 commit comments

Comments
 (0)