Skip to content

Commit dd38c0c

Browse files
authored
Do not log out of stale portals during iSCSI self-healing
This commit changes iSCSI self-healing such that the logout remediation is never performed for stale portals. Instead, if a session is stale for an extended period of time, Trident will gather additional session and connection state then log a warning to administrators to indicate which portals are potentially unhealthy.
1 parent 31a0219 commit dd38c0c

File tree

4 files changed

+141
-9
lines changed

4 files changed

+141
-9
lines changed

utils/iscsi.go

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2024 NetApp, Inc. All Rights Reserved.
1+
// Copyright 2025 NetApp, Inc. All Rights Reserved.
22

33
package utils
44

@@ -726,6 +726,14 @@ func InspectAllISCSISessions(
726726

727727
if action != models.NoAction {
728728
candidateStalePortals = append(candidateStalePortals, portal)
729+
730+
// At this point we know the iSCSI session has been stale for some time but do not know why.
731+
// Retrieve additional state from sysfs and inform the admin of an issue.
732+
Logc(ctx).WithFields(LogFields{
733+
"portal": portal,
734+
"sessionState": GetSessionState(ctx, currentPortalInfo.SessionNumber),
735+
"connectionState": GetSessionConnectionsState(ctx, currentPortalInfo.SessionNumber),
736+
}).Warn("Portal requires manual intervention; storage network connection may be unstable.")
729737
}
730738
continue
731739
}
@@ -795,7 +803,10 @@ func isStalePortal(
795803
} else if timeNow.Sub(publishedPortalInfo.FirstIdentifiedStaleAt) >= iSCSISessionWaitTime {
796804
Logc(ctx).WithFields(logFields).Warningf("Portal exceeded stale wait time at %v; adding to stale portals list.",
797805
timeNow)
798-
return models.LogoutLoginScan
806+
// Things like storage platform upgrades or extended network outages may result in a FREE or FAILED state on the
807+
// session. At this point in time, there isn't a reliable mechanism to know when it would be safe to perform a
808+
// Logout remediation step, so only ever perform a LoginScan.
809+
return models.LoginScan
799810
} else {
800811
Logc(ctx).WithFields(logFields).Warningf("Portal has not exceeded stale wait time at %v.", timeNow)
801812
}
@@ -915,6 +926,14 @@ func iSCSIScanTargetLUN(ctx context.Context, lunID int, hosts []int) error {
915926
return iscsiClient.ScanTargetLUN(ctx, lunID, hosts)
916927
}
917928

929+
func GetSessionState(ctx context.Context, sessionID string) string {
930+
return iscsiClient.GetSessionState(ctx, sessionID)
931+
}
932+
933+
func GetSessionConnectionsState(ctx context.Context, sessionID string) []string {
934+
return iscsiClient.GetSessionConnectionsState(ctx, sessionID)
935+
}
936+
918937
func IsISCSISessionStale(ctx context.Context, sessionNumber string) bool {
919938
return iscsiClient.IsSessionStale(ctx, sessionNumber)
920939
}

utils/iscsi/expose.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2024 NetApp, Inc. All Rights Reserved.
1+
// Copyright 2025 NetApp, Inc. All Rights Reserved.
22

33
package iscsi
44

@@ -54,6 +54,14 @@ func (client *Client) GetLunSerial(ctx context.Context, path string) (string, er
5454
return client.getLunSerial(ctx, path)
5555
}
5656

57+
func (client *Client) GetSessionState(ctx context.Context, sessionID string) string {
58+
return client.getSessionState(ctx, sessionID)
59+
}
60+
61+
func (client *Client) GetSessionConnectionsState(ctx context.Context, sessionID string) []string {
62+
return client.getSessionConnectionsState(ctx, sessionID)
63+
}
64+
5765
func (client *Client) IsSessionStale(ctx context.Context, sessionID string) bool {
5866
return client.isSessionStale(ctx, sessionID)
5967
}

utils/iscsi/iscsi.go

Lines changed: 108 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2024 NetApp, Inc. All Rights Reserved.
1+
// Copyright 2025 NetApp, Inc. All Rights Reserved.
22

33
package iscsi
44

@@ -15,6 +15,7 @@ import (
1515
"fmt"
1616
"os"
1717
"os/exec"
18+
"path/filepath"
1819
"regexp"
1920
"strconv"
2021
"strings"
@@ -38,8 +39,9 @@ const (
3839
DevPrefix = "/dev/"
3940
DevMapperRoot = "/dev/mapper/"
4041

41-
sessionStateLoggedIn = "LOGGED_IN"
42-
SessionInfoSource = "sessionSource"
42+
sessionStateLoggedIn = "LOGGED_IN"
43+
SessionInfoSource = "sessionSource"
44+
sessionConnectionStateUp = "up"
4345

4446
iscsiadmLoginTimeoutValue = 10
4547
iscsiadmLoginTimeout = iscsiadmLoginTimeoutValue * time.Second
@@ -1191,6 +1193,109 @@ func (client *Client) portalsToLogin(ctx context.Context, targetIQN string, port
11911193
return portalsNotLoggedIn, loggedIn, nil
11921194
}
11931195

1196+
// getSessionConnectionsState returns the state of iscsi session connections stored in:
1197+
// '/sys/class/iscsi_session/session<ID>/device/connection<ID>:0/iscsi_connection/connection<ID>:0.
1198+
func (client *Client) getSessionConnectionsState(ctx context.Context, sessionID string) []string {
1199+
Logc(ctx).WithField("sessionID", sessionID).Debug(">>>> iscsi.getSessionConnectionsState")
1200+
defer Logc(ctx).Debug("<<<< iscsi.getSessionConnectionsState")
1201+
1202+
// Find the session device dirs under: '/sys/class/iscsi_session/session<ID>/device/'.
1203+
sessionName := fmt.Sprintf("session%s", sessionID)
1204+
sessionDevicePath := filepath.Join(client.chrootPathPrefix, "sys", "class", "iscsi_session", sessionName, "device")
1205+
sessionDeviceEntries, err := client.os.ReadDir(sessionDevicePath)
1206+
if err != nil {
1207+
Logc(ctx).WithField("path", sessionDevicePath).WithError(err).Error("Could not read session dirs.")
1208+
return nil
1209+
}
1210+
1211+
const notFound = "<NOT FOUND>"
1212+
var errs error
1213+
1214+
// Dynamically discover the 'state' for all underlying connections and return them.
1215+
connectionStates := make([]string, 0)
1216+
for _, entry := range sessionDeviceEntries {
1217+
// Only consider: `/sys/class/iscsi_session/session<ID>/device/connection<ID>:0`
1218+
connection := entry.Name()
1219+
if !strings.HasPrefix(connection, "connection") {
1220+
continue
1221+
}
1222+
1223+
// At this point, we know we're looking at something like:
1224+
// '/sys/class/iscsi_session/session<ID>/device/connection<ID>:0' but we need:
1225+
// '/sys/class/iscsi_session/session<ID>/device/connection<ID>:0/iscsi_connection/connection<ID>:0'
1226+
state := notFound
1227+
statePath := filepath.Join(sessionDevicePath, connection, "iscsi_connection", connection, "state")
1228+
rawState, err := client.os.ReadFile(statePath)
1229+
if err != nil {
1230+
errs = errors.Join(errs, fmt.Errorf("failed to read session state at: '%s'; %w", statePath, err))
1231+
} else if len(rawState) != 0 {
1232+
state = strings.TrimSpace(string(rawState))
1233+
}
1234+
1235+
// If the connection state is "up" or not found, further inspection won't be helpful. Ignore this and move on.
1236+
if state == sessionConnectionStateUp || state == notFound {
1237+
continue
1238+
}
1239+
1240+
// Get the persistent address. This is the IP associated with a session.
1241+
address := notFound
1242+
addrPath := filepath.Join(sessionDevicePath, connection, "iscsi_connection", connection, "persistent_address")
1243+
rawAddress, err := client.os.ReadFile(addrPath)
1244+
if err != nil {
1245+
errs = errors.Join(errs, fmt.Errorf("failed to read connection IP at: '%s'; %w", addrPath, err))
1246+
} else if len(rawAddress) != 0 {
1247+
address = strings.TrimSpace(string(rawAddress))
1248+
}
1249+
1250+
// Get the persistent port. This is the port associated with a session.
1251+
port := notFound
1252+
portPath := filepath.Join(sessionDevicePath, connection, "iscsi_connection", connection, "persistent_port")
1253+
rawPort, err := client.os.ReadFile(portPath)
1254+
if err != nil {
1255+
errs = errors.Join(errs, fmt.Errorf("failed to read connection port at: '%s'; %w", portPath, err))
1256+
} else if len(rawPort) != 0 {
1257+
port = strings.TrimSpace(string(rawPort))
1258+
}
1259+
1260+
portal := fmt.Sprintf("%s:%s", address, port)
1261+
1262+
// This will allow Trident to communicate which portals have bad connections.
1263+
connectionState := fmt.Sprintf("\"portal:'%s'; connection:'%s'; state:'%s'\"", portal, connection, state)
1264+
connectionStates = append(connectionStates, connectionState)
1265+
}
1266+
1267+
if errs != nil {
1268+
Logc(ctx).WithError(errs).Error("Could not discover state of iSCSI connections.")
1269+
}
1270+
1271+
return connectionStates
1272+
}
1273+
1274+
func (client *Client) getSessionState(ctx context.Context, sessionID string) string {
1275+
Logc(ctx).WithField("sessionID", sessionID).Debug(">>>> iscsi.getSessionState")
1276+
defer Logc(ctx).Debug("<<<< iscsi.getSessionState")
1277+
1278+
// Find the session state from the session at /sys/class/iscsi_session/sessionXXX/state
1279+
filename := fmt.Sprintf(client.chrootPathPrefix+"/sys/class/iscsi_session/session%s/state", sessionID)
1280+
sessionStateBytes, err := client.os.ReadFile(filename)
1281+
if err != nil {
1282+
Logc(ctx).WithFields(LogFields{
1283+
"path": filename,
1284+
"error": err,
1285+
}).Error("Could not read session state file.")
1286+
return ""
1287+
}
1288+
1289+
sessionState := strings.TrimSpace(string(sessionStateBytes))
1290+
Logc(ctx).WithFields(LogFields{
1291+
"sessionID": sessionID,
1292+
"sessionState": sessionState,
1293+
"sysfsFile": filename,
1294+
}).Debug("Found iSCSI session state.")
1295+
1296+
return sessionState
1297+
}
1298+
11941299
// IsSessionStale - reads /sys/class/iscsi_session/session<sid>/state and returns true if it is not "LOGGED_IN".
11951300
// Looks that the state of an already established session to identify if it is
11961301
// logged in or not, if it is not logged in then it could be a stale session.

utils/iscsi_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2024 NetApp, Inc. All Rights Reserved.
1+
// Copyright 2025 NetApp, Inc. All Rights Reserved.
22

33
package utils
44

@@ -268,7 +268,7 @@ func TestIsStalePortal(t *testing.T) {
268268
SessionWaitTime: 10 * time.Second,
269269
TimeNow: time.Now().Add(20 * time.Second),
270270
Portal: ipList[0],
271-
ResultAction: models.LogoutLoginScan,
271+
ResultAction: models.LoginScan,
272272
SimulateConditions: func(publishedSessions, currentSessions *models.ISCSISessions, portal string) {
273273
publishedSessions.Info[portal].PortalInfo.FirstIdentifiedStaleAt = time.Now()
274274
},
@@ -332,7 +332,7 @@ func TestIsStalePortal(t *testing.T) {
332332
SessionWaitTime: 10 * time.Second,
333333
TimeNow: time.Now().Add(20 * time.Second),
334334
Portal: ipList[0],
335-
ResultAction: models.LogoutLoginScan,
335+
ResultAction: models.LoginScan,
336336
SimulateConditions: func(publishedSessions, currentSessions *models.ISCSISessions, portal string) {
337337
publishedSessions.Info[portal].PortalInfo.Credentials = chapCredentials[0]
338338
publishedSessions.Info[portal].PortalInfo.FirstIdentifiedStaleAt = time.Now()

0 commit comments

Comments
 (0)