@@ -11,6 +11,7 @@ import (
1111 "fmt"
1212 "os"
1313 "os/exec"
14+ "path/filepath"
1415 "regexp"
1516 "sort"
1617 "strconv"
@@ -42,6 +43,7 @@ const (
4243 SessionInfoSource = "sessionSource"
4344 SessionSourceCurrentStatus = "currentStatus"
4445 SessionSourceNodeStage = "nodeStage"
46+ sessionConnectionStateUp = "up"
4547
4648 iscsiadmLoginTimeoutValue = 10
4749 iscsiadmLoginTimeout = iscsiadmLoginTimeoutValue * time .Second
@@ -1115,7 +1117,109 @@ func (client *Client) portalsToLogin(ctx context.Context, targetIQN string, port
11151117 return portalsNotLoggedIn , loggedIn , nil
11161118}
11171119
1118- // IsSessionStale - reads /sys/class/iscsi_session/session<sid>/state and returns true if it is not "LOGGED_IN".
1120+ // getSessionConnectionsState returns the state of iscsi session connections stored in:
1121+ // '/sys/class/iscsi_session/session<ID>/device/connection<ID>:0/iscsi_connection/connection<ID>:0.
1122+ func (client * Client ) getSessionConnectionsState (ctx context.Context , sessionID string ) []string {
1123+ Logc (ctx ).WithField ("sessionID" , sessionID ).Debug (">>>> iscsi.getSessionConnectionsState" )
1124+ defer Logc (ctx ).Debug ("<<<< iscsi.getSessionConnectionsState" )
1125+
1126+ // Find the session device dirs under: '/sys/class/iscsi_session/session<ID>/device/'.
1127+ sessionName := fmt .Sprintf ("session%s" , sessionID )
1128+ sessionDevicePath := filepath .Join (client .chrootPathPrefix , "sys" , "class" , "iscsi_session" , sessionName , "device" )
1129+ sessionDeviceEntries , err := client .os .ReadDir (sessionDevicePath )
1130+ if err != nil {
1131+ Logc (ctx ).WithField ("path" , sessionDevicePath ).WithError (err ).Error ("Could not read session dirs." )
1132+ return nil
1133+ }
1134+
1135+ const notFound = "<NOT FOUND>"
1136+ var errs error
1137+
1138+ // Dynamically discover the 'state' for all underlying connections and return them.
1139+ connectionStates := make ([]string , 0 )
1140+ for _ , entry := range sessionDeviceEntries {
1141+ // Only consider: `/sys/class/iscsi_session/session<ID>/device/connection<ID>:0`
1142+ connection := entry .Name ()
1143+ if ! strings .HasPrefix (connection , "connection" ) {
1144+ continue
1145+ }
1146+
1147+ // At this point, we know we're looking at something like:
1148+ // '/sys/class/iscsi_session/session<ID>/device/connection<ID>:0' but we need:
1149+ // '/sys/class/iscsi_session/session<ID>/device/connection<ID>:0/iscsi_connection/connection<ID>:0'
1150+ state := notFound
1151+ statePath := filepath .Join (sessionDevicePath , connection , "iscsi_connection" , connection , "state" )
1152+ rawState , err := client .os .ReadFile (statePath )
1153+ if err != nil {
1154+ errs = errors .Join (errs , fmt .Errorf ("failed to read session state at: '%s'; %w" , statePath , err ))
1155+ } else if len (rawState ) != 0 {
1156+ state = strings .TrimSpace (string (rawState ))
1157+ }
1158+
1159+ // If the connection state is "up" or not found, further inspection won't be helpful. Ignore this and move on.
1160+ if state == sessionConnectionStateUp || state == notFound {
1161+ continue
1162+ }
1163+
1164+ // Get the persistent address. This is the IP associated with a session.
1165+ address := notFound
1166+ addrPath := filepath .Join (sessionDevicePath , connection , "iscsi_connection" , connection , "persistent_address" )
1167+ rawAddress , err := client .os .ReadFile (addrPath )
1168+ if err != nil {
1169+ errs = errors .Join (errs , fmt .Errorf ("failed to read connection IP at: '%s'; %w" , addrPath , err ))
1170+ } else if len (rawAddress ) != 0 {
1171+ address = strings .TrimSpace (string (rawAddress ))
1172+ }
1173+
1174+ // Get the persistent port. This is the port associated with a session.
1175+ port := notFound
1176+ portPath := filepath .Join (sessionDevicePath , connection , "iscsi_connection" , connection , "persistent_port" )
1177+ rawPort , err := client .os .ReadFile (portPath )
1178+ if err != nil {
1179+ errs = errors .Join (errs , fmt .Errorf ("failed to read connection port at: '%s'; %w" , portPath , err ))
1180+ } else if len (rawPort ) != 0 {
1181+ port = strings .TrimSpace (string (rawPort ))
1182+ }
1183+
1184+ portal := fmt .Sprintf ("%s:%s" , address , port )
1185+
1186+ // This will allow Trident to communicate which portals have bad connections.
1187+ connectionState := fmt .Sprintf ("\" portal:'%s'; connection:'%s'; state:'%s'\" " , portal , connection , state )
1188+ connectionStates = append (connectionStates , connectionState )
1189+ }
1190+
1191+ if errs != nil {
1192+ Logc (ctx ).WithError (errs ).Error ("Could not discover state of iSCSI connections." )
1193+ }
1194+
1195+ return connectionStates
1196+ }
1197+
1198+ // getSessionState returns the state stored in /sys/class/iscsi_session/session<sid>/state.
1199+ // If no state is found, an empty string is returned.
1200+ func (client * Client ) getSessionState (ctx context.Context , sessionID string ) string {
1201+ Logc (ctx ).WithField ("sessionID" , sessionID ).Debug (">>>> iscsi.getSessionState" )
1202+ defer Logc (ctx ).Debug ("<<<< iscsi.getSessionState" )
1203+
1204+ // Find the session state from the session at /sys/class/iscsi_session/sessionXXX/state
1205+ filename := fmt .Sprintf (client .chrootPathPrefix + "/sys/class/iscsi_session/session%s/state" , sessionID )
1206+ sessionStateBytes , err := client .os .ReadFile (filename )
1207+ if err != nil {
1208+ Logc (ctx ).WithField ("path" , filename ).WithError (err ).Error ("Could not read session state file." )
1209+ return ""
1210+ }
1211+
1212+ sessionState := strings .TrimSpace (string (sessionStateBytes ))
1213+ Logc (ctx ).WithFields (LogFields {
1214+ "sessionID" : sessionID ,
1215+ "sessionState" : sessionState ,
1216+ "sysfsFile" : filename ,
1217+ }).Debug ("Found iSCSI session state." )
1218+
1219+ return sessionState
1220+ }
1221+
1222+ // isSessionStale - reads /sys/class/iscsi_session/session<sid>/state and returns true if it is not "LOGGED_IN".
11191223// Looks that the state of an already established session to identify if it is
11201224// logged in or not, if it is not logged in then it could be a stale session.
11211225// For now, we are relying on the sysfs files
@@ -1127,10 +1231,7 @@ func (client *Client) isSessionStale(ctx context.Context, sessionID string) bool
11271231 filename := fmt .Sprintf (client .chrootPathPrefix + "/sys/class/iscsi_session/session%s/state" , sessionID )
11281232 sessionStateBytes , err := client .os .ReadFile (filename )
11291233 if err != nil {
1130- Logc (ctx ).WithFields (LogFields {
1131- "path" : filename ,
1132- "error" : err ,
1133- }).Error ("Could not read session state file" )
1234+ Logc (ctx ).WithField ("path" , filename ).WithError (err ).Error ("Could not read session state file." )
11341235 return false
11351236 }
11361237
@@ -2655,7 +2756,6 @@ func (client *Client) InspectAllISCSISessions(
26552756 var candidateStalePortals , candidateNonStalePortal []string
26562757
26572758 for portal , publishedSessionData := range publishedSessions .Info {
2658-
26592759 logFields := LogFields {"portal" : portal }
26602760
26612761 var publishedPortalInfo , currentPortalInfo * models.PortalInfo
@@ -2725,6 +2825,14 @@ func (client *Client) InspectAllISCSISessions(
27252825
27262826 if action != models .NoAction {
27272827 candidateStalePortals = append (candidateStalePortals , portal )
2828+
2829+ // At this point we know the iSCSI session has been stale for some time but do not know why.
2830+ // Retrieve additional state from sysfs and inform the admin of an issue.
2831+ Logc (ctx ).WithFields (LogFields {
2832+ "portal" : portal ,
2833+ "sessionState" : client .getSessionState (ctx , currentPortalInfo .SessionNumber ),
2834+ "connectionState" : client .getSessionConnectionsState (ctx , currentPortalInfo .SessionNumber ),
2835+ }).Warn ("Portal requires manual intervention; storage network connection may be unstable." )
27282836 }
27292837 continue
27302838 }
@@ -2794,7 +2902,10 @@ func isStalePortal(
27942902 } else if timeNow .Sub (publishedPortalInfo .FirstIdentifiedStaleAt ) >= iSCSISessionWaitTime {
27952903 Logc (ctx ).WithFields (logFields ).Warningf ("Portal exceeded stale wait time at %v; adding to stale portals list." ,
27962904 timeNow )
2797- return models .LogoutLoginScan
2905+ // Things like storage platform upgrades or extended network outages may result in a FREE or FAILED state on the
2906+ // session. At this point in time, there isn't a reliable mechanism to know when it would be safe to perform a
2907+ // Logout remediation step, so only ever perform a LoginScan.
2908+ return models .LoginScan
27982909 } else {
27992910 Logc (ctx ).WithFields (logFields ).Warningf ("Portal has not exceeded stale wait time at %v." , timeNow )
28002911 }
0 commit comments