Skip to content

Commit 25ceda3

Browse files
Tim SmithMarkSymsCtx
authored andcommitted
CA-395560 Improve logging and error checking on sg_readcap
When reading information using sg_readcap, it would be useful in the event of failure to log what the return code was so that we know what failure we're dealing with. Also, per the manual pages for these commands, there are quite a few more return codes which should be considered retryable, plus a non-zero success return code which is theoretically possible. Add a function to gather up the checks and logs for these things and then wrap a call to that function in a short retry loop. Signed-off-by: Tim Smith <tim.smith@cloud.com>
1 parent d87b53a commit 25ceda3

File tree

1 file changed

+52
-5
lines changed

1 file changed

+52
-5
lines changed

drivers/scsiutil.py

Lines changed: 52 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -681,15 +681,62 @@ def remove_stale_luns(hostids, lunid, expectedPath, mpath):
681681
" up properly! Error: %s" % str(e))
682682

683683

684+
def sg_return_check(rc, logmsg, stderr):
685+
"""
686+
Return true if the return code indicates success, false if it is not success
687+
but is retryable, and raise a util.SMException if it is not retryable using the
688+
logmessage and stderr provided.
689+
690+
In the event that a delay is desirable before a retry, that delay is
691+
baked into this function.
692+
"""
693+
if rc == 0:
694+
return True
695+
if rc == 2:
696+
# This is "device not ready", so sleep and try again
697+
util.SMlog(f"{logmsg}: not ready")
698+
time.sleep(1)
699+
return False
700+
if rc == 6:
701+
# retry without a wait for "unit attention".
702+
util.SMlog(f"{logmsg}: unit attention")
703+
return False
704+
if rc == 11:
705+
# Aborted command. Retryable without delay
706+
util.SMlog(f"{logmsg}: command aborted")
707+
return False
708+
if rc == 14:
709+
# Sense miscompare. Retryable without delay until proven otherwise
710+
util.SMlog(f"{logmsg}: sense miscompare")
711+
return False
712+
if rc == 21:
713+
# An error was recovered. This is a success but we would not normally
714+
# expect to see it. Log if it happens.
715+
util.SMlog(f"{logmsg}: recovered error: {stderr}")
716+
return True
717+
if rc == 33:
718+
# Timed out. Retryable without delay
719+
util.SMlog(f"{logmsg}: timed out")
720+
return False
721+
722+
raise util.SMException(f"{logmsg}: RC={rc}, STDERR={stderr}")
723+
724+
684725
def sg_readcap(device):
685726
device = os.path.join('/dev', getdev(device))
686727
readcapcommand = ['/usr/bin/sg_readcap', '-b', device]
687-
(rc, stdout, stderr) = util.doexec(readcapcommand)
688-
if rc == 6:
689-
# retry one time for "Capacity data has changed"
728+
attempts = 3
729+
succeeded = False
730+
while attempts > 0:
731+
attempts -= 1
690732
(rc, stdout, stderr) = util.doexec(readcapcommand)
691-
if rc != 0:
692-
raise util.SMException("scsiutil.sg_readcap(%s) failed" % (device))
733+
if sg_return_check(rc, f"scsiutil.sg_readcap({device})", stderr):
734+
succeeded = True
735+
break
736+
737+
if not succeeded:
738+
raise util.SMException(f"scsiutil.sg_readcap({device}): too many failures")
739+
693740
match = re.search('(^|.*\n)(0x[0-9a-fA-F]+) (0x[0-9a-fA-F]+)\n$', stdout)
694741
if not match:
695742
raise util.SMException("scsiutil.sg_readcap(%s) failed to parse: %s"

0 commit comments

Comments
 (0)