Skip to content

Commit c58f781

Browse files
committed
Make sure to wait until server is powered off before returning that fencing was successful
To avoid cases where we may try to power on before the power off is complete let's wait until the redfish or ipmi interface returns that the server is OFF. Jira: https://issues.redhat.com/browse/OSPRH-20863
1 parent 6f2c429 commit c58f781

File tree

1 file changed

+33
-6
lines changed

1 file changed

+33
-6
lines changed

templates/instanceha/bin/instanceha.py

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,18 @@ def _redfish_get_power_state(url, user, passwd, timeout):
552552
logging.error('Failed to get power state: %s' % str(e))
553553
return None
554554

555+
def _ipmi_get_power_state(ip, port, user, passwd, timeout):
556+
"""Get the power state from IPMI"""
557+
try:
558+
cmd = ['ipmitool', '-I', 'lanplus', '-H', ip, '-U', user, '-P', passwd, '-p', port, 'power', 'status']
559+
cmd_output = subprocess.run(cmd, timeout=timeout, capture_output=True, text=True, check=True)
560+
return cmd_output.stdout.strip().upper()
561+
except subprocess.TimeoutExpired:
562+
logging.error('Failed to get IPMI power state: timeout expired')
563+
except subprocess.CalledProcessError as e:
564+
logging.error('Failed to get IPMI power state: command failed with return code %d' % e.returncode)
565+
return None
566+
555567
def _bmh_fence(token, namespace, host, action):
556568

557569
url = "https://kubernetes.default.svc/apis/metal3.io/v1alpha1/namespaces/%s/baremetalhosts/%s?fieldManager=kubectl-patch" % (namespace, host)
@@ -602,6 +614,7 @@ def _host_fence(host, action):
602614
port = str(fencing_data["ipport"])
603615
user = str(fencing_data["login"])
604616
passwd = str(fencing_data["passwd"])
617+
timeout = fencing_data["timeout"] if "timeout" in fencing_data else 30
605618

606619
logging.debug('Checking %s power status' % host)
607620

@@ -610,20 +623,27 @@ def _host_fence(host, action):
610623
if action == 'off':
611624
cmd = ["ipmitool", "-I", "lanplus", "-H", "%s" % ip, "-U", "%s" % user, "-P", "%s" % passwd, "-p", "%s" % port, "power", "off"]
612625
try:
613-
cmd_output = subprocess.run(cmd, timeout=30, capture_output=True, text=True, check=True)
626+
cmd_output = subprocess.run(cmd, timeout=timeout, capture_output=True, text=True, check=True)
614627
except subprocess.TimeoutExpired:
615628
logging.error('Timeout expired while sending IPMI command for power off of %s' % host)
616629
return False
617630
except subprocess.CalledProcessError as e:
618631
logging.error('Error while sending IPMI command for power off of %s' % host)
619632
return False
620633

621-
logging.info('Successfully powered off %s' % host)
622-
return True
634+
# Wait for server to actually power off
635+
for _ in range(timeout):
636+
time.sleep(1)
637+
power_state = _ipmi_get_power_state(ip, port, user, passwd, timeout)
638+
if power_state == 'CHASSIS POWER IS OFF':
639+
logging.info('Successfully powered off %s' % host)
640+
return True
641+
logging.error('Power off of %s timed out' % host)
642+
return False
623643
else:
624644
cmd = ["ipmitool", "-I", "lanplus", "-H", "%s" % ip, "-U", "%s" % user, "-P", "%s" % passwd, "-p", "%s" % port, "power", "on"]
625645
try:
626-
cmd_output = subprocess.run(cmd, timeout=30, capture_output=True, text=True, check=True)
646+
cmd_output = subprocess.run(cmd, timeout=timeout, capture_output=True, text=True, check=True)
627647
except subprocess.TimeoutExpired:
628648
logging.error('Timeout expired while sending IPMI command for power on of %s' % host)
629649
return False
@@ -657,8 +677,15 @@ def _host_fence(host, action):
657677
r = _redfish_reset(url, user, passwd, timeout, "ForceOff")
658678

659679
if r.status_code in [200, 204]:
660-
logging.info('Power off of %s ok' % host)
661-
return True
680+
# Wait for server to actually power off
681+
for _ in range(timeout):
682+
time.sleep(1)
683+
power_state = _redfish_get_power_state(url, user, passwd, timeout)
684+
if power_state == 'OFF':
685+
logging.info('Power off of %s ok' % host)
686+
return True
687+
logging.error('Power off of %s timed out' % host)
688+
return False
662689
elif r.status_code == 409:
663690
# Check if server is already powered off
664691
power_state = _redfish_get_power_state(url, user, passwd, timeout)

0 commit comments

Comments
 (0)