Skip to content

Commit a32dc6e

Browse files
Resize unreahcbale nodes fix
1 parent c7f894b commit a32dc6e

File tree

2 files changed

+8
-16
lines changed

2 files changed

+8
-16
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ Resizing of HPC cluster with Cluster Network consist of 2 major sub-steps:
6060

6161
## resize.sh usage
6262

63-
The resize.sh is deployed on the bastion node as part of the HPC cluster Stack deployment. Unreachable nodes have been causing issues. If nodes in the inventory are unreachable, we will not do cluster modification to the cluster unless --remove_unreachable is also specified. That will remove the node from the inventory and then do the requested action. To avoid losing track of nodes, we advise you to remove the unreachable nodes with `resize.sh remove_unreachable --nodes nodename` before doing any action on the cluster.
63+
The resize.sh is deployed on the bastion node as part of the HPC cluster Stack deployment. Unreachable nodes have been causing issues. If nodes in the inventory are unreachable, we will not do cluster modification to the cluster unless --remove_unreachable is also specified. That will terminate the unreachable nodes before running the action that was requested (Example Adding a node)
6464

6565
```
6666
/opt/oci-hpc/bin/resize.sh -h

bin/resize.py

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -240,11 +240,8 @@ def add_reconfigure(comp_ocid,cn_ocid,inventory,CN,specific_hosts=None):
240240
if inv_vars.startswith("compute_username"):
241241
username=inv_vars.split("compute_username=")[1].strip()
242242
break
243-
if remove_unreachable:
244-
reachable_instances,unreachable_instances = getreachable(instances,username,delay=1200)
245-
else:
246-
reachable_instances=instances
247-
unreachable_instances=[]
243+
reachable_instances=instances
244+
unreachable_instances=[]
248245
if not os.path.isfile(inventory):
249246
print("There is no inventory file, are you on the bastion? The cluster has been resized but not reconfigured")
250247
exit()
@@ -305,12 +302,7 @@ def reconfigure(comp_ocid,cn_ocid,inventory,CN, crucial=False):
305302
if inv_vars.startswith("compute_username"):
306303
username=inv_vars.split("compute_username=")[1].strip()
307304
break
308-
if remove_unreachable:
309-
reachable_instances,unreachable_instances = getreachable(instances,username)
310-
else:
311-
reachable_instances=instances
312-
unreachable_instances=[]
313-
for node in reachable_instances:
305+
for node in instances:
314306
name=node['display_name']
315307
ip=node['ip']
316308
nodeline=name+" ansible_host="+ip+" ansible_user="+username+" role=compute\n"
@@ -521,7 +513,7 @@ def updateTFState(inventory,cluster_name,size):
521513
parser.add_argument('--user_logging', help='If present. Use the default settings in ~/.oci/config to connect to the API. Default is using instance_principal',action='store_true',default=False)
522514
parser.add_argument('--force', help='If present. Nodes will be removed even if the destroy playbook failed',action='store_true',default=False)
523515
parser.add_argument('--ansible_crucial', help='If present during reconfiguration, only crucial ansible playbooks will be executed on the live nodes. Non live nodes will be removed',action='store_true',default=False)
524-
parser.add_argument('--remove_unreachable', help='If present, nodes that are not sshable will be removed from the config. They will however not be removed from Slurm to avoid losing track of the down nodes. If you need to remove them from Slurm after terminating the nodes in the console. Run sudo scontrol update nodename=name state=Future',action='store_true',default=False)
516+
parser.add_argument('--remove_unreachable', help='If present, nodes that are not sshable will be terminated before running the action that was requested (Example Adding a node) ',action='store_true',default=False)
525517

526518
args = parser.parse_args()
527519

@@ -682,8 +674,8 @@ def updateTFState(inventory,cluster_name,size):
682674
hostnames_to_remove2 = list(hostnames)
683675
hostnames_to_remove2.extend(x for x in hostnames_to_remove if x not in hostnames_to_remove2)
684676
hostnames_to_remove=hostnames_to_remove2
685-
686-
if len(hostnames_to_remove):
677+
hostnames_to_remove_len=len(hostnames_to_remove)
678+
if hostnames_to_remove_len:
687679
if not no_reconfigure:
688680
playbook = playbooks_dir+"resize_remove_unreachable.yml"
689681
error_code = destroy_unreachable_reconfigure(inventory,hostnames_to_remove,playbook)
@@ -720,7 +712,7 @@ def updateTFState(inventory,cluster_name,size):
720712
reconfigure(comp_ocid,cn_ocid,inventory,CN)
721713

722714
if args.mode == 'add':
723-
size = current_size + args.number
715+
size = current_size - hostnames_to_remove_len + args.number
724716
update_size = oci.core.models.UpdateInstancePoolDetails(size=size)
725717
ComputeManagementClientCompositeOperations.update_instance_pool_and_wait_for_state(ipa_ocid,update_size,['RUNNING'])
726718
updateTFState(inventory,cluster_name,size)

0 commit comments

Comments
 (0)