Skip to content

Commit 6c9652e

Browse files
Merge pull request #44 from oracle-quickstart/2.11.0
2.11.0 Release
2 parents 382c496 + e5ee24e commit 6c9652e

File tree

158 files changed

+45035
-2601
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

158 files changed

+45035
-2601
lines changed

README.md

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ Allow dynamic-group instance_principal to manage compute-management-family in co
2323
Allow dynamic-group instance_principal to manage instance-family in compartment compartmentName
2424
Allow dynamic-group instance_principal to use virtual-network-family in compartment compartmentName
2525
Allow dynamic-group instance_principal to use volumes in compartment compartmentName
26+
Allow dynamic-group instance_principal to manage dns in compartment compartmentName
2627
```
2728
or:
2829

@@ -34,12 +35,9 @@ The stack allowa various combination of OS. Here is a list of what has been test
3435

3536
| Controller | Compute |
3637
|---------------|--------------|
37-
| OL7 | OL7 |
38-
| OL7 | OL8 |
39-
| OL7 | CentOS7 |
40-
| OL8 | OL8 |
41-
| OL8 | OL7 |
42-
| Ubuntu 20.04 | Ubuntu 20.04 |
38+
| OL8 | OL8 |
39+
| OL8 | OL7 |
40+
| Ubuntu 22.04 | Ubuntu 22.04 |
4341

4442
When switching to Ubuntu, make sure the username is changed from opc to Ubuntu in the ORM for both the controller and compute nodes.
4543
## How is resizing different from autoscaling ?
@@ -276,10 +274,6 @@ Example:
276274
```
277275
/opt/oci-hpc/bin/create_cluster.sh 4 compute2-1-hpc HPC_instance compute2
278276
```
279-
The name of the cluster must be
280-
queueName-clusterNumber-instanceType_keyword
281-
282-
The keyword will need to match the one from /opt/oci-hpc/conf/queues.conf to be registered in Slurm
283277

284278
### Cluster Deletion:
285279
```
@@ -422,3 +416,14 @@ By default, this check box is enabled. By selecting, this check-box, a PAR would
422416
Step 2: Use shell script: upload_rdma_nic_metrics.sh to collect metrics and upload to object storage.
423417
User needs to use shell script: upload_rdma_nic_metrics.sh to collect metrics and upload to object storage. User could configure metrics
424418
collection limit and interval through config file: rdma_metrics_collection_config.conf.
419+
420+
## Meshpinger
421+
422+
Meshpinger is a tool for validating network layer connectivity between RDMA NICs on a cluster network in OCI. The tool is capable of initiating ICMP ping from every RDMA NIC port on the cluster network to every other RDMA NIC port on the same cluster network and
423+
reporting back the success/failure status of the pings performed in the form of logs
424+
425+
Running the tool before starting workload on a cluster network should serve as a good precheck step to gain confidence on the network reachability between RDMA NICs. Typical causes for reachability failures that the tool can help pinpoint are,
426+
1. Link down on the RDMA NIC
427+
2. RDMA interface initialization or configuration issues including IP address assignment to
428+
the interface
429+
3. Insufficient ARP table size on the node to store all needed peer mac addresses

autoscaling/crontab/autoscale_slurm.sh

Lines changed: 51 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -111,12 +111,12 @@ def getDefaultsConfig(config,queue_name):
111111
for instance_type in partition["instance_types"]:
112112
if "default" in instance_type.keys():
113113
if instance_type["default"]:
114-
return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "instance_keyword":instance_type["instance_keyword"]}
114+
return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "hostname_convention":instance_type["hostname_convention"]}
115115
if len(partition["instance_types"])>0:
116116
instance_type=partition["instance_types"][0]
117117
print ("No default configuration was found, there may be a problem in your queues.conf file")
118118
print ("Selecting "+instance_type["name"]+" as default")
119-
return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "instance_keyword":instance_type["instance_keyword"]}
119+
return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "hostname_convention":instance_type["hostname_convention"]}
120120
print ("The queue "+queue_name+" was not found in the queues.conf file")
121121
return None
122122

@@ -125,7 +125,7 @@ def getJobConfig(config,queue_name,instance_type_name):
125125
if queue_name == partition["name"]:
126126
for instance_type in partition["instance_types"]:
127127
if instance_type_name == instance_type["name"]:
128-
return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "instance_keyword":instance_type["instance_keyword"]}
128+
return {"queue":partition["name"], "instance_type":instance_type["name"], "shape":instance_type["shape"], "cluster_network":instance_type["cluster_network"], "hostname_convention":instance_type["hostname_convention"]}
129129
return None
130130

131131
def getQueueLimits(config,queue_name,instance_type_name):
@@ -136,11 +136,11 @@ def getQueueLimits(config,queue_name,instance_type_name):
136136
return {"max_number_nodes": int(instance_type["max_number_nodes"]), "max_cluster_size": int(instance_type["max_cluster_size"]),"max_cluster_count": int(instance_type["max_cluster_count"])}
137137
return {"max_number_nodes": 0, "max_cluster_size": 0,"max_cluster_count": 0}
138138

139-
def getInstanceType(config,queue_name,instance_keyword):
139+
def getInstanceType(config,queue_name,hostname_convention):
140140
for partition in config:
141141
if queue_name == partition["name"]:
142142
for instance_type in partition["instance_types"]:
143-
if instance_keyword == instance_type["instance_keyword"]:
143+
if hostname_convention == instance_type["hostname_convention"]:
144144
return instance_type["name"]
145145
return None
146146

@@ -161,26 +161,33 @@ def getAllClusterNames(config):
161161
return availableNames
162162

163163
def getClusterName(node):
164-
out = subprocess.Popen(['scontrol','show','topology',node], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
165-
stdout,stderr = out.communicate()
166-
clusterName = None
167-
try:
168-
if len(stdout.split('\n')) > 2:
169-
for output in stdout.split('\n')[:-1]:
170-
if "Switches=" in output:
171-
clusterName=output.split()[0].split('SwitchName=')[1]
172-
break
173-
elif "SwitchName=inactive-" in output:
174-
continue
175-
else:
176-
clusterName=output.split()[0].split('SwitchName=')[1]
177-
elif len(stdout.split('\n')) == 2:
178-
clusterName=stdout.split('\n')[0].split()[0].split('SwitchName=')[1]
179-
if clusterName.startswith("inactive-"):
164+
details=getNodeDetails(node)
165+
clusterName="NOCLUSTERFOUND"
166+
for feature in details[0].split(","):
167+
if feature.startswith('CN__'):
168+
clusterName=feature[4:]
169+
if clusterName == "NOCLUSTERFOUND":
170+
out = subprocess.Popen(['scontrol','show','topology',node], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
171+
stdout,stderr = out.communicate()
172+
clusterName = None
173+
try:
174+
if len(stdout.split('\n')) > 2:
175+
for output in stdout.split('\n')[:-1]:
176+
if "Switches=" in output:
177+
clusterName=output.split()[0].split('SwitchName=')[1]
178+
break
179+
elif "SwitchName=inactive-" in output:
180+
continue
181+
else:
182+
clusterName=output.split()[0].split('SwitchName=')[1]
183+
elif len(stdout.split('\n')) == 2:
184+
clusterName=stdout.split('\n')[0].split()[0].split('SwitchName=')[1]
185+
if clusterName.startswith("inactive-"):
186+
return "NOCLUSTERFOUND"
187+
except:
188+
print('No ClusterName could be found for '+node)
189+
print('There seems to be some issues in the slurm topology file')
180190
return "NOCLUSTERFOUND"
181-
except:
182-
print('No ClusterName could be found for '+node)
183-
return "NOCLUSTERFOUND"
184191
return clusterName
185192

186193
def getstatus_slurm():
@@ -246,7 +253,7 @@ def getstatus_slurm():
246253
clustername=getClusterName(node)
247254
if clustername is None:
248255
continue
249-
instanceType=features[-1]
256+
instanceType=features[0]
250257
if queue in current_nodes.keys():
251258
if instanceType in current_nodes[queue].keys():
252259
current_nodes[queue][instanceType]+=1
@@ -276,7 +283,9 @@ def getstatus_slurm():
276283
cluster_to_destroy=[]
277284
for clustername in nodes_to_destroy_temp.keys():
278285
destroyEntireCluster=True
279-
if clustername in running_cluster or clustername == "NOCLUSTERFOUND":
286+
if clustername == "NOCLUSTERFOUND":
287+
destroyEntireCluster=False
288+
elif clustername in running_cluster:
280289
nodes_to_destroy[clustername]=nodes_to_destroy_temp[clustername]
281290
destroyEntireCluster=False
282291
else:
@@ -295,10 +304,10 @@ def getstatus_slurm():
295304
for clusterName in os.listdir(clusters_path):
296305
if len(clusterName.split('-')) < 3:
297306
continue
298-
instance_keyword='-'.join(clusterName.split('-')[2:])
307+
hostname_convention='-'.join(clusterName.split('-')[2:])
299308
clusterNumber=int(clusterName.split('-')[1])
300309
queue=clusterName.split('-')[0]
301-
instanceType=getInstanceType(config,queue,instance_keyword)
310+
instanceType=getInstanceType(config,queue,hostname_convention)
302311
if not queue in used_index.keys():
303312
used_index[queue]={}
304313
if not instanceType in used_index[queue].keys():
@@ -311,19 +320,19 @@ def getstatus_slurm():
311320
nodes = line.split()[0]
312321
instance_type = line.split()[1]
313322
queue = line.split()[2]
314-
try:
315-
cluster_building.append([int(nodes),instance_type,queue])
316-
if queue in building_nodes.keys():
317-
if instance_type in building_nodes[queue].keys():
318-
building_nodes[queue][instance_type]+=int(nodes)
323+
try:
324+
cluster_building.append([int(nodes),instance_type,queue])
325+
if queue in building_nodes.keys():
326+
if instance_type in building_nodes[queue].keys():
327+
building_nodes[queue][instance_type]+=int(nodes)
328+
else:
329+
building_nodes[queue][instance_type]=int(nodes)
319330
else:
320-
building_nodes[queue][instance_type]=int(nodes)
321-
else:
322-
building_nodes[queue]={instance_type:int(nodes)}
323-
except ValueError:
324-
print ('The cluster '+ clusterName + ' does not have a valid entry for \"currently_building\"')
325-
print ('Ignoring')
326-
continue
331+
building_nodes[queue]={instance_type:int(nodes)}
332+
except ValueError:
333+
print ('The cluster '+ clusterName + ' does not have a valid entry for \"currently_building\"')
334+
print ('Ignoring')
335+
continue
327336
if os.path.isfile(os.path.join(clusters_path,clusterName,'currently_destroying')):
328337
cluster_destroying.append(clusterName)
329338
return cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes
@@ -422,7 +431,7 @@ if autoscaling == "true":
422431
nextIndex=i
423432
used_index[queue][instance_type].append(i)
424433
break
425-
clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["instance_keyword"]
434+
clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["hostname_convention"]
426435
if not queue in current_nodes.keys():
427436
current_nodes[queue]={instance_type:0}
428437
else:
@@ -448,5 +457,5 @@ if autoscaling == "true":
448457
traceback.print_exc()
449458
os.remove(lockfile)
450459
else:
451-
print("Autoscaling is false")
460+
print("Autoscaling is false (set in /etc/ansible/hosts)")
452461
exit()

autoscaling/tf_init/cluster-network.tf

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ resource "oci_core_cluster_network" "cluster_network" {
2828
}
2929
freeform_tags = {
3030
"user" = var.tags
31+
"cluster_name" = local.cluster_name
32+
"parent_cluster" = local.cluster_name
3133
}
3234
placement_configuration {
3335
availability_domain = var.ad

autoscaling/tf_init/compute-cluster.tf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ resource "oci_core_compute_cluster" "compute_cluster" {
77
#Optional
88
display_name = local.cluster_name
99
freeform_tags = {
10+
"user" = var.tags
1011
"cluster_name" = local.cluster_name
1112
"parent_cluster" = local.cluster_name
1213
}

autoscaling/tf_init/controller_update.tf

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

22
locals {
3-
controller_path = "${var.autoscaling_folder}/clusters/${var.cluster_name}"
3+
controller_path = "${var.autoscaling_folder}/clusters/${local.cluster_name}"
44
}
55

66
resource "null_resource" "create_path" {
@@ -12,7 +12,7 @@ resource "null_resource" "create_path" {
1212
resource "local_file" "hosts" {
1313
depends_on = [null_resource.create_path,oci_core_cluster_network.cluster_network]
1414
content = join("\n", local.cluster_instances_ips)
15-
filename = "${local.controller_path}/hosts_${var.cluster_name}"
15+
filename = "${local.controller_path}/hosts_${local.cluster_name}"
1616
}
1717

1818
resource "local_file" "inventory" {
@@ -24,6 +24,8 @@ resource "local_file" "inventory" {
2424
backup_ip = var.backup_ip,
2525
login_name = var.login_name,
2626
login_ip = var.login_ip,
27+
monitoring_name = var.monitoring_name,
28+
monitoring_ip = var.monitoring_ip,
2729
compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[])
2830
public_subnet = var.public_subnet,
2931
private_subnet = var.private_subnet,
@@ -66,7 +68,7 @@ resource "local_file" "inventory" {
6668
instance_pool_ocpus=local.instance_pool_ocpus,
6769
queue=var.queue,
6870
instance_type=var.instance_type,
69-
monitoring=var.monitoring,
71+
cluster_monitoring=var.cluster_monitoring,
7072
autoscaling_monitoring = var.autoscaling_monitoring,
7173
unsupported = var.unsupported,
7274
hyperthreading = var.hyperthreading,
@@ -78,7 +80,9 @@ resource "local_file" "inventory" {
7880
pam = var.pam,
7981
sacct_limits = var.sacct_limits,
8082
use_compute_agent=var.use_compute_agent,
81-
healthchecks=var.healthchecks
83+
healthchecks=var.healthchecks,
84+
change_hostname=var.change_hostname,
85+
hostname_convention=var.hostname_convention
8286
})
8387
filename = "${local.controller_path}/inventory"
8488
}

autoscaling/tf_init/instance-pool.tf

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ resource "oci_core_instance_pool" "instance_pool" {
2727
display_name = local.cluster_name
2828
freeform_tags = {
2929
"user" = var.tags
30+
"cluster_name" = local.cluster_name
31+
"parent_cluster" = local.cluster_name
3032
}
3133
placement_configurations {
3234
availability_domain = var.ad

autoscaling/tf_init/inventory.tpl

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ ${controller_name} ansible_host=${controller_ip} ansible_user=${controller_usern
44
%{ if backup_name != "" }${backup_name} ansible_host=${backup_ip} ansible_user=${controller_username} role=controller%{ endif }
55
[login]
66
%{ if login_name != "" }${login_name} ansible_host=${login_ip} ansible_user=${compute_username} role=login%{ endif }
7+
[monitoring]
8+
%{ if monitoring_name != "" }${monitoring_name} ansible_host=${monitoring_ip} ansible_user=${compute_username} role=monitoring%{ endif }
79
[compute_to_add]
810
[compute_configured]
911
%{ for host, ip in compute ~}
@@ -62,7 +64,7 @@ log_vol=${log_vol}
6264
ldap=${ldap}
6365
queue=${queue}
6466
instance_type=${instance_type}
65-
monitoring=${monitoring}
67+
cluster_monitoring=${cluster_monitoring}
6668
hyperthreading=${hyperthreading}
6769
privilege_sudo=${privilege_sudo}
6870
privilege_group_name=${privilege_group_name}
@@ -74,4 +76,6 @@ sacct_limits=${sacct_limits}
7476
use_compute_agent=${use_compute_agent}
7577
zone_name=${zone_name}
7678
dns_entries=${dns_entries}
77-
healthchecks=${healthchecks}
79+
healthchecks=${healthchecks}
80+
change_hostname=${change_hostname}
81+
hostname_convention=${hostname_convention}

autoscaling/tf_init/locals.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,6 @@ locals {
3838

3939
timeout_per_batch= var.cluster_network ? var.use_multiple_ads ? 15 : 30 : var.use_multiple_ads ? 6 : 15
4040
timeout_ip = join("",[ (( var.node_count - ( var.node_count % 20 ) )/20 + 1 ) * local.timeout_per_batch,"m"])
41-
platform_type = local.shape == "BM.GPU4.8" ? "AMD_ROME_BM_GPU" : local.shape == "BM.GPU.B4.8" || local.shape == "BM.GPU.H100.8" || local.shape == "BM.GPU.A100-v2.8" ? "AMD_MILAN_BM_GPU" : local.shape == "BM.Standard.E3.128" ? "AMD_ROME_BM" : local.shape == "BM.Standard.E4.128" || local.shape == "BM.DenseIO.E4.128" ? "AMD_MILAN_BM" : "GENERIC_BM"
41+
platform_type = local.shape == "BM.GPU4.8" ? "AMD_ROME_BM_GPU" : local.shape == "BM.GPU.B4.8" || local.shape == "BM.GPU.A100-v2.8" ? "AMD_MILAN_BM_GPU" : local.shape == "BM.Standard.E3.128" ? "AMD_ROME_BM" : local.shape == "BM.Standard.E4.128" || local.shape == "BM.DenseIO.E4.128" ? "AMD_MILAN_BM" : "GENERIC_BM"
4242

4343
}

autoscaling/tf_init/network.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -183,10 +183,10 @@ resource "oci_dns_rrset" "rrset-cluster-network-SLURM" {
183183

184184
for_each = var.slurm && var.dns_entries ? toset([for v in range(var.node_count) : tostring(v)]) : []
185185
zone_name_or_id = data.oci_dns_zones.dns_zones.zones[0].id
186-
domain = "${var.queue}-${var.instance_type}-${local.cluster_instances_ips_index[tonumber(each.key)]}.${var.zone_name}"
186+
domain = "${var.hostname_convention}-${local.cluster_instances_ips_index[tonumber(each.key)]}.${var.zone_name}"
187187
rtype = "A"
188188
items {
189-
domain = "${var.queue}-${var.instance_type}-${local.cluster_instances_ips_index[tonumber(each.key)]}.${var.zone_name}"
189+
domain = "${var.hostname_convention}-${local.cluster_instances_ips_index[tonumber(each.key)]}.${var.zone_name}"
190190
rtype = "A"
191191
rdata = "${local.cluster_instances_ips[tonumber(each.key)]}"
192192
ttl = 3600

autoscaling/tf_init/versions.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
terraform {
2-
required_version = ">= 1.0"
2+
required_version = ">= 1.2"
33
required_providers {
44
oci = {
55
source = "oracle/oci"
6-
version = "5.37.0"
6+
version = "6.9.0"
77
}
88
}
99
}

0 commit comments

Comments
 (0)