oracle-quickstart
diff --git a/‎README.md‎
Lines changed: 52 additions & 0 deletions b/‎README.md‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎autoscaling/tf_init/bastion_update.tf‎
Lines changed: 1 addition & 1 deletion b/‎autoscaling/tf_init/bastion_update.tf‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎autoscaling/tf_init/cluster-network-configuration.tf‎
Lines changed: 1 addition & 1 deletion b/‎autoscaling/tf_init/cluster-network-configuration.tf‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎autoscaling/tf_init/cluster-network.tf‎
Lines changed: 3 additions & 3 deletions b/‎autoscaling/tf_init/cluster-network.tf‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎autoscaling/tf_init/compute-cluster.tf‎
Lines changed: 13 additions & 0 deletions b/‎autoscaling/tf_init/compute-cluster.tf‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎autoscaling/tf_init/compute-nodes.tf‎
Lines changed: 53 additions & 0 deletions b/‎autoscaling/tf_init/compute-nodes.tf‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎autoscaling/tf_init/data.tf‎
Lines changed: 2 additions & 2 deletions b/‎autoscaling/tf_init/data.tf‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎autoscaling/tf_init/inventory.tpl‎
Lines changed: 1 addition & 1 deletion b/‎autoscaling/tf_init/inventory.tpl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎autoscaling/tf_init/locals.tf‎
Lines changed: 3 additions & 3 deletions b/‎autoscaling/tf_init/locals.tf‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎autoscaling/tf_init/outputs.tf‎
Lines changed: 1 addition & 1 deletion b/‎autoscaling/tf_init/outputs.tf‎
Lines changed: 1 addition & 1 deletion
@@ -37,6 +37,8 @@ The stack allowa various combination of OS. Here is a list of what has been test
 |      OL7      |      OL7     |  
 |      OL7      |      OL8     |
 |      OL7      |    CentOS7   |
+|      OL8      |       OL8    |
+|      OL8      |       OL7    |
 | Ubuntu  20.04 | Ubuntu 20.04 |
 
 When switching to Ubuntu, make sure the username is changed from opc to Ubuntu in the ORM for both the bastion and compute nodes. 
@@ -358,3 +360,53 @@ You can combine all the options together such as:
 validate -n y -p y -g y -e y -cn <cluster name file>
 
 
+## /opt/oci-hpc/scripts/collect_logs.py
+This is a script to collect nvidia bug report, sosreport, console history logs. 
+
+The script needs to be run from the bastion. In the case where the host is not ssh-able, it will get only  console history logs for the same.
+
+It requires the below argument.
+--hostname <HOSTNAME>
+
+And --compartment-id <COMPARTMENT_ID> is optional (i.e. assumption is the host is in the same compartment as the bastion). 
+
+Where HOSTNAME is the node name for which you need the above logs and COMPARTMENT_ID is the OCID of the compartment where the node is.
+
+The script will get all the above logs and put them in a folder specific to each node in /home/{user}. It will give the folder name as the output.
+
+Assumption: For getting the console history logs, the script expects to have the node name in /etc/hosts file.
+
+Examples:
+
+python3 collect_logs.py --hostname compute-permanent-node-467
+The nvidia bug report, sosreport, and console history logs for compute-permanent-node-467 are at /home/ubuntu/compute-permanent-node-467_06132023191024
+
+python3 collect_logs.py --hostname inst-jxwf6-keen-drake
+The nvidia bug report, sosreport, and console history logs for inst-jxwf6-keen-drake are at /home/ubuntu/inst-jxwf6-keen-drake_11112022001138
+
+for x in `less /home/opc/hostlist` ; do echo $x ; python3 collect_logs.py --hostname $x; done ;
+compute-permanent-node-467
+The nvidia bug report, sosreport, and console history logs for compute-permanent-node-467 are at /home/ubuntu/compute-permanent-node-467_11112022011318
+compute-permanent-node-787
+The nvidia bug report, sosreport, and console history logs for compute-permanent-node-787 are at /home/ubuntu/compute-permanent-node-787_11112022011835
+
+Where hostlist had the below contents
+compute-permanent-node-467
+compute-permanent-node-787
+
+
+## Collect RDMA NIC Metrics and Upload to Object Storage
+
+OCI-HPC is deployed in customer tenancy. So, OCI service teams cannot access metrics from these OCI-HPC stack clusters. Due to overcome this issue, in release,
+we introduce a feature to collect RDMA NIC Metrics and upload those metrics to Object Storage. Later on, that Object Storage URL could be shared with OCI service
+teams. After that URL, OCI service teams could access metrics and use those metrics for debugging purpose.
+
+To collect RDMA NIC Metrics and upload those to Object Storage, user needs to follow these following steps:
+
+Step 1: Create a PAR (PreAuthenticated Request)
+For creating a PAR, user needs to select check-box "Create Object Storage PAR" during Resource Manager's stack creation.
+By default, this check box is enabled. By selecting, this check-box, a PAR would be created.
+
+Step 2: Use shell script: upload_rdma_nic_metrics.sh to collect metrics and upload to object storage.
+User needs to use shell script: upload_rdma_nic_metrics.sh to collect metrics and upload to object storage. User could configure metrics
+collection limit and interval through config file: rdma_metrics_collection_config.conf.
@@ -16,7 +16,7 @@ resource "local_file" "hosts" {
   }
 
 resource "local_file" "inventory" {
-  depends_on          = [oci_core_cluster_network.cluster_network]
+  depends_on          = [oci_core_cluster_network.cluster_network, oci_core_cluster_network.cluster_network]
   content        = templatefile("${local.bastion_path}/inventory.tpl", {  
     bastion_name = var.bastion_name,
     bastion_ip = var.bastion_ip, 
 
@@ -1,5 +1,5 @@
 resource "oci_core_instance_configuration" "cluster-network-instance_configuration" {
-  count = var.cluster_network ? 1 : 0
+  count =  ( ! var.compute_cluster ) && var.cluster_network ? 1 : 0
   depends_on     = [oci_core_app_catalog_subscription.mp_image_subscription]
   compartment_id = var.targetCompartment
   display_name   = local.cluster_name
 
@@ -1,5 +1,5 @@
 resource "oci_core_volume" "nfs-cluster-network-volume" { 
-  count = var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 
+  count =  ( ! var.compute_cluster ) && var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 
   availability_domain = var.ad
   compartment_id = var.targetCompartment
   display_name = "${local.cluster_name}-nfs-volume"
@@ -9,7 +9,7 @@ resource "oci_core_volume" "nfs-cluster-network-volume" {
 }
 
 resource "oci_core_volume_attachment" "cluster_network_volume_attachment" { 
-  count = var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 
+  count =  ( ! var.compute_cluster ) && var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 
   attachment_type = "iscsi"
   volume_id       = oci_core_volume.nfs-cluster-network-volume[0].id
   instance_id     = local.cluster_instances_ids[0]
@@ -18,7 +18,7 @@ resource "oci_core_volume_attachment" "cluster_network_volume_attachment" {
 } 
 
 resource "oci_core_cluster_network" "cluster_network" {
-  count = var.cluster_network && var.node_count > 0 ? 1 : 0
+  count =  ( ! var.compute_cluster ) && var.cluster_network && var.node_count > 0 ? 1 : 0
   depends_on     = [oci_core_app_catalog_subscription.mp_image_subscription, oci_core_subnet.private-subnet, oci_core_subnet.public-subnet]
   compartment_id = var.targetCompartment
   instance_pools {
 
@@ -0,0 +1,13 @@
+resource "oci_core_compute_cluster" "compute_cluster" {
+  count = var.compute_cluster && var.cluster_network && var.node_count > 0 ? 1 : 0
+    #Required
+    availability_domain = var.ad
+    compartment_id = var.targetCompartment
+
+    #Optional
+    display_name = local.cluster_name
+    freeform_tags = {
+      "cluster_name" = local.cluster_name
+      "parent_cluster" = local.cluster_name
+  }
+}
@@ -0,0 +1,53 @@
+resource "oci_core_volume" "nfs-compute-cluster-volume" { 
+  count = var.compute_cluster && var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 
+  availability_domain = var.ad
+  compartment_id = var.targetCompartment
+  display_name = "${local.cluster_name}-nfs-volume"
+  
+  size_in_gbs = var.cluster_block_volume_size
+  vpus_per_gb = split(".", var.cluster_block_volume_performance)[0]
+}
+
+resource "oci_core_volume_attachment" "compute_cluster_volume_attachment" { 
+  count = var.compute_cluster && var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 
+  attachment_type = "iscsi"
+  volume_id       = oci_core_volume.nfs-compute-cluster-volume[0].id
+  instance_id     = oci_core_instance.compute_cluster_instances[0].id
+  display_name    = "${local.cluster_name}-compute-cluster-volume-attachment"
+  device          = "/dev/oracleoci/oraclevdb"
+} 
+
+resource "oci_core_instance" "compute_cluster_instances" {
+  count = var.compute_cluster ? var.node_count : 0
+  depends_on          = [oci_core_compute_cluster.compute_cluster]
+  availability_domain = var.ad
+  compartment_id      = var.targetCompartment
+  shape               = var.cluster_network_shape
+
+  agent_config {
+    is_management_disabled = true
+    }
+
+  display_name        = "${local.cluster_name}-node-${var.compute_cluster_start_index+count.index}"
+
+  freeform_tags = {
+    "cluster_name" = local.cluster_name
+    "parent_cluster" = local.cluster_name
+    "user" = var.tags
+  }
+
+  metadata = {
+    ssh_authorized_keys = file("/home/${var.bastion_username}/.ssh/id_rsa.pub")
+    user_data           = base64encode(data.template_file.config.rendered)
+  }
+  source_details {
+    source_id = local.cluster_network_image
+    source_type             = "image"
+    boot_volume_size_in_gbs = var.boot_volume_size
+  }
+  compute_cluster_id=length(var.compute_cluster_id) > 2 ? var.compute_cluster_id : oci_core_compute_cluster.compute_cluster[0].id
+  create_vnic_details {
+    subnet_id = local.subnet_id
+    assign_public_ip = false
+  }
+} 
@@ -10,7 +10,7 @@ data "oci_core_services" "services" {
 }
 
 data "oci_core_cluster_network_instances" "cluster_network_instances" {
-  count = var.cluster_network && var.node_count > 0 ? 1 : 0
+  count = (! var.compute_cluster) && var.cluster_network && var.node_count > 0 ? 1 : 0
   cluster_network_id = oci_core_cluster_network.cluster_network[0].id
   compartment_id     = var.targetCompartment
 }
@@ -22,7 +22,7 @@ data "oci_core_instance_pool_instances" "instance_pool_instances" {
 }
 
 data "oci_core_instance" "cluster_network_instances" {
-  count       = var.cluster_network && var.node_count > 0 ? var.node_count : 0
+  count       = (! var.compute_cluster) && var.cluster_network && var.node_count > 0 ? var.node_count : 0
   instance_id = data.oci_core_cluster_network_instances.cluster_network_instances[0].instances[count.index]["id"]
 }
 
 
@@ -1,5 +1,5 @@
 [bastion]
-${bastion_name} ansible_host=${bastion_ip} ansible_user=${bastion_username} role=bastion
+${bastion_name} ansible_host=${bastion_ip} ansible_user=${bastion_username} role=bastion ansible_python_interpreter=/usr/bin/python
 [slurm_backup]
 %{ if backup_name != "" }${backup_name} ansible_host=${backup_ip} ansible_user=${bastion_username} role=bastion%{ endif }
 [login]
 
@@ -1,13 +1,13 @@
 locals { 
 // display names of instances 
-  cluster_instances_ids = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.id : data.oci_core_instance.instance_pool_instances.*.id
-  cluster_instances_names = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.display_name : data.oci_core_instance.instance_pool_instances.*.display_name
+  cluster_instances_ids = var.compute_cluster ? oci_core_instance.compute_cluster_instances.*.id : var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.id : data.oci_core_instance.instance_pool_instances.*.id
+  cluster_instances_names = var.compute_cluster ? oci_core_instance.compute_cluster_instances.*.display_name :var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.display_name : data.oci_core_instance.instance_pool_instances.*.display_name
   image_ocid = var.unsupported ? var.image_ocid : var.image 
 
   shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape
   instance_pool_ocpus = local.shape == "VM.DenseIO.E4.Flex" ? var.instance_pool_ocpus_denseIO_flex : var.instance_pool_ocpus
 // ips of the instances
-  cluster_instances_ips = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip
+  cluster_instances_ips = var.compute_cluster ? oci_core_instance.compute_cluster_instances.*.private_ip : var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip
 
 // subnet id derived either from created subnet or existing if specified
   subnet_id = var.private_deployment ? var.use_existing_vcn ? var.private_subnet_id : element(concat(oci_core_subnet.private-subnet.*.id, [""]), 1) : var.use_existing_vcn ? var.private_subnet_id : element(concat(oci_core_subnet.private-subnet.*.id, [""]), 0)
 
@@ -8,5 +8,5 @@ output "ocids" {
   value = join(",", local.cluster_instances_ids)
 }
 output "cluster_ocid" {
-  value = var.cluster_network ? oci_core_cluster_network.cluster_network[0].id : oci_core_instance_pool.instance_pool[0].id
+  value = var.compute_cluster ? oci_core_compute_cluster.compute_cluster[0].id : var.cluster_network ? oci_core_cluster_network.cluster_network[0].id : oci_core_instance_pool.instance_pool[0].id
 }
Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@ resource "local_file" "hosts" {`
`16`	`16`	`}`
`17`	`17`
`18`	`18`	`resource "local_file" "inventory" {`
`19`		`- depends_on = [oci_core_cluster_network.cluster_network]`
	`19`	`+ depends_on = [oci_core_cluster_network.cluster_network, oci_core_cluster_network.cluster_network]`
`20`	`20`	`content = templatefile("${local.bastion_path}/inventory.tpl", {`
`21`	`21`	`bastion_name = var.bastion_name,`
`22`	`22`	`bastion_ip = var.bastion_ip,`
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@ data "oci_core_services" "services" {`
`10`	`10`	`}`
`11`	`11`
`12`	`12`	`data "oci_core_cluster_network_instances" "cluster_network_instances" {`
`13`		`- count = var.cluster_network && var.node_count > 0 ? 1 : 0`
	`13`	`+ count = (! var.compute_cluster) && var.cluster_network && var.node_count > 0 ? 1 : 0`
`14`	`14`	`cluster_network_id = oci_core_cluster_network.cluster_network[0].id`
`15`	`15`	`compartment_id = var.targetCompartment`
`16`	`16`	`}`
`@@ -22,7 +22,7 @@ data "oci_core_instance_pool_instances" "instance_pool_instances" {`
`22`	`22`	`}`
`23`	`23`
`24`	`24`	`data "oci_core_instance" "cluster_network_instances" {`
`25`		`- count = var.cluster_network && var.node_count > 0 ? var.node_count : 0`
	`25`	`+ count = (! var.compute_cluster) && var.cluster_network && var.node_count > 0 ? var.node_count : 0`
`26`	`26`	`instance_id = data.oci_core_cluster_network_instances.cluster_network_instances[0].instances[count.index]["id"]`
`27`	`27`	`}`
`28`	`28`
Original file line number	Diff line number	Diff line change
`@@ -8,5 +8,5 @@ output "ocids" {`
`8`	`8`	`value = join(",", local.cluster_instances_ids)`
`9`	`9`	`}`
`10`	`10`	`output "cluster_ocid" {`
`11`		`- value = var.cluster_network ? oci_core_cluster_network.cluster_network[0].id : oci_core_instance_pool.instance_pool[0].id`
	`11`	`+ value = var.compute_cluster ? oci_core_compute_cluster.compute_cluster[0].id : var.cluster_network ? oci_core_cluster_network.cluster_network[0].id : oci_core_instance_pool.instance_pool[0].id`
`12`	`12`	`}`