Skip to content

Commit 763d350

Browse files
Merge pull request #26 from oracle-quickstart/2.10.2.1
2.10.2.1
2 parents 3595386 + b763575 commit 763d350

File tree

27 files changed

+205
-108
lines changed

27 files changed

+205
-108
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ validate -g y -cn <cluster name file> --> This will run the GPU throttle check.
350350

351351
validate -g <gpu check host file> --> This will run the GPU throttle check on the hosts provided in the file given. The gpu check host file should have a host name on each line.
352352

353-
validate -e y -cn <cluster name file> --> This will run the GPU throttle check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file.
353+
validate -e y -cn <cluster name file> --> This will run the /etc/hosts md5 sum check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file.
354354

355355
validate -e <md5 sum check host file> --> This will run the /etc/hosts md5 sum check on the hosts provided in the file given. The md5 sum check host file should have a host name on each line.
356356

autoscaling/tf_init/bastion_update.tf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ resource "local_file" "inventory" {
6262
instance_pool_ocpus=local.instance_pool_ocpus,
6363
queue=var.queue,
6464
instance_type=var.instance_type,
65+
monitoring=var.monitoring,
6566
autoscaling_monitoring = var.autoscaling_monitoring,
6667
unsupported = var.unsupported,
6768
hyperthreading = var.hyperthreading,

autoscaling/tf_init/inventory.tpl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ log_vol=${log_vol}
6060
ldap=${ldap}
6161
queue=${queue}
6262
instance_type=${instance_type}
63+
monitoring=${monitoring}
6364
hyperthreading=${hyperthreading}
6465
privilege_sudo=${privilege_sudo}
6566
privilege_group_name=${privilege_group_name}

autoscaling/tf_init/versions.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ terraform {
33
required_providers {
44
oci = {
55
source = "oracle/oci"
6-
version = "4.115.0"
6+
version = "5.1.0"
77
}
88
}
99
}

conf/variables.tpl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ variable "hyperthreading" { default = ##HT## }
116116
variable "unsupported" { default = ${unsupported} }
117117
variable "image_ocid" { default = "##IMAGE##" }
118118
variable "ldap" { default = ${ldap} }
119+
variable "monitoring" { default = ${monitoring} }
119120
variable "autoscaling_monitoring" { default = ${autoscaling_monitoring} }
120121

121122

playbooks/new_nodes.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@
4646
become: true
4747
gather_facts: true
4848
tasks:
49+
- include_role:
50+
name: oci-cn-auth
51+
when: cluster_network|bool
4952
- include_role:
5053
name: rdma-interface
5154
when: cluster_network|bool
@@ -200,6 +203,9 @@
200203
- include_role:
201204
name: slurm
202205
when: slurm|default(false)|bool
206+
- include_role:
207+
name: influxdb
208+
when: monitoring|default(false)|bool
203209
- include_role:
204210
name: telegraf
205211
when: monitoring|default(false)|bool

playbooks/resize_add.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@
4444
become: true
4545
gather_facts: true
4646
tasks:
47+
- include_role:
48+
name: oci-cn-auth
49+
when: cluster_network|bool
4750
- include_role:
4851
name: rdma-interface
4952
when: cluster_network|bool
@@ -202,6 +205,9 @@
202205
- include_role:
203206
name: slurm
204207
when: slurm|default(false)|bool
208+
- include_role:
209+
name: influxdb
210+
when: monitoring|default(false)|bool
205211
- include_role:
206212
name: telegraf
207213
when: monitoring|default(false)|bool

playbooks/roles/autoscaling_mon/tasks/el.yml

Lines changed: 2 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -51,35 +51,9 @@
5151
sslcacert: /etc/pki/tls/certs/ca-bundle.crt
5252

5353
- name: install grafana
54-
vars:
55-
package_name:
56-
- grafana-8.5.21-1
57-
package_state: present
5854
include_role:
59-
name: safe_yum
60-
61-
- name: start grafana
62-
become: true
63-
service:
64-
name: grafana-server
65-
state: restarted
66-
enabled: true
67-
68-
- name: Ensure grafana key directory exists
69-
file:
70-
path: "/etc/opt/oci-hpc/passwords/grafana"
71-
state: directory
72-
delegate_to: localhost
73-
74-
- name: Check api key list
75-
uri:
76-
url: "{{ grafana_api_url }}/api/auth/keys"
77-
user: "{{ grafana_security.admin_user }}"
78-
password: "{{ grafana_security.admin_password }}"
79-
force_basic_auth: true
80-
return_content: true
81-
no_log: false
82-
register: existing_api_keys
55+
name: grafana
56+
when: not monitoring|default(false)|bool
8357

8458
- name: install mysql-shell and connector
8559
vars:

playbooks/roles/autoscaling_mon/tasks/ubuntu.yml

Lines changed: 2 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -50,52 +50,10 @@
5050
# sslverify: 1
5151
# sslcacert: /etc/pki/tls/certs/ca-bundle.crt
5252

53-
- name: Add grafana key
54-
become: true
55-
apt_key:
56-
state: present
57-
url: https://packages.grafana.com/gpg.key
58-
59-
- name: Manage grafana APT repositories
60-
become: true
61-
apt_repository:
62-
repo: deb https://packages.grafana.com/oss/deb stable main
63-
state: present
64-
6553
- name: install grafana
66-
vars:
67-
package_name:
68-
- grafana-8.5.21-1
69-
package_state: present
7054
include_role:
71-
name: safe_yum
72-
73-
- name: start grafana
74-
become: true
75-
service:
76-
name: grafana-server
77-
state: restarted
78-
enabled: true
79-
80-
- name: Ensure grafana key directory exists
81-
file:
82-
path: "/etc/opt/oci-hpc/passwords/grafana"
83-
state: directory
84-
delegate_to: localhost
85-
86-
- name: Check api key list
87-
uri:
88-
url: "{{ grafana_api_url }}/api/auth/keys"
89-
method: GET
90-
user: "{{ grafana_security.admin_user }}"
91-
password: "{{ grafana_security.admin_password }}"
92-
force_basic_auth: true
93-
return_content: true
94-
no_log: false
95-
register: existing_api_keys
96-
retries: 5
97-
delay: 5
98-
until: existing_api_keys is not failed
55+
name: grafana
56+
when: not monitoring|default(false)|bool
9957

10058
# - name: Import mysql-2022 key
10159
# become: true

playbooks/roles/fix_broken/tasks/ubuntu.yml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,14 @@
7272
delay: 10
7373
until: result.stdout | int == 0
7474

75+
- name: Ensure lock file is removed
76+
become: true
77+
file:
78+
path: "/var/lib/apt/lists/lock"
79+
state: absent
80+
retries: 30
81+
delay: 10
82+
until: result.stdout | int == 0
7583

7684
- name: Run apt-get update
7785
become: true
@@ -80,7 +88,6 @@
8088
PID1=$!
8189
wait $PID1
8290
83-
8491
- name: Run fix-broken
8592
become: true
8693
shell: |

0 commit comments

Comments
 (0)