Skip to content

Commit 56b757c

Browse files
Merge pull request #106 from oci-hpc/2.10.2_ds_nvpeermem
load nvidia_peermem module if not loaded
2 parents f4a6673 + 02f50d2 commit 56b757c

File tree

7 files changed

+38
-0
lines changed

7 files changed

+38
-0
lines changed

playbooks/new_nodes.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@
4848
- include_role:
4949
name: rdma-interface
5050
when: cluster_network|bool
51+
- include_role:
52+
name: nvidia_peermem
5153

5254
- hosts: bastion,slurm_backup,login,compute
5355
become: true

playbooks/resize_add.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@
4646
- include_role:
4747
name: rdma-interface
4848
when: cluster_network|bool
49+
- include_role:
50+
name: nvidia_peermem
4951

5052
- hosts: bastion,slurm_backup,login,compute
5153
become: true
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
---
2+
# defaults file for nvidia_peermem
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
---
2+
- name: Check if its a GPU shape
3+
shell:
4+
cmd: "curl -sH \"Authorization: Bearer Oracle\" -L http://169.254.169.254/opc/v2/instance/ | jq .shape | grep GPU"
5+
warn: false
6+
register: shape
7+
failed_when: false
8+
9+
10+
- name: Check if nvidia drivers are installed
11+
shell: cat /sys/module/nvidia/version | wc -l
12+
register: nvidia
13+
when: shape.stdout != ""
14+
15+
16+
- name: Check if nvidia_peermem module is loaded
17+
shell: lsmod | grep nvidia_peermem | wc -l
18+
register: result
19+
when: shape.stdout != "" and nvidia.stdout == '1'
20+
21+
22+
- name: Load nvidia_peermem module
23+
become: true
24+
shell: modprobe nvidia_peermem
25+
when: shape.stdout != "" and nvidia.stdout == '1' and result.stdout != '3'
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
---
2+
# tasks file for nvidia_peermem
3+
- include: common.yml
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
---
2+
# vars file for nvidia_peermem

playbooks/site.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@
5050
- include_role:
5151
name: rdma-interface
5252
when: cluster_network|bool
53+
- include_role:
54+
name: nvidia_peermem
5355

5456
- hosts: bastion
5557
become: true

0 commit comments

Comments
 (0)