Skip to content

Commit 52d670b

Browse files
authored
Merge pull request #79 from nebius/SCHED-1210
SCHED-1210 use cuda_force_upgrade for upgrading CUDA version and upgrade ansible requirements.txt
2 parents 535f8fc + ad3a0e5 commit 52d670b

File tree

6 files changed

+54
-45
lines changed

6 files changed

+54
-45
lines changed

ansible/common-packages.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
---
22

33
- name: Install common packages
4-
hosts: localhost
4+
hosts:
5+
- localhost
6+
- all
57
gather_facts: true
68
gather_subset:
79
- min

ansible/python.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
---
22

33
- name: Install python
4-
hosts: localhost
4+
hosts:
5+
- localhost
6+
- all
57
become: true
68
roles:
79
- python

ansible/requirements.txt

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,36 @@
1-
ansible==12.2.0
2-
ansible-compat==25.8.2
3-
ansible-core==2.19.4
4-
ansible-lint==25.9.2
1+
ansible==13.4.0
2+
ansible-compat==25.12.1
3+
ansible-core==2.20.4
4+
ansible-lint==26.3.0
55
argcomplete==3.6.3
6-
attrs==25.4.0
7-
black==25.9.0
6+
attrs==26.1.0
7+
black==26.3.1
88
bracex==2.6
99
cffi==2.0.0
10-
charset-normalizer==3.4.4
11-
click==8.3.0
12-
cryptography==46.0.3
10+
charset-normalizer==3.4.6
11+
click==8.3.1
12+
cryptography==46.0.5
1313
distro==1.9.0
14-
filelock==3.20.1
15-
importlib_metadata==8.7.0
14+
filelock==3.25.2
15+
importlib_metadata==9.0.0
1616
Jinja2==3.1.6
17-
jsonschema==4.25.1
17+
jsonschema==4.26.0
1818
jsonschema-specifications==2025.9.1
1919
MarkupSafe==3.0.3
2020
mypy_extensions==1.1.0
21-
packaging==25.0
22-
pathspec==0.12.1
23-
platformdirs==4.5.0
24-
pycparser==2.23
25-
python-debian==1.0.1
26-
pytokens==0.2.0
21+
packaging==26.0
22+
pathspec==1.0.4
23+
platformdirs==4.9.4
24+
pycparser==3.0
25+
python-debian==1.1.0
26+
pytokens==0.4.1
2727
PyYAML==6.0.3
2828
referencing==0.37.0
2929
resolvelib==1.2.1
30-
rpds-py==0.28.0
31-
ruamel.yaml==0.18.16
32-
ruamel.yaml.clib==0.2.14
30+
rpds-py==0.30.0
31+
ruamel.yaml==0.19.1
32+
ruamel.yaml.clib==0.2.15
3333
subprocess-tee==0.4.2
3434
wcmatch==10.1
35-
yamllint==1.37.1
35+
yamllint==1.38.0
3636
zipp==3.23.0

ansible/roles/cuda/tasks/main.yml

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -54,24 +54,39 @@
5454
tags:
5555
- cuda
5656

57-
- name: Unhold old CUDA packages
58-
ansible.builtin.dpkg_selections:
59-
name: "{{ item }}"
60-
selection: install
61-
loop: "{{ cuda_hold_packages[cuda_old_version] }}"
62-
when: cuda_old_version is defined
57+
- name: Get held packages
58+
ansible.builtin.command: apt-mark showhold
59+
register: cuda_apt_hold
60+
changed_when: false
61+
check_mode: false
62+
when: cuda_force_upgrade | default(false) | bool
63+
tags:
64+
- cuda
65+
66+
- name: Unhold only held CUDA packages
67+
ansible.builtin.command: "apt-mark unhold {{ item }}"
68+
loop: >-
69+
{{
70+
cuda_apt_hold.stdout_lines
71+
| intersect(cuda_hold_packages | dict2items | map(attribute='value') | flatten | unique)
72+
}}
73+
register: cuda_unhold_out
74+
changed_when: "'Canceled hold on' in (unhold_out.stdout | default(''))"
75+
failed_when: cuda_unhold_out.rc != 0
76+
when: cuda_force_upgrade | default(false) | bool
6377
tags:
6478
- cuda
6579

6680
- name: Remove old CUDA packages
6781
ansible.builtin.apt:
68-
name: "{{ cuda_packages[cuda_old_version] }}"
82+
name: "{{ cuda_hold_packages | dict2items | map(attribute='value') | flatten | unique }}"
6983
state: absent
7084
purge: true
7185
autoremove: true
7286
update_cache: false
7387
force_apt_get: true
74-
when: cuda_old_version is defined
88+
allow_change_held_packages: true
89+
when: cuda_force_upgrade | default(false) | bool
7590
tags:
7691
- cuda
7792

ansible/roles/dcgmi/tasks/main.yml

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -51,18 +51,6 @@
5151
tags:
5252
- dcgmi
5353

54-
- name: Remove old dcgmi packages
55-
ansible.builtin.apt:
56-
name: "{{ dcgmi_cuda_packages_map[dcgmi_cuda_old_version] }}"
57-
state: absent
58-
purge: true
59-
autoremove: true
60-
update_cache: false
61-
force_apt_get: true
62-
when: dcgmi_cuda_old_version is defined
63-
tags:
64-
- dcgmi
65-
6654
- name: Ensure dcgmi packages is installed for detected CUDA {{ dcgmi_cuda_major_version }}
6755
ansible.builtin.apt:
6856
name: "{{ dcgmi_packages }}"

ansible/slurm.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
---
22

33
- name: Install slurm packages
4-
hosts: localhost
4+
hosts:
5+
- localhost
6+
- all
57
become: true
68
gather_facts: true
79
gather_subset:

0 commit comments

Comments
 (0)