|
56 | 56 | cvmfs_config_overrides: {}
|
57 | 57 | cvmfs_config: "{{ cvmfs_config_default | combine(cvmfs_config_overrides) }}"
|
58 | 58 |
|
| 59 | + openhpc_conf_server: control_node_ip |
| 60 | + openhpc_gres_template: /etc/ansible-init/templates/gres.conf.j2 |
| 61 | + openhpc_slurm_service_enabled: true |
| 62 | + openhpc_slurm_service_started: "{{ openhpc_slurm_service_enabled }}" |
| 63 | + openhpc_enable: |
| 64 | + control: false |
| 65 | + batch: true |
| 66 | + database: false |
| 67 | + runtime: true |
59 | 68 |
|
60 | 69 | tasks:
|
61 | 70 | - name: Configure resolve.conf
|
|
223 | 232 |
|
224 | 233 |
|
225 | 234 | - name: Configure EESSI
|
226 |
| - gather_facts: false |
227 | 235 | block:
|
228 | 236 | - name: Download Cern GPG key
|
229 | 237 | ansible.builtin.get_url:
|
|
247 | 255 | # NOTE: Can't find any docs on obtaining gpg key - maybe downloading directly from github is ok?
|
248 | 256 | disable_gpg_check: true
|
249 | 257 |
|
250 |
| - # Alternative version using official repo - still no GPG key :( |
251 |
| - # - name: Add EESSI repo |
252 |
| - # dnf: |
253 |
| - # name: http://repo.eessi-infra.org/eessi/rhel/8/noarch/eessi-release-0-1.noarch.rpm |
254 |
| - |
255 |
| - # - name: Install EESSI CVMFS config |
256 |
| - # dnf: |
257 |
| - # name: cvmfs-config-eessi |
258 |
| - |
259 | 258 | - name: Add base CVMFS config
|
260 | 259 | community.general.ini_file:
|
261 | 260 | dest: /etc/cvmfs/default.local
|
|
273 | 272 |
|
274 | 273 | - name: Configure openhpc
|
275 | 274 | block:
|
276 |
| - - name: Check openhpc_slurm_control_host, openhpc_cluster_name or openhpc_slurm_partitions exist |
277 |
| - assert: |
278 |
| - that: |
279 |
| - - openhpc_slurm_control_host is defined |
280 |
| - - openhpc_cluster_name is defined |
281 |
| - - openhpc_cluster_name != '' |
282 |
| - - openhpc_slurm_partitions is defined |
283 |
| - fail_msg: "Undefined openhpc_slurm_control_host, openhpc_cluster_name or openhpc_slurm_partitions." |
284 |
| - |
285 |
| - - name: Fail if control host not in play and munge key not specified |
286 |
| - fail: |
287 |
| - msg: "Either the slurm control node must be in the play or `openhpc_munge_key` must be set" |
288 |
| - when: |
289 |
| - - openhpc_slurm_control_host not in ansible_play_hosts |
290 |
| - - not openhpc_munge_key |
291 |
| - |
292 |
| - # - name: Ensure Slurm directories exists |
293 |
| - # file: |
294 |
| - # path: "{{ openhpc_state_save_location }}" |
295 |
| - # owner: slurm |
296 |
| - # group: slurm |
297 |
| - # mode: 0755 |
298 |
| - # state: directory |
299 |
| - # when: inventory_hostname == openhpc_slurm_control_host |
300 |
| - |
301 |
| - # - name: Generate a Munge key on control host |
302 |
| - # # NB this is usually a no-op as the package install actually generates a (node-unique) one, so won't usually trigger handler |
303 |
| - # command: "dd if=/dev/urandom of=/etc/munge/munge.key bs=1 count=1024" |
304 |
| - # args: |
305 |
| - # creates: "/etc/munge/munge.key" |
306 |
| - # when: inventory_hostname == openhpc_slurm_control_host |
307 |
| - |
308 |
| - # - name: Retrieve Munge key from control host |
309 |
| - # slurp: |
310 |
| - # src: "/etc/munge/munge.key" |
311 |
| - # register: openhpc_control_munge_key |
312 |
| - # delegate_to: "{{ openhpc_slurm_control_host }}" |
313 |
| - # when: openhpc_slurm_control_host in ansible_play_hosts |
314 |
| - |
315 | 275 | - name: Fix permissions on /etc to pass Munge startup checks
|
316 | 276 | # Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 makes /etc g=rwx rather than g=rx (where group=root)
|
317 | 277 | # which fails munged startup checks
|
|
320 | 280 | state: directory
|
321 | 281 | mode: g-w
|
322 | 282 |
|
323 |
| - - name: Write Munge key |
| 283 | + - name: Copy Munge key from NFS-mounted directory to /etc/munge |
324 | 284 | copy:
|
325 |
| - content: "{{ openhpc_munge_key or (openhpc_control_munge_key.content | b64decode) }}" |
| 285 | + src: "/mnt/openhpc_munge.key" |
326 | 286 | dest: "/etc/munge/munge.key"
|
327 | 287 | owner: munge
|
328 | 288 | group: munge
|
329 | 289 | mode: 0400
|
330 |
| - notify: |
331 |
| - - Restart Munge service |
332 |
| - |
333 |
| - - name: Ensure JobComp logfile exists |
334 |
| - file: |
335 |
| - path: "{{ openhpc_slurm_job_comp_loc }}" |
336 |
| - state: touch |
337 |
| - owner: slurm |
338 |
| - group: slurm |
339 |
| - mode: 0644 |
340 |
| - access_time: preserve |
341 |
| - modification_time: preserve |
342 |
| - when: openhpc_slurm_job_comp_type == 'jobcomp/filetxt' |
343 |
| - |
344 |
| - - name: Template slurmdbd.conf |
345 |
| - template: |
346 |
| - src: slurmdbd.conf.j2 |
347 |
| - dest: /etc/slurm/slurmdbd.conf |
348 |
| - mode: "0600" |
349 |
| - owner: slurm |
350 |
| - group: slurm |
351 |
| - notify: Restart slurmdbd service |
352 |
| - when: openhpc_enable.database | default(false) | bool |
353 |
| - |
354 |
| - - name: Make local tempfile for slurm.conf templating # ensures simultaneous runs don't clobber each other |
355 |
| - ansible.builtin.tempfile: |
356 |
| - register: _slurm_conf_tmpfile |
357 |
| - delegate_to: localhost |
358 |
| - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless |
359 |
| - changed_when: false # so molecule doesn't fail |
360 |
| - become: no |
361 |
| - |
362 |
| - - name: Template basic slurm.conf |
363 |
| - template: |
364 |
| - src: slurm.conf.j2 |
365 |
| - dest: "{{ _slurm_conf_tmpfile.path }}" |
366 |
| - lstrip_blocks: true |
367 |
| - mode: 0644 |
368 |
| - delegate_to: localhost |
369 |
| - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless |
370 |
| - changed_when: false # so molecule doesn't fail |
371 |
| - become: no |
372 |
| - |
373 |
| - - name: Customise slurm.conf |
374 |
| - community.general.ini_file: |
375 |
| - path: "{{ _slurm_conf_tmpfile.path }}" |
376 |
| - option: "{{ item.key }}" |
377 |
| - section: '' |
378 |
| - value: "{{ (item.value | join(',')) if (item.value is sequence and item.value is not string) else item.value }}" |
379 |
| - no_extra_spaces: true |
380 |
| - create: no |
381 |
| - mode: 0644 |
382 |
| - loop: "{{ openhpc_config | dict2items }}" |
383 |
| - delegate_to: localhost |
384 |
| - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless |
385 |
| - changed_when: false # so molecule doesn't fail |
386 |
| - become: no |
387 |
| - |
388 |
| - - name: Create slurm.conf |
389 |
| - copy: |
390 |
| - src: "{{ _slurm_conf_tmpfile.path }}" |
391 |
| - dest: /etc/slurm/slurm.conf |
392 |
| - owner: root |
393 |
| - group: root |
394 |
| - mode: 0644 |
395 |
| - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless |
396 |
| - notify: |
397 |
| - - Restart slurmctld service |
398 |
| - register: ohpc_slurm_conf |
399 |
| - # NB uses restart rather than reload as number of nodes might have changed |
400 | 290 |
|
401 | 291 | - name: Create gres.conf
|
402 | 292 | template:
|
|
405 | 295 | mode: "0600"
|
406 | 296 | owner: slurm
|
407 | 297 | group: slurm
|
408 |
| - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless |
409 |
| - notify: |
410 |
| - - Restart slurmctld service |
| 298 | + when: openhpc_enable.control | default(false) |
411 | 299 | register: ohpc_gres_conf
|
412 |
| - # NB uses restart rather than reload as this is needed in some cases |
413 |
| - |
414 |
| - - name: Template cgroup.conf |
415 |
| - # appears to be required even with NO cgroup plugins: https://slurm.schedmd.com/cgroups.html#cgroup_design |
416 |
| - template: |
417 |
| - src: cgroup.conf.j2 |
418 |
| - dest: /etc/slurm/cgroup.conf |
419 |
| - mode: "0644" # perms/ownership based off src from ohpc package |
420 |
| - owner: root |
421 |
| - group: root |
422 |
| - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless |
423 |
| - |
424 |
| - - name: Remove local tempfile for slurm.conf templating |
425 |
| - ansible.builtin.file: |
426 |
| - path: "{{ _slurm_conf_tmpfile.path }}" |
427 |
| - state: absent |
428 |
| - when: _slurm_conf_tmpfile.path is defined |
429 |
| - delegate_to: localhost |
430 |
| - changed_when: false # so molecule doesn't fail |
431 |
| - become: no |
432 |
| - |
433 |
| - - name: Notify handler for slurmd restart |
434 |
| - debug: |
435 |
| - msg: "notifying handlers" # meta: noop doesn't support 'when' |
436 |
| - changed_when: true |
437 |
| - when: |
438 |
| - - openhpc_slurm_control_host in ansible_play_hosts |
439 |
| - - hostvars[openhpc_slurm_control_host].ohpc_slurm_conf.changed or hostvars[openhpc_slurm_control_host].ohpc_gres_conf.changed # noqa no-handler |
440 |
| - notify: |
441 |
| - - Restart slurmd service |
442 | 300 |
|
443 | 301 | - name: Set slurmctld location for configless operation
|
444 | 302 | lineinfile:
|
445 | 303 | path: /etc/sysconfig/slurmd
|
446 |
| - line: "SLURMD_OPTIONS='--conf-server {{ openhpc_slurm_control_host_address | default(openhpc_slurm_control_host) }}'" |
| 304 | + line: "SLURMD_OPTIONS='--conf-server {{ openhpc_conf_server }}'" |
447 | 305 | regexp: "^SLURMD_OPTIONS="
|
448 | 306 | create: yes
|
449 | 307 | owner: root
|
450 | 308 | group: root
|
451 | 309 | mode: 0644
|
452 |
| - when: |
453 |
| - - openhpc_enable.batch | default(false) |
454 |
| - - openhpc_slurm_configless |
455 |
| - notify: |
456 |
| - - Restart slurmd service |
457 |
| - # Reloading is sufficent, but using a single handler means no bounce. Realistically this won't regularly change on a running slurmd so restarting is ok. |
458 |
| - |
459 |
| - # Munge state could be unchanged but the service is not running. |
460 |
| - # Handle that here. |
| 310 | + |
461 | 311 | - name: Configure Munge service
|
462 | 312 | service:
|
463 | 313 | name: munge
|
464 | 314 | enabled: "{{ openhpc_slurm_service_enabled | bool }}"
|
465 | 315 | state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}"
|
466 | 316 |
|
467 |
| - - name: Flush handler |
468 |
| - meta: flush_handlers # as then subsequent "ensure" is a no-op if slurm services bounced |
469 |
| - |
470 |
| - - name: Ensure slurmdbd state |
471 |
| - service: |
472 |
| - name: slurmdbd |
473 |
| - enabled: "{{ openhpc_slurm_service_enabled | bool }}" |
474 |
| - state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" |
475 |
| - when: openhpc_enable.database | default(false) | bool |
476 |
| - |
477 |
| - # - name: Ensure slurmctld state |
478 |
| - # service: |
479 |
| - # name: slurmctld |
480 |
| - # enabled: "{{ openhpc_slurm_service_enabled | bool }}" |
481 |
| - # state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" |
482 |
| - # when: openhpc_enable.control | default(false) | bool |
483 |
| - |
484 | 317 | - name: Ensure slurmd state
|
485 | 318 | service:
|
486 | 319 | name: slurmd
|
|
0 commit comments