|
2 | 2 | Common utilities related to working with Marian. |
3 | 3 | """ |
4 | 4 |
|
| 5 | +from logging import Logger |
| 6 | +import os |
| 7 | +import sys |
| 8 | +import subprocess |
5 | 9 | from pathlib import Path |
6 | 10 | from typing import Union |
7 | 11 |
|
@@ -51,3 +55,68 @@ def marian_args_to_dict(extra_marian_args: list[str]) -> dict[str, Union[str, bo |
51 | 55 | decoder_config[previous_key] = [prev_value, arg] |
52 | 56 |
|
53 | 57 | return decoder_config |
| 58 | + |
| 59 | + |
| 60 | +def assert_gpus_available(logger: Logger) -> None: |
| 61 | + """ |
| 62 | + Sometimes the GPUs aren't available when running tasks on GPU machines in the cloud. |
| 63 | + This function reports on the GPUs available, and exits the task with an |
| 64 | + EX_TEMPFAIL (75) exit code when the machines are not available. Taskcluster can |
| 65 | + restart the tasks via the `retry-exit-status` property. |
| 66 | + """ |
| 67 | + |
| 68 | + if "USE_CPU" in os.environ or "COMET_CPU" in os.environ: |
| 69 | + return |
| 70 | + |
| 71 | + query = { |
| 72 | + "name": "Name", |
| 73 | + "driver_version": "Driver Version", |
| 74 | + "vbios_version": "GPU BIOS", |
| 75 | + "memory.total": "Memory Total", |
| 76 | + "memory.free": "Memory Free", |
| 77 | + "compute_cap": "Compute Capability (https://developer.nvidia.com/cuda-gpus)", |
| 78 | + "temperature.gpu": "GPU temperature (Celsius)", |
| 79 | + } |
| 80 | + |
| 81 | + fields = list(query.keys()) |
| 82 | + |
| 83 | + try: |
| 84 | + result = subprocess.run( |
| 85 | + [ |
| 86 | + "nvidia-smi", |
| 87 | + f"--query-gpu={','.join(fields)}", |
| 88 | + "--format=csv,noheader,nounits", |
| 89 | + ], |
| 90 | + check=False, |
| 91 | + stdout=subprocess.PIPE, |
| 92 | + stderr=subprocess.PIPE, |
| 93 | + text=True, |
| 94 | + ) |
| 95 | + except FileNotFoundError: |
| 96 | + raise Exception( |
| 97 | + "nvidia-smi not found. Ensure NVIDIA drivers are installed and nvidia-smi is in PATH." |
| 98 | + ) |
| 99 | + |
| 100 | + if result.returncode != 0: |
| 101 | + stdout = result.stdout.strip() |
| 102 | + stderr = result.stderr.strip() |
| 103 | + logger.error(f"nvidia-smi failed with return code {result.returncode}") |
| 104 | + for line in stdout.splitlines(): |
| 105 | + logger.error(f"[nvidia-smi(stdout)] {line}") |
| 106 | + for line in stderr.splitlines(): |
| 107 | + logger.error(f"[nvidia-smi(stderr)] {line}") |
| 108 | + logger.error("No GPUs were found available on this machine. Exiting with EX_TEMPFAIL (75)") |
| 109 | + sys.exit(75) |
| 110 | + |
| 111 | + output = result.stdout.strip() |
| 112 | + if not output: |
| 113 | + logger.info("No GPUs found by nvidia-smi, exiting EX_TEMPFAIL (75).") |
| 114 | + sys.exit(75) |
| 115 | + |
| 116 | + logger.info("CUDA-capable GPU(s) detected.") |
| 117 | + for idx, line in enumerate(output.splitlines()): |
| 118 | + values = [v.strip() for v in line.split(",")] |
| 119 | + logger.info(f"GPU {idx}:") |
| 120 | + for key, value in zip(query.values(), values): |
| 121 | + logger.info(f" {key}: {value}") |
| 122 | + logger.info("") |
0 commit comments