Skip to content

Commit 6545be6

Browse files
authored
Assert that GPUs are available and restart tasks if they are not (#1112)
* Assert that GPUs are available and output information about them * Add the GPU assertion to pipeline/translate/translate.py * Fix linting error * Add the USE_CPU environment
1 parent 5d33b26 commit 6545be6

File tree

20 files changed

+132
-41
lines changed

20 files changed

+132
-41
lines changed

pipeline/bicleaner/bicleaner.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ else
6565
# The GPU devices have failed to be found, and bicleaner AI falls back
6666
# to operate on the CPU very slowly. To guard against this wasting expensive
6767
# GPU time, always check that it can find GPUs.
68-
python3 -c "import tensorflow; exit(0) if tensorflow.config.list_physical_devices('GPU') else exit(9001)"
68+
python3 -c "import tensorflow; exit(0) if tensorflow.config.list_physical_devices('GPU') else exit(75)"
6969
bicleaner-ai-classify --disable_hardrules --scol ${scol} --tcol ${tcol} - - $1
7070
}
7171
export -f biclean

pipeline/common/marian.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
Common utilities related to working with Marian.
33
"""
44

5+
from logging import Logger
6+
import os
7+
import sys
8+
import subprocess
59
from pathlib import Path
610
from typing import Union
711

@@ -51,3 +55,68 @@ def marian_args_to_dict(extra_marian_args: list[str]) -> dict[str, Union[str, bo
5155
decoder_config[previous_key] = [prev_value, arg]
5256

5357
return decoder_config
58+
59+
60+
def assert_gpus_available(logger: Logger) -> None:
61+
"""
62+
Sometimes the GPUs aren't available when running tasks on GPU machines in the cloud.
63+
This function reports on the GPUs available, and exits the task with an
64+
EX_TEMPFAIL (75) exit code when the machines are not available. Taskcluster can
65+
restart the tasks via the `retry-exit-status` property.
66+
"""
67+
68+
if "USE_CPU" in os.environ or "COMET_CPU" in os.environ:
69+
return
70+
71+
query = {
72+
"name": "Name",
73+
"driver_version": "Driver Version",
74+
"vbios_version": "GPU BIOS",
75+
"memory.total": "Memory Total",
76+
"memory.free": "Memory Free",
77+
"compute_cap": "Compute Capability (https://developer.nvidia.com/cuda-gpus)",
78+
"temperature.gpu": "GPU temperature (Celsius)",
79+
}
80+
81+
fields = list(query.keys())
82+
83+
try:
84+
result = subprocess.run(
85+
[
86+
"nvidia-smi",
87+
f"--query-gpu={','.join(fields)}",
88+
"--format=csv,noheader,nounits",
89+
],
90+
check=False,
91+
stdout=subprocess.PIPE,
92+
stderr=subprocess.PIPE,
93+
text=True,
94+
)
95+
except FileNotFoundError:
96+
raise Exception(
97+
"nvidia-smi not found. Ensure NVIDIA drivers are installed and nvidia-smi is in PATH."
98+
)
99+
100+
if result.returncode != 0:
101+
stdout = result.stdout.strip()
102+
stderr = result.stderr.strip()
103+
logger.error(f"nvidia-smi failed with return code {result.returncode}")
104+
for line in stdout.splitlines():
105+
logger.error(f"[nvidia-smi(stdout)] {line}")
106+
for line in stderr.splitlines():
107+
logger.error(f"[nvidia-smi(stderr)] {line}")
108+
logger.error("No GPUs were found available on this machine. Exiting with EX_TEMPFAIL (75)")
109+
sys.exit(75)
110+
111+
output = result.stdout.strip()
112+
if not output:
113+
logger.info("No GPUs found by nvidia-smi, exiting EX_TEMPFAIL (75).")
114+
sys.exit(75)
115+
116+
logger.info("CUDA-capable GPU(s) detected.")
117+
for idx, line in enumerate(output.splitlines()):
118+
values = [v.strip() for v in line.split(",")]
119+
logger.info(f"GPU {idx}:")
120+
for key, value in zip(query.values(), values):
121+
logger.info(f" {key}: {value}")
122+
logger.info("")

pipeline/eval/eval.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353

5454
from pipeline.common.downloads import decompress_file
5555
from pipeline.common.logging import get_logger
56+
from pipeline.common.marian import assert_gpus_available
5657

5758
logger = get_logger("eval")
5859
try:
@@ -212,6 +213,8 @@ def main(args_list: Optional[list[str]] = None) -> None:
212213

213214
logger.info("Save the original target sentences to the artifacts")
214215

216+
assert_gpus_available(logger)
217+
215218
decompress_file(target_file_compressed, keep_original=False, decompressed_path=target_ref_file)
216219

217220
run_bash_oneliner(

pipeline/train/train.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from pipeline.common.downloads import read_lines, write_lines
1717
from pipeline.common.logging import get_logger
1818
from pipeline.common.command_runner import apply_command_args, run_command_pipeline
19+
from pipeline.common.marian import assert_gpus_available
1920

2021
logger = get_logger(__file__)
2122
train_dir = Path(__file__).parent
@@ -478,6 +479,8 @@ def main() -> None:
478479
help="Additional parameters for the training script",
479480
)
480481

482+
assert_gpus_available(logger)
483+
481484
with tempfile.TemporaryDirectory() as temp_dir:
482485
train_cli = TrainCLI(parser.parse_args(), Path(temp_dir))
483486
train_cli.log_config()

pipeline/translate/translate.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from glob import glob
88
import os
99
from pathlib import Path
10-
import sys
1110
import tempfile
1211

1312
from pipeline.common.command_runner import apply_command_args, run_command
@@ -22,6 +21,7 @@
2221
)
2322
from pipeline.common.marian import get_combined_config
2423
from pipeline.translate.translate_ctranslate2 import translate_with_ctranslate2
24+
from pipeline.common.marian import assert_gpus_available
2525

2626
logger = get_logger(__file__)
2727

@@ -187,6 +187,8 @@ def main() -> None:
187187
pass
188188
return
189189

190+
assert_gpus_available(logger)
191+
190192
if decoder == Decoder.ctranslate2:
191193
translate_with_ctranslate2(
192194
input_zst=input_zst,
@@ -257,12 +259,4 @@ def main() -> None:
257259

258260

259261
if __name__ == "__main__":
260-
try:
261-
main()
262-
except RuntimeError as e:
263-
# On GCP instances, we occasionally find that a GPU is not found even
264-
# when it has been requested. Exiting with a unique error code in these
265-
# cases allows us to automatically retry such tasks in Taskcluster.
266-
if len(e.args) > 0 and "no CUDA-capable device is detected" in e.args[0]:
267-
logger.exception("couldn't find GPU, exiting with 9002")
268-
sys.exit(9002)
262+
main()

taskcluster/kinds/backtranslations-mono-trg-translate/kind.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,9 @@ tasks:
111111
CUDA_DIR: fetches/cuda-toolkit
112112
CUDNN_DIR: fetches/cuda-toolkit
113113
MARIAN: $MOZ_FETCHES_DIR
114+
# 75 - EX_TEMPFAIL, used for when the GPUs aren't available on the machine.
114115
# 128 happens when cloning this repository fails
115-
# 9002 happens if no GPU is attached
116-
retry-exit-status: [128, 9002]
116+
retry-exit-status: [75, 128]
117117

118118
# Don't run unless explicitly scheduled
119119
run-on-tasks-for: []

taskcluster/kinds/backtranslations-train-backwards-model/kind.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,9 @@ tasks:
7474
docker-image: {"in-tree": "train"}
7575
max-run-time: 2592000
7676
# train_taskcluster.py exits with 17 if a request to Taskcluster fails
77-
retry-exit-status: [17]
77+
# 75 - EX_TEMPFAIL, used for when the GPUs aren't available on the machine.
78+
# 128 happens when cloning this repository fails
79+
retry-exit-status: [17, 75, 128]
7880
env:
7981
# Weight & Biases trigger
8082
WANDB_PUBLICATION: "{wandb_publication}"
@@ -91,8 +93,6 @@ tasks:
9193

9294
# Taskcluster proxy is required to read secrets
9395
taskcluster-proxy: true
94-
# 128 happens when cloning this repository fails
95-
retry-exit-status: [128]
9696

9797
# The task needs to be able to read that secret to publish on Weight & Biases
9898
scopes:

taskcluster/kinds/corpus-clean-parallel-bicleaner-ai/kind.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,10 +79,10 @@ tasks:
7979
SRC: "{src_locale}"
8080
TRG: "{trg_locale}"
8181
# 128 happens when cloning this repository fails
82-
# 9001 is the code for when tensorflow fails to find GPUs on the system,
83-
# and biclenaer reverts to CPU time. Rather than waste time, we should
84-
# restart the task.
85-
retry-exit-status: [128,9001]
82+
# 75 is the unix code EX_TEMPFAIL, which indicates a temporary failure.
83+
# This is used when the GPUs can't be accessed. Bicleaner reverts to CPU
84+
# time in this case, which is a waste of time. The task should be restarted.
85+
retry-exit-status: [128,75]
8686

8787
# Don't run unless explicitly scheduled
8888
run-on-tasks-for: []

taskcluster/kinds/distillation-mono-src-translate/kind.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,9 +112,9 @@ tasks:
112112
CUDA_DIR: fetches/cuda-toolkit
113113
CUDNN_DIR: fetches/cuda-toolkit
114114
MARIAN: $MOZ_FETCHES_DIR
115+
# 75 - EX_TEMPFAIL, used for when the GPUs aren't available on the machine.
115116
# 128 happens when cloning this repository fails
116-
# 9002 happens if no GPU is attached
117-
retry-exit-status: [128, 9002]
117+
retry-exit-status: [75, 128]
118118

119119
marian-args:
120120
from-parameters: training_config.marian-args.decoding-teacher

taskcluster/kinds/distillation-parallel-src-translate/kind.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,9 @@ tasks:
113113
CUDA_DIR: fetches/cuda-toolkit
114114
CUDNN_DIR: fetches/cuda-toolkit
115115
MARIAN: $MOZ_FETCHES_DIR
116+
# 75 - EX_TEMPFAIL, used for when the GPUs aren't available on the machine.
116117
# 128 happens when cloning this repository fails
117-
# 9002 happens if no GPU is attached
118-
retry-exit-status: [128, 9002]
118+
retry-exit-status: [75, 128]
119119

120120
run:
121121
using: run-task

0 commit comments

Comments
 (0)