Skip to content

Commit 48bfb20

Browse files
authored
Merge pull request #45 from ROCm/fix_diagnostic_feedback
Fix diagnostic feedback
2 parents 1c5bd4f + 260a3ac commit 48bfb20

File tree

4 files changed

+119
-94
lines changed

4 files changed

+119
-94
lines changed

bitsandbytes/cextension.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def get_native_library() -> BNBNativeLibrary:
120120
hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2])
121121
HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor
122122
BNB_HIP_VERSION_SHORT = f"{hip_major}{hip_minor}"
123-
BNB_BACKEND = "ROCM"
123+
BNB_BACKEND = "ROCm"
124124
else:
125125
HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0
126126
BNB_HIP_VERSION_SHORT = ""

bitsandbytes/diagnostics/cuda.py

Lines changed: 113 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import torch
77

8-
from bitsandbytes.cextension import BNB_BACKEND, HIP_ENVIRONMENT, get_cuda_bnb_library_path
8+
from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path
99
from bitsandbytes.consts import NONPYTORCH_DOC_URL
1010
from bitsandbytes.cuda_specs import CUDASpecs
1111
from bitsandbytes.diagnostics.utils import print_dedented
@@ -32,16 +32,18 @@
3232
"_", # current Python interpreter
3333
}
3434

35-
CUDA_RUNTIME_LIB_PATTERNS = (
36-
"cudart64*.dll", # Windows
37-
"libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
38-
"nvcuda*.dll", # Windows
39-
)
35+
logger = logging.getLogger(__name__)
4036

41-
if HIP_ENVIRONMENT:
42-
CUDA_RUNTIME_LIB_PATTERNS = ("libamdhip64.so*",)
4337

44-
logger = logging.getLogger(__name__)
38+
def get_runtime_lib_patterns() -> tuple:
39+
if HIP_ENVIRONMENT:
40+
return ("libamdhip64.so*",)
41+
else:
42+
return (
43+
"cudart64*.dll", # Windows
44+
"libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
45+
"nvcuda*.dll", # Windows
46+
)
4547

4648

4749
def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path]:
@@ -58,8 +60,8 @@ def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path
5860
continue
5961
except OSError: # Assume an esoteric error trying to poke at the directory
6062
pass
61-
for lib_pattern in CUDA_RUNTIME_LIB_PATTERNS:
62-
for pth in dir.rglob(lib_pattern):
63+
for lib_pattern in get_runtime_lib_patterns():
64+
for pth in dir.glob(lib_pattern):
6365
if pth.is_file() and not pth.is_symlink():
6466
yield pth
6567
except (OSError, PermissionError):
@@ -107,59 +109,38 @@ def find_cudart_libraries() -> Iterator[Path]:
107109
yield from find_cuda_libraries_in_path_list(value)
108110

109111

110-
def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
111-
if not HIP_ENVIRONMENT:
112-
print(
113-
f"PyTorch settings found: CUDA_VERSION={cuda_specs.cuda_version_string}, "
114-
f"Highest Compute Capability: {cuda_specs.highest_compute_capability}.",
115-
)
116-
else:
117-
print(f"PyTorch settings found: ROCM_VERSION={cuda_specs.cuda_version_string}")
112+
def _print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
113+
print(
114+
f"PyTorch settings found: CUDA_VERSION={cuda_specs.cuda_version_string}, "
115+
f"Highest Compute Capability: {cuda_specs.highest_compute_capability}.",
116+
)
118117

119118
binary_path = get_cuda_bnb_library_path(cuda_specs)
120119
if not binary_path.exists():
121-
if not HIP_ENVIRONMENT:
122-
print_dedented(
123-
f"""
124-
Library not found: {binary_path}. Maybe you need to compile it from source?
125-
If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION`,
126-
for example, `make CUDA_VERSION=113`.
127-
128-
The CUDA version for the compile might depend on your conda install, if using conda.
129-
Inspect CUDA version via `conda list | grep cuda`.
130-
""",
131-
)
132-
else:
133-
print_dedented(
134-
f"""
135-
Library not found: {binary_path}.
136-
Maybe you need to compile it from source? If you compiled from source, check that ROCM_VERSION
137-
in PyTorch Settings matches your ROCM install. If not, reinstall PyTorch for your ROCm version
138-
and rebuild bitsandbytes.
139-
""",
140-
)
120+
print_dedented(
121+
f"""
122+
Library not found: {binary_path}. Maybe you need to compile it from source?
123+
If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION`,
124+
for example, `make CUDA_VERSION=113`.
125+
126+
The CUDA version for the compile might depend on your conda install, if using conda.
127+
Inspect CUDA version via `conda list | grep cuda`.
128+
""",
129+
)
141130

142131
cuda_major, cuda_minor = cuda_specs.cuda_version_tuple
143-
if not HIP_ENVIRONMENT:
144-
if cuda_major < 11:
145-
print_dedented(
146-
"""
147-
WARNING: CUDA versions lower than 11 are currently not supported for LLM.int8().
148-
You will be only to use 8-bit optimizers and quantization routines!
149-
""",
150-
)
151-
152-
print(f"To manually override the PyTorch CUDA version please see: {NONPYTORCH_DOC_URL}")
153-
else:
154-
if (cuda_major, cuda_minor) < (6, 1):
155-
print_dedented(
156-
"""
157-
WARNING: bitandbytes is fully supported only from ROCm 6.1.
158-
""",
159-
)
132+
if cuda_major < 11:
133+
print_dedented(
134+
"""
135+
WARNING: CUDA versions lower than 11 are currently not supported for LLM.int8().
136+
You will be only to use 8-bit optimizers and quantization routines!
137+
""",
138+
)
139+
140+
print(f"To manually override the PyTorch CUDA version please see: {NONPYTORCH_DOC_URL}")
160141

161142
# 7.5 is the minimum CC for cublaslt
162-
if not cuda_specs.has_cublaslt and not HIP_ENVIRONMENT:
143+
if not cuda_specs.has_cublaslt:
163144
print_dedented(
164145
"""
165146
WARNING: Compute capability < 7.5 detected! Only slow 8-bit matmul is supported for your GPU!
@@ -173,44 +154,88 @@ def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
173154
# (2) Multiple CUDA versions installed
174155

175156

176-
def print_cuda_runtime_diagnostics() -> None:
157+
def _print_hip_diagnostics(cuda_specs: CUDASpecs) -> None:
158+
print(f"PyTorch settings found: ROCM_VERSION={cuda_specs.cuda_version_string}")
159+
160+
binary_path = get_cuda_bnb_library_path(cuda_specs)
161+
if not binary_path.exists():
162+
print_dedented(
163+
f"""
164+
Library not found: {binary_path}.
165+
Maybe you need to compile it from source? If you compiled from source, check that ROCM_VERSION
166+
in PyTorch Settings matches your ROCm install. If not, reinstall PyTorch for your ROCm version
167+
and rebuild bitsandbytes.
168+
""",
169+
)
170+
171+
hip_major, hip_minor = cuda_specs.cuda_version_tuple
172+
if (hip_major, hip_minor) < (6, 1):
173+
print_dedented(
174+
"""
175+
WARNING: bitsandbytes is fully supported only from ROCm 6.1.
176+
""",
177+
)
178+
179+
180+
def print_diagnostics(cuda_specs: CUDASpecs) -> None:
181+
if HIP_ENVIRONMENT:
182+
_print_hip_diagnostics(cuda_specs)
183+
else:
184+
_print_cuda_diagnostics(cuda_specs)
185+
186+
187+
def _print_cuda_runtime_diagnostics() -> None:
177188
cudart_paths = list(find_cudart_libraries())
178189
if not cudart_paths:
179-
print(f"{BNB_BACKEND} SETUP: WARNING! {BNB_BACKEND} runtime files not found in any environmental path.")
190+
print("WARNING! CUDA runtime files not found in any environmental path.")
180191
elif len(cudart_paths) > 1:
181-
backend_version = torch.version.cuda if not HIP_ENVIRONMENT else torch.version.hip
182192
print_dedented(
183193
f"""
184-
Found duplicate {BNB_BACKEND} runtime files (see below).
194+
Found duplicate CUDA runtime files (see below).
195+
196+
We select the PyTorch default CUDA runtime, which is {torch.version.cuda},
197+
but this might mismatch with the CUDA version that is needed for bitsandbytes.
198+
To override this behavior set the `BNB_CUDA_VERSION=<version string, e.g. 122>` environmental variable.
199+
200+
For example, if you want to use the CUDA version 122,
201+
BNB_CUDA_VERSION=122 python ...
202+
203+
OR set the environmental variable in your .bashrc:
204+
export BNB_CUDA_VERSION=122
185205
186-
We select the PyTorch default {BNB_BACKEND} runtime, which is {backend_version},
187-
but this might mismatch with the {BNB_BACKEND} version that is needed for bitsandbytes.
206+
In the case of a manual override, make sure you set LD_LIBRARY_PATH, e.g.
207+
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2,
208+
""",
209+
)
210+
for pth in cudart_paths:
211+
print(f"* Found CUDA runtime at: {pth}")
212+
213+
214+
def _print_hip_runtime_diagnostics() -> None:
215+
cudart_paths = list(find_cudart_libraries())
216+
if not cudart_paths:
217+
print("WARNING! ROCm runtime files not found in any environmental path.")
218+
elif len(cudart_paths) > 1:
219+
print_dedented(
220+
f"""
221+
Found duplicate ROCm runtime files (see below).
222+
223+
We select the PyTorch default ROCm runtime, which is {torch.version.hip},
224+
but this might mismatch with the ROCm version that is needed for bitsandbytes.
225+
226+
To resolve it, install PyTorch built for the ROCm version you want to use
227+
228+
and set LD_LIBRARY_PATH to your ROCm install path, e.g.
229+
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm-6.1.2/lib,
188230
""",
189231
)
190-
if not HIP_ENVIRONMENT:
191-
print_dedented(
192-
"""
193-
To override this behavior set the `BNB_CUDA_VERSION=<version string, e.g. 122>` environmental variable.
194-
195-
For example, if you want to use the CUDA version 122,
196-
BNB_CUDA_VERSION=122 python ...
197-
198-
OR set the environmental variable in your .bashrc:
199-
export BNB_CUDA_VERSION=122
200-
201-
In the case of a manual override, make sure you set LD_LIBRARY_PATH, e.g.
202-
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2,
203-
""",
204-
)
205-
else:
206-
print_dedented(
207-
"""
208-
To resolve it, install PyTorch built for the ROCm version you want to use
209-
210-
and set LD_LIBRARY_PATH to your ROCm install path, e.g.
211-
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/rocm-6.1.2,
212-
""",
213-
)
214232

215233
for pth in cudart_paths:
216-
print(f"* Found {BNB_BACKEND} runtime at: {pth}")
234+
print(f"* Found ROCm runtime at: {pth}")
235+
236+
237+
def print_runtime_diagnostics() -> None:
238+
if HIP_ENVIRONMENT:
239+
_print_hip_runtime_diagnostics()
240+
else:
241+
_print_cuda_runtime_diagnostics()

bitsandbytes/diagnostics/main.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
from bitsandbytes.consts import PACKAGE_GITHUB_URL
88
from bitsandbytes.cuda_specs import get_cuda_specs
99
from bitsandbytes.diagnostics.cuda import (
10-
print_cuda_diagnostics,
11-
print_cuda_runtime_diagnostics,
10+
print_diagnostics,
11+
print_runtime_diagnostics,
1212
)
1313
from bitsandbytes.diagnostics.utils import print_dedented, print_header
1414

@@ -63,8 +63,8 @@ def main():
6363
print(f"2. {BNB_BACKEND} not installed")
6464
print(f"3. You have multiple conflicting {BNB_BACKEND} libraries")
6565
if cuda_specs:
66-
print_cuda_diagnostics(cuda_specs)
67-
print_cuda_runtime_diagnostics()
66+
print_diagnostics(cuda_specs)
67+
print_runtime_diagnostics()
6868
print_header("")
6969
print_header("DEBUG INFO END")
7070
print_header("")

csrc/ops.hip

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -618,7 +618,7 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
618618
if (returnedAlgoCount == 0)
619619
{
620620
has_error = 1;
621-
printf("Error: Matmul Algo Heurisitic didn't return algorithms\n");
621+
fprintf(stderr, "Error: Matmul Algo Heuristic didn't return algorithms\n");
622622
}
623623
else
624624
{

0 commit comments

Comments
 (0)