Skip to content

Commit 45b7d14

Browse files
pnunna93akxTitus-von-Koeller
authored
AMD: Clarify diagnostic messages; free up disk space for CI build
* Add build job for rocm * Add rocm build script * Copy shared obj file into output_dir * upload build artifacts and enable wheels build * Remove cuda build temporarily * Add ROCm version to .so filename * Add rocm_version to whls build * Revert "Remove cuda build temporarily" This reverts commit 1413c5f. * Add rocm_version env var * Remove thrush header files * Print node info * print cuda node info * Revert "print cuda node info" This reverts commit cdb209a. * Revert "Print node info" This reverts commit 7e9a65c. * Add rocm arch to compile command * Rename .so files to rocm * Update default gpu arch * Skip cpu based igemmlt int tests on ROCm * Update Documentation * Update upstream repo name * Update docs * Update string format Co-authored-by: Aarni Koskela <[email protected]> * Remove pre-release option for torch install * Update pytorch install path Co-authored-by: Titus <[email protected]> * Add messages for Heuristics error * Remove toolcache for disk space * print disk usage * Clean disk space for linux * Fix for ubuntu * Add sudo for apt clean * Update clean up disk list * remove disk usage print * Add BNB_BACKEND variable * Update diagnostic functions for ROCm * Fix tuple error * Fix library detection bug for recursive and symlink cases * fix pre-commit errors * Remove recursive path lib search * Create function for runtime lib patterns * Update logger format Co-authored-by: Aarni Koskela <[email protected]> * Update error reporting Co-authored-by: Aarni Koskela <[email protected]> * Remove commented code Co-authored-by: Aarni Koskela <[email protected]> * Update error reporting Co-authored-by: Aarni Koskela <[email protected]> * Update error reporting * Create hip diagnostics functions * Fix Typo * Fix pre-commit checks --------- Co-authored-by: Aarni Koskela <[email protected]> Co-authored-by: Titus <[email protected]>
1 parent 2784653 commit 45b7d14

File tree

5 files changed

+137
-41
lines changed

5 files changed

+137
-41
lines changed

.github/workflows/python-package.yml

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -116,10 +116,23 @@ jobs:
116116
uses: docker/setup-qemu-action@v2
117117
- name: Clean up disk space
118118
run: |
119-
sudo rm -rf /usr/share/dotnet
120-
sudo rm -rf /opt/ghc
121-
sudo rm -rf "/usr/local/share/boost"
122-
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
119+
sudo rm -rf \
120+
/usr/share/dotnet \
121+
/opt/ghc \
122+
"/usr/local/share/boost" \
123+
"$AGENT_TOOLSDIRECTORY" \
124+
/opt/hostedtoolcache \
125+
/opt/google/chrome \
126+
/opt/microsoft/msedge \
127+
/opt/microsoft/powershell \
128+
/opt/pipx \
129+
/usr/lib/mono \
130+
/usr/local/julia* \
131+
/usr/local/lib/android \
132+
/usr/local/lib/node_modules \
133+
/usr/local/share/chromium \
134+
/usr/local/share/powershell \
135+
/usr/share/swift
123136
- name: Build C++
124137
run: bash .github/scripts/build-rocm.sh
125138
env:

bitsandbytes/cextension.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def get_native_library() -> BNBNativeLibrary:
9999
if cuda_binary_path.exists():
100100
binary_path = cuda_binary_path
101101
else:
102-
logger.warning("Could not find the bitsandbytes CUDA binary at %r", cuda_binary_path)
102+
logger.warning("Could not find the bitsandbytes %s binary at %r", BNB_BACKEND, cuda_binary_path)
103103
logger.debug(f"Loading bitsandbytes native library from: {binary_path}")
104104
dll = ct.cdll.LoadLibrary(str(binary_path))
105105

@@ -116,21 +116,24 @@ def get_native_library() -> BNBNativeLibrary:
116116
hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2])
117117
HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor
118118
BNB_HIP_VERSION_SHORT = f"{hip_major}{hip_minor}"
119+
BNB_BACKEND = "ROCm"
119120
else:
120121
HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0
121122
BNB_HIP_VERSION_SHORT = ""
123+
BNB_BACKEND = "CUDA"
124+
122125
lib = get_native_library()
123126
except Exception as e:
124127
lib = None
125128
logger.error(f"Could not load bitsandbytes native library: {e}", exc_info=True)
126129
if torch.cuda.is_available():
127130
logger.warning(
128-
"""
129-
CUDA Setup failed despite CUDA being available. Please run the following command to get more information:
131+
f"""
132+
{BNB_BACKEND} Setup failed despite {BNB_BACKEND} being available. Please run the following command to get more information:
130133
131134
python -m bitsandbytes
132135
133-
Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them
136+
Inspect the output of the command and see if you can locate {BNB_BACKEND} libraries. You might need to add them
134137
to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
135138
and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues
136139
""",

bitsandbytes/diagnostics/cuda.py

Lines changed: 77 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import torch
77

8-
from bitsandbytes.cextension import get_cuda_bnb_library_path
8+
from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path
99
from bitsandbytes.consts import NONPYTORCH_DOC_URL
1010
from bitsandbytes.cuda_specs import CUDASpecs
1111
from bitsandbytes.diagnostics.utils import print_dedented
@@ -32,15 +32,20 @@
3232
"_", # current Python interpreter
3333
}
3434

35-
CUDA_RUNTIME_LIB_PATTERNS = (
36-
"cudart64*.dll", # Windows
37-
"libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
38-
"nvcuda*.dll", # Windows
39-
)
40-
4135
logger = logging.getLogger(__name__)
4236

4337

38+
def get_runtime_lib_patterns() -> tuple:
39+
if HIP_ENVIRONMENT:
40+
return ("libamdhip64.so*",)
41+
else:
42+
return (
43+
"cudart64*.dll", # Windows
44+
"libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
45+
"nvcuda*.dll", # Windows
46+
)
47+
48+
4449
def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path]:
4550
for dir_string in paths_list_candidate.split(os.pathsep):
4651
if not dir_string:
@@ -55,9 +60,9 @@ def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path
5560
continue
5661
except OSError: # Assume an esoteric error trying to poke at the directory
5762
pass
58-
for lib_pattern in CUDA_RUNTIME_LIB_PATTERNS:
63+
for lib_pattern in get_runtime_lib_patterns():
5964
for pth in dir.glob(lib_pattern):
60-
if pth.is_file():
65+
if pth.is_file() and not pth.is_symlink():
6166
yield pth
6267
except (OSError, PermissionError):
6368
pass
@@ -104,7 +109,7 @@ def find_cudart_libraries() -> Iterator[Path]:
104109
yield from find_cuda_libraries_in_path_list(value)
105110

106111

107-
def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
112+
def _print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
108113
print(
109114
f"PyTorch settings found: CUDA_VERSION={cuda_specs.cuda_version_string}, "
110115
f"Highest Compute Capability: {cuda_specs.highest_compute_capability}.",
@@ -149,10 +154,40 @@ def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
149154
# (2) Multiple CUDA versions installed
150155

151156

152-
def print_cuda_runtime_diagnostics() -> None:
157+
def _print_hip_diagnostics(cuda_specs: CUDASpecs) -> None:
158+
print(f"PyTorch settings found: ROCM_VERSION={cuda_specs.cuda_version_string}")
159+
160+
binary_path = get_cuda_bnb_library_path(cuda_specs)
161+
if not binary_path.exists():
162+
print_dedented(
163+
f"""
164+
Library not found: {binary_path}.
165+
Maybe you need to compile it from source? If you compiled from source, check that ROCM_VERSION
166+
in PyTorch Settings matches your ROCm install. If not, reinstall PyTorch for your ROCm version
167+
and rebuild bitsandbytes.
168+
""",
169+
)
170+
171+
hip_major, hip_minor = cuda_specs.cuda_version_tuple
172+
if (hip_major, hip_minor) < (6, 1):
173+
print_dedented(
174+
"""
175+
WARNING: bitsandbytes is fully supported only from ROCm 6.1.
176+
""",
177+
)
178+
179+
180+
def print_diagnostics(cuda_specs: CUDASpecs) -> None:
181+
if HIP_ENVIRONMENT:
182+
_print_hip_diagnostics(cuda_specs)
183+
else:
184+
_print_cuda_diagnostics(cuda_specs)
185+
186+
187+
def _print_cuda_runtime_diagnostics() -> None:
153188
cudart_paths = list(find_cudart_libraries())
154189
if not cudart_paths:
155-
print("CUDA SETUP: WARNING! CUDA runtime files not found in any environmental path.")
190+
print("WARNING! CUDA runtime files not found in any environmental path.")
156191
elif len(cudart_paths) > 1:
157192
print_dedented(
158193
f"""
@@ -174,3 +209,33 @@ def print_cuda_runtime_diagnostics() -> None:
174209
)
175210
for pth in cudart_paths:
176211
print(f"* Found CUDA runtime at: {pth}")
212+
213+
214+
def _print_hip_runtime_diagnostics() -> None:
215+
cudart_paths = list(find_cudart_libraries())
216+
if not cudart_paths:
217+
print("WARNING! ROCm runtime files not found in any environmental path.")
218+
elif len(cudart_paths) > 1:
219+
print_dedented(
220+
f"""
221+
Found duplicate ROCm runtime files (see below).
222+
223+
We select the PyTorch default ROCm runtime, which is {torch.version.hip},
224+
but this might mismatch with the ROCm version that is needed for bitsandbytes.
225+
226+
To resolve it, install PyTorch built for the ROCm version you want to use
227+
228+
and set LD_LIBRARY_PATH to your ROCm install path, e.g.
229+
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm-6.1.2/lib,
230+
""",
231+
)
232+
233+
for pth in cudart_paths:
234+
print(f"* Found ROCm runtime at: {pth}")
235+
236+
237+
def print_runtime_diagnostics() -> None:
238+
if HIP_ENVIRONMENT:
239+
_print_hip_runtime_diagnostics()
240+
else:
241+
_print_cuda_runtime_diagnostics()

bitsandbytes/diagnostics/main.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@
33

44
import torch
55

6+
from bitsandbytes.cextension import BNB_BACKEND, HIP_ENVIRONMENT
67
from bitsandbytes.consts import PACKAGE_GITHUB_URL
78
from bitsandbytes.cuda_specs import get_cuda_specs
89
from bitsandbytes.diagnostics.cuda import (
9-
print_cuda_diagnostics,
10-
print_cuda_runtime_diagnostics,
10+
print_diagnostics,
11+
print_runtime_diagnostics,
1112
)
1213
from bitsandbytes.diagnostics.utils import print_dedented, print_header
1314

@@ -16,12 +17,13 @@ def sanity_check():
1617
from bitsandbytes.cextension import lib
1718

1819
if lib is None:
20+
compute_backend = "cuda" if not HIP_ENVIRONMENT else "hip"
1921
print_dedented(
20-
"""
22+
f"""
2123
Couldn't load the bitsandbytes library, likely due to missing binaries.
2224
Please ensure bitsandbytes is properly installed.
2325
24-
For source installations, compile the binaries with `cmake -DCOMPUTE_BACKEND=cuda -S .`.
26+
For source installations, compile the binaries with `cmake -DCOMPUTE_BACKEND={compute_backend} -S .`.
2527
See the documentation for more details if needed.
2628
2729
Trying a simple check anyway, but this will likely fail...
@@ -49,19 +51,24 @@ def main():
4951

5052
print_header("OTHER")
5153
cuda_specs = get_cuda_specs()
52-
print("CUDA specs:", cuda_specs)
54+
if HIP_ENVIRONMENT:
55+
rocm_specs = f" rocm_version_string='{cuda_specs.cuda_version_string}',"
56+
rocm_specs += f" rocm_version_tuple={cuda_specs.cuda_version_tuple}"
57+
print(f"{BNB_BACKEND} specs:{rocm_specs}")
58+
else:
59+
print(f"{BNB_BACKEND} specs:{cuda_specs}")
5360
if not torch.cuda.is_available():
54-
print("Torch says CUDA is not available. Possible reasons:")
55-
print("1. CUDA driver not installed")
56-
print("2. CUDA not installed")
57-
print("3. You have multiple conflicting CUDA libraries")
61+
print(f"Torch says {BNB_BACKEND} is not available. Possible reasons:")
62+
print(f"1. {BNB_BACKEND} driver not installed")
63+
print(f"2. {BNB_BACKEND} not installed")
64+
print(f"3. You have multiple conflicting {BNB_BACKEND} libraries")
5865
if cuda_specs:
59-
print_cuda_diagnostics(cuda_specs)
60-
print_cuda_runtime_diagnostics()
66+
print_diagnostics(cuda_specs)
67+
print_runtime_diagnostics()
6168
print_header("")
6269
print_header("DEBUG INFO END")
6370
print_header("")
64-
print("Checking that the library is importable and CUDA is callable...")
71+
print(f"Checking that the library is importable and {BNB_BACKEND} is callable...")
6572
try:
6673
sanity_check()
6774
print("SUCCESS!")

csrc/ops.hip

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -576,6 +576,7 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
576576
if (returnedAlgoCount == 0)
577577
{
578578
has_error = 1;
579+
fprintf(stderr, "Error: Matmul Algo Heuristic didn't return algorithms\n");
579580
}
580581
else
581582
{
@@ -614,18 +615,25 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
614615
heuristicResult,
615616
&returnedAlgoCount));
616617

617-
if(!SCALE_ROWS)
618+
if (returnedAlgoCount == 0)
618619
{
619-
float alpha = 1.0f, beta = 0.0f;
620-
621-
has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
620+
has_error = 1;
621+
fprintf(stderr, "Error: Matmul Algo Heuristic didn't return algorithms\n");
622622
}
623623
else
624624
{
625-
//has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, hipblasLt_MATMUL_DESC_POINTER_MODE, &alphaVec, sizeof(alphaVec)));
626-
float beta = 0.0f;
627-
628-
has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
625+
if(!SCALE_ROWS)
626+
{
627+
float alpha = 1.0f, beta = 0.0f;
628+
629+
has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
630+
}
631+
else
632+
{
633+
float beta = 0.0f;
634+
635+
has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
636+
}
629637
}
630638
}
631639

@@ -635,7 +643,7 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
635643
if (Adesc) has_error |= checkHipblasStatus(hipblasLtMatrixLayoutDestroy(Adesc));
636644
if (matmulDesc) has_error |= checkHipblasStatus(hipblasLtMatmulDescDestroy(matmulDesc));
637645
if(has_error == 1)
638-
printf("error detected");
646+
fprintf(stderr, "error detected\n");
639647

640648
return has_error;
641649
#endif // NO_HIPBLASLT

0 commit comments

Comments
 (0)