Skip to content

Commit 2483659

Browse files
timthluTim Lu
andauthored
Enable LLVM ThreadSanitizer for triton-shared (#301)
This PR builds on top of #294, which enabled LLVM AddressSanitizer for triton-shared. Enabling ThreadSanitizer support involves the same infrastructure, with some small differences: - Requires `-fsanitize=thread` during compiling and linking - Requires parallelizing the Triton grid using OpenMP in order to detect data races. The LLVM build script has also been changed to reflect this - Requires some TSan-specific suppressions and libraries (archer), added to the run script --------- Co-authored-by: Tim Lu <[email protected]>
1 parent 1338c49 commit 2483659

File tree

7 files changed

+87
-10
lines changed

7 files changed

+87
-10
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,9 @@ $ ls /tmp/ir_dumps
206206
ll.ir ll.mlir tt.mlir ttshared.mlir
207207
```
208208

209+
## Debugging Triton Programs
210+
Triton-shared includes a build option that enables LLVM-sanitizers - AddressSanitizer (ASan) and ThreadSanitizer (TSan) - to help detect memory safety and concurrency issues in Triton programs. These sanitizers dynamically analyze the program during execution, identifying bugs such as buffer overflows and data races respectively. For more details and setup instructions, refer [here](https://github.com/microsoft/triton-shared/blob/main/scripts/SANITIZER.md).
211+
209212
## Contributing
210213

211214
This project welcomes contributions and suggestions. Most contributions require you to agree to a

backend/compiler.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,10 @@ def _dump_ir_if_needed(files):
3636

3737
def _get_sanitizer_type():
3838
# returns "" if not set
39-
# throws error if set to something other than "asan"
39+
# throws error if set to something other than "asan" or "tsan"
4040
sanitizer_type = os.getenv("TRITON_SHARED_SANITIZER_TYPE", "")
4141

42-
if sanitizer_type != "" and sanitizer_type != "asan":
42+
if sanitizer_type != "" and sanitizer_type != "asan" and sanitizer_type != "tsan":
4343
# throw error
4444
raise Exception(f"TRITON_SHARED_SANITIZER_TYPE {sanitizer_type} is invalid.")
4545

@@ -164,6 +164,8 @@ def _llir_to_bin(llir: str, metadata):
164164

165165
if sanitizer_type == "asan":
166166
subprocess_args.extend(["-g", "-fsanitize=address", "-mllvm", "-asan-stack=0"])
167+
elif sanitizer_type == "tsan":
168+
subprocess_args.extend(["-g", "-fsanitize=thread"])
167169

168170
subprocess.check_call(subprocess_args)
169171
else:

backend/driver.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@ def _get_llvm_bin_path(bin_name: str) -> str:
2020

2121
def _get_sanitizer_type():
2222
# returns "" if not set
23-
# throws error if set to something other than "asan"
23+
# throws error if set to something other than "asan" or "tsan"
2424
sanitizer_type = os.getenv("TRITON_SHARED_SANITIZER_TYPE", "")
2525

26-
if sanitizer_type != "" and sanitizer_type != "asan":
26+
if sanitizer_type != "" and sanitizer_type != "asan" and sanitizer_type != "tsan":
2727
# throw error
2828
raise Exception(f"TRITON_SHARED_SANITIZER_TYPE {sanitizer_type} is invalid.")
2929

@@ -114,6 +114,9 @@ def _generate_launcher(constants, signature, kernel_name):
114114
static void _launch(int gridX, int gridY, int gridZ, {arg_decls}) {{
115115
if (gridX*gridY*gridZ > 0) {{
116116
// Cast "function" to the real function type.
117+
// apply parallelization to the triton grid when using ThreadSanitizer (TSan)
118+
// to help detect potential data races across program instances during kernel execution
119+
{"#pragma omp parallel for collapse(3)" if _get_sanitizer_type() == "tsan" else ""}
117120
for(int x = 0; x < gridX; x++) {{
118121
for(int y = 0; y < gridY; y++) {{
119122
for(int z = 0; z < gridZ; z++) {{
@@ -317,6 +320,16 @@ def launch(
317320

318321
if sanitizer_type == "asan":
319322
subprocess_args.extend(["-g", "-fsanitize=address", "-mllvm", "-asan-stack=0"])
323+
elif sanitizer_type == "tsan":
324+
# ensure that openmp is available
325+
libomp_path = next(Path(Path(_get_llvm_bin_path("")).parent).rglob("libomp.so"), None)
326+
327+
if not libomp_path:
328+
raise Exception(f"libomp.so does not exist.")
329+
330+
libomp_path = str(libomp_path.parent)
331+
332+
subprocess_args.extend(["-g", "-fsanitize=thread", "-fopenmp", f"-Wl,-rpath,{libomp_path}"])
320333

321334
subprocess.check_call(subprocess_args)
322335
else:

scripts/SANITIZER.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,5 +38,5 @@ For building LLVM and triton_shared for usage with sanitizers, run the scripts i
3838

3939
For runtime setup (one-time per shell): `source setup_runtime_for_sanitizers.sh <existing path to venv> <existing path to llvm install dir> <existing path to triton shared>`
4040

41-
For running a python program with sanitizers enabled: `run_triton_with_sanitizers.sh <sanitizer type> python program.py`. Currently, the only supported `<sanitizer type>` is `asan`.
41+
For running a python program with sanitizers enabled: `run_triton_with_sanitizers.sh <sanitizer type> python program.py`. Currently, the supported `<sanitizer type>`s are `asan` and `tsan`.
4242

scripts/build_llvm_for_sanitizers.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ LLVM_SOURCE_DIR="${LLVM_PATH}/llvm-project"
4040
LLVM_SOURCE="${LLVM_SOURCE_DIR}/llvm"
4141

4242
# compiler-rt and clang are the sanitizer-specific LLVM projects
43-
LLVM_PROJECTS="clang;compiler-rt;mlir"
43+
# openmp is used for parallelizing the triton grid for ThreadSanitizer (TSan)
44+
LLVM_PROJECTS="clang;compiler-rt;openmp;mlir"
4445

4546
# these are the targets supported by the Triton language
4647
# Triton's build script for LLVM uses these exact targets

scripts/build_triton_shared_for_sanitizers.sh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,19 @@ if [ ! -e "$TRITON_SHARED_PATH" ]; then
2929
exit 1
3030
fi
3131

32+
# check whether ~/.triton is empty
33+
# llvm being used from before may be cached and may be different from the custom llvm
34+
# will cause linking errors during the triton-shared build
35+
if [ -e "~/.triton" ]; then
36+
echo "Error: Please remove ~/.triton and run this script again."
37+
exit 1
38+
fi
39+
3240
cd "$TRITON_SHARED_PATH"
3341

3442
# prepare for triton_shared build
3543
export PATH="${LLVM_INSTALL_PATH}/bin:${PATH}"
3644
which clang
37-
rm -rf ~/.triton
3845

3946
# build triton-shared with the custom LLVM
4047
cd "${TRITON_SHARED_PATH}/triton"

scripts/run_triton_with_sanitizers.sh

Lines changed: 54 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,14 @@
33
# use LLVM_BINARY_DIR to obtain the locations of the .so files used for LD_PRELOAD
44

55
if [ "$#" -eq 0 ]; then
6-
echo "Usage: $0 <sanitizer type> python program.py. <sanitizer type> can be \"asan\""
6+
echo "Usage: $0 <sanitizer type> python program.py. <sanitizer type> can be \"asan\", \"tsan\""
77
exit 1
88
fi
99

1010
sanitizer_type=$1
1111

12-
if [ "$sanitizer_type" != "asan" ]; then
13-
echo "Error: Unsupported <sanitizer type> $sanitizer_type. Usage: $0 <sanitizer type> python program.py. <sanitizer type> can be \"asan\""
12+
if [ "$sanitizer_type" != "asan" ] && [ "$sanitizer_type" != "tsan" ]; then
13+
echo "Error: Unsupported <sanitizer type> $sanitizer_type. Usage: $0 <sanitizer type> python program.py. <sanitizer type> can be \"asan\", \"tsan\""
1414
exit 1
1515
fi
1616

@@ -42,6 +42,57 @@ ASAN_OPTIONS=\"detect_leaks=0\""
4242

4343
# shift command line arguments to the left by 1 to account for "asan"
4444
shift 1
45+
elif [ "${sanitizer_type}" = "tsan" ]; then
46+
# find path to tsan shared library
47+
tsan_dir="$(find "$llvm_install_dir" -type f -name "libclang_rt.tsan.so")"
48+
49+
if [ -z "$tsan_dir" ]; then
50+
echo "Error: unable to find libclang_rt.tsan.so in $llvm_install_dir"
51+
exit 1
52+
fi
53+
54+
count=$(echo "$tsan_dir" | wc -l)
55+
56+
if [ "$count" -gt 1 ]; then
57+
echo "Error: multiple libclang_rt.tsan.so found in $llvm_install_dir"
58+
echo "$tsan_dir"
59+
exit 1
60+
fi
61+
62+
# find path to archer library
63+
archer_dir="$(find "$llvm_install_dir" -type f -name "libarcher.so")"
64+
65+
if [ -z "$archer_dir" ]; then
66+
echo "Error: unable to find libarcher.so in $llvm_install_dir"
67+
exit 1
68+
fi
69+
70+
count=$(echo "$archer_dir" | wc -l)
71+
72+
if [ "$count" -gt 1 ]; then
73+
echo "Error: multiple libarcher.so found in $llvm_install_dir"
74+
echo "$archer_dir"
75+
exit 1
76+
fi
77+
78+
# make new suppression.txt file if it doesn't exist already
79+
if [ ! -f "suppression.txt" ]; then
80+
echo "called_from_lib:libomp.so
81+
called_from_lib:libtorch_python.so
82+
called_from_lib:libtorch_cpu.so
83+
called_from_lib:libtorch_cuda.so" > "./suppression.txt"
84+
fi
85+
86+
env_args="LD_PRELOAD=\"$tsan_dir\" \
87+
TRITON_ALWAYS_COMPILE=1 \
88+
TRITON_SHARED_SANITIZER_TYPE=\"tsan\" \
89+
TSAN_OPTIONS=\"ignore_noninstrumented_modules=0:suppressions=suppression.txt\" \
90+
OMP_NUM_THREADS=16 \
91+
OMP_TOOL_LIBRARIES=\"$archer_dir\" \
92+
ARCHER_OPTIONS=\"verbose=1\""
93+
94+
# shift command line arguments to the left by 1 to account for "tsan"
95+
shift 1
4596
fi
4697

4798
# invoke python function

0 commit comments

Comments
 (0)