Skip to content

Commit 73cb057

Browse files
authored
Switch to use rbe workers (#405)
* Switch to use rbe workers * Introduce rocm rbe pools * First check for multigpu tag * Fix buildifier issue * Use valid platform name * Fix docker url * Run multigpu tests locally * Build mutigpu tests locally * Ignore numa related leaks * Limit the supported archs * Build locally run remotely * Enable remote and disk_cache for jax tests * Execute iota_test locally * Mark failing tests as local * Ignore iota_test as it is flaky and large * Make dot operation local and exclude flaky test * Bump to rocm7 container * Disable failing test on tsan * Switch rbe image * Fixing hermetic build * Force tsan builds to run locally * Set proper multigpu tag for all_reduce_test * Run flaky on rbe tests locally * Make collective ops test local
1 parent 2e59711 commit 73cb057

File tree

12 files changed

+107
-33
lines changed

12 files changed

+107
-33
lines changed

build_tools/rocm/BUILD

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,21 +34,33 @@ filegroup(
3434
)
3535

3636
genrule(
37-
name = "san_wrapper_script",
38-
srcs = [":sanitizer_ignore_lists"],
39-
outs = ["san_wrapper.sh"],
37+
name = "exclusive_wrapper_script",
38+
outs = ["exclusive_wrapper.sh"],
4039
cmd = """
4140
echo '#!/bin/bash' > $@
42-
echo 'exec "$$@"' >> $@
41+
echo 'exec {lock_fd}>/var/lock/gpulock || exit 1' >> $@
42+
echo 'flock "$$lock_fd"' >> $@
43+
echo '"$$@"' >> $@
44+
echo 'return_code=$$?' >> $@
45+
echo 'flock -u "$$lock_fd"' >> $@
46+
echo 'exit $$return_code' >> $@
4347
chmod +x $@
4448
""",
4549
)
4650

51+
# this wrapper ensures the test target
52+
# take into account any changes in the ignore list files
53+
sh_binary(
54+
name = "exclusive_local_wrapper",
55+
srcs = [":exclusive_wrapper_script"],
56+
visibility = ["//visibility:public"],
57+
)
58+
4759
# this wrapper ensures the test target
4860
# take into account any changes in the ignore list files
4961
sh_binary(
5062
name = "sanitizer_wrapper",
51-
srcs = [":san_wrapper_script"],
63+
srcs = [":exclusive_wrapper_script"],
5264
data = [":sanitizer_ignore_lists"],
5365
visibility = ["//visibility:public"],
5466
)

build_tools/rocm/lsan_ignore_list.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ leak:libstdc++.so
33
leak:libamdhip64.so
44
leak:libhiprtc.so
55
leak:librccl.so
6+
leak:hwloc_bitmap_alloc

build_tools/rocm/platform/linux_x64/BUILD

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ platform(
1111
"@bazel_tools//tools/cpp:clang",
1212
],
1313
exec_properties = {
14-
"container-image": "rocm/tensorflow-build@sha256:7cd444ac48657fee2f5087fbda7766266704d3f8fb2299f681952ae4eabed060",
14+
# rocm/tensorflow-build:2.18-jammy-python3.9-rocm7.0.2
15+
"container-image": "docker://rocm/tensorflow-build@sha256:a2672ff2510b369b4a5f034272a518dc93c2e492894e3befaeef19649632ccaa",
1516
"OSFamily": "Linux",
1617
},
1718
)
@@ -24,7 +25,8 @@ platform(
2425
"@bazel_tools//tools/cpp:clang",
2526
],
2627
exec_properties = {
27-
"container-image": "rocm/tensorflow-build@sha256:7cd444ac48657fee2f5087fbda7766266704d3f8fb2299f681952ae4eabed060",
28+
# rocm/tensorflow-build:2.18-jammy-python3.9-rocm7.0.2
29+
"container-image": "docker://rocm/tensorflow-build@sha256:a2672ff2510b369b4a5f034272a518dc93c2e492894e3befaeef19649632ccaa",
2830
"OSFamily": "Linux",
2931
"Pool": "linux_x64_gpu",
3032
},

build_tools/rocm/rocm_xla.bazelrc

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@ build:rocm_dev --remote_cache="https://wardite.cluster.engflow.com"
55

66
build:rocm_rbe --bes_backend="grpcs://wardite.cluster.engflow.com"
77
build:rocm_rbe --bes_results_url="https://wardite.cluster.engflow.com/invocation/"
8-
build:rocm_rbe --host_platform="//build_tools/rocm/platform/linux_x64:linux_x64_gpu"
9-
build:rocm_rbe --extra_execution_platforms="//build_tools/rocm/platform/linux_x64:linux_x64_gpu"
10-
build:rocm_rbe --platforms="//build_tools/rocm/platform/linux_x64:linux_x64_gpu"
8+
build:rocm_rbe --host_platform="//build_tools/rocm/platform/linux_x64:linux_x64"
9+
build:rocm_rbe --extra_execution_platforms="//build_tools/rocm/platform/linux_x64:linux_x64"
10+
build:rocm_rbe --platforms="//build_tools/rocm/platform/linux_x64:linux_x64"
1111
build:rocm_rbe --bes_timeout=600s
1212
build:rocm_rbe --tls_client_certificate="/tf/certificates/ci-cert.crt"
1313
build:rocm_rbe --tls_client_key="/tf/certificates/ci-cert.key"
@@ -19,7 +19,7 @@ build:rocm_rbe --remote_timeout=3600
1919
build:rocm_rbe --remote_download_minimal
2020
build:rocm_rbe --remote_upload_local_results
2121

22-
test:rocm_rbe --strategy=TestRunner=local
22+
test:rocm_rbe --strategy=TestRunner=remote,local
2323

2424
build:asan --strip=never
2525
build:asan --copt -fsanitize=address
@@ -62,7 +62,9 @@ build:xla_sgpu -- \
6262
-//xla/pjrt/distributed:topology_util_test \
6363
-//xla/pjrt/distributed:client_server_test \
6464
-//xla/service/gpu/tests:dynamic_shared_memory_test_amdgpu_any \
65-
-//xla/service/gpu/tests:gpu_cub_sort_test_amdgpu_any
65+
-//xla/service/gpu/tests:gpu_cub_sort_test_amdgpu_any \
66+
-//xla/tests:iota_test_amdgpu_any \
67+
-//xla/tests:reduce_window_test_amdgpu_any # TODO: return when it is not flaky!
6668

6769
test:xla_mgpu -- \
6870
//xla/tests:collective_ops_e2e_test \

build_tools/rocm/run_jax_ut.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ python build/build.py build \
1616
# TODO: run the tests when they are green
1717
bazel build \
1818
--config=rocm \
19+
--disk_cache=/tf/disk_cache/jaxlib-v0.7.1 \
20+
--remote_cache="grpcs://wardite.cluster.engflow.com" \
21+
--tls_client_certificate="/tf/certificates/ci-cert.crt" \
22+
--tls_client_key="/tf/certificates/ci-cert.key" \
1923
--build_tag_filters=cpu,gpu,-tpu,-config-cuda-only \
2024
--test_tag_filters=cpu,gpu,-tpu,-config-cuda-only \
2125
--action_env=TF_ROCM_AMDGPU_TARGETS=gfx908,gfx90a,gfx942 \

build_tools/rocm/run_xla.sh

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,17 @@ N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
2727
rocm-smi -i
2828
STATUS=$?
2929
if [ $STATUS -ne 0 ]; then TF_GPU_COUNT=1; else
30-
TF_GPU_COUNT=$(rocm-smi -i|grep 'Device ID' |grep 'GPU' |wc -l)
30+
TF_GPU_COUNT=$(rocm-smi -i | grep 'Device ID' | grep 'GPU' | wc -l)
3131
fi
3232
TF_TESTS_PER_GPU=1
3333
N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
34-
amdgpuname=(`rocminfo | grep gfx | head -n 1`)
34+
amdgpuname=($(rocminfo | grep gfx | head -n 1))
3535
AMD_GPU_GFX_ID=${amdgpuname[1]}
3636
echo ""
3737
echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s) for gpu ${AMD_GPU_GFX_ID}."
3838
echo ""
3939

40-
export PYTHON_BIN_PATH=`which python3`
40+
export PYTHON_BIN_PATH=$(which python3)
4141
export TF_NEED_ROCM=1
4242
export ROCM_PATH="/opt/rocm"
4343

@@ -99,12 +99,13 @@ BAZEL_DISK_CACHE_SIZE=100G
9999
BAZEL_DISK_CACHE_DIR="/tf/disk_cache/rocm-jaxlib-v0.7.1"
100100
mkdir -p ${BAZEL_DISK_CACHE_DIR}
101101
if [ ! -d /tf/pkg ]; then
102-
mkdir -p /tf/pkg
102+
mkdir -p /tf/pkg
103103
fi
104104

105105
SCRIPT_DIR=$(realpath $(dirname $0))
106106
TAG_FILTERS=$($SCRIPT_DIR/rocm_tag_filters.sh),-multigpu,-multi_gpu_h100,requires-gpu-amd,-skip_rocprofiler_sdk,-no_oss,-oss_excluded,-oss_serial
107107

108+
RBE_OPTIONS=()
108109
SANITIZER_ARGS=()
109110
if [[ $1 == "asan" ]]; then
110111
SANITIZER_ARGS+=("--config=asan")
@@ -121,6 +122,12 @@ elif [[ $1 == "tsan" ]]; then
121122
HostExecuteStartThunkTest*
122123
HostExecuteDoneThunkTest*
123124
)
125+
126+
# tsan tests appear to be flaky in rbe due to the heavy load
127+
# force them to run locally
128+
RBE_OPTIONS+=(
129+
--strategy=TestRunner=local
130+
)
124131
shift
125132
fi
126133

@@ -139,16 +146,18 @@ bazel --bazelrc=build_tools/rocm/rocm_xla.bazelrc test \
139146
--flaky_test_attempts=3 \
140147
--keep_going \
141148
--local_test_jobs=${N_TEST_JOBS} \
149+
--repo_env=TF_ROCM_AMDGPU_TARGETS=gfx908,gfx90a,gfx942,gfx1100 \
142150
--test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
143151
--action_env=XLA_FLAGS="--xla_gpu_enable_llvm_module_compilation_parallelism=true --xla_gpu_force_compilation_parallelism=16" \
144152
--run_under=//build_tools/ci:parallel_gpu_execute \
145153
--test_env=MIOPEN_FIND_ENFORCE=5 \
146154
--test_env=MIOPEN_FIND_MODE=1 \
147155
--test_filter=-$(IFS=: ; echo "${EXCLUDED_TESTS[*]}") \
148156
"${SANITIZER_ARGS[@]}" \
149-
"$@"
157+
"$@" \
158+
"${RBE_OPTIONS[@]}"
150159

151160
# clean up bazel disk_cache
152161
bazel shutdown \
153-
--disk_cache=${BAZEL_DISK_CACHE_DIR} \
154-
--experimental_disk_cache_gc_max_size=${BAZEL_DISK_CACHE_SIZE}
162+
--disk_cache=${BAZEL_DISK_CACHE_DIR} \
163+
--experimental_disk_cache_gc_max_size=${BAZEL_DISK_CACHE_SIZE}

build_tools/rocm/run_xla_multi_gpu.sh

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ EXCLUDED_TESTS=(
7676
SCRIPT_DIR=$(realpath $(dirname $0))
7777
TAG_FILTERS="$($SCRIPT_DIR/rocm_tag_filters.sh)"
7878

79+
RBE_OPTIONS=()
7980
SANITIZER_ARGS=()
8081
if [[ $1 == "asan" ]]; then
8182
SANITIZER_ARGS+=("--run_under=//build_tools/rocm:sanitizer_wrapper")
@@ -99,6 +100,12 @@ elif [[ $1 == "tsan" ]]; then
99100
AsyncMemcpyCollectiveOps*
100101
RaggedAllToAllTest*
101102
)
103+
104+
# tsan tests appear to be flaky in rbe due to the heavy load
105+
# force them to run locally
106+
RBE_OPTIONS+=(
107+
--strategy=TestRunner=local
108+
)
102109
shift
103110
fi
104111

@@ -116,14 +123,15 @@ bazel --bazelrc=build_tools/rocm/rocm_xla.bazelrc test \
116123
--test_output=errors \
117124
--flaky_test_attempts=3 \
118125
--keep_going \
119-
--test_strategy=exclusive \
126+
--run_under=//build_tools/rocm:exclusive_local_wrapper \
127+
--repo_env=TF_ROCM_AMDGPU_TARGETS=gfx908,gfx90a,gfx942,gfx1100 \
120128
--action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
121129
--action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
122130
--action_env=NCCL_MAX_NCHANNELS=1 \
123131
--test_filter=-$(IFS=: ; echo "${EXCLUDED_TESTS[*]}") \
124132
"${SANITIZER_ARGS[@]}" \
125133
"$@" \
126-
--strategy=TestRunner=local # execute multigpu tests locally as there is no gpu exclusive protection on rbe
134+
"${RBE_OPTIONS[@]}"
127135

128136
# clean up bazel disk_cache
129137
bazel shutdown \

third_party/gpus/rocm/BUILD.tpl

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ cc_library(
140140
name = "rocm_rpath",
141141
linkopts = select({
142142
":build_hermetic": [
143-
"-Wl,-rpath,%{rocm_toolkit_path}/lib",
143+
"-Wl,-rpath,external/local_config_rocm/rocm/%{rocm_root}/lib",
144144
],
145145
":multiple_rocm_paths": [
146146
"-Wl,-rpath=%{rocm_lib_paths}",
@@ -163,7 +163,7 @@ cc_library(
163163

164164
cc_library(
165165
name = "rocm_hip",
166-
srcs = glob(["%{rocm_root}/lib/libamdhip*.so"]),
166+
srcs = glob(["%{rocm_root}/lib/libamdhip*.so*"]),
167167
hdrs = glob(["%{rocm_root}/include/hip/**"]),
168168
include_prefix = "rocm",
169169
includes = [
@@ -181,7 +181,10 @@ cc_library(
181181
# Used by jax_rocm_plugin to minimally link to hip runtime.
182182
cc_library(
183183
name = "hip_runtime",
184-
srcs = glob(["%{rocm_root}/lib/libamdhip*.so"]),
184+
srcs = glob([
185+
"%{rocm_root}/lib/libamdhip*.so*",
186+
"%{rocm_root}/lib/libamd_comgr.so*",
187+
]),
185188
hdrs = glob(["%{rocm_root}/include/hip/**"]),
186189
include_prefix = "rocm",
187190
includes = [
@@ -215,15 +218,16 @@ cc_library(
215218
],
216219
# workaround to bring tensile files to the same fs layout as expected in the lib
217220
# rocblas assumes that tensile files are located in ../roblas/libraries directory
218-
linkopts = ["-Wl,-rpath,local_config_rocm/rocm/rocm_dis/lib"],
221+
linkopts = ["-Wl,-rpath,external/local_config_rocm/rocm/%{rocm_root}/lib"],
219222
strip_include_prefix = "%{rocm_root}",
220223
visibility = ["//visibility:public"],
221224
deps = [":rocm_config"],
222225
)
223226

224227
cc_library(
225228
name = "rocfft",
226-
srcs = glob(["%{rocm_root}/lib/librocfft*.so*"]),
229+
data = glob(["%{rocm_root}/lib/librocfft*.so*"]),
230+
linkopts = ["-Wl,-rpath,external/local_config_rocm/rocm/%{rocm_root}/lib"],
227231
include_prefix = "rocm",
228232
includes = [
229233
"%{rocm_root}/include",
@@ -235,7 +239,8 @@ cc_library(
235239

236240
cc_library(
237241
name = "hipfft",
238-
srcs = glob(["%{rocm_root}/lib/libhipfft*.so*"]),
242+
data = glob(["%{rocm_root}/lib/libhipfft*.so*"]),
243+
linkopts = ["-Wl,-rpath,external/local_config_rocm/rocm/%{rocm_root}/lib"],
239244
include_prefix = "rocm",
240245
includes = [
241246
"%{rocm_root}/include",
@@ -266,7 +271,6 @@ miopen_libs = glob([
266271

267272
cc_library(
268273
name = "miopen",
269-
srcs = glob(["%{rocm_root}/lib/libMIOpen*.so*"]),
270274
hdrs = glob(["%{rocm_root}/include/miopen/**"]),
271275
data = select({
272276
":build_hermetic": miopen_libs,
@@ -279,7 +283,7 @@ cc_library(
279283
],
280284
# workaround to bring miopen db files to the same fs layout as expected in the lib
281285
# rocblas assumes that miopen db files are located in ../share/miopen/db directory
282-
linkopts = ["-Wl,-rpath,local_config_rocm/rocm/rocm_dis/lib"],
286+
linkopts = ["-Wl,-rpath,external/local_config_rocm/rocm/%{rocm_root}/lib"],
283287
strip_include_prefix = "%{rocm_root}",
284288
visibility = ["//visibility:public"],
285289
deps = [":rocm_config"],
@@ -429,13 +433,13 @@ hipblas_libs = glob(["%{rocm_root}/lib/libhipblas.so*"])
429433

430434
cc_library(
431435
name = "hipblas",
432-
srcs = glob(["%{rocm_root}/lib/libhipblas.so*"]),
433436
hdrs = glob(["%{rocm_root}/include/hipblas/**"]),
434437
data = select({
435438
":build_hermetic": hipblas_libs,
436439
":multiple_rocm_paths": hipblas_libs,
437440
"//conditions:default": [],
438441
}),
442+
linkopts = ["-Wl,-rpath,external/local_config_rocm/rocm/%{rocm_root}/lib"],
439443
include_prefix = "rocm",
440444
includes = [
441445
"%{rocm_root}/include/",
@@ -479,7 +483,7 @@ cc_library(
479483
],
480484
# workaround to bring tensile files to the same fs layout as expected in the lib
481485
# hibplatslt assumes that tensile files are located in ../hipblaslt/libraries directory
482-
linkopts = ["-Wl,-rpath,local_config_rocm/rocm/rocm_dis/lib"],
486+
linkopts = ["-Wl,-rpath,external/local_config_rocm/rocm/%{rocm_root}/lib"],
483487
strip_include_prefix = "%{rocm_root}",
484488
visibility = ["//visibility:public"],
485489
deps = [":rocm_config"],

xla/backends/gpu/runtime/BUILD

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1812,6 +1812,8 @@ xla_test(
18121812
backend_tags = {
18131813
"gpu": [
18141814
"multi_gpu_h100",
1815+
"multi_gpu",
1816+
"local",
18151817
"no_oss",
18161818
],
18171819
},

xla/tests/BUILD

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,6 +1009,7 @@ xla_test(
10091009
srcs = ["dot_operation_test.cc"],
10101010
shard_count = 20,
10111011
tags = [
1012+
"local", # TODO: remove when remote execution is fixed
10121013
"optonly",
10131014
],
10141015
deps = [
@@ -1242,6 +1243,7 @@ xla_test(
12421243
},
12431244
shard_count = 20,
12441245
tags = [
1246+
"local",
12451247
"optonly",
12461248
],
12471249
deps = [
@@ -1969,6 +1971,7 @@ cc_library(
19691971
# This is set intentionally as to avoid the default behavior of the TSL
19701972
# `cc_library` definition that is used in this file.
19711973
compatible_with = [],
1974+
tags = ["local"], # TODO: remove when remote execution is fixed
19721975
deps = [
19731976
":client_library_test_base",
19741977
":hlo_test_base",
@@ -2622,6 +2625,7 @@ xla_test(
26222625
backend_tags = {
26232626
"gpu": [
26242627
"multi_gpu",
2628+
"local",
26252629
"no_oss",
26262630
],
26272631
"cpu": [
@@ -2721,6 +2725,7 @@ xla_test(
27212725
backend_tags = {
27222726
"gpu": [
27232727
"multi_gpu",
2728+
"local",
27242729
"no_oss",
27252730
],
27262731
},
@@ -3356,6 +3361,7 @@ xla_test(
33563361
},
33573362
shard_count = 50,
33583363
tags = [
3364+
"local", # TODO: remove when remote execution is fixed
33593365
"test_migrated_to_hlo_runner_pjrt",
33603366
],
33613367
deps = [

0 commit comments

Comments
 (0)