Skip to content

Commit d40f3c8

Browse files
rocm-micimhalkamd-hhashemi
authored andcommitted
[AUTOGENERATED] [release/2.5] [ROCm][layer_norm] Use __builtin_amdgcn_rcpf(x) instead of 1.f/x (#1800)
Cherry-pick of #1688 Co-authored-by: Michael Halkenhäuser <[email protected]> Co-authored-by: Hashem Hashemi <[email protected]> (cherry picked from commit f8544af) (cherry picked from commit ed48754) (cherry picked from commit d62a39e)
1 parent e4d62b1 commit d40f3c8

File tree

3 files changed

+28
-0
lines changed

3 files changed

+28
-0
lines changed

aten/src/ATen/native/cuda/layer_norm_kernel.cu

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,11 @@ WelfordDataLN cuWelfordOnlineSum(
131131
{
132132
U delta = val - curr_sum.mean;
133133
U new_count = curr_sum.count + 1.f;
134+
#if defined(USE_ROCM) && defined(PYTORCH_LAYERNORM_FAST_RECIPROCAL)
135+
U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count);
136+
#else
134137
U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
138+
#endif
135139
return {new_mean, curr_sum.sigma2 + delta * (val - new_mean), new_count};
136140
}
137141

@@ -145,7 +149,11 @@ WelfordDataLN cuWelfordCombine(
145149
U count = dataA.count + dataB.count;
146150
U mean, sigma2;
147151
if (count > decltype(dataB.count){0}) {
152+
#if defined(USE_ROCM) && defined(PYTORCH_LAYERNORM_FAST_RECIPROCAL)
153+
auto coef = __builtin_amdgcn_rcpf(count);
154+
#else
148155
auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division
156+
#endif
149157
auto nA = dataA.count * coef;
150158
auto nB = dataB.count * coef;
151159
mean = nA*dataA.mean + nB*dataB.mean;

cmake/Dependencies.cmake

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1048,6 +1048,22 @@ if(USE_ROCM)
10481048
list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling)
10491049
endif(CMAKE_BUILD_TYPE MATCHES Debug)
10501050

1051+
# Get EnVar 'PYTORCH_LAYERNORM_FAST_RECIPROCAL' (or default to on).
1052+
if(DEFINED ENV{PYTORCH_LAYERNORM_FAST_RECIPROCAL})
1053+
set(PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE $ENV{PYTORCH_LAYERNORM_FAST_RECIPROCAL})
1054+
else()
1055+
set(PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE ON)
1056+
endif()
1057+
1058+
set(PYTORCH_LAYERNORM_FAST_RECIPROCAL
1059+
${PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE}
1060+
CACHE BOOL "Enable fast reciprocals within layer normalization." FORCE
1061+
)
1062+
1063+
if(PYTORCH_LAYERNORM_FAST_RECIPROCAL)
1064+
add_definitions(-DPYTORCH_LAYERNORM_FAST_RECIPROCAL)
1065+
endif()
1066+
10511067
# needed for compat with newer versions of hip-clang that introduced C++20 mangling rules
10521068
list(APPEND HIP_HIPCC_FLAGS -fclang-abi-compat=17)
10531069

setup.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,10 @@
153153
# USE_ROCM_KERNEL_ASSERT=1
154154
# Enable kernel assert in ROCm platform
155155
#
156+
# PYTORCH_LAYERNORM_FAST_RECIPROCAL
157+
# If set, enables the use of builtin functions for fast reciprocals (1/x) w.r.t.
158+
# layer normalization. Default: enabled.
159+
#
156160
# Environment variables we respect (these environment variables are
157161
# conventional and are often understood/set by other software.)
158162
#

0 commit comments

Comments
 (0)