From a00c7414df225d5a0c9fbaef7ecfef3dd43f622c Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 30 Sep 2024 23:10:49 -0700 Subject: [PATCH 1/2] [ExecuTorch] Dramatically improve op_clamp build time (2/2) Instead of building `O(|CTYPE_IN| * |CTYPE_MIN| * |CTYPE_MAX| * |CTYPE_OUT|)` kernel code (where |T| means the number of possibilities for type T), we build `O((|CTYPE_IN| + |CTYPE_MIN| + |CTYPE_MAX|) * |CTYPE_OUT|)` kernel code. (Concretely, `ET_SWITCH_REALHB_TYPES` has 9 possibilities, so I estimate that we went from 9**4 = 6561 template instantiations to 9 * 3 * 9 = 243 kernels, or a 27x reduction.) Differential Revision: [D63681034](https://our.internmc.facebook.com/intern/diff/D63681034/) [ghstack-poisoned] --- kernels/portable/cpu/op_clamp.cpp | 78 +++++++++++++++++-------------- 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp index 0c3894bcf47..03e28fd644b 100644 --- a/kernels/portable/cpu/op_clamp.cpp +++ b/kernels/portable/cpu/op_clamp.cpp @@ -214,43 +214,51 @@ Tensor& clamp_tensor_out( constexpr auto name = "clamp.Tensor_out"; - ET_SWITCH_REALHB_TYPES(in_type, ctx, name, CTYPE_IN, [&]() { + ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { + using ToCtypeOutFn = CTYPE_OUT (*)(const void*); + ToCtypeOutFn in_to_out; + ET_SWITCH_REALHB_TYPES(in_type, ctx, name, CTYPE_IN, [&]() { + in_to_out = [](const void* inPtr) { + return static_cast( + *reinterpret_cast(inPtr)); + }; + }); + ToCtypeOutFn min_to_out; ET_SWITCH_REALHB_TYPES(min_type, ctx, name, CTYPE_MIN, [&]() { - ET_SWITCH_REALHB_TYPES(max_type, ctx, name, CTYPE_MAX, [&]() { - ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { - apply_ternary_elementwise_fn( - [has_min, has_max]( - const CTYPE_OUT val_in, - const CTYPE_OUT val_min, - const CTYPE_OUT val_max) { - CTYPE_OUT val_out = val_in; - if (has_min) { - val_out = utils::max_override(val_out, val_min); - } - if (has_max) { - val_out = utils::min_override(val_out, val_max); - } - return val_out; - }, - in, - min, - max, - out, - [](const void* inPtr) { - return static_cast( - *reinterpret_cast(inPtr)); - }, - [](const void* minPtr) { - return static_cast( - *reinterpret_cast(minPtr)); - }, - [](const void* maxPtr) { - return static_cast( - *reinterpret_cast(maxPtr)); - }); - }); - }); + min_to_out = [](const void* minPtr) { + return static_cast( + *reinterpret_cast(minPtr)); + }; }); + ToCtypeOutFn max_to_out; + ET_SWITCH_REALHB_TYPES(max_type, ctx, name, CTYPE_MAX, [&]() { + max_to_out = [](const void* maxPtr) { + return static_cast( + *reinterpret_cast(maxPtr)); + }; + }); + + apply_ternary_elementwise_fn( + [has_min, has_max]( + const CTYPE_OUT val_in, + const CTYPE_OUT val_min, + const CTYPE_OUT val_max) { + CTYPE_OUT val_out = val_in; + if (has_min) { + val_out = utils::max_override(val_out, val_min); + } + if (has_max) { + val_out = utils::min_override(val_out, val_max); + } + return val_out; + }, + in, + min, + max, + out, + in_to_out, + min_to_out, + max_to_out); }); return out; From 69461cb630ee47e11017e9ba7d88c49faad3c87e Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 1 Oct 2024 13:00:54 -0700 Subject: [PATCH 2/2] slight improvement on "[ExecuTorch] Dramatically improve op_clamp build time (2/2)" Instead of building `O(|CTYPE_IN| * |CTYPE_MIN| * |CTYPE_MAX|* |CTYPE_OUT|)` kernel code (where |T| means the number of possibilities for type T), we build `O((|CTYPE_IN| + |CTYPE_MIN| + |CTYPE_MAX|) * |CTYPE_OUT|)` kernel code. (Concretely, `ET_SWITCH_REALHB_TYPES` has 9 possibilities, so I estimate that we went from 9**4 = 6561 template instantiations to 9 * 3 * 9 = 243 instantiations, or a 27x reduction.) Differential Revision: [D63681034](https://our.internmc.facebook.com/intern/diff/D63681034/) [ghstack-poisoned] --- kernels/portable/cpu/op_clamp.cpp | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp index 03e28fd644b..afabd917b9a 100644 --- a/kernels/portable/cpu/op_clamp.cpp +++ b/kernels/portable/cpu/op_clamp.cpp @@ -66,6 +66,10 @@ ET_NODISCARD bool check_bounds( return is_valid; } +template +To load_and_convert(const void* fromPtr) { + return static_cast(*reinterpret_cast(fromPtr)); +} } // namespace Tensor& clamp_out( @@ -218,24 +222,15 @@ Tensor& clamp_tensor_out( using ToCtypeOutFn = CTYPE_OUT (*)(const void*); ToCtypeOutFn in_to_out; ET_SWITCH_REALHB_TYPES(in_type, ctx, name, CTYPE_IN, [&]() { - in_to_out = [](const void* inPtr) { - return static_cast( - *reinterpret_cast(inPtr)); - }; + in_to_out = load_and_convert; }); ToCtypeOutFn min_to_out; ET_SWITCH_REALHB_TYPES(min_type, ctx, name, CTYPE_MIN, [&]() { - min_to_out = [](const void* minPtr) { - return static_cast( - *reinterpret_cast(minPtr)); - }; + min_to_out = load_and_convert; }); ToCtypeOutFn max_to_out; ET_SWITCH_REALHB_TYPES(max_type, ctx, name, CTYPE_MAX, [&]() { - max_to_out = [](const void* maxPtr) { - return static_cast( - *reinterpret_cast(maxPtr)); - }; + max_to_out = load_and_convert; }); apply_ternary_elementwise_fn(