Skip to content

Commit 239a9ad

Browse files
sterrettm2pytorchmergebot
authored andcommitted
Adds support for accelerated sorting with x86-simd-sort (pytorch#127936)
Adds x86-simd-sort as a submodule to accelerate sorting for 32-bit and 64-bit datatypes when AVX2 or AVX512 are available. For contiguous data, this can be over a 10x speedup for large arrays. For discontiguous data, it can give over a 4x speedup with larger arrays. These benchmarks were gathered on a Skylake system (7900x), limited to 8 threads. <details> <summary><b>Contiguous Benchmarks</b></summary> ``` float32, normally distributed (in microseconds) size Default AVX2 AVX512 Default/AVX2 Default/AVX512 16 7.150844336 6.886271477 7.132277489 1.038420335 1.002603214 128 9.208030939 8.478154898 7.846915245 1.086089019 1.173458697 1024 37.79037627 23.60707456 16.44122627 1.600807257 2.298513241 10000 714.7355628 203.9921844 105.5683001 3.503739934 6.770361577 100000 8383.074408 721.6333354 465.3709247 11.61680593 18.01374766 1000000 97124.31945 5632.054572 3920.148401 17.24491803 24.77567416 10000000 1161974.907 86070.48988 71533.82301 13.50027063 16.24371323 int32_t, uniformly distributed (in microseconds) size Default AVX2 AVX512 Default/AVX2 Default/AVX512 16 7.203208685 6.92212224 7.014458179 1.040606975 1.026908779 128 8.972388983 8.195516348 7.592543125 1.094792396 1.18173698 1024 32.77489477 23.6874548 15.36617105 1.383639359 2.132925285 10000 607.8824128 193.3402024 99.25090471 3.144107667 6.124703997 100000 523.9384684 608.1836536 442.3166784 0.861480682 1.184532472 1000000 5211.348627 5271.598405 3518.861883 0.988570871 1.480975611 10000000 133853.6263 81463.05084 67852.97394 1.643120714 1.972700952 ``` </details> Note that the int32_t sort is accelerated by FBGEMM's radix sort for larger arrays, but this only handles contiguous data and in one sorting direction. <details> <summary><b>Discontiguous Benchmarks</b></summary> ``` float, normal distributed, discontiguous in sorted dimension (in microseconds) size Default AVX2 AVX512 Default/AVX2 Default/AVX512 16 3.836543679 4.011214256 3.84376061 0.956454439 0.99812243 128 5.755310194 5.755723127 4.820394962 0.999928257 1.193949923 1024 49.46946019 24.78790785 15.47874362 1.995709379 3.195960952 10000 665.2505291 236.6165959 143.9490662 2.811512551 4.621429974 100000 4328.002203 1329.001212 818.3516414 3.256582586 5.288682743 1000000 47651.5018 16693.72045 11827.39551 2.854456677 4.028909133 10000000 556655.1288 236252.6258 184215.9828 2.356185998 3.021752621 int32_t, uniformly distributed, discontiguous in sorted dimension (in microseconds) size Default AVX2 AVX512 Default/AVX2 Default/AVX512 16 3.817994356 3.878117442 3.770039797 0.984496837 1.012719908 128 5.578731397 5.577152082 4.716770534 1.000283176 1.182743862 1024 43.3412619 23.61275801 14.55446819 1.835501887 2.977866408 10000 634.3997478 224.4322851 133.9518324 2.826686667 4.736028889 100000 4084.358152 1292.363303 781.7867576 3.16037924 5.22438902 1000000 46262.20465 16608.35284 11367.51817 2.785478192 4.06968381 10000000 541231.9104 235185.1861 180249.9294 2.301301028 3.002674742 ``` </details> Pull Request resolved: pytorch#127936 Approved by: https://github.com/jgong5, https://github.com/peterbell10
1 parent d2455b9 commit 239a9ad

File tree

8 files changed

+207
-2
lines changed

8 files changed

+207
-2
lines changed

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,3 +127,6 @@
127127
[submodule "third_party/NVTX"]
128128
path = third_party/NVTX
129129
url = https://github.com/NVIDIA/NVTX.git
130+
[submodule "third_party/x86-simd-sort"]
131+
path = third_party/x86-simd-sort
132+
url = https://github.com/intel/x86-simd-sort.git

CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,7 @@ else()
262262
cmake_dependent_option(USE_CUFILE "Use cuFile" OFF "USE_CUDA AND NOT WIN32" OFF)
263263
endif()
264264
option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
265+
option(USE_X86_SIMD_SORT "Use x86-simd-sort to accelerate sorting and topk for AVX2/AVX512" ON)
265266
option(USE_KINETO "Use Kineto profiling library" ON)
266267
option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
267268
option(USE_FAKELOWP "Use FakeLowp operators" OFF)
@@ -907,6 +908,13 @@ if(USE_FBGEMM)
907908
string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM")
908909
endif()
909910

911+
if(USE_X86_SIMD_SORT)
912+
string(APPEND CMAKE_CXX_FLAGS " -DUSE_X86_SIMD_SORT")
913+
if(USE_XSS_OPENMP)
914+
string(APPEND CMAKE_CXX_FLAGS " -DXSS_USE_OPENMP")
915+
endif()
916+
endif()
917+
910918
if(USE_PYTORCH_QNNPACK)
911919
string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK")
912920
endif()

NOTICE

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -454,3 +454,37 @@ and reference the following license:
454454
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
455455
OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
456456
PERFORMANCE OF THIS SOFTWARE.
457+
458+
=======================================================================
459+
x86-simd-sort BSD 3-Clause License
460+
=======================================================================
461+
462+
Code derived from implementations in x86-simd-sort should mention its
463+
derivation and reference the following license:
464+
465+
Copyright (c) 2022, Intel. All rights reserved.
466+
467+
Redistribution and use in source and binary forms, with or without
468+
modification, are permitted provided that the following conditions are met:
469+
470+
1. Redistributions of source code must retain the above copyright notice, this
471+
list of conditions and the following disclaimer.
472+
473+
2. Redistributions in binary form must reproduce the above copyright notice,
474+
this list of conditions and the following disclaimer in the documentation
475+
and/or other materials provided with the distribution.
476+
477+
3. Neither the name of the copyright holder nor the names of its
478+
contributors may be used to endorse or promote products derived from
479+
this software without specific prior written permission.
480+
481+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
482+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
483+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
484+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
485+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
486+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
487+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
488+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
489+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
490+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

aten/src/ATen/native/cpu/SortingKernel.cpp

Lines changed: 129 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,18 @@
1515
#include <ATen/native/CompositeRandomAccessor.h>
1616
#include <ATen/native/TopKImpl.h>
1717
#include <c10/core/WrapDimMinimal.h>
18+
#include <c10/util/SmallBuffer.h>
1819
#include <c10/util/irange.h>
20+
1921
#ifdef USE_FBGEMM
2022
#include <fbgemm/Utils.h>
2123
#endif
2224

25+
#if USE_X86_SIMD_SORT && (defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2))
26+
#define XSS_COMPILE_TIME_SUPPORTED
27+
#include <src/x86simdsort-static-incl.h>
28+
#endif
29+
2330
namespace at::native {
2431

2532
namespace {
@@ -119,6 +126,7 @@ static void parallel_sort1d_kernel(
119126
std::vector<int64_t> tmp_vals(elements);
120127
const scalar_t* sorted_keys = nullptr;
121128
const int64_t* sorted_vals = nullptr;
129+
122130
std::tie(sorted_keys, sorted_vals) = fbgemm::radix_sort_parallel(
123131
keys,
124132
vals,
@@ -167,6 +175,116 @@ static inline void sort_kernel_impl(const value_accessor_t& value_accessor,
167175
}
168176
}
169177

178+
#if defined(XSS_COMPILE_TIME_SUPPORTED)
179+
180+
#define AT_DISPATCH_CASE_XSS_TYPES(...) \
181+
AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) \
182+
AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \
183+
AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
184+
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)
185+
186+
#define AT_DISPATCH_XSS_TYPES(TYPE, NAME, ...) \
187+
AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_XSS_TYPES(__VA_ARGS__))
188+
189+
static bool can_use_xss_sort(const TensorBase& values, const TensorBase& indices, int64_t dim, const bool stable) {
190+
// xss_sort is not a stable sort
191+
if (stable) return false;
192+
193+
auto type = values.scalar_type();
194+
if (! (type == ScalarType::Long || type == ScalarType::Int || type == ScalarType::Double || type == ScalarType::Float)) return false;
195+
196+
return true;
197+
}
198+
199+
static bool xss_sort_preferred(const TensorBase& values, const bool descending) {
200+
#if defined(XSS_USE_OPENMP) || !defined(USE_FBGEMM)
201+
return true;
202+
#else
203+
// Without OpenMP support for x86-simd-sort, fbgemm radix sort is faster when it can be used
204+
return !can_use_radix_sort(values, descending);
205+
#endif
206+
}
207+
208+
static void xss_sort_kernel(
209+
const TensorBase& values,
210+
const TensorBase& indices,
211+
int64_t dim,
212+
bool descending) {
213+
auto iter = TensorIteratorConfig()
214+
.check_all_same_dtype(false)
215+
.resize_outputs(false)
216+
.declare_static_shape(values.sizes(), /*squash_dims=*/dim)
217+
.add_output(values)
218+
.add_output(indices)
219+
.build();
220+
221+
using index_t = int64_t;
222+
223+
AT_DISPATCH_XSS_TYPES(values.scalar_type(), "xss_sort_kernel", [&] {
224+
225+
auto values_dim_stride = values.stride(dim);
226+
auto indices_dim_stride = indices.stride(dim);
227+
auto dim_size = values.size(dim);
228+
229+
auto loop = [&](char** data, const int64_t* strides, int64_t n) {
230+
auto* values_data_bytes = data[0];
231+
auto* indices_data_bytes = data[1];
232+
233+
if(values_data_bytes==nullptr || indices_data_bytes==nullptr){
234+
return;
235+
}
236+
237+
if (values_dim_stride == 1 && indices_dim_stride == 1){
238+
for (const auto i C10_UNUSED : c10::irange(n)) {
239+
x86simdsortStatic::keyvalue_qsort<scalar_t, index_t>(
240+
reinterpret_cast<scalar_t*>(values_data_bytes),
241+
reinterpret_cast<index_t*>(indices_data_bytes),
242+
dim_size,
243+
true,
244+
descending);
245+
246+
values_data_bytes += strides[0];
247+
indices_data_bytes += strides[1];
248+
}
249+
}else{
250+
c10::SmallBuffer<scalar_t, 0> tmp_values(dim_size);
251+
c10::SmallBuffer<index_t, 0> tmp_indices(dim_size);
252+
253+
for (const auto i : c10::irange(n)) {
254+
TensorAccessor<scalar_t, 1> mode_values_acc(
255+
reinterpret_cast<scalar_t*>(data[0] + i * strides[0]),
256+
&dim_size, &values_dim_stride);
257+
TensorAccessor<index_t, 1> mode_indices_acc(
258+
reinterpret_cast<index_t*>(data[1] + i * strides[1]),
259+
&dim_size, &indices_dim_stride);
260+
261+
for (const auto j : c10::irange(dim_size)) {
262+
tmp_values[j] = mode_values_acc[j];
263+
tmp_indices[j] = j;
264+
}
265+
266+
x86simdsortStatic::keyvalue_qsort<scalar_t, index_t>(
267+
tmp_values.data(),
268+
tmp_indices.data(),
269+
dim_size,
270+
true,
271+
descending);
272+
273+
for (const auto j : c10::irange(dim_size)) {
274+
mode_values_acc[j] = tmp_values[j];
275+
mode_indices_acc[j] = tmp_indices[j];
276+
}
277+
}
278+
}
279+
};
280+
281+
int64_t grain_size = internal::GRAIN_SIZE / std::max(int64_t{1}, dim_size);
282+
iter.for_each(loop, /*grain_size=*/grain_size);
283+
284+
});
285+
}
286+
#endif
287+
170288
static void sort_kernel(
171289
const TensorBase& self,
172290
const TensorBase& values,
@@ -181,6 +299,14 @@ static void sort_kernel(
181299
// https://github.com/pytorch/pytorch/issues/91420
182300
return;
183301
}
302+
303+
#if defined(XSS_COMPILE_TIME_SUPPORTED)
304+
if (can_use_xss_sort(values, indices, dim, stable) && xss_sort_preferred(values, descending)){
305+
xss_sort_kernel(values, indices, dim, descending);
306+
return;
307+
}
308+
#endif
309+
184310
#ifdef USE_FBGEMM
185311
if (can_use_radix_sort(values, descending)) {
186312
parallel_sort1d_kernel(values, indices);
@@ -232,6 +358,7 @@ static void topk_kernel(
232358
int64_t dim,
233359
bool largest,
234360
bool sorted) {
361+
235362
auto sizes = self.sizes();
236363
auto iter = TensorIteratorConfig()
237364
.check_all_same_dtype(false)
@@ -266,7 +393,7 @@ static void topk_kernel(
266393

267394
} // anonymous namespace
268395

269-
REGISTER_DISPATCH(sort_stub, &sort_kernel);
270-
REGISTER_DISPATCH(topk_stub, &topk_kernel);
396+
ALSO_REGISTER_AVX512_DISPATCH(sort_stub, &sort_kernel);
397+
ALSO_REGISTER_AVX512_DISPATCH(topk_stub, &topk_kernel);
271398

272399
} //at::native

cmake/Dependencies.cmake

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1328,6 +1328,28 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX)
13281328
set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS})
13291329
endif()
13301330

1331+
# --[ x86-simd-sort integration
1332+
if(USE_X86_SIMD_SORT)
1333+
if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
1334+
message(WARNING
1335+
"x64 operating system is required for x86-simd-sort. "
1336+
"Not compiling with x86-simd-sort. "
1337+
"Turn this warning off by USE_X86_SIMD_SORT=OFF.")
1338+
set(USE_X86_SIMD_SORT OFF)
1339+
endif()
1340+
1341+
if(USE_X86_SIMD_SORT)
1342+
if(USE_OPENMP AND NOT MSVC)
1343+
set(USE_XSS_OPENMP ON)
1344+
else()
1345+
set(USE_XSS_OPENMP OFF)
1346+
endif()
1347+
1348+
set(XSS_SIMD_SORT_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/../third_party/x86-simd-sort)
1349+
include_directories(SYSTEM ${XSS_SIMD_SORT_INCLUDE_DIR})
1350+
endif()
1351+
endif()
1352+
13311353
# --[ ATen checks
13321354
set(USE_LAPACK 0)
13331355

cmake/Summary.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ function(caffe2_print_configuration_summary)
133133
endif()
134134
message(STATUS " BUILD_NVFUSER : ${BUILD_NVFUSER}")
135135
message(STATUS " USE_EIGEN_FOR_BLAS : ${CAFFE2_USE_EIGEN_FOR_BLAS}")
136+
message(STATUS " USE_X86_SIMD_SORT : ${USE_X86_SIMD_SORT}")
136137
message(STATUS " USE_FBGEMM : ${USE_FBGEMM}")
137138
message(STATUS " USE_FAKELOWP : ${USE_FAKELOWP}")
138139
message(STATUS " USE_KINETO : ${USE_KINETO}")

test/inductor/test_torchinductor_opinfo.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -466,6 +466,9 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
466466
("nn.functional.interpolate.bicubic", u8): {"atol": 1, "rtol": 0},
467467
# High atol due to precision loss
468468
("nn.functional.interpolate.bicubic", f32): {"atol": 5e-3, "rtol": 0},
469+
# reference_in_float can cause erroneous failures in sorting tests
470+
"argsort": {"reference_in_float": False},
471+
"sort": {"reference_in_float": False},
469472
}
470473

471474
inductor_override_kwargs["cuda"] = {
@@ -536,6 +539,9 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
536539
("index_reduce.amax", f32): {"check_gradient": False},
537540
("index_reduce.amax", f16): {"check_gradient": False},
538541
("tanh", f16): {"atol": 1e-4, "rtol": 1e-2},
542+
# reference_in_float can cause erroneous failures in sorting tests
543+
"argsort": {"reference_in_float": False},
544+
"sort": {"reference_in_float": False},
539545
}
540546

541547
inductor_override_kwargs["xpu"] = {
@@ -655,6 +661,9 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
655661
("nn.functional.embedding_bag", f64): {"check_gradient": False},
656662
("_unsafe_masked_index", f16): {"atol": 1e-5, "rtol": 2e-3},
657663
("_unsafe_masked_index_put_accumulate", f16): {"atol": 1e-5, "rtol": 5e-3},
664+
# reference_in_float can cause erroneous failures in sorting tests
665+
"argsort": {"reference_in_float": False},
666+
"sort": {"reference_in_float": False},
658667
}
659668

660669
# Test with one sample only for following ops

third_party/x86-simd-sort

Submodule x86-simd-sort added at 9a1b616

0 commit comments

Comments
 (0)