numpy
diff --git a/‎LICENSE.md‎
Lines changed: 28 additions & 0 deletions b/‎LICENSE.md‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 27 additions & 0 deletions b/‎Makefile‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 117 additions & 0 deletions b/‎README.md‎
Lines changed: 117 additions & 0 deletions
diff --git a/‎_clang-format‎
Lines changed: 98 additions & 0 deletions b/‎_clang-format‎
Lines changed: 98 additions & 0 deletions
diff --git a/‎benchmarks/bench.hpp‎
Lines changed: 74 additions & 0 deletions b/‎benchmarks/bench.hpp‎
Lines changed: 74 additions & 0 deletions
@@ -0,0 +1,28 @@
+BSD 3-Clause License
+
+Copyright (c) 2022, Intel. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,27 @@
+CXX		?= g++
+SRCDIR		= ./src
+TESTDIR		= ./tests
+BENCHDIR	= ./benchmarks
+UTILS		= ./utils
+SRCS		= $(wildcard $(SRCDIR)/*.hpp)
+TESTS		= $(wildcard $(TESTDIR)/*.cpp)
+TESTOBJS	= $(patsubst $(TESTDIR)/%.cpp,$(TESTDIR)/%.o,$(TESTS))
+TESTOBJS	:= $(filter-out $(TESTDIR)/main.o ,$(TESTOBJS))
+GTEST_LIB	= gtest
+GTEST_INCLUDE	= /usr/local/include
+CXXFLAGS	+= -I$(SRCDIR) -I$(GTEST_INCLUDE) -I$(UTILS)
+LD_FLAGS	= -L /usr/local/lib -l $(GTEST_LIB) -l pthread
+
+all : test bench
+
+$(TESTDIR)/%.o : $(TESTDIR)/%.cpp $(SRCS)
+		$(CXX) -march=icelake-client -O3 $(CXXFLAGS) -c $< -o $@
+
+test: $(TESTDIR)/main.cpp $(TESTOBJS) $(SRCS)
+		$(CXX) tests/main.cpp $(TESTOBJS) $(CXXFLAGS) $(LD_FLAGS) -o testexe
+
+bench: $(BENCHDIR)/main.cpp $(SRCS)
+		$(CXX) $(BENCHDIR)/main.cpp $(CXXFLAGS) -march=icelake-client -O3 -o benchexe
+
+clean:
+		rm -f $(TESTDIR)/*.o testexe benchexe
@@ -0,0 +1,117 @@
+# x86-simd-sort
+
+C++ header file library for SIMD based 16-bit, 32-bit and 64-bit data type
+sorting on x86 processors. Source header files are available in src directory.
+We currently only have AVX-512 based implementation of quicksort. This
+repository also includes a test suite which can be built and run to test the
+sorting algorithms for correctness. It also has benchmarking code to compare
+its performance relative to std::sort.
+
+## Algorithm details
+
+The ideas and code are based on these two research papers [1] and [2]. On a
+high level, the idea is to vectorize quicksort partitioning using AVX-512
+compressstore instructions. If the array size is < 128, then use Bitonic
+sorting network implemented on 512-bit registers.  The precise network
+definitions depend on the size of the dtype and are defined in separate files:
+`avx512-16bit-qsort.hpp`, `avx512-32bit-qsort.hpp` and
+`avx512-64bit-qsort.hpp`. Article [4] is a good resource for bitonic sorting
+network. The core implementations of the vectorized qsort functions
+`avx512_qsort<T>(T*, int64_t)` are modified versions of avx2 quicksort
+presented in the paper [2] and source code associated with that paper [3].
+
+## Handling NAN in float and double arrays
+
+If you expect your array to contain NANs, please be aware that the these
+routines **do not preserve your NANs as you pass them**. The
+`avx512_qsort<T>()` routine will put all your NAN's at the end of the sorted
+array and replace them with `std::nan("1")`. Please take a look at
+`avx512_qsort<float>()` and `avx512_qsort<double>()` functions for details.
+
+## Example to include and build this in a C++ code
+
+### Sample code `main.cpp`
+
+```cpp
+#include "src/avx512-32bit-qsort.hpp"
+
+int main() {
+    const int ARRSIZE = 10;
+    std::vector<float> arr;
+
+    /* Initialize elements is reverse order */
+    for (int ii = 0; ii < ARRSIZE; ++ii) {
+        arr.push_back(ARRSIZE - ii);
+    }
+
+    /* call avx512 quicksort */
+    avx512_qsort<float>(arr.data(), ARRSIZE);
+    return 0;
+}
+
+```
+
+### Build using gcc
+
+```
+gcc main.cpp -mavx512f -mavx512dq -O3
+```
+
+This is a header file only library and we do not provide any compile time and
+run time checks which is recommended while including this your source code. A
+slightly modified version of this source code has been contributed to
+[NumPy](https://github.com/numpy/numpy) (see this [pull
+request](https://github.com/numpy/numpy/pull/22315) for details). This NumPy
+pull request is a good reference for how to include and build this library with
+your source code.
+
+## Build requirements
+
+None, its header files only. However you will need `make` or `meson` to build
+the unit tests and benchmarking suite. You will need a relatively modern
+compiler to build.
+
+```
+gcc >= 8.x
+```
+
+### Build using Make
+
+`make` command builds two executables:
+- `testexe`: runs a bunch of tests written in ./tests directory.
+- `benchexe`: measures performance of these algorithms for various data types
+  and compares them to std::sort.
+
+You can use `make test` and `make bench` to build just the `testexe` and
+`benchexe` respectively.
+
+### Build using Meson
+
+You can also build `testexe` and `benchexe` using Meson/Ninja with the following
+command:
+
+```
+meson setup builddir && cd builddir && ninja
+```
+
+## Requirements and dependencies
+
+The sorting routines relies only on the C++ Standard Library and requires a
+relatively modern compiler to build (gcc 8.x and above). Since they use the
+AVX-512 instruction set, they can only run on processors that have AVX-512.
+Specifically, the 32-bit and 64-bit require AVX-512F and AVX-512DQ instruction
+set. The 16-bit sorting requires the AVX-512F, AVX-512BW and AVX-512 VMBI2
+instruction set. The test suite is written using the Google test framework.
+
+## References
+
+* [1] Fast and Robust Vectorized In-Place Sorting of Primitive Types
+    https://drops.dagstuhl.de/opus/volltexte/2021/13775/
+
+* [2] A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel
+Skylake https://arxiv.org/pdf/1704.08579.pdf
+
+* [3] https://github.com/simd-sorting/fast-and-robust: SPDX-License-Identifier: MIT
+
+* [4] http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030
+
@@ -0,0 +1,98 @@
+---
+Language:        Cpp
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: DontAlign
+AlignOperands:   false
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   true
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      true
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: All
+BreakBeforeBraces: Custom
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeComma
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 8
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+FixNamespaceComments: true
+ForEachMacros:
+IncludeBlocks:   Preserve
+IndentCaseLabels: true
+# IndentPPDirectives: AfterHash
+IndentPPDirectives: None
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Right
+ReflowComments:  false
+SortIncludes:    true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: true
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        4
+UseTab:          Never
+...
+# vim:ft=conf et ts=2 sw=2
@@ -0,0 +1,74 @@
+/*******************************************
+ * * Copyright (C) 2022 Intel Corporation
+ * * SPDX-License-Identifier: BSD-3-Clause
+ * *******************************************/
+
+#include "avx512-16bit-qsort.hpp"
+#include "avx512-32bit-qsort.hpp"
+#include "avx512-64bit-qsort.hpp"
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <vector>
+
+static inline uint64_t cycles_start(void)
+{
+    unsigned a, d;
+    __asm__ __volatile__(
+            "cpuid\n\t"
+            "rdtsc\n\t"
+            : "=a"(a), // comma separated output operands
+              "=d"(d)
+            : // comma separated input operands
+            : "rbx", "rcx" // list of clobbered registers
+    );
+    return (((uint64_t)d << 32) | a);
+}
+
+static inline uint64_t cycles_end(void)
+{
+    unsigned high, low;
+    __asm__ __volatile__(
+            "rdtscp\n\t"
+            "movl %%eax, %[low]\n\t"
+            "movl %%edx, %[high]\n\t"
+            "cpuid\n\t"
+            : [high] "=r"(high), [low] "=r"(low)
+            :
+            : "rax", "rbx", "rcx", "rdx");
+    return (((uint64_t)high << 32) | low);
+}
+
+template <typename T>
+std::tuple<uint64_t, uint64_t> bench_sort(const std::vector<T> arr,
+                                          const uint64_t iters,
+                                          const uint64_t lastfew)
+{
+    std::vector<T> arr_bckup = arr;
+    std::vector<uint64_t> runtimes1, runtimes2;
+    uint64_t start(0), end(0);
+    for (uint64_t ii = 0; ii < iters; ++ii) {
+        start = cycles_start();
+        avx512_qsort<T>(arr_bckup.data(), arr_bckup.size());
+        end = cycles_end();
+        runtimes1.emplace_back(end - start);
+        arr_bckup = arr;
+    }
+    uint64_t avx_sort = std::accumulate(runtimes1.end() - lastfew,
+                                        runtimes1.end(),
+                                        (uint64_t)0)
+            / lastfew;
+
+    for (uint64_t ii = 0; ii < iters; ++ii) {
+        start = cycles_start();
+        std::sort(arr_bckup.begin(), arr_bckup.end());
+        end = cycles_end();
+        runtimes2.emplace_back(end - start);
+        arr_bckup = arr;
+    }
+    uint64_t std_sort = std::accumulate(runtimes2.end() - lastfew,
+                                        runtimes2.end(),
+                                        (uint64_t)0)
+            / lastfew;
+    return std::make_tuple(avx_sort, std_sort);
+}