Skip to content

Commit 9c44d2a

Browse files
author
Raghuveer Devulapalli
authored
Merge pull request #34 from r-devulap/argsort
Add AVX-512 argsort for 32 and 64-bit data type
2 parents 62b03bf + 353dc3c commit 9c44d2a

16 files changed

+1892
-509
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
CXX = g++-12
1+
CXX ?= g++-12
22
SRCDIR = ./src
33
TESTDIR = ./tests
44
BENCHDIR = ./benchmarks

benchmarks/bench-qsort-common.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
#ifndef AVX512_BENCH_COMMON
22
#define AVX512_BENCH_COMMON
33

4-
#include <benchmark/benchmark.h>
5-
#include "rand_array.h"
6-
#include "cpuinfo.h"
74
#include "avx512-16bit-qsort.hpp"
85
#include "avx512-32bit-qsort.hpp"
6+
#include "avx512-64bit-argsort.hpp"
97
#include "avx512-64bit-qsort.hpp"
8+
#include "cpuinfo.h"
9+
#include "rand_array.h"
10+
#include <benchmark/benchmark.h>
1011

1112
#define MY_BENCHMARK_CAPTURE(func, T, test_case_name, ...) \
1213
BENCHMARK_PRIVATE_DECLARE(func) \

benchmarks/bench_argsort.hpp

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#include "bench-qsort-common.h"
2+
3+
template <typename T>
4+
std::vector<int64_t> stdargsort(const std::vector<T> &array)
5+
{
6+
std::vector<int64_t> indices(array.size());
7+
std::iota(indices.begin(), indices.end(), 0);
8+
std::sort(indices.begin(),
9+
indices.end(),
10+
[&array](int64_t left, int64_t right) -> bool {
11+
// sort indices according to corresponding array element
12+
return array[left] < array[right];
13+
});
14+
15+
return indices;
16+
}
17+
18+
template <typename T, class... Args>
19+
static void stdargsort(benchmark::State &state, Args &&...args)
20+
{
21+
auto args_tuple = std::make_tuple(std::move(args)...);
22+
// Perform setup here
23+
size_t ARRSIZE = std::get<0>(args_tuple);
24+
std::vector<T> arr;
25+
std::vector<int64_t> inx;
26+
27+
std::string arrtype = std::get<1>(args_tuple);
28+
if (arrtype == "random") { arr = get_uniform_rand_array<T>(ARRSIZE); }
29+
else if (arrtype == "sorted") {
30+
arr = get_uniform_rand_array<T>(ARRSIZE);
31+
std::sort(arr.begin(), arr.end());
32+
}
33+
else if (arrtype == "constant") {
34+
T temp = get_uniform_rand_array<T>(1)[0];
35+
for (size_t ii = 0; ii < ARRSIZE; ++ii) {
36+
arr.push_back(temp);
37+
}
38+
}
39+
else if (arrtype == "reverse") {
40+
arr = get_uniform_rand_array<T>(ARRSIZE);
41+
std::sort(arr.begin(), arr.end());
42+
std::reverse(arr.begin(), arr.end());
43+
}
44+
45+
/* call avx512 quicksort */
46+
for (auto _ : state) {
47+
inx = stdargsort(arr);
48+
}
49+
}
50+
51+
template <typename T, class... Args>
52+
static void avx512argsort(benchmark::State &state, Args &&...args)
53+
{
54+
auto args_tuple = std::make_tuple(std::move(args)...);
55+
if (!cpu_has_avx512bw()) {
56+
state.SkipWithMessage("Requires AVX512 BW ISA");
57+
}
58+
// Perform setup here
59+
size_t ARRSIZE = std::get<0>(args_tuple);
60+
std::vector<T> arr;
61+
std::vector<int64_t> inx;
62+
63+
std::string arrtype = std::get<1>(args_tuple);
64+
if (arrtype == "random") { arr = get_uniform_rand_array<T>(ARRSIZE); }
65+
else if (arrtype == "sorted") {
66+
arr = get_uniform_rand_array<T>(ARRSIZE);
67+
std::sort(arr.begin(), arr.end());
68+
}
69+
else if (arrtype == "constant") {
70+
T temp = get_uniform_rand_array<T>(1)[0];
71+
for (size_t ii = 0; ii < ARRSIZE; ++ii) {
72+
arr.push_back(temp);
73+
}
74+
}
75+
else if (arrtype == "reverse") {
76+
arr = get_uniform_rand_array<T>(ARRSIZE);
77+
std::sort(arr.begin(), arr.end());
78+
std::reverse(arr.begin(), arr.end());
79+
}
80+
81+
/* call avx512 quicksort */
82+
for (auto _ : state) {
83+
inx = avx512_argsort<T>(arr.data(), ARRSIZE);
84+
}
85+
}
86+
87+
#define BENCH_BOTH(type)\
88+
BENCH(avx512argsort, type)\
89+
BENCH(stdargsort, type)\
90+
91+
BENCH_BOTH(int64_t)
92+
BENCH_BOTH(uint64_t)
93+
BENCH_BOTH(double)
94+
BENCH_BOTH(int32_t)
95+
BENCH_BOTH(uint32_t)
96+
BENCH_BOTH(float)

benchmarks/bench_partial_qsort.hpp

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
#include "bench-qsort-common.h"
22

33
template <typename T>
4-
static void avx512_partial_qsort(benchmark::State& state) {
4+
static void avx512_partial_qsort(benchmark::State &state)
5+
{
56
if (!cpu_has_avx512bw()) {
67
state.SkipWithMessage("Requires AVX512 BW ISA");
78
}
@@ -29,7 +30,8 @@ static void avx512_partial_qsort(benchmark::State& state) {
2930
}
3031

3132
template <typename T>
32-
static void stdpartialsort(benchmark::State& state) {
33+
static void stdpartialsort(benchmark::State &state)
34+
{
3335
// Perform setup here
3436
int64_t K = state.range(0);
3537
size_t ARRSIZE = 10000;
@@ -53,20 +55,48 @@ static void stdpartialsort(benchmark::State& state) {
5355
// Register the function as a benchmark
5456
BENCHMARK(avx512_partial_qsort<float>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
5557
BENCHMARK(stdpartialsort<float>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
56-
BENCHMARK(avx512_partial_qsort<uint32_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
58+
BENCHMARK(avx512_partial_qsort<uint32_t>)
59+
->Arg(10)
60+
->Arg(100)
61+
->Arg(1000)
62+
->Arg(5000);
5763
BENCHMARK(stdpartialsort<uint32_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
58-
BENCHMARK(avx512_partial_qsort<int32_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
64+
BENCHMARK(avx512_partial_qsort<int32_t>)
65+
->Arg(10)
66+
->Arg(100)
67+
->Arg(1000)
68+
->Arg(5000);
5969
BENCHMARK(stdpartialsort<int32_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
6070

61-
BENCHMARK(avx512_partial_qsort<double>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
71+
BENCHMARK(avx512_partial_qsort<double>)
72+
->Arg(10)
73+
->Arg(100)
74+
->Arg(1000)
75+
->Arg(5000);
6276
BENCHMARK(stdpartialsort<double>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
63-
BENCHMARK(avx512_partial_qsort<uint64_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
77+
BENCHMARK(avx512_partial_qsort<uint64_t>)
78+
->Arg(10)
79+
->Arg(100)
80+
->Arg(1000)
81+
->Arg(5000);
6482
BENCHMARK(stdpartialsort<uint64_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
65-
BENCHMARK(avx512_partial_qsort<int64_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
83+
BENCHMARK(avx512_partial_qsort<int64_t>)
84+
->Arg(10)
85+
->Arg(100)
86+
->Arg(1000)
87+
->Arg(5000);
6688
BENCHMARK(stdpartialsort<int64_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
6789

6890
//BENCHMARK(avx512_partial_qsort<float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
69-
BENCHMARK(avx512_partial_qsort<uint16_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
91+
BENCHMARK(avx512_partial_qsort<uint16_t>)
92+
->Arg(10)
93+
->Arg(100)
94+
->Arg(1000)
95+
->Arg(5000);
7096
BENCHMARK(stdpartialsort<uint16_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
71-
BENCHMARK(avx512_partial_qsort<int16_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
97+
BENCHMARK(avx512_partial_qsort<int16_t>)
98+
->Arg(10)
99+
->Arg(100)
100+
->Arg(1000)
101+
->Arg(5000);
72102
BENCHMARK(stdpartialsort<int16_t>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);

benchmarks/bench_qselect.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
#include "bench-qsort-common.h"
22

33
template <typename T>
4-
static void avx512_qselect(benchmark::State& state) {
4+
static void avx512_qselect(benchmark::State &state)
5+
{
56
if (!cpu_has_avx512bw()) {
67
state.SkipWithMessage("Requires AVX512 BW ISA");
78
}
@@ -29,7 +30,8 @@ static void avx512_qselect(benchmark::State& state) {
2930
}
3031

3132
template <typename T>
32-
static void stdnthelement(benchmark::State& state) {
33+
static void stdnthelement(benchmark::State &state)
34+
{
3335
// Perform setup here
3436
int64_t K = state.range(0);
3537
size_t ARRSIZE = 10000;

benchmarks/bench_qsort.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
#include "bench_qsort.hpp"
2-
#include "bench_qselect.hpp"
2+
#include "bench_argsort.hpp"
33
#include "bench_partial_qsort.hpp"
4+
#include "bench_qselect.hpp"

benchmarks/bench_qsort.hpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -80,15 +80,15 @@ static void avx512qsort(benchmark::State &state, Args &&...args)
8080
}
8181
}
8282

83-
#define BENCH_ALL(type)\
83+
#define BENCH_BOTH_QSORT(type)\
8484
BENCH(avx512qsort, type)\
8585
BENCH(stdsort, type)
8686

87-
BENCH_ALL(uint64_t)
88-
BENCH_ALL(int64_t)
89-
BENCH_ALL(uint32_t)
90-
BENCH_ALL(int32_t)
91-
BENCH_ALL(uint16_t)
92-
BENCH_ALL(int16_t)
93-
BENCH_ALL(float)
94-
BENCH_ALL(double)
87+
BENCH_BOTH_QSORT(uint64_t)
88+
BENCH_BOTH_QSORT(int64_t)
89+
BENCH_BOTH_QSORT(uint32_t)
90+
BENCH_BOTH_QSORT(int32_t)
91+
BENCH_BOTH_QSORT(uint16_t)
92+
BENCH_BOTH_QSORT(int16_t)
93+
BENCH_BOTH_QSORT(float)
94+
BENCH_BOTH_QSORT(double)

benchmarks/bench_qsortfp16.cpp

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1-
#include <benchmark/benchmark.h>
2-
#include "rand_array.h"
3-
#include "cpuinfo.h"
41
#include "avx512fp16-16bit-qsort.hpp"
2+
#include "cpuinfo.h"
3+
#include "rand_array.h"
4+
#include <benchmark/benchmark.h>
55

66
template <typename T>
7-
static void avx512_qsort(benchmark::State& state) {
7+
static void avx512_qsort(benchmark::State &state)
8+
{
89
if (cpu_has_avx512fp16()) {
910
// Perform setup here
1011
size_t ARRSIZE = state.range(0);
@@ -13,7 +14,7 @@ static void avx512_qsort(benchmark::State& state) {
1314

1415
/* Initialize elements */
1516
for (size_t jj = 0; jj < ARRSIZE; ++jj) {
16-
_Float16 temp = (float) rand() / (float)(RAND_MAX);
17+
_Float16 temp = (float)rand() / (float)(RAND_MAX);
1718
arr.push_back(temp);
1819
}
1920
arr_bkp = arr;
@@ -32,15 +33,16 @@ static void avx512_qsort(benchmark::State& state) {
3233
}
3334

3435
template <typename T>
35-
static void stdsort(benchmark::State& state) {
36+
static void stdsort(benchmark::State &state)
37+
{
3638
if (cpu_has_avx512fp16()) {
3739
// Perform setup here
3840
size_t ARRSIZE = state.range(0);
3941
std::vector<T> arr;
4042
std::vector<T> arr_bkp;
4143

4244
for (size_t jj = 0; jj < ARRSIZE; ++jj) {
43-
_Float16 temp = (float) rand() / (float)(RAND_MAX);
45+
_Float16 temp = (float)rand() / (float)(RAND_MAX);
4446
arr.push_back(temp);
4547
}
4648
arr_bkp = arr;
@@ -63,7 +65,8 @@ BENCHMARK(avx512_qsort<_Float16>)->Arg(10000)->Arg(1000000);
6365
BENCHMARK(stdsort<_Float16>)->Arg(10000)->Arg(1000000);
6466

6567
template <typename T>
66-
static void avx512_qselect(benchmark::State& state) {
68+
static void avx512_qselect(benchmark::State &state)
69+
{
6770
if (cpu_has_avx512fp16()) {
6871
// Perform setup here
6972
int64_t K = state.range(0);
@@ -73,7 +76,7 @@ static void avx512_qselect(benchmark::State& state) {
7376

7477
/* Initialize elements */
7578
for (size_t jj = 0; jj < ARRSIZE; ++jj) {
76-
_Float16 temp = (float) rand() / (float)(RAND_MAX);
79+
_Float16 temp = (float)rand() / (float)(RAND_MAX);
7780
arr.push_back(temp);
7881
}
7982
arr_bkp = arr;
@@ -93,7 +96,8 @@ static void avx512_qselect(benchmark::State& state) {
9396
}
9497

9598
template <typename T>
96-
static void stdnthelement(benchmark::State& state) {
99+
static void stdnthelement(benchmark::State &state)
100+
{
97101
if (cpu_has_avx512fp16()) {
98102
// Perform setup here
99103
int64_t K = state.range(0);
@@ -103,7 +107,7 @@ static void stdnthelement(benchmark::State& state) {
103107

104108
/* Initialize elements */
105109
for (size_t jj = 0; jj < ARRSIZE; ++jj) {
106-
_Float16 temp = (float) rand() / (float)(RAND_MAX);
110+
_Float16 temp = (float)rand() / (float)(RAND_MAX);
107111
arr.push_back(temp);
108112
}
109113
arr_bkp = arr;
@@ -127,7 +131,8 @@ BENCHMARK(avx512_qselect<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
127131
BENCHMARK(stdnthelement<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
128132

129133
template <typename T>
130-
static void avx512_partial_qsort(benchmark::State& state) {
134+
static void avx512_partial_qsort(benchmark::State &state)
135+
{
131136
if (cpu_has_avx512fp16()) {
132137
// Perform setup here
133138
int64_t K = state.range(0);
@@ -137,7 +142,7 @@ static void avx512_partial_qsort(benchmark::State& state) {
137142

138143
/* Initialize elements */
139144
for (size_t jj = 0; jj < ARRSIZE; ++jj) {
140-
_Float16 temp = (float) rand() / (float)(RAND_MAX);
145+
_Float16 temp = (float)rand() / (float)(RAND_MAX);
141146
arr.push_back(temp);
142147
}
143148
arr_bkp = arr;
@@ -157,7 +162,8 @@ static void avx512_partial_qsort(benchmark::State& state) {
157162
}
158163

159164
template <typename T>
160-
static void stdpartialsort(benchmark::State& state) {
165+
static void stdpartialsort(benchmark::State &state)
166+
{
161167
if (cpu_has_avx512fp16()) {
162168
// Perform setup here
163169
int64_t K = state.range(0);
@@ -167,7 +173,7 @@ static void stdpartialsort(benchmark::State& state) {
167173

168174
/* Initialize elements */
169175
for (size_t jj = 0; jj < ARRSIZE; ++jj) {
170-
_Float16 temp = (float) rand() / (float)(RAND_MAX);
176+
_Float16 temp = (float)rand() / (float)(RAND_MAX);
171177
arr.push_back(temp);
172178
}
173179
arr_bkp = arr;
@@ -187,5 +193,9 @@ static void stdpartialsort(benchmark::State& state) {
187193
}
188194

189195
// Register the function as a benchmark
190-
BENCHMARK(avx512_partial_qsort<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);
196+
BENCHMARK(avx512_partial_qsort<_Float16>)
197+
->Arg(10)
198+
->Arg(100)
199+
->Arg(1000)
200+
->Arg(5000);
191201
BENCHMARK(stdpartialsort<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000);

0 commit comments

Comments
 (0)