Skip to content

Commit 0eb633c

Browse files
committed
Add huge sum-reduction and pepper kernel with restrict keyword, it
doubles the speed.
1 parent b7b94aa commit 0eb633c

File tree

3 files changed

+95
-27
lines changed

3 files changed

+95
-27
lines changed

src/gpuarray_buffer_cuda.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,7 @@ static const char CUDA_PREAMBLE[] =
510510
"#define GA_DECL_SHARED_PARAM(type, name)\n"
511511
"#define GA_DECL_SHARED_BODY(type, name) extern __shared__ type name[];\n"
512512
"#define GA_WARP_SIZE warpSize\n"
513+
"#define restrict __restrict__\n"
513514
"#line 1\n";
514515

515516
/* XXX: add complex, quads, longlong */

src/gpuarray_reduction.c

Lines changed: 20 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -416,7 +416,6 @@ static int reduxInvCleanupMsg (redux_ctx* ctx, int r
416416
static size_t reduxInvEstimateParallelism (const redux_ctx* ctx);
417417
static int reduxInvRequiresDst (const redux_ctx* ctx);
418418
static int reduxInvRequiresDstArg (const redux_ctx* ctx);
419-
static int reduxInvKernelRequiresDst (const redux_ctx* ctx);
420419
static unsigned reduxInvGetSplitFree (const redux_ctx* ctx);
421420
static unsigned reduxInvGetSplitReduce (const redux_ctx* ctx);
422421
static axis_desc* reduxInvGetSrcAxis (const redux_ctx* ctx, int i);
@@ -1144,12 +1143,6 @@ static int reduxInvRequiresDst (const redux_ctx* ctx){
11441143
static int reduxInvRequiresDstArg (const redux_ctx* ctx){
11451144
return reduxGenRequiresDstArg(ctx->gr);
11461145
}
1147-
static int reduxInvKernelRequiresDst (const redux_ctx* ctx){
1148-
return reduxGenKernelRequiresDst(ctx->gr);
1149-
}
1150-
static int reduxInvKernelRequiresDstArg (const redux_ctx* ctx){
1151-
return reduxGenKernelRequiresDstArg(ctx->gr);
1152-
}
11531146
static unsigned reduxInvGetSplitFree (const redux_ctx* ctx){
11541147
if(ctx->xdSplit && !axisIsReduced(ctx->xdSplit)){
11551148
return axisGetIntraLen(ctx->xdSplit);
@@ -1512,26 +1505,26 @@ static void reduxGenIterArgs (GpuReduction* gr,
15121505
for(k=gr->ndd;k < gr->nds && reduxGenRequiresDstArg(gr);k++){
15131506
fn(gr, GA_SIZE, "TX", "l%dPDim", k, user);
15141507
}
1515-
fn(gr, GA_BUFFER, "const GLOBAL_MEM char*", "s", 0, user);
1508+
fn(gr, GA_BUFFER, "const GLOBAL_MEM char* restrict", "s", 0, user);
15161509
fn(gr, GA_SSIZE, "TX", "sOff", 0, user);
15171510
for(k=0;k < gr->nds;k++){
15181511
fn(gr, GA_SIZE, "TX", "sJ%d", k, user);
15191512
}
15201513
if(reduxGenRequiresDst (gr)){
1521-
fn(gr, GA_BUFFER, "GLOBAL_MEM char*", "d", 0, user);
1514+
fn(gr, GA_BUFFER, "GLOBAL_MEM char* restrict", "d", 0, user);
15221515
fn(gr, GA_SSIZE, "TX", "dOff", 0, user);
15231516
for(k=0;k < gr->ndd;k++){
15241517
fn(gr, GA_SIZE, "TX", "dJ%d", k, user);
15251518
}
15261519
}
15271520
if(reduxGenRequiresDstArg(gr)){
1528-
fn(gr, GA_BUFFER, "GLOBAL_MEM char*", "a", 0, user);
1521+
fn(gr, GA_BUFFER, "GLOBAL_MEM char* restrict", "a", 0, user);
15291522
fn(gr, GA_SSIZE, "TX", "aOff", 0, user);
15301523
for(k=0;k < gr->ndd;k++){
15311524
fn(gr, GA_SIZE, "TX", "aJ%d", k, user);
15321525
}
15331526
}
1534-
fn(gr, GA_BUFFER, "GLOBAL_MEM char*", "w", 0, user);
1527+
fn(gr, GA_BUFFER, "GLOBAL_MEM char* restrict", "w", 0, user);
15351528
if(reduxGenKernelRequiresDst (gr)){
15361529
fn(gr, GA_SSIZE, "TX", "wdOff", 0, user);
15371530
fn(gr, GA_SSIZE, "TX", "pdOff", 0, user);
@@ -1624,9 +1617,9 @@ static void reduxGenSrcAppendMacroDefs (GpuReduction* gr){
16241617
*/
16251618

16261619
if (gr->srcTypeCode == GA_HALF && gr->accTypeCode == GA_FLOAT){
1627-
srcbAppends(&gr->srcGen, "#define LOADS(v, p) do{(v) = (TK)load_half((TS*)(p));}while(0)\n");
1620+
srcbAppends(&gr->srcGen, "#define LOADS(v, p) do{(v) = (TK)load_half((const TS* restrict)(p));}while(0)\n");
16281621
}else{
1629-
srcbAppends(&gr->srcGen, "#define LOADS(v, p) do{(v) = (TK)*(TS*)(p);}while(0)\n");
1622+
srcbAppends(&gr->srcGen, "#define LOADS(v, p) do{(v) = (TK)*(const TS* restrict)(p);}while(0)\n");
16301623
}
16311624

16321625

@@ -1737,9 +1730,9 @@ static void reduxGenSrcAppendMacroDefs (GpuReduction* gr){
17371730

17381731
if (reduxGenRequiresDst(gr)){
17391732
if (gr->dstTypeCode == GA_HALF && gr->accTypeCode == GA_FLOAT){
1740-
srcbAppends(&gr->srcGen, "#define STORED(p, v) do{store_half((TD*)(p), (v));}while(0)\n");
1733+
srcbAppends(&gr->srcGen, "#define STORED(p, v) do{store_half((TD* restrict)(p), (v));}while(0)\n");
17411734
}else{
1742-
srcbAppends(&gr->srcGen, "#define STORED(p, v) do{*(TD*)(p) = (v);}while(0)\n");
1735+
srcbAppends(&gr->srcGen, "#define STORED(p, v) do{*(TD* restrict)(p) = (v);}while(0)\n");
17431736
}
17441737
}else{
17451738
srcbAppends(&gr->srcGen, "#define STORED(p, v) do{}while(0)\n");
@@ -1753,7 +1746,7 @@ static void reduxGenSrcAppendMacroDefs (GpuReduction* gr){
17531746
*/
17541747

17551748
if (reduxGenRequiresDstArg(gr)){
1756-
srcbAppends(&gr->srcGen, "#define STOREA(p, v) do{*(TA*)(p) = (v);}while(0)\n");
1749+
srcbAppends(&gr->srcGen, "#define STOREA(p, v) do{*(TA* restrict)(p) = (v);}while(0)\n");
17571750
}else{
17581751
srcbAppends(&gr->srcGen, "#define STOREA(p, v) do{}while(0)\n");
17591752
}
@@ -2085,17 +2078,17 @@ static void reduxGenSrcAppendBlockDecode (GpuReduction* gr){
20852078
srcbAppends(&gr->srcGen, " \n");
20862079
if(reduxGenKernelRequiresDst(gr)){
20872080
srcbAppends(&gr->srcGen,
2088-
" TK* wd = (TK*)(w + wdOff);\n"
2089-
" TK* wdL = &wd[0];\n"
2090-
" TK* wdR = &wd[GDIM_0*D];\n"
2091-
" TK* pd = (TK*)(SHMEM + pdOff);\n");
2081+
" TK* restrict wd = (TK* restrict)(w + wdOff);\n"
2082+
" TK* restrict wdL = &wd[0];\n"
2083+
" TK* restrict wdR = &wd[GDIM_0*D];\n"
2084+
" TK* restrict pd = (TK* restrict)(SHMEM + pdOff);\n");
20922085
}
20932086
if(reduxGenKernelRequiresDstArg(gr)){
20942087
srcbAppends(&gr->srcGen,
2095-
" TA* wa = (TA*)(w + waOff);\n"
2096-
" TA* waL = &wa[0];\n"
2097-
" TA* waR = &wa[GDIM_0*D];\n"
2098-
" TA* pa = (TA*)(SHMEM + paOff);\n");
2088+
" TA* restrict wa = (TA* restrict)(w + waOff);\n"
2089+
" TA* restrict waL = &wa[0];\n"
2090+
" TA* restrict waR = &wa[GDIM_0*D];\n"
2091+
" TA* restrict pa = (TA* restrict)(SHMEM + paOff);\n");
20992092
}
21002093
srcbAppends(&gr->srcGen, " \n");
21012094
}
@@ -2173,12 +2166,12 @@ static void reduxGenSrcAppendThreadDecode (GpuReduction* gr){
21732166
" local_barrier();\n");
21742167
}
21752168
srcbAppends(&gr->srcGen, " \n"
2176-
" const char* ts = s + sOff;\n");
2169+
" const char* restrict ts = s + sOff;\n");
21772170
if(reduxGenRequiresDst(gr)){
2178-
srcbAppends(&gr->srcGen, " char* td = d + dOff;\n");
2171+
srcbAppends(&gr->srcGen, " char* restrict td = d + dOff;\n");
21792172
}
21802173
if(reduxGenRequiresDstArg(gr)){
2181-
srcbAppends(&gr->srcGen, " char* ta = a + aOff;\n");
2174+
srcbAppends(&gr->srcGen, " char* restrict ta = a + aOff;\n");
21822175
}
21832176
srcbAppends(&gr->srcGen, " \n"
21842177
" \n");

tests/check_reduction.c

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2054,6 +2054,79 @@ START_TEST(test_sum_alldimsreduced){
20542054
GpuArray_clear(&gaD);
20552055
}END_TEST
20562056

2057+
START_TEST(test_sum_huge){
2058+
pcgSeed(1);
2059+
2060+
/**
2061+
* We test here a reduction of a huge 1D tensor on all dimensions.
2062+
*/
2063+
2064+
size_t i;
2065+
size_t dims[1] = {100000000};
2066+
size_t prodDims = dims[0];
2067+
const int reduxList[] = {0};
2068+
const float TOL = 1e-2;
2069+
2070+
float* pS = calloc(1, sizeof(*pS) * dims[0]);
2071+
float* pD = calloc(1, sizeof(*pD));
2072+
2073+
ck_assert_ptr_ne(pS, NULL);
2074+
ck_assert_ptr_ne(pD, NULL);
2075+
2076+
2077+
/**
2078+
* Initialize source data.
2079+
*/
2080+
2081+
for(i=0;i<prodDims;i++){
2082+
pS[i] = pcgRand01()-0.5;
2083+
}
2084+
2085+
2086+
/**
2087+
* Run the kernel.
2088+
*/
2089+
2090+
GpuArray gaS;
2091+
GpuArray gaD;
2092+
2093+
ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 1, &dims[0], GA_C_ORDER));
2094+
ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 0, NULL, GA_C_ORDER));
2095+
2096+
ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
2097+
ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */
2098+
2099+
GpuReduction* gr;
2100+
GpuReduction_new(&gr, GpuArray_context(&gaS),
2101+
GA_REDUCE_SUM, 0, 1, gaS.typecode, 0);
2102+
ck_assert_ptr_nonnull(gr);
2103+
ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 1, reduxList, 0));
2104+
GpuReduction_free(gr);
2105+
2106+
ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD));
2107+
2108+
2109+
/**
2110+
* Check that the destination tensors are correct.
2111+
*/
2112+
2113+
double gtD = 0;
2114+
for(i=0;i<dims[0];i++){
2115+
double v = pS[i];
2116+
gtD += v;
2117+
}
2118+
ck_assert_double_eq_tol(gtD, pD[0], TOL);
2119+
2120+
/**
2121+
* Deallocate.
2122+
*/
2123+
2124+
free(pS);
2125+
free(pD);
2126+
GpuArray_clear(&gaS);
2127+
GpuArray_clear(&gaD);
2128+
}END_TEST
2129+
20572130
START_TEST(test_prod_reduction){
20582131
pcgSeed(1);
20592132

@@ -3938,6 +4011,7 @@ Suite *get_suite(void) {
39384011
tcase_add_test(tc, test_sum_reduction);
39394012
tcase_add_test(tc, test_sum_veryhighrank);
39404013
tcase_add_test(tc, test_sum_alldimsreduced);
4014+
tcase_add_test(tc, test_sum_huge);
39414015

39424016
tcase_add_test(tc, test_prod_reduction);
39434017
tcase_add_test(tc, test_prod_veryhighrank);

0 commit comments

Comments
 (0)