Skip to content
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion clang/lib/DPCT/APINames_CUB.inc
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,6 @@ ENTRY_MEMBER_FUNCTION(cub::BlockExchange, cub::BlockExchange, ScatterToStripedFl
ENTRY_MEMBER_FUNCTION(cub::BlockHistogram, cub::BlockHistogram, InitHistogram, InitHistogram, false, NO_FLAG, P4, "Comment")
ENTRY_MEMBER_FUNCTION(cub::BlockHistogram, cub::BlockHistogram, Histogram, Histogram, false, NO_FLAG, P4, "Comment")
ENTRY_MEMBER_FUNCTION(cub::BlockHistogram, cub::BlockHistogram, Composite, Composite, false, NO_FLAG, P4, "Comment")
ENTRY_MEMBER_FUNCTION(cub::BlockLoad, cub::BlockLoad, Load, Load, false, NO_FLAG, P4, "Comment")
ENTRY_MEMBER_FUNCTION(cub::BlockStore, cub::BlockStore, Store, Store, false, NO_FLAG, P4, "Comment")
ENTRY_MEMBER_FUNCTION(cub::BlockRadixSort, cub::BlockRadixSort, Sort, Sort, false, NO_FLAG, P4, "Comment")
ENTRY_MEMBER_FUNCTION(cub::BlockRadixSort, cub::BlockRadixSort, SortDescending, SortDescending, false, NO_FLAG, P4, "Comment")
Expand All @@ -123,6 +122,7 @@ ENTRY_MEMBER_FUNCTION(cub::BlockShuffle, cub::BlockShuffle, Offset, Offset, fals
ENTRY_MEMBER_FUNCTION(cub::BlockShuffle, cub::BlockShuffle, Rotate, Rotate, false, NO_FLAG, P4, "Comment")
ENTRY_MEMBER_FUNCTION(cub::BlockShuffle, cub::BlockShuffle, Up, Up, false, NO_FLAG, P4, "Comment")
ENTRY_MEMBER_FUNCTION(cub::BlockShuffle, cub::BlockShuffle, Down, Down, false, NO_FLAG, P4, "Comment")
ENTRY_MEMBER_FUNCTION(cub::BlockLoad, cub::BlockLoad, Load, Load, true, NO_FLAG, P4, "Successful")


Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's remove extra blank line change.

// Device Level
Expand Down
27 changes: 21 additions & 6 deletions clang/lib/DPCT/CUBAPIMigration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -558,14 +558,14 @@ void CubDeviceLevelRule::removeRedundantTempVar(const CallExpr *CE) {
void CubRule::registerMatcher(ast_matchers::MatchFinder &MF) {
MF.addMatcher(
typeLoc(loc(qualType(hasDeclaration(namedDecl(hasAnyName(
"WarpScan", "WarpReduce", "BlockScan", "BlockReduce"))))))
"WarpScan", "WarpReduce", "BlockScan", "BlockReduce", "BlockLoad"))))))
.bind("TypeLoc"),
this);

MF.addMatcher(
typedefDecl(
hasType(hasCanonicalType(qualType(hasDeclaration(namedDecl(hasAnyName(
"WarpScan", "WarpReduce", "BlockScan", "BlockReduce")))))))
"WarpScan", "WarpReduce", "BlockScan", "BlockReduce", "BlockLoad")))))))
.bind("TypeDefDecl"),
this);

Expand Down Expand Up @@ -684,7 +684,8 @@ void CubRule::processCubDeclStmt(const DeclStmt *DS) {
ObjTypeStr.find("class cub::WarpReduce") == 0) {
Repl = DpctGlobalInfo::getSubGroup(DRE);
} else if (ObjTypeStr.find("class cub::BlockScan") == 0 ||
ObjTypeStr.find("class cub::BlockReduce") == 0) {
ObjTypeStr.find("class cub::BlockReduce") == 0 ||
ObjTypeStr.find("class cub::BlockLoad") == 0) {
Repl = DpctGlobalInfo::getGroup(DRE);
} else {
continue;
Expand Down Expand Up @@ -749,7 +750,8 @@ void CubRule::processCubTypeDef(const TypedefDecl *TD) {
!(ObjTypeStr.find("class cub::WarpScan") == 0 ||
ObjTypeStr.find("class cub::WarpReduce") == 0 ||
ObjTypeStr.find("class cub::BlockScan") == 0 ||
ObjTypeStr.find("class cub::BlockReduce") == 0)) {
ObjTypeStr.find("class cub::BlockReduce") == 0 ||
ObjTypeStr.find("class cub::BlockLoad") == 0)) {
DeleteFlag = false;
break;
}
Expand Down Expand Up @@ -1139,6 +1141,17 @@ void CubRule::processBlockLevelMemberCall(const CXXMemberCallExpr *BlockMC) {
CubParamAs << GroupOrWorkitem << InEA.getReplacedString() << OpRepl;
Repl = NewFuncName + "(" + ParamList + ")";
emplaceTransformation(new ReplaceStmt(BlockMC, Repl));
} else if (FuncName == "Load") {

GroupOrWorkitem = DpctGlobalInfo::getItem(BlockMC);
NewFuncName = Mapnames::getClNamespace() + "load";
const Expr *InData = FuncArgs[0];
ExprAnalysis InEA(InData);
OpRepl = getOpRepl(nullptr);
CubParamAs << GroupOrWorkitem << InEA.getReplacedString() << OpRepl;
Repl = NewFuncName + "(" + ParamList + ")";
emplaceTransformation(new ReplaceStmt(BlockMC, Repl));

}
}

Expand Down Expand Up @@ -1304,7 +1317,8 @@ void CubRule::processCubMemberCall(const CXXMemberCallExpr *MC) {
ObjTypeStr.find("class cub::WarpReduce") == 0) {
processWarpLevelMemberCall(MC);
} else if (ObjTypeStr.find("class cub::BlockScan") == 0 ||
ObjTypeStr.find("class cub::BlockReduce") == 0) {
ObjTypeStr.find("class cub::BlockReduce") == 0 ||
ObjTypeStr.find("class cub::BlockLoad") == 0) {
processBlockLevelMemberCall(MC);
} else {
report(MC->getBeginLoc(), Diagnostics::API_NOT_MIGRATED, false, ObjTypeStr);
Expand All @@ -1328,7 +1342,8 @@ void CubRule::processTypeLoc(const TypeLoc *TL) {
MapNames::getClNamespace() + "sub_group",
SM));
} else if (TypeName.find("class cub::BlockScan") == 0 ||
TypeName.find("class cub::BlockReduce") == 0) {
TypeName.find("class cub::BlockReduce") == 0 ||
ObjTypeStr.find("class cub::BlockLoad") == 0) {
auto DeviceFuncDecl = DpctGlobalInfo::findAncestor<FunctionDecl>(TL);
if (DeviceFuncDecl && (DeviceFuncDecl->hasAttr<CUDADeviceAttr>() ||
DeviceFuncDecl->hasAttr<CUDAGlobalAttr>())) {
Expand Down
2 changes: 1 addition & 1 deletion clang/lib/DPCT/ExprAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1170,7 +1170,7 @@ void ExprAnalysis::analyzeType(TypeLoc TL, const Expr *CSCE,
}
}
if (OS.str() != "cub::WarpScan" && OS.str() != "cub::WarpReduce" &&
OS.str() != "cub::BlockReduce" && OS.str() != "cub::BlockScan") {
OS.str() != "cub::BlockReduce" && OS.str() != "cub::BlockScan" && OS.str() != "cub::BlockLoad") {
SR.setEnd(TSTL.getTemplateNameLoc());
}
analyzeTemplateSpecializationType(TSTL);
Expand Down
83 changes: 83 additions & 0 deletions clang/test/dpct/cub/blocklevel/blockload.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2
// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2
// RUN: dpct -in-root %S -out-root %T/blocklevel/blockload %S/blockload.cu --cuda-include-path="%cuda-path/include" -- -std=c++14 -x cuda --cuda-host-only
// RUN: FileCheck --input-file %T/blocklevel/blockscan/blockload.dp.cpp --match-full-lines %s

#include <iostream>
#include <vector>

#include <cuda_runtime.h>
#include <cub/cub.cuh>

#define WARP_SIZE 32

const int N = 256;
const int BlockSize = 128;
const int ItemsPerThread = 4;


void init_data(int* data, int num) {
for(int i = 0; i < num; i++)
data[i] = i;
}
void verify_data(int* data, int num) {
return;
}
void print_data(int* data, int num) {
for (int i = 0; i < num; i++) {
std::cout << data[i] << " ";
}
std::cout << std::endl;
}

//CHECK: void BlockLoadKernel(int* data,
//CHECK-NEXT: const sycl::nd_item<3> &item_ct1) {
//CHECK-EMPTY:
//CHECK-NEXT: int threadid = item_ct1.get_local_id(2);
//CHECK-EMPTY:
//CHECK-NEXT: int input = data[threadid];
//CHECK-NEXT: int output = 0;
//CHECK-NEXT: output = sycl::load(item_ct1.get_group(), input, sycl::plus<>());
//CHECK-NEXT: data[threadid] = output;
//CHECK-NEXT:}

__global__ void BlockLoadKernel(int *d_data)
{
// Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
typedef cub::BlockLoad<int, BlockSize, ItemsPerThread, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
// Allocate shared memory for BlockLoad
__shared__ typename BlockLoad::TempStorage temp_storage;
// Load a segment of consecutive items that are blocked across threads
int thread_data[ItemsPerThread];
int offset = threadIdx.x * ItemsPerThread;
BlockLoad(temp_storage).Load(d_data + offset, thread_data);

// Print loaded data
printf("Thread %d loaded: %d %d %d %d\n", threadIdx.x, thread_data[0], thread_data[1], thread_data[2], thread_data[3]);
}

int main()
{
int h_data[N];
init_data(h_data, N);
int *d_data;
cudaMalloc((void**)&d_data, N * sizeof(int));
cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice);

//CHECK: q_ct1.parallel_for(
//CHECK-NEXT: sycl::nd_range<3>(GridSize * BlockSize, BlockSize),
//CHECK-NEXT: [=](sycl::nd_item<3> item_ct1) {
//CHECK-NEXT: BlockLoadKernel(dev_data, item_ct1);
//CHECK-NEXT: });

dim3 block(BlockSize);
dim3 grid((N + BlockSize - 1) / BlockSize);

BlockLoadKernel<<<grid, block>>>(d_data);
cudaDeviceSynchronize();
//verify_data(d_data, N);

cudaFree(d_data);

return 0;
}