oneapi-src · abhilash1910 · Sep 15, 2023 · Sep 18, 2023 · Sep 18, 2023 · Sep 18, 2023
@@ -107,7 +107,6 @@ ENTRY_MEMBER_FUNCTION(cub::BlockExchange, cub::BlockExchange, ScatterToStripedFl
 ENTRY_MEMBER_FUNCTION(cub::BlockHistogram, cub::BlockHistogram, InitHistogram, InitHistogram, false, NO_FLAG, P4, "Comment")
 ENTRY_MEMBER_FUNCTION(cub::BlockHistogram, cub::BlockHistogram, Histogram, Histogram, false, NO_FLAG, P4, "Comment")
 ENTRY_MEMBER_FUNCTION(cub::BlockHistogram, cub::BlockHistogram, Composite, Composite, false, NO_FLAG, P4, "Comment")
-ENTRY_MEMBER_FUNCTION(cub::BlockLoad, cub::BlockLoad, Load, Load, false, NO_FLAG, P4, "Comment")
 ENTRY_MEMBER_FUNCTION(cub::BlockStore, cub::BlockStore, Store, Store, false, NO_FLAG, P4, "Comment")
 ENTRY_MEMBER_FUNCTION(cub::BlockRadixSort, cub::BlockRadixSort, Sort, Sort, false, NO_FLAG, P4, "Comment")
 ENTRY_MEMBER_FUNCTION(cub::BlockRadixSort, cub::BlockRadixSort, SortDescending, SortDescending, false, NO_FLAG, P4, "Comment")
@@ -123,6 +122,7 @@ ENTRY_MEMBER_FUNCTION(cub::BlockShuffle, cub::BlockShuffle, Offset, Offset, fals
 ENTRY_MEMBER_FUNCTION(cub::BlockShuffle, cub::BlockShuffle, Rotate, Rotate, false, NO_FLAG, P4, "Comment")
 ENTRY_MEMBER_FUNCTION(cub::BlockShuffle, cub::BlockShuffle, Up, Up, false, NO_FLAG, P4, "Comment")
 ENTRY_MEMBER_FUNCTION(cub::BlockShuffle, cub::BlockShuffle, Down, Down, false, NO_FLAG, P4, "Comment")
+ENTRY_MEMBER_FUNCTION(cub::BlockLoad, cub::BlockLoad, Load, Load, true, NO_FLAG, P4, "Successful")
 
 
 // Device Level

@@ -558,14 +558,14 @@ void CubDeviceLevelRule::removeRedundantTempVar(const CallExpr *CE) {
 void CubRule::registerMatcher(ast_matchers::MatchFinder &MF) {
   MF.addMatcher(
       typeLoc(loc(qualType(hasDeclaration(namedDecl(hasAnyName(
-                  "WarpScan", "WarpReduce", "BlockScan", "BlockReduce"))))))
+                  "WarpScan", "WarpReduce", "BlockScan", "BlockReduce", "BlockLoad"))))))
           .bind("TypeLoc"),
       this);
 
   MF.addMatcher(
       typedefDecl(
           hasType(hasCanonicalType(qualType(hasDeclaration(namedDecl(hasAnyName(
-              "WarpScan", "WarpReduce", "BlockScan", "BlockReduce")))))))
+              "WarpScan", "WarpReduce", "BlockScan", "BlockReduce", "BlockLoad")))))))
           .bind("TypeDefDecl"),
       this);
 
@@ -684,7 +684,8 @@ void CubRule::processCubDeclStmt(const DeclStmt *DS) {
                  ObjTypeStr.find("class cub::WarpReduce") == 0) {
         Repl = DpctGlobalInfo::getSubGroup(DRE);
       } else if (ObjTypeStr.find("class cub::BlockScan") == 0 ||
-                 ObjTypeStr.find("class cub::BlockReduce") == 0) {
+                 ObjTypeStr.find("class cub::BlockReduce") == 0 ||
+                 ObjTypeStr.find("class cub::BlockLoad") == 0) {
         Repl = DpctGlobalInfo::getGroup(DRE);
       } else {
         continue;
@@ -749,7 +750,8 @@ void CubRule::processCubTypeDef(const TypedefDecl *TD) {
               !(ObjTypeStr.find("class cub::WarpScan") == 0 ||
                 ObjTypeStr.find("class cub::WarpReduce") == 0 ||
                 ObjTypeStr.find("class cub::BlockScan") == 0 ||
-                ObjTypeStr.find("class cub::BlockReduce") == 0)) {
+                ObjTypeStr.find("class cub::BlockReduce") == 0 ||
+                ObjTypeStr.find("class cub::BlockLoad") == 0)) {
             DeleteFlag = false;
             break;
           }
@@ -1139,6 +1141,17 @@ void CubRule::processBlockLevelMemberCall(const CXXMemberCallExpr *BlockMC) {
     CubParamAs << GroupOrWorkitem << InEA.getReplacedString() << OpRepl;
     Repl = NewFuncName + "(" + ParamList + ")";
     emplaceTransformation(new ReplaceStmt(BlockMC, Repl));
+  } else if (FuncName == "Load") {
+
+    GroupOrWorkitem = DpctGlobalInfo::getItem(BlockMC);
+    NewFuncName = Mapnames::getClNamespace() + "load";
+    const Expr *InData = FuncArgs[0];
+    ExprAnalysis InEA(InData);
+    OpRepl = getOpRepl(nullptr);
+    CubParamAs << GroupOrWorkitem << InEA.getReplacedString() << OpRepl;
+    Repl = NewFuncName + "(" + ParamList + ")";
+    emplaceTransformation(new ReplaceStmt(BlockMC, Repl));
+
   }
 }
 
@@ -1304,7 +1317,8 @@ void CubRule::processCubMemberCall(const CXXMemberCallExpr *MC) {
              ObjTypeStr.find("class cub::WarpReduce") == 0) {
     processWarpLevelMemberCall(MC);
   } else if (ObjTypeStr.find("class cub::BlockScan") == 0 ||
-             ObjTypeStr.find("class cub::BlockReduce") == 0) {
+             ObjTypeStr.find("class cub::BlockReduce") == 0 ||
+             ObjTypeStr.find("class cub::BlockLoad") == 0) {
     processBlockLevelMemberCall(MC);
   } else {
     report(MC->getBeginLoc(), Diagnostics::API_NOT_MIGRATED, false, ObjTypeStr);
@@ -1328,7 +1342,8 @@ void CubRule::processTypeLoc(const TypeLoc *TL) {
                                       MapNames::getClNamespace() + "sub_group",
                                       SM));
   } else if (TypeName.find("class cub::BlockScan") == 0 ||
-             TypeName.find("class cub::BlockReduce") == 0) {
+             TypeName.find("class cub::BlockReduce") == 0 ||
+             ObjTypeStr.find("class cub::BlockLoad") == 0) {
     auto DeviceFuncDecl = DpctGlobalInfo::findAncestor<FunctionDecl>(TL);
     if (DeviceFuncDecl && (DeviceFuncDecl->hasAttr<CUDADeviceAttr>() ||
                            DeviceFuncDecl->hasAttr<CUDAGlobalAttr>())) {

diff --git a/clang/lib/DPCT/ExprAnalysis.cpp b/clang/lib/DPCT/ExprAnalysis.cpp
@@ -1170,7 +1170,7 @@ void ExprAnalysis::analyzeType(TypeLoc TL, const Expr *CSCE,
       }
     }
     if (OS.str() != "cub::WarpScan" && OS.str() != "cub::WarpReduce" &&
-        OS.str() != "cub::BlockReduce" && OS.str() != "cub::BlockScan") {
+        OS.str() != "cub::BlockReduce" && OS.str() != "cub::BlockScan" && OS.str() != "cub::BlockLoad") {
       SR.setEnd(TSTL.getTemplateNameLoc());
     }
     analyzeTemplateSpecializationType(TSTL);

diff --git a/clang/test/dpct/cub/blocklevel/blockload.cu b/clang/test/dpct/cub/blocklevel/blockload.cu
@@ -0,0 +1,83 @@
+// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2
+// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2
+// RUN: dpct -in-root %S -out-root %T/blocklevel/blockload %S/blockload.cu --cuda-include-path="%cuda-path/include" -- -std=c++14 -x cuda --cuda-host-only
+// RUN: FileCheck --input-file %T/blocklevel/blockscan/blockload.dp.cpp --match-full-lines %s
+
+#include <iostream>
+#include <vector>
+
+#include <cuda_runtime.h>
+#include <cub/cub.cuh>
+
+#define WARP_SIZE 32
+
+const int N = 256;
+const int BlockSize = 128;
+const int ItemsPerThread = 4;
+
+
+void init_data(int* data, int num) {
+  for(int i = 0; i < num; i++)
+    data[i] = i;
+}
+void verify_data(int* data, int num) {
+  return;
+}
+void print_data(int* data, int num) {
+  for (int i = 0; i < num; i++) {
+    std::cout << data[i] << " ";
+  }
+  std::cout << std::endl;
+}
+
+//CHECK: void BlockLoadKernel(int* data,
+//CHECK-NEXT:  const sycl::nd_item<3> &item_ct1) {
+//CHECK-EMPTY:
+//CHECK-NEXT:  int threadid = item_ct1.get_local_id(2);
+//CHECK-EMPTY:
+//CHECK-NEXT:  int input = data[threadid];
+//CHECK-NEXT:  int output = 0;
+//CHECK-NEXT:  output = sycl::load(item_ct1.get_group(), input, sycl::plus<>());
+//CHECK-NEXT:  data[threadid] = output;
+//CHECK-NEXT:}
+
+__global__ void BlockLoadKernel(int *d_data)
+{
+    // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+    typedef cub::BlockLoad<int, BlockSize, ItemsPerThread, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+    // Allocate shared memory for BlockLoad
+    __shared__ typename BlockLoad::TempStorage temp_storage;
+    // Load a segment of consecutive items that are blocked across threads
+    int thread_data[ItemsPerThread];
+    int offset = threadIdx.x * ItemsPerThread;
+    BlockLoad(temp_storage).Load(d_data + offset, thread_data);
+
+    // Print loaded data
+    printf("Thread %d loaded: %d %d %d %d\n", threadIdx.x, thread_data[0], thread_data[1], thread_data[2], thread_data[3]);
+}
+
+int main()
+{
+    int h_data[N];
+    init_data(h_data, N);
+    int *d_data;
+    cudaMalloc((void**)&d_data, N * sizeof(int));
+    cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice);
+
+    //CHECK:  q_ct1.parallel_for(
+    //CHECK-NEXT:        sycl::nd_range<3>(GridSize * BlockSize, BlockSize),
+    //CHECK-NEXT:        [=](sycl::nd_item<3> item_ct1) {
+    //CHECK-NEXT:          BlockLoadKernel(dev_data, item_ct1);
+    //CHECK-NEXT:        });
+
+    dim3 block(BlockSize);
+    dim3 grid((N + BlockSize - 1) / BlockSize);
+
+    BlockLoadKernel<<<grid, block>>>(d_data);
+    cudaDeviceSynchronize();
+    //verify_data(d_data, N);   
+
+    cudaFree(d_data);
+
+    return 0;
+}