Skip to content

Commit 2b44d2b

Browse files
authored
[SYCLomatic][ASM] Support migration of 2 PTX instruction: shfl and shfl.sync (#2447)
Signed-off-by: chenwei.sun <[email protected]>
1 parent fa328ba commit 2b44d2b

File tree

6 files changed

+245
-1
lines changed

6 files changed

+245
-1
lines changed

clang/lib/DPCT/RulesAsm/AsmMigration.cpp

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1093,6 +1093,132 @@ class SYCLGen : public SYCLGenBase {
10931093
return HandleAddSub(Inst);
10941094
}
10951095

1096+
bool HandleShflSync(const InlineAsmInstruction *Inst) {
1097+
1098+
if (emitStmt(Inst->getOutputOperand()))
1099+
return SYCLGenError();
1100+
OS() << " = ";
1101+
1102+
std::string Op[4];
1103+
if (tryEmitAllInputOperands(Op, Inst))
1104+
return SYCLGenError();
1105+
1106+
OS() << MapNames::getDpctNamespace();
1107+
1108+
if (DpctGlobalInfo::useMaskedSubGroupFunction()) {
1109+
OS() << "experimental::";
1110+
if (Inst->hasAttr(InstAttr::up)) {
1111+
// to handle "shfl.up.b32 %0, %1, %2, %3;"
1112+
OS() << "shift_sub_group_right(" << Op[3] << ", ";
1113+
} else if (Inst->hasAttr(InstAttr::down)) {
1114+
// to handle "shfl.down.b32 %0, %1, %2, %3;"
1115+
OS() << "shift_sub_group_left(" << Op[3] << ", ";
1116+
} else if (Inst->hasAttr(InstAttr::idx)) {
1117+
// to handle "shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
1118+
OS() << "select_from_sub_group(" << Op[3] << ", ";
1119+
} else if (Inst->hasAttr(InstAttr::bfly)) {
1120+
// to handle "shfl.bfly.b32 %0, %1, %2, %3;"
1121+
OS() << "permute_sub_group_by_xor(" << Op[3] << ", ";
1122+
}
1123+
1124+
OS() << DpctGlobalInfo::getItem(GAS) << ".get_sub_group(), " << Op[0]
1125+
<< ", " << Op[1] << ")";
1126+
} else {
1127+
llvm::StringRef Str =
1128+
". You can specify "
1129+
"\"--use-experimental-features=masked-sub-group-operation\" to "
1130+
"use the experimental helper function to migrate inline asm "
1131+
"instruction ";
1132+
1133+
auto CommonStr = llvm::Twine(Str)
1134+
.concat("\"")
1135+
.concat(GAS->getAsmString()->getString())
1136+
.concat("\"")
1137+
.str();
1138+
1139+
if (Inst->hasAttr(InstAttr::up)) {
1140+
// to handle "shfl.sync.up.b32 %0, %1, %2, %3, %4;"
1141+
report(Diagnostics::MASK_UNSUPPORTED, true,
1142+
llvm::Twine("shift_sub_group_right").concat(CommonStr).str());
1143+
OS() << "shift_sub_group_right(";
1144+
} else if (Inst->hasAttr(InstAttr::down)) {
1145+
// to handle "shfl.sync.down.b32 %0, %1, %2, %3, %4;"
1146+
report(Diagnostics::MASK_UNSUPPORTED, true,
1147+
llvm::Twine("shift_sub_group_left").concat(CommonStr).str());
1148+
OS() << "shift_sub_group_left(";
1149+
} else if (Inst->hasAttr(InstAttr::idx)) {
1150+
// to handle "shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
1151+
report(Diagnostics::MASK_UNSUPPORTED, true,
1152+
llvm::Twine("select_from_sub_group").concat(CommonStr).str());
1153+
OS() << "select_from_sub_group(";
1154+
} else if (Inst->hasAttr(InstAttr::bfly)) {
1155+
// to handle "shfl.sync.bfly.b32 %0, %1, %2, %3, %4;"
1156+
report(Diagnostics::MASK_UNSUPPORTED, true,
1157+
llvm::Twine("permute_sub_group_by_xor").concat(CommonStr).str());
1158+
OS() << "permute_sub_group_by_xor(";
1159+
}
1160+
1161+
OS() << DpctGlobalInfo::getItem(GAS) << ".get_sub_group(), " << Op[0]
1162+
<< ", " << Op[1] << ")";
1163+
}
1164+
1165+
endstmt();
1166+
return SYCLGenSuccess();
1167+
}
1168+
1169+
bool HandleShfl(const InlineAsmInstruction *Inst) {
1170+
if (emitStmt(Inst->getOutputOperand()))
1171+
return SYCLGenError();
1172+
OS() << " = ";
1173+
1174+
std::string Op[3];
1175+
if (tryEmitAllInputOperands(Op, Inst))
1176+
return SYCLGenError();
1177+
1178+
OS() << MapNames::getDpctNamespace();
1179+
if (Inst->hasAttr(InstAttr::up)) {
1180+
// to handle "shfl.up.b32 %0, %1, %2, %3;"
1181+
OS() << "shift_sub_group_right(";
1182+
} else if (Inst->hasAttr(InstAttr::down)) {
1183+
// to handle "shfl.down.b32 %0, %1, %2, %3;"
1184+
OS() << "shift_sub_group_left(";
1185+
} else if (Inst->hasAttr(InstAttr::idx)) {
1186+
// to handle "shfl.idx.b32 %0, %1, %2, %3;"
1187+
OS() << "select_from_sub_group(";
1188+
} else if (Inst->hasAttr(InstAttr::bfly)) {
1189+
// to handle "shfl.bfly.b32 %0, %1, %2, %3;"
1190+
OS() << "permute_sub_group_by_xor(";
1191+
}
1192+
1193+
OS() << DpctGlobalInfo::getItem(GAS) << ".get_sub_group(), " << Op[0]
1194+
<< ", " << Op[1] << ")";
1195+
1196+
endstmt();
1197+
return SYCLGenSuccess();
1198+
}
1199+
1200+
bool handle_shfl(const InlineAsmInstruction *Inst) override {
1201+
1202+
if (DpctGlobalInfo::useSYCLCompat()) {
1203+
report(Diagnostics::UNSUPPORT_SYCLCOMPAT, /*UseTextBegin=*/true,
1204+
GAS->getAsmString()->getString());
1205+
cutOffMigration();
1206+
return SYCLGenSuccess();
1207+
}
1208+
1209+
if (Inst->getNumInputOperands() == 4 && Inst->getNumTypes() == 1 &&
1210+
Inst->hasAttr(InstAttr::sync)) {
1211+
return HandleShflSync(Inst);
1212+
}
1213+
1214+
if (Inst->getNumInputOperands() == 3 && Inst->getNumTypes() == 1 &&
1215+
!Inst->hasAttr(InstAttr::sync)) {
1216+
return HandleShfl(Inst);
1217+
}
1218+
1219+
return SYCLGenError();
1220+
}
1221+
10961222
bool handle_sub(const InlineAsmInstruction *Inst) override {
10971223
return HandleAddSub(Inst);
10981224
}

clang/lib/DPCT/RulesAsm/Parser/AsmTokenKinds.def

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,10 @@ MODIFIER(wrap, ".wrap")
398398
MODIFIER(wide, ".wide")
399399
MODIFIER(sync, ".sync")
400400
MODIFIER(warp, ".warp")
401+
MODIFIER(up, ".up")
402+
MODIFIER(down, ".down")
403+
MODIFIER(idx, ".idx")
404+
MODIFIER(bfly, ".bfly")
401405

402406
#undef LINKAGE
403407
#undef TARGET

clang/lib/DPCT/SrcAPI/APINames_ASM.inc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ ENTRY("set", "set", false, NO_FLAG, P1, "Comment")
115115
ENTRY("setmaxnreg", "setmaxnreg", false, NO_FLAG, P1, "Comment")
116116
ENTRY("setp", "setp", true, NO_FLAG, P1, "Successful")
117117
ENTRY("shf", "shf", false, NO_FLAG, P1, "Comment")
118-
ENTRY("shfl", "shfl", false, NO_FLAG, P1, "Comment")
118+
ENTRY("shfl", "shfl", true, NO_FLAG, P1, "Comment")
119119
ENTRY("shl", "shl", true, NO_FLAG, P1, "Successful")
120120
ENTRY("shr", "shr", true, NO_FLAG, P1, "Successful")
121121
ENTRY("sin", "sin", true, NO_FLAG, P1, "Successful")

clang/test/dpct/asm/shfl.cu

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2
2+
// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2
3+
// RUN: dpct --format-range=none -out-root %T/shfl %s --cuda-include-path="%cuda-path/include" -- -std=c++14 -x cuda --cuda-host-only
4+
// RUN: FileCheck %s --match-full-lines --input-file %T/shfl/shfl.dp.cpp
5+
// RUN: %if build_lit %{icpx -c -fsycl %T/shfl/shfl.dp.cpp -o %T/shfl/shfl.dp.o %}
6+
7+
// clang-format off
8+
#include <cuda_runtime.h>
9+
#include <cstdint>
10+
11+
__global__ void shfl() {
12+
int value;
13+
unsigned mask = 0xFFFFFFFF;
14+
int offset;
15+
int output;
16+
17+
// CHECK: output = dpct::shift_sub_group_right(item_ct1.get_sub_group(), value, offset);
18+
asm volatile("shfl.up.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(value), "r"(offset),"r"(mask));
19+
20+
// CHECK: output = dpct::shift_sub_group_right(item_ct1.get_sub_group(), value, offset);
21+
asm volatile("shfl.up.b32 %0, %1, %2, 0xFFFFFFFF;" : "=r"(output) : "r"(value), "r"(offset));
22+
23+
// CHECK: output = dpct::shift_sub_group_left(item_ct1.get_sub_group(), value, offset);
24+
asm volatile("shfl.down.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(value), "r"(offset),"r"(mask));
25+
26+
// CHECK: output = dpct::shift_sub_group_left(item_ct1.get_sub_group(), value, offset);
27+
asm volatile("shfl.down.b32 %0, %1, %2, 0xFFFFFFFF;" : "=r"(output) : "r"(value), "r"(offset));
28+
29+
// CHECK: output = dpct::select_from_sub_group(item_ct1.get_sub_group(), value, offset);
30+
asm volatile("shfl.idx.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(value), "r"(offset),"r"(mask));
31+
32+
// CHECK: output = dpct::select_from_sub_group(item_ct1.get_sub_group(), value, offset);
33+
asm volatile("shfl.idx.b32 %0, %1, %2, 0xFFFFFFFF;" : "=r"(output) : "r"(value), "r"(offset));
34+
35+
// CHECK: output = dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), value, offset);
36+
asm volatile("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(value), "r"(offset),"r"(mask));
37+
38+
// CHECK: output = dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), value, offset);
39+
asm volatile("shfl.bfly.b32 %0, %1, %2, 0xFFFFFFFF;" : "=r"(output) : "r"(value), "r"(offset));
40+
}
41+
42+
// clang-format off

clang/test/dpct/asm/shfl_sync.cu

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2
2+
// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2
3+
// RUN: dpct --format-range=none -out-root %T/shfl_sync %s --cuda-include-path="%cuda-path/include" -- -std=c++14 -x cuda --cuda-host-only
4+
// RUN: FileCheck %s --match-full-lines --input-file %T/shfl_sync/shfl_sync.dp.cpp
5+
// RUN: %if build_lit %{icpx -c -fsycl %T/shfl_sync/shfl_sync.dp.cpp -o %T/shfl_sync/shfl_sync.dp.o %}
6+
7+
// clang-format off
8+
#include <cuda_runtime.h>
9+
#include <cstdint>
10+
11+
__global__ void shfl_sync() {
12+
int value;
13+
unsigned mask = 0xFFFFFFFF;
14+
int offset;
15+
int output;
16+
17+
// CHECK: /*
18+
// CHECK-NEXT: DPCT1023:{{[0-9]+}}: The SYCL sub-group does not support mask options for shift_sub_group_right. You can specify "--use-experimental-features=masked-sub-group-operation" to use the experimental helper function to migrate inline asm instruction "shfl.sync.up.b32 %0, %1, %2, %3, %4;".
19+
// CHECK-NEXT: */
20+
// CHECK-NEXT: output = dpct::shift_sub_group_right(item_ct1.get_sub_group(), value, offset);
21+
asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;" : "=r"(output) : "r"(value), "r"(offset), "r"(0), "r"(mask));
22+
23+
// CHECK: /*
24+
// CHECK-NEXT: DPCT1023:{{[0-9]+}}: The SYCL sub-group does not support mask options for shift_sub_group_left. You can specify "--use-experimental-features=masked-sub-group-operation" to use the experimental helper function to migrate inline asm instruction "shfl.sync.down.b32 %0, %1, %2, %3, %4;".
25+
// CHECK-NEXT: */
26+
// CHECK-NEXT output = dpct::shift_sub_group_left(item_ct1.get_sub_group(), value, offset);
27+
asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=r"(output) : "r"(value), "r"(offset), "r"(0), "r"(mask));
28+
29+
// CHECK: /*
30+
// CHECK-NEXT: DPCT1023:{{[0-9]+}}: The SYCL sub-group does not support mask options for select_from_sub_group. You can specify "--use-experimental-features=masked-sub-group-operation" to use the experimental helper function to migrate inline asm instruction "shfl.sync.idx.b32 %0, %1, %2, %3, %4;".
31+
// CHECK-NEXT: */
32+
// CHECK-NEXT output = dpct::select_from_sub_group(item_ct1.get_sub_group(), value, offset);
33+
asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=r"(output) : "r"(value), "r"(offset), "r"(0), "r"(mask));
34+
35+
// CHECK: /*
36+
// CHECK-NEXT: DPCT1023:{{[0-9]+}}: The SYCL sub-group does not support mask options for permute_sub_group_by_xor. You can specify "--use-experimental-features=masked-sub-group-operation" to use the experimental helper function to migrate inline asm instruction "shfl.sync.bfly.b32 %0, %1, %2, %3, %4;".
37+
// CHECK-NEXT: */
38+
// CHECK-NEXT: output = dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), value, offset);
39+
asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;" : "=r"(output) : "r"(value), "r"(offset), "r"(0), "r"(mask));
40+
}
41+
42+
// clang-format off
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2
2+
// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2
3+
// RUN: dpct --use-experimental-features=masked-sub-group-operation --format-range=none -out-root %T/shfl_sync_with_exp %s --cuda-include-path="%cuda-path/include" -- -std=c++14 -x cuda --cuda-host-only
4+
// RUN: FileCheck %s --match-full-lines --input-file %T/shfl_sync_with_exp/shfl_sync_with_exp.dp.cpp
5+
// RUN: %if build_lit %{icpx -c -fsycl %T/shfl_sync_with_exp/shfl_sync_with_exp.dp.cpp -o %T/shfl_sync_with_exp/shfl_sync_with_exp.dp.o %}
6+
7+
// clang-format off
8+
#include <cuda_runtime.h>
9+
#include <cstdint>
10+
11+
__global__ void shfl_sync_with_exp() {
12+
int value;
13+
unsigned mask = 0xFFFFFFFF;
14+
int offset;
15+
int output;
16+
17+
// CHECK: output = dpct::experimental::shift_sub_group_right(mask, item_ct1.get_sub_group(), value, offset);
18+
asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;" : "=r"(output) : "r"(value), "r"(offset), "r"(0), "r"(mask));
19+
20+
// CHECK: output = dpct::experimental::shift_sub_group_left(mask, item_ct1.get_sub_group(), value, offset);
21+
asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=r"(output) : "r"(value), "r"(offset), "r"(0), "r"(mask));
22+
23+
// CHECK: output = dpct::experimental::select_from_sub_group(mask, item_ct1.get_sub_group(), value, offset);
24+
asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=r"(output) : "r"(value), "r"(offset), "r"(0), "r"(mask));
25+
26+
// CHECK: output = dpct::experimental::permute_sub_group_by_xor(mask, item_ct1.get_sub_group(), value, offset);
27+
asm volatile("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;" : "=r"(output) : "r"(value), "r"(offset), "r"(0), "r"(mask));
28+
}
29+
30+
// clang-format off

0 commit comments

Comments
 (0)