Skip to content

Commit 110ab5a

Browse files
shiltiankosarev
andauthored
[AMDGPU] Add builtins and intrinsics for cluster attributes (#157877)
Co-authored-by: Ivan Kosarev <[email protected]>
1 parent 6241cb3 commit 110ab5a

File tree

3 files changed

+194
-0
lines changed

3 files changed

+194
-0
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,20 @@ BUILTIN(__builtin_amdgcn_workgroup_id_x, "Ui", "nc")
3434
BUILTIN(__builtin_amdgcn_workgroup_id_y, "Ui", "nc")
3535
BUILTIN(__builtin_amdgcn_workgroup_id_z, "Ui", "nc")
3636

37+
TARGET_BUILTIN(__builtin_amdgcn_cluster_id_x, "Ui", "nc", "gfx1250-insts")
38+
TARGET_BUILTIN(__builtin_amdgcn_cluster_id_y, "Ui", "nc", "gfx1250-insts")
39+
TARGET_BUILTIN(__builtin_amdgcn_cluster_id_z, "Ui", "nc", "gfx1250-insts")
40+
41+
TARGET_BUILTIN(__builtin_amdgcn_cluster_workgroup_id_x, "Ui", "nc", "gfx1250-insts")
42+
TARGET_BUILTIN(__builtin_amdgcn_cluster_workgroup_id_y, "Ui", "nc", "gfx1250-insts")
43+
TARGET_BUILTIN(__builtin_amdgcn_cluster_workgroup_id_z, "Ui", "nc", "gfx1250-insts")
44+
TARGET_BUILTIN(__builtin_amdgcn_cluster_workgroup_flat_id, "Ui", "nc", "gfx1250-insts")
45+
46+
TARGET_BUILTIN(__builtin_amdgcn_cluster_workgroup_max_id_x, "Ui", "nc", "gfx1250-insts")
47+
TARGET_BUILTIN(__builtin_amdgcn_cluster_workgroup_max_id_y, "Ui", "nc", "gfx1250-insts")
48+
TARGET_BUILTIN(__builtin_amdgcn_cluster_workgroup_max_id_z, "Ui", "nc", "gfx1250-insts")
49+
TARGET_BUILTIN(__builtin_amdgcn_cluster_workgroup_max_flat_id, "Ui", "nc", "gfx1250-insts")
50+
3751
BUILTIN(__builtin_amdgcn_workitem_id_x, "Ui", "nc")
3852
BUILTIN(__builtin_amdgcn_workitem_id_y, "Ui", "nc")
3953
BUILTIN(__builtin_amdgcn_workitem_id_z, "Ui", "nc")

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,6 +1064,174 @@ void test_sat_pk4_i4_i8(ushort *out, uint src)
10641064
*out = __builtin_amdgcn_sat_pk4_u4_u8(src);
10651065
}
10661066

1067+
// CHECK-LABEL: @test_get_cluster_id(
1068+
// CHECK-NEXT: entry:
1069+
// CHECK-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1070+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1071+
// CHECK-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
1072+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
1073+
// CHECK-NEXT: store i32 [[D:%.*]], ptr [[D_ADDR_ASCAST]], align 4
1074+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
1075+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[D_ADDR_ASCAST]], align 4
1076+
// CHECK-NEXT: switch i32 [[TMP0]], label [[SW_DEFAULT:%.*]] [
1077+
// CHECK-NEXT: i32 0, label [[SW_BB:%.*]]
1078+
// CHECK-NEXT: i32 1, label [[SW_BB1:%.*]]
1079+
// CHECK-NEXT: i32 2, label [[SW_BB2:%.*]]
1080+
// CHECK-NEXT: ]
1081+
// CHECK: sw.bb:
1082+
// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.cluster.id.x()
1083+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
1084+
// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4
1085+
// CHECK-NEXT: br label [[SW_EPILOG:%.*]]
1086+
// CHECK: sw.bb1:
1087+
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cluster.id.y()
1088+
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
1089+
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
1090+
// CHECK-NEXT: br label [[SW_EPILOG]]
1091+
// CHECK: sw.bb2:
1092+
// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cluster.id.z()
1093+
// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
1094+
// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
1095+
// CHECK-NEXT: br label [[SW_EPILOG]]
1096+
// CHECK: sw.default:
1097+
// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
1098+
// CHECK-NEXT: store i32 0, ptr addrspace(1) [[TMP7]], align 4
1099+
// CHECK-NEXT: br label [[SW_EPILOG]]
1100+
// CHECK: sw.epilog:
1101+
// CHECK-NEXT: ret void
1102+
//
1103+
void test_get_cluster_id(int d, global int *out)
1104+
{
1105+
switch (d) {
1106+
case 0: *out = __builtin_amdgcn_cluster_id_x(); break;
1107+
case 1: *out = __builtin_amdgcn_cluster_id_y(); break;
1108+
case 2: *out = __builtin_amdgcn_cluster_id_z(); break;
1109+
default: *out = 0;
1110+
}
1111+
}
1112+
1113+
// CHECK-LABEL: @test_get_cluster_group_id(
1114+
// CHECK-NEXT: entry:
1115+
// CHECK-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1116+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1117+
// CHECK-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
1118+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
1119+
// CHECK-NEXT: store i32 [[D:%.*]], ptr [[D_ADDR_ASCAST]], align 4
1120+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
1121+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[D_ADDR_ASCAST]], align 4
1122+
// CHECK-NEXT: switch i32 [[TMP0]], label [[SW_DEFAULT:%.*]] [
1123+
// CHECK-NEXT: i32 0, label [[SW_BB:%.*]]
1124+
// CHECK-NEXT: i32 1, label [[SW_BB1:%.*]]
1125+
// CHECK-NEXT: i32 2, label [[SW_BB2:%.*]]
1126+
// CHECK-NEXT: ]
1127+
// CHECK: sw.bb:
1128+
// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.id.x()
1129+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
1130+
// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4
1131+
// CHECK-NEXT: br label [[SW_EPILOG:%.*]]
1132+
// CHECK: sw.bb1:
1133+
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.id.y()
1134+
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
1135+
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
1136+
// CHECK-NEXT: br label [[SW_EPILOG]]
1137+
// CHECK: sw.bb2:
1138+
// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.id.z()
1139+
// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
1140+
// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
1141+
// CHECK-NEXT: br label [[SW_EPILOG]]
1142+
// CHECK: sw.default:
1143+
// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
1144+
// CHECK-NEXT: store i32 0, ptr addrspace(1) [[TMP7]], align 4
1145+
// CHECK-NEXT: br label [[SW_EPILOG]]
1146+
// CHECK: sw.epilog:
1147+
// CHECK-NEXT: ret void
1148+
//
1149+
void test_get_cluster_group_id(int d, global int *out)
1150+
{
1151+
switch (d) {
1152+
case 0: *out = __builtin_amdgcn_cluster_workgroup_id_x(); break;
1153+
case 1: *out = __builtin_amdgcn_cluster_workgroup_id_y(); break;
1154+
case 2: *out = __builtin_amdgcn_cluster_workgroup_id_z(); break;
1155+
default: *out = 0;
1156+
}
1157+
}
1158+
1159+
// CHECK-LABEL: @test_cluster_workgroup_flat_id(
1160+
// CHECK-NEXT: entry:
1161+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1162+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
1163+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
1164+
// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.flat.id()
1165+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
1166+
// CHECK-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[TMP1]], align 4
1167+
// CHECK-NEXT: ret void
1168+
//
1169+
void test_cluster_workgroup_flat_id(global uint *out)
1170+
{
1171+
*out = __builtin_amdgcn_cluster_workgroup_flat_id();
1172+
}
1173+
1174+
// CHECK-LABEL: @test_get_cluster_workgroups_max_id(
1175+
// CHECK-NEXT: entry:
1176+
// CHECK-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1177+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1178+
// CHECK-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
1179+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
1180+
// CHECK-NEXT: store i32 [[D:%.*]], ptr [[D_ADDR_ASCAST]], align 4
1181+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
1182+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[D_ADDR_ASCAST]], align 4
1183+
// CHECK-NEXT: switch i32 [[TMP0]], label [[SW_DEFAULT:%.*]] [
1184+
// CHECK-NEXT: i32 0, label [[SW_BB:%.*]]
1185+
// CHECK-NEXT: i32 1, label [[SW_BB1:%.*]]
1186+
// CHECK-NEXT: i32 2, label [[SW_BB2:%.*]]
1187+
// CHECK-NEXT: ]
1188+
// CHECK: sw.bb:
1189+
// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.max.id.x()
1190+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
1191+
// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4
1192+
// CHECK-NEXT: br label [[SW_EPILOG:%.*]]
1193+
// CHECK: sw.bb1:
1194+
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.max.id.y()
1195+
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
1196+
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
1197+
// CHECK-NEXT: br label [[SW_EPILOG]]
1198+
// CHECK: sw.bb2:
1199+
// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.max.id.z()
1200+
// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
1201+
// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
1202+
// CHECK-NEXT: br label [[SW_EPILOG]]
1203+
// CHECK: sw.default:
1204+
// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
1205+
// CHECK-NEXT: store i32 0, ptr addrspace(1) [[TMP7]], align 4
1206+
// CHECK-NEXT: br label [[SW_EPILOG]]
1207+
// CHECK: sw.epilog:
1208+
// CHECK-NEXT: ret void
1209+
//
1210+
void test_get_cluster_workgroups_max_id(int d, global int *out)
1211+
{
1212+
switch (d) {
1213+
case 0: *out = __builtin_amdgcn_cluster_workgroup_max_id_x(); break;
1214+
case 1: *out = __builtin_amdgcn_cluster_workgroup_max_id_y(); break;
1215+
case 2: *out = __builtin_amdgcn_cluster_workgroup_max_id_z(); break;
1216+
default: *out = 0;
1217+
}
1218+
}
1219+
1220+
// CHECK-LABEL: @test_get_cluster_workgroup_max_flat_id(
1221+
// CHECK-NEXT: entry:
1222+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1223+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
1224+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
1225+
// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.cluster.workgroup.max.flat.id()
1226+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
1227+
// CHECK-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[TMP1]], align 4
1228+
// CHECK-NEXT: ret void
1229+
//
1230+
void test_get_cluster_workgroup_max_flat_id(global int *out)
1231+
{
1232+
*out = __builtin_amdgcn_cluster_workgroup_max_flat_id();
1233+
}
1234+
10671235
// CHECK-LABEL: @test_permlane16_swap(
10681236
// CHECK-NEXT: entry:
10691237
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,18 @@ defm int_amdgcn_workitem_id
165165

166166
defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
167167
<"__builtin_amdgcn_workgroup_id">;
168+
defm int_amdgcn_cluster_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
169+
<"__builtin_amdgcn_cluster_id">;
170+
defm int_amdgcn_cluster_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
171+
<"__builtin_amdgcn_cluster_workgroup_id">;
172+
def int_amdgcn_cluster_workgroup_flat_id:
173+
ClangBuiltin<"__builtin_amdgcn_cluster_workgroup_flat_id">,
174+
Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
175+
defm int_amdgcn_cluster_workgroup_max_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
176+
<"__builtin_amdgcn_cluster_workgroup_max_id">;
177+
def int_amdgcn_cluster_workgroup_max_flat_id:
178+
ClangBuiltin<"__builtin_amdgcn_cluster_workgroup_max_flat_id">,
179+
Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
168180

169181
def int_amdgcn_dispatch_ptr :
170182
DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],

0 commit comments

Comments
 (0)