@@ -1070,6 +1070,61 @@ void test_permlane_idx_gen(global uint* out, uint src0, uint src1) {
10701070 * out = __builtin_amdgcn_permlane_idx_gen (src0 , src1 );
10711071}
10721072
1073+ // CHECK-LABEL: @test_perm_pk(
1074+ // CHECK-NEXT: entry:
1075+ // CHECK-NEXT: [[A32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1076+ // CHECK-NEXT: [[A64_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1077+ // CHECK-NEXT: [[B32_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1078+ // CHECK-NEXT: [[B64_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1079+ // CHECK-NEXT: [[C_ADDR:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
1080+ // CHECK-NEXT: [[OUT2_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1081+ // CHECK-NEXT: [[OUT3_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1082+ // CHECK-NEXT: [[OUT4_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1083+ // CHECK-NEXT: [[A32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A32_ADDR]] to ptr
1084+ // CHECK-NEXT: [[A64_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A64_ADDR]] to ptr
1085+ // CHECK-NEXT: [[B32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B32_ADDR]] to ptr
1086+ // CHECK-NEXT: [[B64_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B64_ADDR]] to ptr
1087+ // CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
1088+ // CHECK-NEXT: [[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT2_ADDR]] to ptr
1089+ // CHECK-NEXT: [[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT3_ADDR]] to ptr
1090+ // CHECK-NEXT: [[OUT4_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT4_ADDR]] to ptr
1091+ // CHECK-NEXT: store i32 [[A32:%.*]], ptr [[A32_ADDR_ASCAST]], align 4
1092+ // CHECK-NEXT: store i32 [[A64:%.*]], ptr [[A64_ADDR_ASCAST]], align 4
1093+ // CHECK-NEXT: store i32 [[B32:%.*]], ptr [[B32_ADDR_ASCAST]], align 4
1094+ // CHECK-NEXT: store i32 [[B64:%.*]], ptr [[B64_ADDR_ASCAST]], align 4
1095+ // CHECK-NEXT: store <2 x i32> [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 8
1096+ // CHECK-NEXT: store ptr [[OUT2:%.*]], ptr [[OUT2_ADDR_ASCAST]], align 8
1097+ // CHECK-NEXT: store ptr [[OUT3:%.*]], ptr [[OUT3_ADDR_ASCAST]], align 8
1098+ // CHECK-NEXT: store ptr [[OUT4:%.*]], ptr [[OUT4_ADDR_ASCAST]], align 8
1099+ // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A32_ADDR_ASCAST]], align 4
1100+ // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B32_ADDR_ASCAST]], align 4
1101+ // CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[C_ADDR_ASCAST]], align 8
1102+ // CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.amdgcn.perm.pk16.b4.u4(i32 [[TMP0]], i32 [[TMP1]], <2 x i32> [[TMP2]])
1103+ // CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT2_ADDR_ASCAST]], align 8
1104+ // CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[TMP4]], align 8
1105+ // CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[A32_ADDR_ASCAST]], align 4
1106+ // CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[B64_ADDR_ASCAST]], align 4
1107+ // CHECK-NEXT: [[CONV:%.*]] = zext i32 [[TMP6]] to i64
1108+ // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[C_ADDR_ASCAST]], align 8
1109+ // CHECK-NEXT: [[TMP8:%.*]] = call <3 x i32> @llvm.amdgcn.perm.pk16.b6.u4(i32 [[TMP5]], i64 [[CONV]], <2 x i32> [[TMP7]])
1110+ // CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[OUT3_ADDR_ASCAST]], align 8
1111+ // CHECK-NEXT: store <3 x i32> [[TMP8]], ptr [[TMP9]], align 16
1112+ // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[A64_ADDR_ASCAST]], align 4
1113+ // CHECK-NEXT: [[CONV1:%.*]] = zext i32 [[TMP10]] to i64
1114+ // CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[B64_ADDR_ASCAST]], align 4
1115+ // CHECK-NEXT: [[CONV2:%.*]] = zext i32 [[TMP11]] to i64
1116+ // CHECK-NEXT: [[TMP12:%.*]] = load <2 x i32>, ptr [[C_ADDR_ASCAST]], align 8
1117+ // CHECK-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.amdgcn.perm.pk16.b8.u4(i64 [[CONV1]], i64 [[CONV2]], <2 x i32> [[TMP12]])
1118+ // CHECK-NEXT: [[TMP14:%.*]] = load ptr, ptr [[OUT4_ADDR_ASCAST]], align 8
1119+ // CHECK-NEXT: store <4 x i32> [[TMP13]], ptr [[TMP14]], align 16
1120+ // CHECK-NEXT: ret void
1121+ //
1122+ void test_perm_pk (uint a32 , uint a64 , uint b32 , uint b64 , uint2 c , uint2 * out2 , uint3 * out3 , uint4 * out4 ) {
1123+ * out2 = __builtin_amdgcn_perm_pk16_b4_u4 (a32 , b32 , c );
1124+ * out3 = __builtin_amdgcn_perm_pk16_b6_u4 (a32 , b64 , c );
1125+ * out4 = __builtin_amdgcn_perm_pk16_b8_u4 (a64 , b64 , c );
1126+ }
1127+
10731128// CHECK-LABEL: @test_prefetch(
10741129// CHECK-NEXT: entry:
10751130// CHECK-NEXT: [[FPTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
0 commit comments