Skip to content

Commit 005acd6

Browse files
rampitecmikolaj-pirog
authored andcommitted
[AMDGPU] Add intrinsics for v_[pk]_add_{min|max}_* instructions (llvm#164731)
1 parent 9fffd90 commit 005acd6

File tree

14 files changed

+399
-19
lines changed

14 files changed

+399
-19
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -830,6 +830,15 @@ TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b4_u4, "V2UiUiUiV2Ui", "nc", "tensor-c
830830
TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b6_u4, "V3UiUiULiV2Ui", "nc", "tensor-cvt-lut-insts")
831831
TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b8_u4, "V4UiULiULiV2Ui", "nc", "tensor-cvt-lut-insts")
832832

833+
TARGET_BUILTIN(__builtin_amdgcn_add_max_i32, "iiiiIb", "nc", "add-min-max-insts")
834+
TARGET_BUILTIN(__builtin_amdgcn_add_max_u32, "UiUiUiUiIb", "nc", "add-min-max-insts")
835+
TARGET_BUILTIN(__builtin_amdgcn_add_min_i32, "iiiiIb", "nc", "add-min-max-insts")
836+
TARGET_BUILTIN(__builtin_amdgcn_add_min_u32, "UiUiUiUiIb", "nc", "add-min-max-insts")
837+
TARGET_BUILTIN(__builtin_amdgcn_pk_add_max_i16, "V2sV2sV2sV2sIb", "nc", "pk-add-min-max-insts")
838+
TARGET_BUILTIN(__builtin_amdgcn_pk_add_max_u16, "V2UsV2UsV2UsV2UsIb", "nc", "pk-add-min-max-insts")
839+
TARGET_BUILTIN(__builtin_amdgcn_pk_add_min_i16, "V2sV2sV2sV2sIb", "nc", "pk-add-min-max-insts")
840+
TARGET_BUILTIN(__builtin_amdgcn_pk_add_min_u16, "V2UsV2UsV2UsV2UsIb", "nc", "pk-add-min-max-insts")
841+
833842
// GFX1250 WMMA builtins
834843
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, "V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
835844
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_bf16, "V8fIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")

clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ kernel void foo(global int *p) { *p = 1; }
2626
// CHECK-NEXT: ret void
2727
//
2828
//.
29-
// CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" "uniform-work-group-size"="false" }
30-
// CHECK: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" }
29+
// CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" "uniform-work-group-size"="false" }
30+
// CHECK: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" }
3131
// CHECK: attributes #[[ATTR2]] = { convergent nounwind }
3232
//.
3333
// CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}

clang/test/CodeGenOpenCL/amdgpu-features.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,8 @@
109109
// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
110110
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
111111
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
112-
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
113-
// GFX1251: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
112+
// GFX1250: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
113+
// GFX1251: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
114114

115115
// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
116116

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ typedef float __attribute__((ext_vector_type(8))) float8;
2121
typedef float __attribute__((ext_vector_type(16))) float16;
2222
typedef float __attribute__((ext_vector_type(32))) float32;
2323
typedef short __attribute__((ext_vector_type(2))) short2;
24+
typedef unsigned short __attribute__((ext_vector_type(2))) ushort2;
2425

2526
// CHECK-LABEL: @test_setprio_inc_wg(
2627
// CHECK-NEXT: entry:
@@ -1718,3 +1719,111 @@ void test_cvt_f32_fp8_e5m3(global int* out, int a)
17181719
*out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 2);
17191720
*out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 3);
17201721
}
1722+
1723+
// CHECK-LABEL: @test_add_min_max(
1724+
// CHECK-NEXT: entry:
1725+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1726+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1727+
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1728+
// CHECK-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1729+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
1730+
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
1731+
// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
1732+
// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
1733+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
1734+
// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
1735+
// CHECK-NEXT: store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
1736+
// CHECK-NEXT: store i32 [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 4
1737+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
1738+
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
1739+
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
1740+
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.add.max.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false)
1741+
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
1742+
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
1743+
// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
1744+
// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
1745+
// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
1746+
// CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.add.max.u32(i32 [[TMP5]], i32 [[TMP6]], i32 [[TMP7]], i1 true)
1747+
// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
1748+
// CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 4
1749+
// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
1750+
// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
1751+
// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
1752+
// CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.add.min.i32(i32 [[TMP10]], i32 [[TMP11]], i32 [[TMP12]], i1 false)
1753+
// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
1754+
// CHECK-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4
1755+
// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
1756+
// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
1757+
// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
1758+
// CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.add.min.u32(i32 [[TMP15]], i32 [[TMP16]], i32 [[TMP17]], i1 true)
1759+
// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
1760+
// CHECK-NEXT: store i32 [[TMP18]], ptr addrspace(1) [[TMP19]], align 4
1761+
// CHECK-NEXT: ret void
1762+
//
1763+
void test_add_min_max(global int *out, int a, int b, int c)
1764+
{
1765+
*out = __builtin_amdgcn_add_max_i32(a, b, c, false);
1766+
*out = __builtin_amdgcn_add_max_u32(a, b, c, true);
1767+
*out = __builtin_amdgcn_add_min_i32(a, b, c, false);
1768+
*out = __builtin_amdgcn_add_min_u32(a, b, c, true);
1769+
}
1770+
1771+
// CHECK-LABEL: @test_pk_add_min_max(
1772+
// CHECK-NEXT: entry:
1773+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1774+
// CHECK-NEXT: [[UOUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1775+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
1776+
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
1777+
// CHECK-NEXT: [[C_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
1778+
// CHECK-NEXT: [[UA_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
1779+
// CHECK-NEXT: [[UB_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
1780+
// CHECK-NEXT: [[UC_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
1781+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
1782+
// CHECK-NEXT: [[UOUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UOUT_ADDR]] to ptr
1783+
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
1784+
// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
1785+
// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
1786+
// CHECK-NEXT: [[UA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UA_ADDR]] to ptr
1787+
// CHECK-NEXT: [[UB_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UB_ADDR]] to ptr
1788+
// CHECK-NEXT: [[UC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UC_ADDR]] to ptr
1789+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
1790+
// CHECK-NEXT: store ptr addrspace(1) [[UOUT:%.*]], ptr [[UOUT_ADDR_ASCAST]], align 8
1791+
// CHECK-NEXT: store <2 x i16> [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
1792+
// CHECK-NEXT: store <2 x i16> [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
1793+
// CHECK-NEXT: store <2 x i16> [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 4
1794+
// CHECK-NEXT: store <2 x i16> [[UA:%.*]], ptr [[UA_ADDR_ASCAST]], align 4
1795+
// CHECK-NEXT: store <2 x i16> [[UB:%.*]], ptr [[UB_ADDR_ASCAST]], align 4
1796+
// CHECK-NEXT: store <2 x i16> [[UC:%.*]], ptr [[UC_ADDR_ASCAST]], align 4
1797+
// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr [[A_ADDR_ASCAST]], align 4
1798+
// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[B_ADDR_ASCAST]], align 4
1799+
// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr [[C_ADDR_ASCAST]], align 4
1800+
// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]], <2 x i16> [[TMP2]], i1 false)
1801+
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
1802+
// CHECK-NEXT: store <2 x i16> [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
1803+
// CHECK-NEXT: [[TMP5:%.*]] = load <2 x i16>, ptr [[UA_ADDR_ASCAST]], align 4
1804+
// CHECK-NEXT: [[TMP6:%.*]] = load <2 x i16>, ptr [[UB_ADDR_ASCAST]], align 4
1805+
// CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr [[UC_ADDR_ASCAST]], align 4
1806+
// CHECK-NEXT: [[TMP8:%.*]] = call <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16> [[TMP5]], <2 x i16> [[TMP6]], <2 x i16> [[TMP7]], i1 true)
1807+
// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[UOUT_ADDR_ASCAST]], align 8
1808+
// CHECK-NEXT: store <2 x i16> [[TMP8]], ptr addrspace(1) [[TMP9]], align 4
1809+
// CHECK-NEXT: [[TMP10:%.*]] = load <2 x i16>, ptr [[A_ADDR_ASCAST]], align 4
1810+
// CHECK-NEXT: [[TMP11:%.*]] = load <2 x i16>, ptr [[B_ADDR_ASCAST]], align 4
1811+
// CHECK-NEXT: [[TMP12:%.*]] = load <2 x i16>, ptr [[C_ADDR_ASCAST]], align 4
1812+
// CHECK-NEXT: [[TMP13:%.*]] = call <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16> [[TMP10]], <2 x i16> [[TMP11]], <2 x i16> [[TMP12]], i1 false)
1813+
// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
1814+
// CHECK-NEXT: store <2 x i16> [[TMP13]], ptr addrspace(1) [[TMP14]], align 4
1815+
// CHECK-NEXT: [[TMP15:%.*]] = load <2 x i16>, ptr [[UA_ADDR_ASCAST]], align 4
1816+
// CHECK-NEXT: [[TMP16:%.*]] = load <2 x i16>, ptr [[UB_ADDR_ASCAST]], align 4
1817+
// CHECK-NEXT: [[TMP17:%.*]] = load <2 x i16>, ptr [[UC_ADDR_ASCAST]], align 4
1818+
// CHECK-NEXT: [[TMP18:%.*]] = call <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16> [[TMP15]], <2 x i16> [[TMP16]], <2 x i16> [[TMP17]], i1 true)
1819+
// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[UOUT_ADDR_ASCAST]], align 8
1820+
// CHECK-NEXT: store <2 x i16> [[TMP18]], ptr addrspace(1) [[TMP19]], align 4
1821+
// CHECK-NEXT: ret void
1822+
//
1823+
void test_pk_add_min_max(global short2 *out, global ushort2 *uout, short2 a, short2 b, short2 c, ushort2 ua, ushort2 ub, ushort2 uc)
1824+
{
1825+
*out = __builtin_amdgcn_pk_add_max_i16(a, b, c, false);
1826+
*uout = __builtin_amdgcn_pk_add_max_u16(ua, ub, uc, true);
1827+
*out = __builtin_amdgcn_pk_add_min_i16(a, b, c, false);
1828+
*uout = __builtin_amdgcn_pk_add_min_u16(ua, ub, uc, true);
1829+
}

0 commit comments

Comments
 (0)