@@ -1678,7 +1678,8 @@ void test_cvt_scalef32_sr_fp8_f32(global unsigned *out, float src, uint seed, fl
16781678// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
16791679// CHECK-NEXT: ret void
16801680//
1681- void test_bitop3_b32 (global uint * out , uint a , uint b , uint c ) {
1681+ void test_bitop3_b32 (global uint * out , uint a , uint b , uint c )
1682+ {
16821683 * out = __builtin_amdgcn_bitop3_b32 (a , b , c , 1 );
16831684}
16841685
@@ -1700,6 +1701,67 @@ void test_bitop3_b32(global uint* out, uint a, uint b, uint c) {
17001701// CHECK-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[TMP4]], align 2
17011702// CHECK-NEXT: ret void
17021703//
1703- void test_bitop3_b16 (global ushort * out , ushort a , ushort b , ushort c ) {
1704+ void test_bitop3_b16 (global ushort * out , ushort a , ushort b , ushort c )
1705+ {
17041706 * out = __builtin_amdgcn_bitop3_b16 (a , b , c , 1 );
17051707}
1708+
1709+ // CHECK-LABEL: @test_cvt_sr_bf16_f32(
1710+ // CHECK-NEXT: entry:
1711+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1712+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1713+ // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1714+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1715+ // CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1716+ // CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4
1717+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1718+ // CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[TMP0]], align 4
1719+ // CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4
1720+ // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1721+ // CHECK-NEXT: [[TMP4:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat> [[TMP1]], float [[TMP2]], i32 [[TMP3]], i1 false)
1722+ // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1723+ // CHECK-NEXT: store <2 x bfloat> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
1724+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1725+ // CHECK-NEXT: [[TMP7:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[TMP6]], align 4
1726+ // CHECK-NEXT: [[TMP8:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4
1727+ // CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1728+ // CHECK-NEXT: [[TMP10:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat> [[TMP7]], float [[TMP8]], i32 [[TMP9]], i1 true)
1729+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1730+ // CHECK-NEXT: store <2 x bfloat> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
1731+ // CHECK-NEXT: ret void
1732+ //
1733+ void test_cvt_sr_bf16_f32 (global bfloat2 * out , float src , uint seed )
1734+ {
1735+ * out = __builtin_amdgcn_cvt_sr_bf16_f32 (* out , src , seed , 0 );
1736+ * out = __builtin_amdgcn_cvt_sr_bf16_f32 (* out , src , seed , 1 );
1737+ }
1738+
1739+ // CHECK-LABEL: @test_cvt_sr_f16_f32(
1740+ // CHECK-NEXT: entry:
1741+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1742+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1743+ // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1744+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1745+ // CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1746+ // CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4
1747+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1748+ // CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 4
1749+ // CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4
1750+ // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1751+ // CHECK-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> [[TMP1]], float [[TMP2]], i32 [[TMP3]], i1 false)
1752+ // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1753+ // CHECK-NEXT: store <2 x half> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
1754+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1755+ // CHECK-NEXT: [[TMP7:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP6]], align 4
1756+ // CHECK-NEXT: [[TMP8:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4
1757+ // CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1758+ // CHECK-NEXT: [[TMP10:%.*]] = call <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> [[TMP7]], float [[TMP8]], i32 [[TMP9]], i1 true)
1759+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1760+ // CHECK-NEXT: store <2 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
1761+ // CHECK-NEXT: ret void
1762+ //
1763+ void test_cvt_sr_f16_f32 (global half2 * out , float src , uint seed )
1764+ {
1765+ * out = __builtin_amdgcn_cvt_sr_f16_f32 (* out , src , seed , 0 );
1766+ * out = __builtin_amdgcn_cvt_sr_f16_f32 (* out , src , seed , 1 );
1767+ }
0 commit comments