@@ -398,6 +398,144 @@ void test_cvt_pk_f16_bf8(global half2* out, short a)
398398 out [0 ] = __builtin_amdgcn_cvt_pk_f16_bf8 (a );
399399}
400400
401+ // CHECK-LABEL: @test_cvt_pk_bf8_f16(
402+ // CHECK-NEXT: entry:
403+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
404+ // CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
405+ // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
406+ // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
407+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
408+ // CHECK-NEXT: store <2 x half> [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
409+ // CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4
410+ // CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.cvt.pk.bf8.f16(<2 x half> [[TMP0]])
411+ // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
412+ // CHECK-NEXT: store i16 [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
413+ // CHECK-NEXT: ret void
414+ //
415+ void test_cvt_pk_bf8_f16 (global short* out , half2 a )
416+ {
417+ * out = __builtin_amdgcn_cvt_pk_bf8_f16 (a );
418+ }
419+
420+ // CHECK-LABEL: @test_cvt_pk_fp8_f16(
421+ // CHECK-NEXT: entry:
422+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
423+ // CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
424+ // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
425+ // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
426+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
427+ // CHECK-NEXT: store <2 x half> [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
428+ // CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4
429+ // CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.cvt.pk.fp8.f16(<2 x half> [[TMP0]])
430+ // CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
431+ // CHECK-NEXT: store i16 [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
432+ // CHECK-NEXT: ret void
433+ //
434+ void test_cvt_pk_fp8_f16 (global short* out , half2 a )
435+ {
436+ * out = __builtin_amdgcn_cvt_pk_fp8_f16 (a );
437+ }
438+
439+ // CHECK-LABEL: @test_cvt_sr_bf8_f16(
440+ // CHECK-NEXT: entry:
441+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
442+ // CHECK-NEXT: [[A_ADDR:%.*]] = alloca half, align 2, addrspace(5)
443+ // CHECK-NEXT: [[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
444+ // CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
445+ // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
446+ // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
447+ // CHECK-NEXT: [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr
448+ // CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
449+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
450+ // CHECK-NEXT: store half [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
451+ // CHECK-NEXT: store i32 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 4
452+ // CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
453+ // CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
454+ // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
455+ // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
456+ // CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 0)
457+ // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
458+ // CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
459+ // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
460+ // CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
461+ // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
462+ // CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP5]], i32 [[TMP6]], i32 [[TMP7]], i32 1)
463+ // CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
464+ // CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 4
465+ // CHECK-NEXT: [[TMP10:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
466+ // CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
467+ // CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
468+ // CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP10]], i32 [[TMP11]], i32 [[TMP12]], i32 2)
469+ // CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
470+ // CHECK-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4
471+ // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
472+ // CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
473+ // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
474+ // CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP15]], i32 [[TMP16]], i32 [[TMP17]], i32 3)
475+ // CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
476+ // CHECK-NEXT: store i32 [[TMP18]], ptr addrspace(1) [[TMP19]], align 4
477+ // CHECK-NEXT: ret void
478+ //
479+ void test_cvt_sr_bf8_f16 (global int * out , half a , uint sr , int old )
480+ {
481+ * out = __builtin_amdgcn_cvt_sr_bf8_f16 (a , sr , old , 0 );
482+ * out = __builtin_amdgcn_cvt_sr_bf8_f16 (a , sr , old , 1 );
483+ * out = __builtin_amdgcn_cvt_sr_bf8_f16 (a , sr , old , 2 );
484+ * out = __builtin_amdgcn_cvt_sr_bf8_f16 (a , sr , old , 3 );
485+ }
486+
487+ // CHECK-LABEL: @test_cvt_sr_fp8_f16(
488+ // CHECK-NEXT: entry:
489+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
490+ // CHECK-NEXT: [[A_ADDR:%.*]] = alloca half, align 2, addrspace(5)
491+ // CHECK-NEXT: [[SR_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
492+ // CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
493+ // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
494+ // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
495+ // CHECK-NEXT: [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr
496+ // CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
497+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
498+ // CHECK-NEXT: store half [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
499+ // CHECK-NEXT: store i16 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 2
500+ // CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
501+ // CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
502+ // CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2
503+ // CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32
504+ // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
505+ // CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP0]], i32 [[CONV]], i32 [[TMP2]], i32 0)
506+ // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
507+ // CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
508+ // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
509+ // CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2
510+ // CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[TMP6]] to i32
511+ // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
512+ // CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP5]], i32 [[CONV1]], i32 [[TMP7]], i32 1)
513+ // CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
514+ // CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 4
515+ // CHECK-NEXT: [[TMP10:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
516+ // CHECK-NEXT: [[TMP11:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2
517+ // CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP11]] to i32
518+ // CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
519+ // CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP10]], i32 [[CONV2]], i32 [[TMP12]], i32 2)
520+ // CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
521+ // CHECK-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4
522+ // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
523+ // CHECK-NEXT: [[TMP16:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2
524+ // CHECK-NEXT: [[CONV3:%.*]] = sext i16 [[TMP16]] to i32
525+ // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
526+ // CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP15]], i32 [[CONV3]], i32 [[TMP17]], i32 3)
527+ // CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
528+ // CHECK-NEXT: store i32 [[TMP18]], ptr addrspace(1) [[TMP19]], align 4
529+ // CHECK-NEXT: ret void
530+ //
531+ void test_cvt_sr_fp8_f16 (global int * out , half a , short sr , int old )
532+ {
533+ * out = __builtin_amdgcn_cvt_sr_fp8_f16 (a , sr , old , 0 );
534+ * out = __builtin_amdgcn_cvt_sr_fp8_f16 (a , sr , old , 1 );
535+ * out = __builtin_amdgcn_cvt_sr_fp8_f16 (a , sr , old , 2 );
536+ * out = __builtin_amdgcn_cvt_sr_fp8_f16 (a , sr , old , 3 );
537+ }
538+
401539// CHECK-LABEL: @test_sat_pk4_i4_i8(
402540// CHECK-NEXT: entry:
403541// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
0 commit comments