@@ -442,5 +442,130 @@ define void @truncstore_v6i32_to_v6i16(ptr addrspace(1) %out, <6 x i32> %val) {
442442 ret void
443443}
444444
445+ define void @global_fp_truncstore_v6f32_to_v6bf16 (ptr addrspace (1 ) %ptr , <6 x float > %src ) {
446+ ; SI-LABEL: global_fp_truncstore_v6f32_to_v6bf16:
447+ ; SI: ; %bb.0:
448+ ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
449+ ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
450+ ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5
451+ ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
452+ ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
453+ ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
454+ ; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
455+ ; SI-NEXT: v_alignbit_b32 v3, v3, v2, 16
456+ ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7
457+ ; SI-NEXT: s_mov_b32 s6, 0
458+ ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
459+ ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
460+ ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6
461+ ; SI-NEXT: s_mov_b32 s7, 0xf000
462+ ; SI-NEXT: s_mov_b32 s4, s6
463+ ; SI-NEXT: s_mov_b32 s5, s6
464+ ; SI-NEXT: v_alignbit_b32 v2, v2, v5, 16
465+ ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 offset:8
466+ ; SI-NEXT: buffer_store_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64
467+ ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
468+ ; SI-NEXT: s_setpc_b64 s[30:31]
469+ ;
470+ ; VI-LABEL: global_fp_truncstore_v6f32_to_v6bf16:
471+ ; VI: ; %bb.0:
472+ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
473+ ; VI-NEXT: v_bfe_u32 v9, v6, 16, 1
474+ ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v6
475+ ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
476+ ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6
477+ ; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
478+ ; VI-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc
479+ ; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
480+ ; VI-NEXT: s_movk_i32 s4, 0x7fff
481+ ; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
482+ ; VI-NEXT: v_add_u32_e32 v9, vcc, s4, v9
483+ ; VI-NEXT: v_or_b32_e32 v8, 0x400000, v7
484+ ; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
485+ ; VI-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc
486+ ; VI-NEXT: v_bfe_u32 v8, v4, 16, 1
487+ ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v4
488+ ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
489+ ; VI-NEXT: v_add_u32_e32 v8, vcc, s4, v8
490+ ; VI-NEXT: v_alignbit_b32 v6, v7, v6, 16
491+ ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4
492+ ; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
493+ ; VI-NEXT: v_cndmask_b32_e32 v4, v8, v7, vcc
494+ ; VI-NEXT: v_bfe_u32 v8, v5, 16, 1
495+ ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v5
496+ ; VI-NEXT: v_add_u32_e32 v8, vcc, s4, v8
497+ ; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5
498+ ; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
499+ ; VI-NEXT: v_cndmask_b32_e32 v5, v8, v7, vcc
500+ ; VI-NEXT: v_bfe_u32 v7, v2, 16, 1
501+ ; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v2
502+ ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
503+ ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
504+ ; VI-NEXT: v_alignbit_b32 v5, v5, v4, 16
505+ ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
506+ ; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
507+ ; VI-NEXT: v_cndmask_b32_e32 v2, v7, v4, vcc
508+ ; VI-NEXT: v_bfe_u32 v7, v3, 16, 1
509+ ; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3
510+ ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
511+ ; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3
512+ ; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
513+ ; VI-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc
514+ ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
515+ ; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16
516+ ; VI-NEXT: flat_store_dwordx3 v[0:1], v[4:6]
517+ ; VI-NEXT: s_waitcnt vmcnt(0)
518+ ; VI-NEXT: s_setpc_b64 s[30:31]
519+ %trunc = fptrunc <6 x float > %src to <6 x bfloat>
520+ store <6 x bfloat> %trunc , ptr addrspace (1 ) %ptr
521+ ret void
522+ }
523+
524+
525+ define void @global_fp_truncstore_v6f32_to_v6f16 (ptr addrspace (1 ) %ptr , <6 x float > %src ) {
526+ ; SI-LABEL: global_fp_truncstore_v6f32_to_v6f16:
527+ ; SI: ; %bb.0:
528+ ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
529+ ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
530+ ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
531+ ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
532+ ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
533+ ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
534+ ; SI-NEXT: v_or_b32_e32 v4, v4, v5
535+ ; SI-NEXT: v_cvt_f16_f32_e32 v5, v7
536+ ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
537+ ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
538+ ; SI-NEXT: s_mov_b32 s6, 0
539+ ; SI-NEXT: v_or_b32_e32 v3, v2, v3
540+ ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5
541+ ; SI-NEXT: s_mov_b32 s7, 0xf000
542+ ; SI-NEXT: s_mov_b32 s4, s6
543+ ; SI-NEXT: s_mov_b32 s5, s6
544+ ; SI-NEXT: v_or_b32_e32 v2, v6, v2
545+ ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 offset:8
546+ ; SI-NEXT: buffer_store_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64
547+ ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
548+ ; SI-NEXT: s_setpc_b64 s[30:31]
549+ ;
550+ ; VI-LABEL: global_fp_truncstore_v6f32_to_v6f16:
551+ ; VI: ; %bb.0:
552+ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
553+ ; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
554+ ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6
555+ ; VI-NEXT: v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
556+ ; VI-NEXT: v_cvt_f16_f32_e32 v8, v4
557+ ; VI-NEXT: v_cvt_f16_f32_sdwa v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
558+ ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
559+ ; VI-NEXT: v_or_b32_e32 v4, v6, v7
560+ ; VI-NEXT: v_or_b32_e32 v3, v8, v5
561+ ; VI-NEXT: v_or_b32_e32 v2, v2, v9
562+ ; VI-NEXT: flat_store_dwordx3 v[0:1], v[2:4]
563+ ; VI-NEXT: s_waitcnt vmcnt(0)
564+ ; VI-NEXT: s_setpc_b64 s[30:31]
565+ %trunc = fptrunc <6 x float > %src to <6 x half >
566+ store <6 x half > %trunc , ptr addrspace (1 ) %ptr
567+ ret void
568+ }
569+
445570;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
446571; GCN: {{.*}}
0 commit comments