Skip to content

Commit 3c9c4c6

Browse files
arsenmHoney Goyal
authored andcommitted
AMDGPU: Fix truncstore from v6f32 to v6f16 (llvm#171212)
The v6bf16 cases work, but that's likely because v6bf16 isn't currently an MVT. Fixes: SWDEV-570985
1 parent 56b7b6f commit 3c9c4c6

File tree

4 files changed

+188
-0
lines changed

4 files changed

+188
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
338338
setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
339339
setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
340340
setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
341+
setTruncStoreAction(MVT::v6f32, MVT::v6f16, Expand);
341342
setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
342343
setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
343344
setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);

llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp32_to_bf16.ll

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,54 @@ entry:
119119
ret void
120120
}
121121

122+
define void @v6(<6 x float> %num, ptr addrspace(1) %p) {
123+
; CHECK-LABEL: v6:
124+
; CHECK: ; %bb.0: ; %entry
125+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
126+
; CHECK-NEXT: v_bfe_u32 v8, v4, 16, 1
127+
; CHECK-NEXT: s_movk_i32 s4, 0x7fff
128+
; CHECK-NEXT: v_add3_u32 v8, v8, v4, s4
129+
; CHECK-NEXT: v_or_b32_e32 v9, 0x400000, v4
130+
; CHECK-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
131+
; CHECK-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
132+
; CHECK-NEXT: v_bfe_u32 v8, v5, 16, 1
133+
; CHECK-NEXT: v_add3_u32 v8, v8, v5, s4
134+
; CHECK-NEXT: v_or_b32_e32 v9, 0x400000, v5
135+
; CHECK-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
136+
; CHECK-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
137+
; CHECK-NEXT: s_mov_b32 s5, 0x7060302
138+
; CHECK-NEXT: v_perm_b32 v4, v5, v4, s5
139+
; CHECK-NEXT: v_bfe_u32 v5, v2, 16, 1
140+
; CHECK-NEXT: v_add3_u32 v5, v5, v2, s4
141+
; CHECK-NEXT: v_or_b32_e32 v8, 0x400000, v2
142+
; CHECK-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
143+
; CHECK-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc
144+
; CHECK-NEXT: v_bfe_u32 v5, v3, 16, 1
145+
; CHECK-NEXT: v_add3_u32 v5, v5, v3, s4
146+
; CHECK-NEXT: v_or_b32_e32 v8, 0x400000, v3
147+
; CHECK-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
148+
; CHECK-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc
149+
; CHECK-NEXT: v_perm_b32 v3, v3, v2, s5
150+
; CHECK-NEXT: v_bfe_u32 v2, v0, 16, 1
151+
; CHECK-NEXT: v_add3_u32 v2, v2, v0, s4
152+
; CHECK-NEXT: v_or_b32_e32 v5, 0x400000, v0
153+
; CHECK-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
154+
; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
155+
; CHECK-NEXT: v_bfe_u32 v2, v1, 16, 1
156+
; CHECK-NEXT: v_add3_u32 v2, v2, v1, s4
157+
; CHECK-NEXT: v_or_b32_e32 v5, 0x400000, v1
158+
; CHECK-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
159+
; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
160+
; CHECK-NEXT: v_perm_b32 v2, v1, v0, s5
161+
; CHECK-NEXT: global_store_dwordx3 v[6:7], v[2:4], off
162+
; CHECK-NEXT: s_waitcnt vmcnt(0)
163+
; CHECK-NEXT: s_setpc_b64 s[30:31]
164+
entry:
165+
%conv = fptrunc <6 x float> %num to <6 x bfloat>
166+
store <6 x bfloat> %conv, ptr addrspace(1) %p, align 16
167+
ret void
168+
}
169+
122170
define void @v8(<8 x float> %num, ptr addrspace(1) %p) {
123171
; CHECK-LABEL: v8:
124172
; CHECK: ; %bb.0: ; %entry

llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,17 @@ define amdgpu_kernel void @global_truncstore_v16f64_to_v16f16(ptr addrspace(1) %
5353
store <16 x half> %cvt, ptr addrspace(1) %out
5454
ret void
5555
}
56+
57+
; GCN-LABEL: {{^}}global_truncstore_v6f64_to_v6f16:
58+
define void @global_truncstore_v6f64_to_v6f16(ptr addrspace(1) %ptr, <6 x double> %src) {
59+
%trunc = fptrunc <6 x double> %src to <6 x half>
60+
store <6 x half> %trunc, ptr addrspace(1) %ptr
61+
ret void
62+
}
63+
64+
; GCN-LABEL: {{^}}global_truncstore_v6f64_to_v6bf16:
65+
define void @global_truncstore_v6f64_to_v6bf16(ptr addrspace(1) %ptr, <6 x double> %src) {
66+
%trunc = fptrunc <6 x double> %src to <6 x bfloat>
67+
store <6 x bfloat> %trunc, ptr addrspace(1) %ptr
68+
ret void
69+
}

llvm/test/CodeGen/AMDGPU/trunc-store.ll

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,5 +442,130 @@ define void @truncstore_v6i32_to_v6i16(ptr addrspace(1) %out, <6 x i32> %val) {
442442
ret void
443443
}
444444

445+
define void @global_fp_truncstore_v6f32_to_v6bf16(ptr addrspace(1) %ptr, <6 x float> %src) {
446+
; SI-LABEL: global_fp_truncstore_v6f32_to_v6bf16:
447+
; SI: ; %bb.0:
448+
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
449+
; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
450+
; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5
451+
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
452+
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
453+
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
454+
; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
455+
; SI-NEXT: v_alignbit_b32 v3, v3, v2, 16
456+
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7
457+
; SI-NEXT: s_mov_b32 s6, 0
458+
; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16
459+
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
460+
; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6
461+
; SI-NEXT: s_mov_b32 s7, 0xf000
462+
; SI-NEXT: s_mov_b32 s4, s6
463+
; SI-NEXT: s_mov_b32 s5, s6
464+
; SI-NEXT: v_alignbit_b32 v2, v2, v5, 16
465+
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 offset:8
466+
; SI-NEXT: buffer_store_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64
467+
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
468+
; SI-NEXT: s_setpc_b64 s[30:31]
469+
;
470+
; VI-LABEL: global_fp_truncstore_v6f32_to_v6bf16:
471+
; VI: ; %bb.0:
472+
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
473+
; VI-NEXT: v_bfe_u32 v9, v6, 16, 1
474+
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v6
475+
; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
476+
; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6
477+
; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
478+
; VI-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc
479+
; VI-NEXT: v_bfe_u32 v9, v7, 16, 1
480+
; VI-NEXT: s_movk_i32 s4, 0x7fff
481+
; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7
482+
; VI-NEXT: v_add_u32_e32 v9, vcc, s4, v9
483+
; VI-NEXT: v_or_b32_e32 v8, 0x400000, v7
484+
; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
485+
; VI-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc
486+
; VI-NEXT: v_bfe_u32 v8, v4, 16, 1
487+
; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v4
488+
; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
489+
; VI-NEXT: v_add_u32_e32 v8, vcc, s4, v8
490+
; VI-NEXT: v_alignbit_b32 v6, v7, v6, 16
491+
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v4
492+
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
493+
; VI-NEXT: v_cndmask_b32_e32 v4, v8, v7, vcc
494+
; VI-NEXT: v_bfe_u32 v8, v5, 16, 1
495+
; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v5
496+
; VI-NEXT: v_add_u32_e32 v8, vcc, s4, v8
497+
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5
498+
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
499+
; VI-NEXT: v_cndmask_b32_e32 v5, v8, v7, vcc
500+
; VI-NEXT: v_bfe_u32 v7, v2, 16, 1
501+
; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v2
502+
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
503+
; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
504+
; VI-NEXT: v_alignbit_b32 v5, v5, v4, 16
505+
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
506+
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
507+
; VI-NEXT: v_cndmask_b32_e32 v2, v7, v4, vcc
508+
; VI-NEXT: v_bfe_u32 v7, v3, 16, 1
509+
; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v3
510+
; VI-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
511+
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v3
512+
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
513+
; VI-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc
514+
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
515+
; VI-NEXT: v_alignbit_b32 v4, v3, v2, 16
516+
; VI-NEXT: flat_store_dwordx3 v[0:1], v[4:6]
517+
; VI-NEXT: s_waitcnt vmcnt(0)
518+
; VI-NEXT: s_setpc_b64 s[30:31]
519+
%trunc = fptrunc <6 x float> %src to <6 x bfloat>
520+
store <6 x bfloat> %trunc, ptr addrspace(1) %ptr
521+
ret void
522+
}
523+
524+
525+
define void @global_fp_truncstore_v6f32_to_v6f16(ptr addrspace(1) %ptr, <6 x float> %src) {
526+
; SI-LABEL: global_fp_truncstore_v6f32_to_v6f16:
527+
; SI: ; %bb.0:
528+
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
529+
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
530+
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
531+
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
532+
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
533+
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
534+
; SI-NEXT: v_or_b32_e32 v4, v4, v5
535+
; SI-NEXT: v_cvt_f16_f32_e32 v5, v7
536+
; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
537+
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
538+
; SI-NEXT: s_mov_b32 s6, 0
539+
; SI-NEXT: v_or_b32_e32 v3, v2, v3
540+
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5
541+
; SI-NEXT: s_mov_b32 s7, 0xf000
542+
; SI-NEXT: s_mov_b32 s4, s6
543+
; SI-NEXT: s_mov_b32 s5, s6
544+
; SI-NEXT: v_or_b32_e32 v2, v6, v2
545+
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 offset:8
546+
; SI-NEXT: buffer_store_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64
547+
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
548+
; SI-NEXT: s_setpc_b64 s[30:31]
549+
;
550+
; VI-LABEL: global_fp_truncstore_v6f32_to_v6f16:
551+
; VI: ; %bb.0:
552+
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
553+
; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
554+
; VI-NEXT: v_cvt_f16_f32_e32 v6, v6
555+
; VI-NEXT: v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
556+
; VI-NEXT: v_cvt_f16_f32_e32 v8, v4
557+
; VI-NEXT: v_cvt_f16_f32_sdwa v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
558+
; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
559+
; VI-NEXT: v_or_b32_e32 v4, v6, v7
560+
; VI-NEXT: v_or_b32_e32 v3, v8, v5
561+
; VI-NEXT: v_or_b32_e32 v2, v2, v9
562+
; VI-NEXT: flat_store_dwordx3 v[0:1], v[2:4]
563+
; VI-NEXT: s_waitcnt vmcnt(0)
564+
; VI-NEXT: s_setpc_b64 s[30:31]
565+
%trunc = fptrunc <6 x float> %src to <6 x half>
566+
store <6 x half> %trunc, ptr addrspace(1) %ptr
567+
ret void
568+
}
569+
445570
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
446571
; GCN: {{.*}}

0 commit comments

Comments
 (0)