@@ -24,6 +24,166 @@ define amdgpu_kernel void @dpp8_wait_states(ptr addrspace(1) %out, i32 %in) {
2424 ret void
2525}
2626
27+ ; GFX10PLUS-LABEL: {{^}}dpp8_i64:
28+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
29+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
30+ ; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
31+ define amdgpu_ps void @dpp8_i64 (i64 %in , ptr addrspace (1 ) %out ) {
32+ %tmp0 = call i64 @llvm.amdgcn.mov.dpp8.i64 (i64 %in , i32 1 )
33+ store i64 %tmp0 , ptr addrspace (1 ) %out
34+ ret void
35+ }
36+
37+ ; GFX10PLUS-LABEL: {{^}}dpp8_v2i32:
38+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
39+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
40+ ; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
41+ define amdgpu_ps void @dpp8_v2i32 (<2 x i32 > %in , ptr addrspace (1 ) %out ) {
42+ %tmp0 = call <2 x i32 > @llvm.amdgcn.mov.dpp8.v3i32 (<2 x i32 > %in , i32 1 )
43+ store <2 x i32 > %tmp0 , ptr addrspace (1 ) %out
44+ ret void
45+ }
46+
47+ ; GFX10PLUS-LABEL: {{^}}dpp8_v3i32:
48+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
49+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
50+ ; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
51+ ; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off
52+ define amdgpu_ps void @dpp8_v3i32 (<3 x i32 > %in , ptr addrspace (1 ) %out ) {
53+ %tmp0 = call <3 x i32 > @llvm.amdgcn.mov.dpp8.v3i32 (<3 x i32 > %in , i32 1 )
54+ store <3 x i32 > %tmp0 , ptr addrspace (1 ) %out
55+ ret void
56+ }
57+
58+ ; GFX10PLUS-LABEL: {{^}}dpp8_v4i32:
59+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
60+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
61+ ; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
62+ ; GFX10PLUS-DAG: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0]
63+ ; GFX10PLUS-DAG: global_store_{{dwordx4|b128}} v[4:5], v[0:3], off
64+ define amdgpu_ps void @dpp8_v4i32 (<4 x i32 > %in , ptr addrspace (1 ) %out ) {
65+ %tmp0 = call <4 x i32 > @llvm.amdgcn.mov.dpp8.v3i32 (<4 x i32 > %in , i32 1 )
66+ store <4 x i32 > %tmp0 , ptr addrspace (1 ) %out
67+ ret void
68+ }
69+
70+ ; GFX10PLUS-LABEL: {{^}}dpp8_p0:
71+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
72+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
73+ ; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
74+ define amdgpu_ps void @dpp8_p0 (ptr %in , ptr addrspace (1 ) %out ) {
75+ %tmp0 = call ptr @llvm.amdgcn.mov.dpp8.p0 (ptr %in , i32 1 )
76+ store ptr %tmp0 , ptr addrspace (1 ) %out
77+ ret void
78+ }
79+
80+ ; GFX10PLUS-LABEL: {{^}}dpp8_p3:
81+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
82+ ; GFX10PLUS-DAG: global_store_{{dword|b32}} v[1:2], v0, off
83+ define amdgpu_ps void @dpp8_p3 (ptr addrspace (3 ) %in , ptr addrspace (1 ) %out ) {
84+ %tmp0 = call ptr addrspace (3 ) @llvm.amdgcn.mov.dpp8.v3p3 (ptr addrspace (3 ) %in , i32 1 )
85+ store ptr addrspace (3 ) %tmp0 , ptr addrspace (1 ) %out
86+ ret void
87+ }
88+
89+ ; GFX10PLUS-LABEL: {{^}}dpp8_v3p3:
90+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
91+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
92+ ; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
93+ ; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off
94+ define amdgpu_ps void @dpp8_v3p3 (<3 x ptr addrspace (3 )> %in , ptr addrspace (1 ) %out ) {
95+ %tmp0 = call <3 x ptr addrspace (3 )> @llvm.amdgcn.mov.dpp8.v3p3 (<3 x ptr addrspace (3 )> %in , i32 1 )
96+ store <3 x ptr addrspace (3 )> %tmp0 , ptr addrspace (1 ) %out
97+ ret void
98+ }
99+
100+ ; GFX10PLUS-LABEL: {{^}}dpp8_i16:
101+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
102+ ; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off
103+ define amdgpu_ps void @dpp8_i16 (i16 %in , ptr addrspace (1 ) %out ) {
104+ %tmp0 = call i16 @llvm.amdgcn.mov.dpp8.i16 (i16 %in , i32 1 )
105+ store i16 %tmp0 , ptr addrspace (1 ) %out
106+ ret void
107+ }
108+
109+ ; GFX10PLUS-LABEL: {{^}}dpp8_v4i16:
110+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
111+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
112+ ; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
113+ define amdgpu_ps void @dpp8_v4i16 (<4 x i16 > %in , ptr addrspace (1 ) %out ) {
114+ %tmp0 = call <4 x i16 > @llvm.amdgcn.mov.dpp8.v4i16 (<4 x i16 > %in , i32 1 )
115+ store <4 x i16 > %tmp0 , ptr addrspace (1 ) %out
116+ ret void
117+ }
118+
119+ ; GFX10PLUS-LABEL: {{^}}dpp8_v4f16:
120+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
121+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
122+ ; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
123+ define amdgpu_ps void @dpp8_v4f16 (<4 x half > %in , ptr addrspace (1 ) %out ) {
124+ %tmp0 = call <4 x half > @llvm.amdgcn.mov.dpp8.v4f16 (<4 x half > %in , i32 1 )
125+ store <4 x half > %tmp0 , ptr addrspace (1 ) %out
126+ ret void
127+ }
128+
129+ ; GFX10PLUS-LABEL: {{^}}dpp8_float:
130+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
131+ ; GFX10PLUS-DAG: global_store_{{dword|b32}} v[1:2], v0, off
132+ define amdgpu_ps void @dpp8_float (float %in , ptr addrspace (1 ) %out ) {
133+ %tmp0 = call float @llvm.amdgcn.mov.dpp8.f32 (float %in , i32 1 )
134+ store float %tmp0 , ptr addrspace (1 ) %out
135+ ret void
136+ }
137+
138+ ; GFX10PLUS-LABEL: {{^}}dpp8_v3f32:
139+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
140+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
141+ ; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
142+ ; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off
143+ define amdgpu_ps void @dpp8_v3f32 (<3 x float > %in , ptr addrspace (1 ) %out ) {
144+ %tmp0 = call <3 x float > @llvm.amdgcn.mov.dpp8.v3f32 (<3 x float > %in , i32 1 )
145+ store <3 x float > %tmp0 , ptr addrspace (1 ) %out
146+ ret void
147+ }
148+
149+ ; GFX10PLUS-LABEL: {{^}}dpp8_half:
150+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
151+ ; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off
152+ define amdgpu_ps void @dpp8_half (half %in , ptr addrspace (1 ) %out ) {
153+ %tmp0 = call half @llvm.amdgcn.mov.dpp8.f16 (half %in , i32 1 )
154+ store half %tmp0 , ptr addrspace (1 ) %out
155+ ret void
156+ }
157+
158+ ; GFX10PLUS-LABEL: {{^}}dpp8_bfloat:
159+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
160+ ; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off
161+ define amdgpu_ps void @dpp8_bfloat (bfloat %in , ptr addrspace (1 ) %out ) {
162+ %tmp0 = call bfloat @llvm.amdgcn.mov.dpp8.bf16 (bfloat %in , i32 1 )
163+ store bfloat %tmp0 , ptr addrspace (1 ) %out
164+ ret void
165+ }
166+
167+ ; GFX10PLUS-LABEL: {{^}}dpp8_v4bf16:
168+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
169+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
170+ ; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
171+ define amdgpu_ps void @dpp8_v4bf16 (<4 x bfloat> %in , ptr addrspace (1 ) %out ) {
172+ %tmp0 = call <4 x bfloat> @llvm.amdgcn.mov.dpp8.v4bf16 (<4 x bfloat> %in , i32 1 )
173+ store <4 x bfloat> %tmp0 , ptr addrspace (1 ) %out
174+ ret void
175+ }
176+
177+ ; GFX10PLUS-LABEL: {{^}}dpp8_double:
178+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
179+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
180+ ; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
181+ define amdgpu_ps void @dpp8_double (double %in , ptr addrspace (1 ) %out ) {
182+ %tmp0 = call double @llvm.amdgcn.mov.dpp8.f64 (double %in , i32 1 )
183+ store double %tmp0 , ptr addrspace (1 ) %out
184+ ret void
185+ }
186+
27187declare i32 @llvm.amdgcn.mov.dpp8.i32 (i32 , i32 ) #0
28188
29189attributes #0 = { nounwind readnone convergent }
0 commit comments