@@ -71,6 +71,104 @@ define <vscale x 2 x i32> @masked_gather_nxv2i32(<vscale x 2 x i32*> %ptrs, <vsc
71
71
ret <vscale x 2 x i32 > %data
72
72
}
73
73
74
+ define <vscale x 4 x half > @masked_gather_nxv4f16 (<vscale x 4 x half *> %ptrs , <vscale x 4 x i1 > %mask ) {
75
+ ; CHECK-LABEL: masked_gather_nxv4f16:
76
+ ; CHECK: // %bb.0:
77
+ ; CHECK-NEXT: pfalse p1.b
78
+ ; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
79
+ ; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
80
+ ; CHECK-NEXT: ld1h { z1.d }, p2/z, [z1.d]
81
+ ; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d]
82
+ ; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
83
+ ; CHECK-NEXT: ret
84
+ %data = call <vscale x 4 x half > @llvm.masked.gather.nxv4f16 (<vscale x 4 x half *> %ptrs , i32 0 , <vscale x 4 x i1 > %mask , <vscale x 4 x half > undef )
85
+ ret <vscale x 4 x half > %data
86
+ }
87
+
88
+ define <vscale x 2 x float > @masked_gather_nxv2f32 (float * %base , <vscale x 2 x i16 > %indices , <vscale x 2 x i1 > %mask ) {
89
+ ; CHECK-LABEL: masked_gather_nxv2f32:
90
+ ; CHECK: // %bb.0:
91
+ ; CHECK-NEXT: ptrue p1.d
92
+ ; CHECK-NEXT: sxth z0.d, p1/m, z0.d
93
+ ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
94
+ ; CHECK-NEXT: ret
95
+ %ptrs = getelementptr float , float * %base , <vscale x 2 x i16 > %indices
96
+ %data = call <vscale x 2 x float > @llvm.masked.gather.nxv2f32 (<vscale x 2 x float *> %ptrs , i32 1 , <vscale x 2 x i1 > %mask , <vscale x 2 x float > undef )
97
+ ret <vscale x 2 x float > %data
98
+ }
99
+
100
+ define <vscale x 8 x half > @masked_gather_nxv8f16 (<vscale x 8 x half *> %ptrs , <vscale x 8 x i1 > %mask ) {
101
+ ; CHECK-LABEL: masked_gather_nxv8f16:
102
+ ; CHECK: // %bb.0:
103
+ ; CHECK-NEXT: pfalse p1.b
104
+ ; CHECK-NEXT: zip2 p2.h, p0.h, p1.h
105
+ ; CHECK-NEXT: zip1 p0.h, p0.h, p1.h
106
+ ; CHECK-NEXT: zip2 p3.s, p2.s, p1.s
107
+ ; CHECK-NEXT: zip1 p2.s, p2.s, p1.s
108
+ ; CHECK-NEXT: ld1h { z3.d }, p3/z, [z3.d]
109
+ ; CHECK-NEXT: ld1h { z2.d }, p2/z, [z2.d]
110
+ ; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
111
+ ; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
112
+ ; CHECK-NEXT: ld1h { z1.d }, p2/z, [z1.d]
113
+ ; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d]
114
+ ; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s
115
+ ; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
116
+ ; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
117
+ ; CHECK-NEXT: ret
118
+ %data = call <vscale x 8 x half > @llvm.masked.gather.nxv8f16 (<vscale x 8 x half *> %ptrs , i32 2 , <vscale x 8 x i1 > %mask , <vscale x 8 x half > undef )
119
+ ret <vscale x 8 x half > %data
120
+ }
121
+
122
+ define <vscale x 8 x bfloat> @masked_gather_nxv8bf16 (bfloat* %base , <vscale x 8 x i16 > %indices , <vscale x 8 x i1 > %mask ) #0 {
123
+ ; CHECK-LABEL: masked_gather_nxv8bf16:
124
+ ; CHECK: // %bb.0:
125
+ ; CHECK-NEXT: pfalse p1.b
126
+ ; CHECK-NEXT: sunpkhi z1.s, z0.h
127
+ ; CHECK-NEXT: sunpklo z0.s, z0.h
128
+ ; CHECK-NEXT: zip2 p2.h, p0.h, p1.h
129
+ ; CHECK-NEXT: zip1 p0.h, p0.h, p1.h
130
+ ; CHECK-NEXT: ld1h { z1.s }, p2/z, [x0, z1.s, sxtw #1]
131
+ ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
132
+ ; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
133
+ ; CHECK-NEXT: ret
134
+ %ptrs = getelementptr bfloat, bfloat* %base , <vscale x 8 x i16 > %indices
135
+ %data = call <vscale x 8 x bfloat> @llvm.masked.gather.nxv8bf16 (<vscale x 8 x bfloat*> %ptrs , i32 1 , <vscale x 8 x i1 > %mask , <vscale x 8 x bfloat> undef )
136
+ ret <vscale x 8 x bfloat> %data
137
+ }
138
+
139
+ define <vscale x 4 x double > @masked_gather_nxv4f64 (double * %base , <vscale x 4 x i16 > %indices , <vscale x 4 x i1 > %mask ) {;
140
+ ; CHECK-LABEL: masked_gather_nxv4f64:
141
+ ; CHECK: // %bb.0:
142
+ ; CHECK-NEXT: ptrue p1.s
143
+ ; CHECK-NEXT: pfalse p2.b
144
+ ; CHECK-NEXT: sxth z0.s, p1/m, z0.s
145
+ ; CHECK-NEXT: zip1 p1.s, p0.s, p2.s
146
+ ; CHECK-NEXT: zip2 p0.s, p0.s, p2.s
147
+ ; CHECK-NEXT: sunpklo z1.d, z0.s
148
+ ; CHECK-NEXT: sunpkhi z2.d, z0.s
149
+ ; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, z1.d, lsl #3]
150
+ ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, z2.d, lsl #3]
151
+ ; CHECK-NEXT: ret
152
+ %ptrs = getelementptr double , double * %base , <vscale x 4 x i16 > %indices
153
+ %data = call <vscale x 4 x double > @llvm.masked.gather.nxv4f64 (<vscale x 4 x double *> %ptrs , i32 1 , <vscale x 4 x i1 > %mask , <vscale x 4 x double > undef )
154
+ ret <vscale x 4 x double > %data
155
+ }
156
+
157
+ define <vscale x 8 x float > @masked_gather_nxv8f32 (float * %base , <vscale x 8 x i32 > %offsets , <vscale x 8 x i1 > %mask ) {
158
+ ; CHECK-LABEL: masked_gather_nxv8f32:
159
+ ; CHECK: // %bb.0:
160
+ ; CHECK-NEXT: pfalse p1.b
161
+ ; CHECK-NEXT: zip1 p2.h, p0.h, p1.h
162
+ ; CHECK-NEXT: zip2 p0.h, p0.h, p1.h
163
+ ; CHECK-NEXT: ld1w { z0.s }, p2/z, [x0, z0.s, uxtw #2]
164
+ ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, z1.s, uxtw #2]
165
+ ; CHECK-NEXT: ret
166
+ %offsets.zext = zext <vscale x 8 x i32 > %offsets to <vscale x 8 x i64 >
167
+ %ptrs = getelementptr float , float * %base , <vscale x 8 x i64 > %offsets.zext
168
+ %vals = call <vscale x 8 x float > @llvm.masked.gather.nxv8f32 (<vscale x 8 x float *> %ptrs , i32 4 , <vscale x 8 x i1 > %mask , <vscale x 8 x float > undef )
169
+ ret <vscale x 8 x float > %vals
170
+ }
171
+
74
172
; Code generate the worst case scenario when all vector types are legal.
75
173
define <vscale x 16 x i8 > @masked_gather_nxv16i8 (i8* %base , <vscale x 16 x i8 > %indices , <vscale x 16 x i1 > %mask ) {
76
174
; CHECK-LABEL: masked_gather_nxv16i8:
@@ -128,3 +226,11 @@ declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32,
128
226
declare <vscale x 4 x i8 > @llvm.masked.gather.nxv4i8 (<vscale x 4 x i8* >, i32 , <vscale x 4 x i1 >, <vscale x 4 x i8 >)
129
227
declare <vscale x 16 x i8 > @llvm.masked.gather.nxv16i8 (<vscale x 16 x i8* >, i32 , <vscale x 16 x i1 >, <vscale x 16 x i8 >)
130
228
declare <vscale x 32 x i32 > @llvm.masked.gather.nxv32i32 (<vscale x 32 x i32* >, i32 , <vscale x 32 x i1 >, <vscale x 32 x i32 >)
229
+
230
+ declare <vscale x 4 x half > @llvm.masked.gather.nxv4f16 (<vscale x 4 x half *>, i32 , <vscale x 4 x i1 >, <vscale x 4 x half >)
231
+ declare <vscale x 8 x half > @llvm.masked.gather.nxv8f16 (<vscale x 8 x half *>, i32 , <vscale x 8 x i1 >, <vscale x 8 x half >)
232
+ declare <vscale x 8 x bfloat> @llvm.masked.gather.nxv8bf16 (<vscale x 8 x bfloat*>, i32 , <vscale x 8 x i1 >, <vscale x 8 x bfloat>)
233
+ declare <vscale x 2 x float > @llvm.masked.gather.nxv2f32 (<vscale x 2 x float *>, i32 , <vscale x 2 x i1 >, <vscale x 2 x float >)
234
+ declare <vscale x 8 x float > @llvm.masked.gather.nxv8f32 (<vscale x 8 x float *>, i32 , <vscale x 8 x i1 >, <vscale x 8 x float >)
235
+ declare <vscale x 4 x double > @llvm.masked.gather.nxv4f64 (<vscale x 4 x double *>, i32 , <vscale x 4 x i1 >, <vscale x 4 x double >)
236
+ attributes #0 = { "target-features" ="+sve,+bf16" }
0 commit comments