@@ -209,10 +209,8 @@ multiclass sse12_move_rr<SDNode OpNode, ValueType vt, string base_opc,
209209}
210210
211211multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
212- X86MemOperand x86memop, string OpcodeStr,
213- Domain d, Predicate pred> {
212+ X86MemOperand x86memop, string OpcodeStr, Domain d> {
214213 // AVX
215- let Predicates = [UseAVX, OptForSize] in
216214 defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
217215 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
218216 VEX, VVVV, VEX_LIG, WIG;
@@ -223,7 +221,6 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
223221 VEX, VEX_LIG, Sched<[WriteFStore]>, WIG;
224222 // SSE1 & 2
225223 let Constraints = "$src1 = $dst" in {
226- let Predicates = [pred, NoSSE41_Or_OptForSize] in
227224 defm NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
228225 "\t{$src2, $dst|$dst, $src2}", d>;
229226 }
@@ -268,9 +265,9 @@ multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
268265}
269266
270267defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
271- SSEPackedSingle, UseSSE1 >, TB, XS;
268+ SSEPackedSingle>, TB, XS;
272269defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
273- SSEPackedDouble, UseSSE2 >, TB, XD;
270+ SSEPackedDouble>, TB, XD;
274271
275272let canFoldAsLoad = 1, isReMaterializable = 1 in {
276273 defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
@@ -292,9 +289,7 @@ let Predicates = [UseAVX] in {
292289 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
293290 def : Pat<(v4f64 (X86vzload64 addr:$src)),
294291 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
295- }
296292
297- let Predicates = [UseAVX, OptForSize] in {
298293 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
299294 // MOVSS to the lower bits.
300295 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
@@ -313,22 +308,21 @@ let Predicates = [UseAVX, OptForSize] in {
313308 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
314309}
315310
316- let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
317- // Move scalar to XMM zero-extended, zeroing a VR128 then do a
318- // MOVSS to the lower bits.
319- def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
320- (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
321- def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
322- (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
323- }
324-
325311let Predicates = [UseSSE2] in
326312def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
327313 (MOVSDrm addr:$src)>;
328314
329- let Predicates = [UseSSE1] in
330- def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
331- (MOVSSrm addr:$src)>;
315+ let Predicates = [UseSSE1] in {
316+ def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
317+ (MOVSSrm addr:$src)>;
318+
319+ // Move scalar to XMM zero-extended, zeroing a VR128 then do a
320+ // MOVSS to the lower bits.
321+ def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
322+ (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
323+ def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
324+ (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
325+ }
332326
333327//===----------------------------------------------------------------------===//
334328// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
@@ -6382,61 +6376,25 @@ let Predicates = [HasAVX] in {
63826376 (VBLENDVPDYrrr VR256:$src2, VR256:$src1, VR256:$mask)>;
63836377}
63846378
6385- // Prefer a movss or movsd over a blendps when optimizing for size. these were
6386- // changed to use blends because blends have better throughput on sandybridge
6387- // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6388- let Predicates = [HasAVX, OptForSpeed] in {
6389- def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6390- (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6391- def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6392- (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6393-
6394- def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6395- (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6379+ // TODO: Remove these and let foldMemoryOperandCustom handle it?
6380+ let Predicates = [HasAVX] in {
63966381 def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
63976382 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
63986383 def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
63996384 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
64006385
6401- def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6402- (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
64036386 def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
64046387 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
64056388 def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
64066389 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6407-
6408- // Move low f32 and clear high bits.
6409- def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
6410- (SUBREG_TO_REG (i32 0),
6411- (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
6412- (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
6413- (i8 1))), sub_xmm)>;
6414- def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
6415- (SUBREG_TO_REG (i32 0),
6416- (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
6417- (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
6418- (i8 3))), sub_xmm)>;
64196390}
64206391
6421- // Prefer a movss or movsd over a blendps when optimizing for size. these were
6422- // changed to use blends because blends have better throughput on sandybridge
6423- // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6424- let Predicates = [UseSSE41, OptForSpeed] in {
6425- // With SSE41 we can use blends for these patterns.
6426- def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6427- (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6428- def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6429- (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6430-
6431- def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6432- (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6392+ let Predicates = [UseSSE41] in {
64336393 def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
64346394 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
64356395 def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
64366396 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
64376397
6438- def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6439- (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
64406398 def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
64416399 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
64426400 def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
0 commit comments