@@ -67,10 +67,27 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
6767 // The libc version is likely to be faster for these cases. It can use the
6868 // address value and run time information about the CPU.
6969 if (Alignment < Align (4 ) || !ConstantSize ||
70- ConstantSize->getZExtValue () > Subtarget.getMaxInlineSizeThreshold ())
70+ ConstantSize->getZExtValue () > Subtarget.getMaxInlineSizeThreshold ())
7171 return SDValue ();
7272
73+ // If we have minsize, then don't care about the alignment.
74+ // On x86, the CPU doesn't care and neither should you.
75+ // As long as the count is aligned, we can use the minimum number of
76+ // instructions without always having to resort to stosb.
77+ //
78+ // Because this is a feature specific to x86, we must handle it here.
7379 uint64_t SizeVal = ConstantSize->getZExtValue ();
80+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ()) {
81+ if ((SizeVal & 7 ) == 0 && Subtarget.is64Bit ())
82+ Alignment = Align (8 );
83+ else if ((SizeVal & 3 ) == 0 )
84+ Alignment = Align (4 );
85+ else if ((SizeVal & 1 ) == 0 )
86+ Alignment = Align (2 );
87+ else
88+ Alignment = Align (1 );
89+ }
90+
7491 SDValue InGlue;
7592 EVT AVT;
7693 SDValue Count;
@@ -86,7 +103,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
86103 ValReg = X86::EAX;
87104 Val = (Val << 8 ) | Val;
88105 Val = (Val << 16 ) | Val;
89- if (Subtarget.is64Bit () && Alignment > Align (8 )) { // QWORD aligned
106+ if (Subtarget.is64Bit () && Alignment > Align (4 )) { // QWORD aligned
90107 AVT = MVT::i64 ;
91108 ValReg = X86::RAX;
92109 Val = (Val << 32 ) | Val;
@@ -103,12 +120,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
103120 Count = DAG.getIntPtrConstant (SizeVal, dl);
104121 }
105122
106- if (AVT.bitsGT (MVT::i8 )) {
107- unsigned UBytes = AVT.getSizeInBits () / 8 ;
108- Count = DAG.getIntPtrConstant (SizeVal / UBytes, dl);
109- BytesLeft = SizeVal % UBytes;
110- }
111-
123+ const uint64_t BlockBytes = AVT.getSizeInBits () / 8 ;
124+ const uint64_t BlockCount = SizeVal / BlockBytes;
125+ Count = DAG.getIntPtrConstant (BlockCount, dl);
126+ BytesLeft = SizeVal % BlockBytes;
112127 Chain = DAG.getCopyToReg (Chain, dl, ValReg, DAG.getConstant (Val, dl, AVT),
113128 InGlue);
114129 InGlue = Chain.getValue (1 );
@@ -120,34 +135,41 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
120135 }
121136
122137 bool Use64BitRegs = Subtarget.isTarget64BitLP64 ();
123- Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
124- Count, InGlue);
138+ Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX, Count,
139+ InGlue);
125140 InGlue = Chain.getValue (1 );
126- Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
127- Dst, InGlue);
141+ Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI, Dst,
142+ InGlue);
128143 InGlue = Chain.getValue (1 );
129144
130145 SDVTList Tys = DAG.getVTList (MVT::Other, MVT::Glue);
131- SDValue Ops[] = { Chain, DAG.getValueType (AVT), InGlue };
132- Chain = DAG.getNode (X86ISD::REP_STOS, dl, Tys, Ops);
133-
134- if (BytesLeft) {
135- // Handle the last 1 - 7 bytes.
136- unsigned Offset = SizeVal - BytesLeft;
137- EVT AddrVT = Dst.getValueType ();
138- EVT SizeVT = Size.getValueType ();
139-
140- Chain =
141- DAG.getMemset (Chain, dl,
142- DAG.getNode (ISD::ADD, dl, AddrVT, Dst,
143- DAG.getConstant (Offset, dl, AddrVT)),
144- Val, DAG.getConstant (BytesLeft, dl, SizeVT), Alignment,
145- isVolatile, AlwaysInline,
146- /* isTailCall */ false , DstPtrInfo.getWithOffset (Offset));
147- }
146+ SDValue Ops[] = {Chain, DAG.getValueType (AVT), InGlue};
147+ SDValue RepStos = DAG.getNode (X86ISD::REP_STOS, dl, Tys, Ops);
148+
149+ // / RepStos can process the whole length.
150+ //
151+ // Because we changed the alignment earlier in the function to work on size
152+ // when we have the minsize attribute, this is guaranteed to be 0 when we get
153+ // here.
154+ if (BytesLeft == 0 )
155+ return RepStos;
148156
149- // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
150- return Chain;
157+ // Handle the last 1 - 7 bytes.
158+ SmallVector<SDValue, 4 > Results;
159+ Results.push_back (RepStos);
160+ unsigned Offset = SizeVal - BytesLeft;
161+ EVT AddrVT = Dst.getValueType ();
162+ EVT SizeVT = Size.getValueType ();
163+
164+ Results.push_back (
165+ DAG.getMemset (Chain, dl,
166+ DAG.getNode (ISD::ADD, dl, AddrVT, Dst,
167+ DAG.getConstant (Offset, dl, AddrVT)),
168+ Val, DAG.getConstant (BytesLeft, dl, SizeVT), Alignment,
169+ isVolatile, /* isAlwaysInline */ true ,
170+ /* isTailCall */ false , DstPtrInfo.getWithOffset (Offset)));
171+
172+ return DAG.getNode (ISD::TokenFactor, dl, MVT::Other, Results);
151173}
152174
153175// / Emit a single REP MOVS{B,W,D,Q} instruction.
@@ -220,13 +242,42 @@ static SDValue emitConstantSizeRepmov(
220242 assert (!Subtarget.hasERMSB () && " No efficient RepMovs" );
221243 // / We assume runtime memcpy will do a better job for unaligned copies when
222244 // / ERMS is not present.
223- if (!AlwaysInline && (Alignment. value () & 3 ) != 0 )
245+ if (!AlwaysInline && (Alignment < Align ( 4 )) )
224246 return SDValue ();
225247
248+ // If we have minsize, then don't care about the alignment.
249+ // On x86, the CPU doesn't care and neither should you.
250+ // As long as the count is aligned, we can use the minimum number of
251+ // instructions without always having to resort to movsb
252+ //
253+ // Because this is a feature specific to x86, we must handle it here.
254+
255+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ()) {
256+ if ((Size & 15 ) == 0 && Subtarget.is64Bit ())
257+ Alignment = Align (16 );
258+ else if ((Size & 7 ) == 0 )
259+ Alignment = Align (8 );
260+ else if ((Size & 3 ) == 0 )
261+ Alignment = Align (4 );
262+ else if ((Size & 1 ) == 0 )
263+ Alignment = Align (2 );
264+ else
265+ Alignment = Align (1 );
266+ }
267+
226268 const MVT BlockType = getOptimalRepmovsType (Subtarget, Alignment);
227269 const uint64_t BlockBytes = BlockType.getSizeInBits () / 8 ;
228270 const uint64_t BlockCount = Size / BlockBytes;
229271 const uint64_t BytesLeft = Size % BlockBytes;
272+
273+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ()) {
274+ // Use the one instruction determined. Because we changed the alignment
275+ // earlier in the function to work on size when we have the minsize
276+ // attribute, it is guaranteed to process the entire length.
277+ return emitRepmovs (Subtarget, DAG, dl, Chain, Dst, Src,
278+ DAG.getIntPtrConstant (BlockCount, dl), BlockType);
279+ }
280+
230281 SDValue RepMovs =
231282 emitRepmovs (Subtarget, DAG, dl, Chain, Dst, Src,
232283 DAG.getIntPtrConstant (BlockCount, dl), BlockType);
@@ -237,11 +288,6 @@ static SDValue emitConstantSizeRepmov(
237288
238289 assert (BytesLeft && " We have leftover at this point" );
239290
240- // / In case we optimize for size we use repmovsb even if it's less efficient
241- // / so we can save the loads/stores of the leftover.
242- if (DAG.getMachineFunction ().getFunction ().hasMinSize ())
243- return emitRepmovsB (Subtarget, DAG, dl, Chain, Dst, Src, Size);
244-
245291 // Handle the last 1 - 7 bytes.
246292 SmallVector<SDValue, 4 > Results;
247293 Results.push_back (RepMovs);
0 commit comments