@@ -28,23 +28,6 @@ static cl::opt<bool>
2828 UseFSRMForMemcpy (" x86-use-fsrm-for-memcpy" , cl::Hidden, cl::init(false ),
2929 cl::desc(" Use fast short rep mov in memcpy lowering" ));
3030
31- // / Returns the best type to use with repmovs/repstos depending on alignment.
32- static MVT getOptimalRepType (const X86Subtarget &Subtarget, Align Alignment) {
33- uint64_t Align = Alignment.value ();
34- assert ((Align != 0 ) && " Align is normalized" );
35- assert (isPowerOf2_64 (Align) && " Align is a power of 2" );
36- switch (Align) {
37- case 1 :
38- return MVT::i8 ;
39- case 2 :
40- return MVT::i16 ;
41- case 4 :
42- return MVT::i32 ;
43- default :
44- return Subtarget.is64Bit () ? MVT::i64 : MVT::i32 ;
45- }
46- }
47-
4831bool X86SelectionDAGInfo::isBaseRegConflictPossible (
4932 SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
5033 // We cannot use TRI->hasBasePointer() until *after* we select all basic
@@ -61,127 +44,102 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
6144 return llvm::is_contained (ClobberSet, TRI->getBaseRegister ());
6245}
6346
64- // / Emit a single REP STOSB instruction for a particular constant size.
65- static SDValue emitRepstos (const X86Subtarget &Subtarget, SelectionDAG &DAG,
66- const SDLoc &dl, SDValue Chain, SDValue Dst,
67- SDValue Val, SDValue Size, MVT AVT) {
68- const bool Use64BitRegs = Subtarget.isTarget64BitLP64 ();
69- unsigned AX = X86::AL;
70- switch (AVT.getSizeInBits ()) {
71- case 8 :
72- AX = X86::AL;
73- break ;
74- case 16 :
75- AX = X86::AX;
76- break ;
77- case 32 :
78- AX = X86::EAX;
79- break ;
80- default :
81- AX = X86::RAX;
82- break ;
83- }
84-
85- const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
86- const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
87-
88- SDValue InGlue;
89- Chain = DAG.getCopyToReg (Chain, dl, AX, Val, InGlue);
90- InGlue = Chain.getValue (1 );
91- Chain = DAG.getCopyToReg (Chain, dl, CX, Size, InGlue);
92- InGlue = Chain.getValue (1 );
93- Chain = DAG.getCopyToReg (Chain, dl, DI, Dst, InGlue);
94- InGlue = Chain.getValue (1 );
95-
96- SDVTList Tys = DAG.getVTList (MVT::Other, MVT::Glue);
97- SDValue Ops[] = {Chain, DAG.getValueType (AVT), InGlue};
98- return DAG.getNode (X86ISD::REP_STOS, dl, Tys, Ops);
99- }
100-
101- // / Emit a single REP STOSB instruction for a particular constant size.
102- static SDValue emitRepstosB (const X86Subtarget &Subtarget, SelectionDAG &DAG,
103- const SDLoc &dl, SDValue Chain, SDValue Dst,
104- SDValue Val, uint64_t Size) {
105- return emitRepstos (Subtarget, DAG, dl, Chain, Dst, Val,
106- DAG.getIntPtrConstant (Size, dl), MVT::i8 );
107- }
108-
109- // / Returns a REP STOS instruction, possibly with a few load/stores to implement
110- // / a constant size memory set. In some cases where we know REP MOVS is
111- // / inefficient we return an empty SDValue so the calling code can either
112- // / generate a store sequence or call the runtime memset function.
113- static SDValue emitConstantSizeRepstos (SelectionDAG &DAG,
114- const X86Subtarget &Subtarget,
115- const SDLoc &dl, SDValue Chain,
116- SDValue Dst, SDValue Val, uint64_t Size,
117- EVT SizeVT, Align Alignment,
118- bool isVolatile, bool AlwaysInline,
119- MachinePointerInfo DstPtrInfo) {
120- // / In case we optimize for size, we use repstosb even if it's less efficient
121- // / so we can save the loads/stores of the leftover.
122- if (DAG.getMachineFunction ().getFunction ().hasMinSize ()) {
123- if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
124- // Special case 0 because otherwise we get large literals,
125- // which causes larger encoding.
126- if ((Size & 31 ) == 0 && (ValC->getZExtValue () & 255 ) == 0 ) {
127- MVT BlockType = MVT::i32 ;
128- const uint64_t BlockBits = BlockType.getSizeInBits ();
129- const uint64_t BlockBytes = BlockBits / 8 ;
130- const uint64_t BlockCount = Size / BlockBytes;
131-
132- Val = DAG.getConstant (0 , dl, BlockType);
133- // repstosd is same size as repstosb
134- return emitRepstos (Subtarget, DAG, dl, Chain, Dst, Val,
135- DAG.getIntPtrConstant (BlockCount, dl), BlockType);
136- }
137- }
138- return emitRepstosB (Subtarget, DAG, dl, Chain, Dst, Val, Size);
139- }
47+ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset (
48+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
49+ SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
50+ MachinePointerInfo DstPtrInfo) const {
51+ // If to a segment-relative address space, use the default lowering.
52+ if (DstPtrInfo.getAddrSpace () >= 256 )
53+ return SDValue ();
14054
141- if (Size > Subtarget.getMaxInlineSizeThreshold ())
55+ // If the base register might conflict with our physical registers, bail out.
56+ const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
57+ X86::ECX, X86::EAX, X86::EDI};
58+ if (isBaseRegConflictPossible (DAG, ClobberSet))
14259 return SDValue ();
14360
61+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
62+ const X86Subtarget &Subtarget =
63+ DAG.getMachineFunction ().getSubtarget <X86Subtarget>();
64+
14465 // If not DWORD aligned or size is more than the threshold, call the library.
14566 // The libc version is likely to be faster for these cases. It can use the
14667 // address value and run time information about the CPU.
147- if (Alignment < Align (4 ))
68+ if (Alignment < Align (4 ) || !ConstantSize ||
69+ ConstantSize->getZExtValue () > Subtarget.getMaxInlineSizeThreshold ())
14870 return SDValue ();
14971
150- MVT BlockType = MVT::i8 ;
151- uint64_t BlockCount = Size;
152- uint64_t BytesLeft = 0 ;
72+ uint64_t SizeVal = ConstantSize->getZExtValue ();
73+ SDValue InGlue;
74+ EVT AVT;
75+ SDValue Count;
76+ unsigned BytesLeft = 0 ;
15377 if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
154- BlockType = getOptimalRepType (Subtarget, Alignment);
155- uint64_t Value = ValC->getZExtValue () & 255 ;
156- const uint64_t BlockBits = BlockType.getSizeInBits ();
78+ unsigned ValReg;
79+ uint64_t Val = ValC->getZExtValue () & 255 ;
80+
81+ // If the value is a constant, then we can potentially use larger sets.
82+ if (Alignment >= Align (4 )) {
83+ // DWORD aligned
84+ AVT = MVT::i32 ;
85+ ValReg = X86::EAX;
86+ Val = (Val << 8 ) | Val;
87+ Val = (Val << 16 ) | Val;
88+ if (Subtarget.is64Bit () && Alignment >= Align (8 )) { // QWORD aligned
89+ AVT = MVT::i64 ;
90+ ValReg = X86::RAX;
91+ Val = (Val << 32 ) | Val;
92+ }
93+ } else if (Alignment == Align (2 )) {
94+ // WORD aligned
95+ AVT = MVT::i16 ;
96+ ValReg = X86::AX;
97+ Val = (Val << 8 ) | Val;
98+ } else {
99+ // Byte aligned
100+ AVT = MVT::i8 ;
101+ ValReg = X86::AL;
102+ Count = DAG.getIntPtrConstant (SizeVal, dl);
103+ }
157104
158- if (BlockBits >= 16 )
159- Value = (Value << 8 ) | Value;
105+ if (AVT.bitsGT (MVT::i8 )) {
106+ unsigned UBytes = AVT.getSizeInBits () / 8 ;
107+ Count = DAG.getIntPtrConstant (SizeVal / UBytes, dl);
108+ BytesLeft = SizeVal % UBytes;
109+ }
160110
161- if (BlockBits >= 32 )
162- Value = (Value << 16 ) | Value;
111+ Chain = DAG.getCopyToReg (Chain, dl, ValReg, DAG.getConstant (Val, dl, AVT),
112+ InGlue);
113+ InGlue = Chain.getValue (1 );
114+ } else {
115+ AVT = MVT::i8 ;
116+ Count = DAG.getIntPtrConstant (SizeVal, dl);
117+ Chain = DAG.getCopyToReg (Chain, dl, X86::AL, Val, InGlue);
118+ InGlue = Chain.getValue (1 );
119+ }
163120
164- if (BlockBits >= 64 )
165- Value = (Value << 32 ) | Value;
121+ bool Use64BitRegs = Subtarget.isTarget64BitLP64 ();
122+ Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
123+ Count, InGlue);
124+ InGlue = Chain.getValue (1 );
125+ Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
126+ Dst, InGlue);
127+ InGlue = Chain.getValue (1 );
166128
167- const uint64_t BlockBytes = BlockBits / 8 ;
168- BlockCount = Size / BlockBytes;
169- BytesLeft = Size % BlockBytes;
170- Val = DAG.getConstant (Value, dl, BlockType);
171- }
129+ SDVTList Tys = DAG.getVTList (MVT::Other, MVT::Glue);
130+ SDValue Ops[] = {Chain, DAG.getValueType (AVT), InGlue};
131+ SDValue RepStos = DAG.getNode (X86ISD::REP_STOS, dl, Tys, Ops);
172132
173- SDValue RepStos =
174- emitRepstos (Subtarget, DAG, dl, Chain, Dst, Val,
175- DAG.getIntPtrConstant (BlockCount, dl), BlockType);
176133 // / RepStos can process the whole length.
177134 if (BytesLeft == 0 )
178135 return RepStos;
179136
180137 // Handle the last 1 - 7 bytes.
181138 SmallVector<SDValue, 4 > Results;
182139 Results.push_back (RepStos);
183- unsigned Offset = Size - BytesLeft;
140+ unsigned Offset = SizeVal - BytesLeft;
184141 EVT AddrVT = Dst.getValueType ();
142+ EVT SizeVT = Size.getValueType ();
185143
186144 Results.push_back (
187145 DAG.getMemset (Chain, dl,
@@ -194,31 +152,6 @@ static SDValue emitConstantSizeRepstos(SelectionDAG &DAG,
194152 return DAG.getNode (ISD::TokenFactor, dl, MVT::Other, Results);
195153}
196154
197- SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset (
198- SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
199- SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
200- MachinePointerInfo DstPtrInfo) const {
201- // If to a segment-relative address space, use the default lowering.
202- if (DstPtrInfo.getAddrSpace () >= 256 )
203- return SDValue ();
204-
205- // If the base register might conflict with our physical registers, bail out.
206- const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
207- X86::ECX, X86::EAX, X86::EDI};
208- if (isBaseRegConflictPossible (DAG, ClobberSet))
209- return SDValue ();
210-
211- ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
212- if (!ConstantSize)
213- return SDValue ();
214-
215- const X86Subtarget &Subtarget =
216- DAG.getMachineFunction ().getSubtarget <X86Subtarget>();
217- return emitConstantSizeRepstos (
218- DAG, Subtarget, dl, Chain, Dst, Val, ConstantSize->getZExtValue (),
219- Size.getValueType (), Alignment, isVolatile, AlwaysInline, DstPtrInfo);
220- }
221-
222155// / Emit a single REP MOVS{B,W,D,Q} instruction.
223156static SDValue emitRepmovs (const X86Subtarget &Subtarget, SelectionDAG &DAG,
224157 const SDLoc &dl, SDValue Chain, SDValue Dst,
@@ -249,6 +182,24 @@ static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
249182 DAG.getIntPtrConstant (Size, dl), MVT::i8 );
250183}
251184
185+ // / Returns the best type to use with repmovs depending on alignment.
186+ static MVT getOptimalRepmovsType (const X86Subtarget &Subtarget,
187+ Align Alignment) {
188+ uint64_t Align = Alignment.value ();
189+ assert ((Align != 0 ) && " Align is normalized" );
190+ assert (isPowerOf2_64 (Align) && " Align is a power of 2" );
191+ switch (Align) {
192+ case 1 :
193+ return MVT::i8 ;
194+ case 2 :
195+ return MVT::i16 ;
196+ case 4 :
197+ return MVT::i32 ;
198+ default :
199+ return Subtarget.is64Bit () ? MVT::i64 : MVT::i32 ;
200+ }
201+ }
202+
252203// / Returns a REP MOVS instruction, possibly with a few load/stores to implement
253204// / a constant size memory copy. In some cases where we know REP MOVS is
254205// / inefficient we return an empty SDValue so the calling code can either
@@ -258,10 +209,6 @@ static SDValue emitConstantSizeRepmov(
258209 SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, EVT SizeVT,
259210 Align Alignment, bool isVolatile, bool AlwaysInline,
260211 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) {
261- // / In case we optimize for size, we use repmovsb even if it's less efficient
262- // / so we can save the loads/stores of the leftover.
263- if (DAG.getMachineFunction ().getFunction ().hasMinSize ())
264- return emitRepmovsB (Subtarget, DAG, dl, Chain, Dst, Src, Size);
265212
266213 // / TODO: Revisit next line: big copy with ERMSB on march >= haswell are very
267214 // / efficient.
@@ -275,10 +222,10 @@ static SDValue emitConstantSizeRepmov(
275222 assert (!Subtarget.hasERMSB () && " No efficient RepMovs" );
276223 // / We assume runtime memcpy will do a better job for unaligned copies when
277224 // / ERMS is not present.
278- if (!AlwaysInline && (Alignment < Align ( 4 )) )
225+ if (!AlwaysInline && (Alignment. value () & 3 ) != 0 )
279226 return SDValue ();
280227
281- const MVT BlockType = getOptimalRepType (Subtarget, Alignment);
228+ const MVT BlockType = getOptimalRepmovsType (Subtarget, Alignment);
282229 const uint64_t BlockBytes = BlockType.getSizeInBits () / 8 ;
283230 const uint64_t BlockCount = Size / BlockBytes;
284231 const uint64_t BytesLeft = Size % BlockBytes;
@@ -292,6 +239,11 @@ static SDValue emitConstantSizeRepmov(
292239
293240 assert (BytesLeft && " We have leftover at this point" );
294241
242+ // / In case we optimize for size we use repmovsb even if it's less efficient
243+ // / so we can save the loads/stores of the leftover.
244+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ())
245+ return emitRepmovsB (Subtarget, DAG, dl, Chain, Dst, Src, Size);
246+
295247 // Handle the last 1 - 7 bytes.
296248 SmallVector<SDValue, 4 > Results;
297249 Results.push_back (RepMovs);
@@ -330,7 +282,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
330282 if (UseFSRMForMemcpy && Subtarget.hasFSRM ())
331283 return emitRepmovs (Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8 );
332284
333- // / Handle constant sizes
285+ // / Handle constant sizes,
334286 if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size))
335287 return emitConstantSizeRepmov (DAG, Subtarget, dl, Chain, Dst, Src,
336288 ConstantSize->getZExtValue (),
0 commit comments