@@ -28,6 +28,23 @@ static cl::opt<bool>
2828 UseFSRMForMemcpy (" x86-use-fsrm-for-memcpy" , cl::Hidden, cl::init(false ),
2929 cl::desc(" Use fast short rep mov in memcpy lowering" ));
3030
31+ // / Returns the best type to use with repmovs/repstos depending on alignment.
32+ static MVT getOptimalRepType (const X86Subtarget &Subtarget, Align Alignment) {
33+ uint64_t Align = Alignment.value ();
34+ assert ((Align != 0 ) && " Align is normalized" );
35+ assert (isPowerOf2_64 (Align) && " Align is a power of 2" );
36+ switch (Align) {
37+ case 1 :
38+ return MVT::i8 ;
39+ case 2 :
40+ return MVT::i16 ;
41+ case 4 :
42+ return MVT::i32 ;
43+ default :
44+ return Subtarget.is64Bit () ? MVT::i64 : MVT::i32 ;
45+ }
46+ }
47+
3148bool X86SelectionDAGInfo::isBaseRegConflictPossible (
3249 SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
3350 // We cannot use TRI->hasBasePointer() until *after* we select all basic
@@ -44,102 +61,127 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
4461 return llvm::is_contained (ClobberSet, TRI->getBaseRegister ());
4562}
4663
47- SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset (
48- SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
49- SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
50- MachinePointerInfo DstPtrInfo) const {
51- // If to a segment-relative address space, use the default lowering.
52- if (DstPtrInfo.getAddrSpace () >= 256 )
53- return SDValue ();
64+ // / Emit a single REP STOSB instruction for a particular constant size.
65+ static SDValue emitRepstos (const X86Subtarget &Subtarget, SelectionDAG &DAG,
66+ const SDLoc &dl, SDValue Chain, SDValue Dst,
67+ SDValue Val, SDValue Size, MVT AVT) {
68+ const bool Use64BitRegs = Subtarget.isTarget64BitLP64 ();
69+ unsigned AX = X86::AL;
70+ switch (AVT.getSizeInBits ()) {
71+ case 8 :
72+ AX = X86::AL;
73+ break ;
74+ case 16 :
75+ AX = X86::AX;
76+ break ;
77+ case 32 :
78+ AX = X86::EAX;
79+ break ;
80+ default :
81+ AX = X86::RAX;
82+ break ;
83+ }
5484
55- // If the base register might conflict with our physical registers, bail out.
56- const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
57- X86::ECX, X86::EAX, X86::EDI};
58- if (isBaseRegConflictPossible (DAG, ClobberSet))
59- return SDValue ();
85+ const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
86+ const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
6087
61- ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
62- const X86Subtarget &Subtarget =
63- DAG.getMachineFunction ().getSubtarget <X86Subtarget>();
88+ SDValue InGlue;
89+ Chain = DAG.getCopyToReg (Chain, dl, AX, Val, InGlue);
90+ InGlue = Chain.getValue (1 );
91+ Chain = DAG.getCopyToReg (Chain, dl, CX, Size, InGlue);
92+ InGlue = Chain.getValue (1 );
93+ Chain = DAG.getCopyToReg (Chain, dl, DI, Dst, InGlue);
94+ InGlue = Chain.getValue (1 );
95+
96+ SDVTList Tys = DAG.getVTList (MVT::Other, MVT::Glue);
97+ SDValue Ops[] = {Chain, DAG.getValueType (AVT), InGlue};
98+ return DAG.getNode (X86ISD::REP_STOS, dl, Tys, Ops);
99+ }
100+
101+ // / Emit a single REP STOSB instruction for a particular constant size.
102+ static SDValue emitRepstosB (const X86Subtarget &Subtarget, SelectionDAG &DAG,
103+ const SDLoc &dl, SDValue Chain, SDValue Dst,
104+ SDValue Val, uint64_t Size) {
105+ return emitRepstos (Subtarget, DAG, dl, Chain, Dst, Val,
106+ DAG.getIntPtrConstant (Size, dl), MVT::i8 );
107+ }
108+
109+ // / Returns a REP STOS instruction, possibly with a few load/stores to implement
110+ // / a constant size memory set. In some cases where we know REP MOVS is
111+ // / inefficient we return an empty SDValue so the calling code can either
112+ // / generate a store sequence or call the runtime memset function.
113+ static SDValue emitConstantSizeRepstos (SelectionDAG &DAG,
114+ const X86Subtarget &Subtarget,
115+ const SDLoc &dl, SDValue Chain,
116+ SDValue Dst, SDValue Val, uint64_t Size,
117+ EVT SizeVT, Align Alignment,
118+ bool isVolatile, bool AlwaysInline,
119+ MachinePointerInfo DstPtrInfo) {
120+ // / In case we optimize for size, we use repstosb even if it's less efficient
121+ // / so we can save the loads/stores of the leftover.
122+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ()) {
123+ if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
124+ // Special case 0 because otherwise we get large literals,
125+ // which causes larger encoding.
126+ if ((Size & 31 ) == 0 && (ValC->getZExtValue () & 255 ) == 0 ) {
127+ MVT BlockType = MVT::i32 ;
128+ const uint64_t BlockBits = BlockType.getSizeInBits ();
129+ const uint64_t BlockBytes = BlockBits / 8 ;
130+ const uint64_t BlockCount = Size / BlockBytes;
131+
132+ Val = DAG.getConstant (0 , dl, BlockType);
133+ // repstosd is same size as repstosb
134+ return emitRepstos (Subtarget, DAG, dl, Chain, Dst, Val,
135+ DAG.getIntPtrConstant (BlockCount, dl), BlockType);
136+ }
137+ }
138+ return emitRepstosB (Subtarget, DAG, dl, Chain, Dst, Val, Size);
139+ }
140+
141+ if (Size > Subtarget.getMaxInlineSizeThreshold ())
142+ return SDValue ();
64143
65144 // If not DWORD aligned or size is more than the threshold, call the library.
66145 // The libc version is likely to be faster for these cases. It can use the
67146 // address value and run time information about the CPU.
68- if (Alignment < Align (4 ) || !ConstantSize ||
69- ConstantSize->getZExtValue () > Subtarget.getMaxInlineSizeThreshold ())
147+ if (Alignment < Align (4 ))
70148 return SDValue ();
71149
72- uint64_t SizeVal = ConstantSize->getZExtValue ();
73- SDValue InGlue;
74- EVT AVT;
75- SDValue Count;
76- unsigned BytesLeft = 0 ;
150+ MVT BlockType = MVT::i8 ;
151+ uint64_t BlockCount = Size;
152+ uint64_t BytesLeft = 0 ;
77153 if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
78- unsigned ValReg;
79- uint64_t Val = ValC->getZExtValue () & 255 ;
80-
81- // If the value is a constant, then we can potentially use larger sets.
82- if (Alignment >= Align (4 )) {
83- // DWORD aligned
84- AVT = MVT::i32 ;
85- ValReg = X86::EAX;
86- Val = (Val << 8 ) | Val;
87- Val = (Val << 16 ) | Val;
88- if (Subtarget.is64Bit () && Alignment >= Align (8 )) { // QWORD aligned
89- AVT = MVT::i64 ;
90- ValReg = X86::RAX;
91- Val = (Val << 32 ) | Val;
92- }
93- } else if (Alignment == Align (2 )) {
94- // WORD aligned
95- AVT = MVT::i16 ;
96- ValReg = X86::AX;
97- Val = (Val << 8 ) | Val;
98- } else {
99- // Byte aligned
100- AVT = MVT::i8 ;
101- ValReg = X86::AL;
102- Count = DAG.getIntPtrConstant (SizeVal, dl);
103- }
154+ BlockType = getOptimalRepType (Subtarget, Alignment);
155+ uint64_t Value = ValC->getZExtValue () & 255 ;
156+ const uint64_t BlockBits = BlockType.getSizeInBits ();
104157
105- if (AVT.bitsGT (MVT::i8 )) {
106- unsigned UBytes = AVT.getSizeInBits () / 8 ;
107- Count = DAG.getIntPtrConstant (SizeVal / UBytes, dl);
108- BytesLeft = SizeVal % UBytes;
109- }
158+ if (BlockBits >= 16 )
159+ Value = (Value << 8 ) | Value;
110160
111- Chain = DAG.getCopyToReg (Chain, dl, ValReg, DAG.getConstant (Val, dl, AVT),
112- InGlue);
113- InGlue = Chain.getValue (1 );
114- } else {
115- AVT = MVT::i8 ;
116- Count = DAG.getIntPtrConstant (SizeVal, dl);
117- Chain = DAG.getCopyToReg (Chain, dl, X86::AL, Val, InGlue);
118- InGlue = Chain.getValue (1 );
119- }
161+ if (BlockBits >= 32 )
162+ Value = (Value << 16 ) | Value;
120163
121- bool Use64BitRegs = Subtarget.isTarget64BitLP64 ();
122- Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
123- Count, InGlue);
124- InGlue = Chain.getValue (1 );
125- Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
126- Dst, InGlue);
127- InGlue = Chain.getValue (1 );
164+ if (BlockBits >= 64 )
165+ Value = (Value << 32 ) | Value;
128166
129- SDVTList Tys = DAG.getVTList (MVT::Other, MVT::Glue);
130- SDValue Ops[] = {Chain, DAG.getValueType (AVT), InGlue};
131- SDValue RepStos = DAG.getNode (X86ISD::REP_STOS, dl, Tys, Ops);
167+ const uint64_t BlockBytes = BlockBits / 8 ;
168+ BlockCount = Size / BlockBytes;
169+ BytesLeft = Size % BlockBytes;
170+ Val = DAG.getConstant (Value, dl, BlockType);
171+ }
132172
173+ SDValue RepStos =
174+ emitRepstos (Subtarget, DAG, dl, Chain, Dst, Val,
175+ DAG.getIntPtrConstant (BlockCount, dl), BlockType);
133176 // / RepStos can process the whole length.
134177 if (BytesLeft == 0 )
135178 return RepStos;
136179
137180 // Handle the last 1 - 7 bytes.
138181 SmallVector<SDValue, 4 > Results;
139182 Results.push_back (RepStos);
140- unsigned Offset = SizeVal - BytesLeft;
183+ unsigned Offset = Size - BytesLeft;
141184 EVT AddrVT = Dst.getValueType ();
142- EVT SizeVT = Size.getValueType ();
143185
144186 Results.push_back (
145187 DAG.getMemset (Chain, dl,
@@ -152,6 +194,31 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
152194 return DAG.getNode (ISD::TokenFactor, dl, MVT::Other, Results);
153195}
154196
197+ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset (
198+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
199+ SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
200+ MachinePointerInfo DstPtrInfo) const {
201+ // If to a segment-relative address space, use the default lowering.
202+ if (DstPtrInfo.getAddrSpace () >= 256 )
203+ return SDValue ();
204+
205+ // If the base register might conflict with our physical registers, bail out.
206+ const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
207+ X86::ECX, X86::EAX, X86::EDI};
208+ if (isBaseRegConflictPossible (DAG, ClobberSet))
209+ return SDValue ();
210+
211+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
212+ if (!ConstantSize)
213+ return SDValue ();
214+
215+ const X86Subtarget &Subtarget =
216+ DAG.getMachineFunction ().getSubtarget <X86Subtarget>();
217+ return emitConstantSizeRepstos (
218+ DAG, Subtarget, dl, Chain, Dst, Val, ConstantSize->getZExtValue (),
219+ Size.getValueType (), Alignment, isVolatile, AlwaysInline, DstPtrInfo);
220+ }
221+
155222// / Emit a single REP MOVS{B,W,D,Q} instruction.
156223static SDValue emitRepmovs (const X86Subtarget &Subtarget, SelectionDAG &DAG,
157224 const SDLoc &dl, SDValue Chain, SDValue Dst,
@@ -182,24 +249,6 @@ static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
182249 DAG.getIntPtrConstant (Size, dl), MVT::i8 );
183250}
184251
185- // / Returns the best type to use with repmovs depending on alignment.
186- static MVT getOptimalRepmovsType (const X86Subtarget &Subtarget,
187- Align Alignment) {
188- uint64_t Align = Alignment.value ();
189- assert ((Align != 0 ) && " Align is normalized" );
190- assert (isPowerOf2_64 (Align) && " Align is a power of 2" );
191- switch (Align) {
192- case 1 :
193- return MVT::i8 ;
194- case 2 :
195- return MVT::i16 ;
196- case 4 :
197- return MVT::i32 ;
198- default :
199- return Subtarget.is64Bit () ? MVT::i64 : MVT::i32 ;
200- }
201- }
202-
203252// / Returns a REP MOVS instruction, possibly with a few load/stores to implement
204253// / a constant size memory copy. In some cases where we know REP MOVS is
205254// / inefficient we return an empty SDValue so the calling code can either
@@ -209,6 +258,10 @@ static SDValue emitConstantSizeRepmov(
209258 SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, EVT SizeVT,
210259 Align Alignment, bool isVolatile, bool AlwaysInline,
211260 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) {
261+ // / In case we optimize for size, we use repmovsb even if it's less efficient
262+ // / so we can save the loads/stores of the leftover.
263+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ())
264+ return emitRepmovsB (Subtarget, DAG, dl, Chain, Dst, Src, Size);
212265
213266 // / TODO: Revisit next line: big copy with ERMSB on march >= haswell are very
214267 // / efficient.
@@ -222,10 +275,10 @@ static SDValue emitConstantSizeRepmov(
222275 assert (!Subtarget.hasERMSB () && " No efficient RepMovs" );
223276 // / We assume runtime memcpy will do a better job for unaligned copies when
224277 // / ERMS is not present.
225- if (!AlwaysInline && (Alignment. value () & 3 ) != 0 )
278+ if (!AlwaysInline && (Alignment < Align ( 4 )) )
226279 return SDValue ();
227280
228- const MVT BlockType = getOptimalRepmovsType (Subtarget, Alignment);
281+ const MVT BlockType = getOptimalRepType (Subtarget, Alignment);
229282 const uint64_t BlockBytes = BlockType.getSizeInBits () / 8 ;
230283 const uint64_t BlockCount = Size / BlockBytes;
231284 const uint64_t BytesLeft = Size % BlockBytes;
@@ -239,11 +292,6 @@ static SDValue emitConstantSizeRepmov(
239292
240293 assert (BytesLeft && " We have leftover at this point" );
241294
242- // / In case we optimize for size we use repmovsb even if it's less efficient
243- // / so we can save the loads/stores of the leftover.
244- if (DAG.getMachineFunction ().getFunction ().hasMinSize ())
245- return emitRepmovsB (Subtarget, DAG, dl, Chain, Dst, Src, Size);
246-
247295 // Handle the last 1 - 7 bytes.
248296 SmallVector<SDValue, 4 > Results;
249297 Results.push_back (RepMovs);
@@ -282,7 +330,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
282330 if (UseFSRMForMemcpy && Subtarget.hasFSRM ())
283331 return emitRepmovs (Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8 );
284332
285- // / Handle constant sizes,
333+ // / Handle constant sizes
286334 if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size))
287335 return emitConstantSizeRepmov (DAG, Subtarget, dl, Chain, Dst, Src,
288336 ConstantSize->getZExtValue (),
0 commit comments