Skip to content

Commit 9e606b5

Browse files
Ensure that we lookup the correct instruction for embedded masking/broadcast scenarios (#117828)
1 parent 97d5ac8 commit 9e606b5

File tree

7 files changed

+290
-84
lines changed

7 files changed

+290
-84
lines changed

src/coreclr/jit/codegenxarch.cpp

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5806,9 +5806,37 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
58065806
ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType, compiler);
58075807
attr = emitActualTypeSize(Compiler::getSIMDTypeForSize(hwintrinsic->GetSimdSize()));
58085808

5809-
if (intrinsicId == NI_X86Base_Extract)
5809+
// We may have opportunistically selected an EVEX only instruction
5810+
// that isn't actually required, so fallback to the VEX compatible
5811+
// encoding to potentially save on the number of bytes emitted.
5812+
5813+
switch (ins)
58105814
{
5811-
ins = INS_pextrw_sse42;
5815+
case INS_pextrw:
5816+
{
5817+
// The encoding which supports containment is SSE4.1+ only
5818+
assert(compiler->compIsaSupportedDebugOnly(InstructionSet_SSE42));
5819+
5820+
ins = INS_pextrw_sse42;
5821+
break;
5822+
}
5823+
5824+
case INS_vextractf64x2:
5825+
{
5826+
ins = INS_vextractf32x4;
5827+
break;
5828+
}
5829+
5830+
case INS_vextracti64x2:
5831+
{
5832+
ins = INS_vextracti32x4;
5833+
break;
5834+
}
5835+
5836+
default:
5837+
{
5838+
break;
5839+
}
58125840
}
58135841

58145842
// The hardware intrinsics take unsigned bytes between [0, 255].

src/coreclr/jit/gentree.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20539,7 +20539,7 @@ bool GenTree::isEmbeddedBroadcastCompatibleHWIntrinsic(Compiler* comp) const
2053920539
{
2054020540
NamedIntrinsic intrinsicId = AsHWIntrinsic()->GetHWIntrinsicId();
2054120541
var_types simdBaseType = AsHWIntrinsic()->GetSimdBaseType();
20542-
instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, simdBaseType, nullptr);
20542+
instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, simdBaseType, comp);
2054320543

2054420544
if (comp->codeGen->instIsEmbeddedBroadcastCompatible(ins))
2054520545
{
@@ -20784,7 +20784,12 @@ bool GenTree::isEmbeddedMaskingCompatible(Compiler* comp, unsigned tgtMaskSize,
2078420784

2078520785
if (tgtSimdBaseJitType != CORINFO_TYPE_UNDEF)
2078620786
{
20787-
ins = HWIntrinsicInfo::lookupIns(intrinsic, simdBaseType, comp);
20787+
var_types tgtSimdBaseType = JitType2PreciseVarType(tgtSimdBaseJitType);
20788+
20789+
instruction tgtIns = HWIntrinsicInfo::lookupIns(intrinsic, tgtSimdBaseType, comp);
20790+
assert(ins != tgtIns);
20791+
20792+
ins = tgtIns;
2078820793
maskBaseSize = CodeGenInterface::instKMaskBaseSize(ins);
2078920794
}
2079020795

src/coreclr/jit/hwintrinsic.cpp

Lines changed: 103 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -87,108 +87,147 @@ instruction HWIntrinsicInfo::lookupIns(NamedIntrinsic id, var_types type, Compil
8787
#endif // TARGET_X86
8888

8989
#if defined(TARGET_XARCH)
90-
instruction evexIns = ins;
91-
92-
switch (ins)
90+
if (comp != nullptr)
9391
{
94-
case INS_movdqa32:
92+
instruction evexIns = ins;
93+
94+
switch (ins)
9595
{
96-
if (varTypeIsLong(type))
96+
case INS_movdqa32:
9797
{
98-
evexIns = INS_vmovdqa64;
98+
if (varTypeIsLong(type))
99+
{
100+
evexIns = INS_vmovdqa64;
101+
}
102+
break;
99103
}
100-
break;
101-
}
102104

103-
case INS_movdqu32:
104-
{
105-
if (varTypeIsLong(type))
105+
case INS_movdqu32:
106106
{
107-
evexIns = INS_vmovdqu64;
107+
if (varTypeIsLong(type))
108+
{
109+
evexIns = INS_vmovdqu64;
110+
}
111+
break;
108112
}
109-
break;
110-
}
111113

112-
case INS_vbroadcastf32x4:
113-
{
114-
if (type == TYP_DOUBLE)
114+
case INS_pandd:
115115
{
116-
evexIns = INS_vbroadcastf64x2;
116+
if (varTypeIsLong(type))
117+
{
118+
evexIns = INS_vpandq;
119+
}
120+
break;
117121
}
118-
break;
119-
}
120122

121-
case INS_vbroadcasti32x4:
122-
{
123-
if (varTypeIsLong(type))
123+
case INS_pandnd:
124124
{
125-
evexIns = INS_vbroadcasti64x2;
125+
if (varTypeIsLong(type))
126+
{
127+
evexIns = INS_vpandnq;
128+
}
129+
break;
126130
}
127-
break;
128-
}
129131

130-
case INS_vextractf32x4:
131-
{
132-
if (type == TYP_DOUBLE)
132+
case INS_pord:
133133
{
134-
evexIns = INS_vextractf64x2;
134+
if (varTypeIsLong(type))
135+
{
136+
evexIns = INS_vporq;
137+
}
138+
break;
135139
}
136-
else if (varTypeIsInt(type))
140+
141+
case INS_pxord:
137142
{
138-
evexIns = INS_vextracti32x4;
143+
if (varTypeIsLong(type))
144+
{
145+
evexIns = INS_vpxorq;
146+
}
147+
break;
139148
}
140-
else if (varTypeIsLong(type))
149+
150+
case INS_vbroadcastf32x4:
141151
{
142-
evexIns = INS_vextracti64x2;
152+
if (type == TYP_DOUBLE)
153+
{
154+
evexIns = INS_vbroadcastf64x2;
155+
}
156+
break;
143157
}
144-
break;
145-
}
146158

147-
case INS_vextracti32x4:
148-
{
149-
if (varTypeIsLong(type))
159+
case INS_vbroadcasti32x4:
160+
{
161+
if (varTypeIsLong(type))
162+
{
163+
evexIns = INS_vbroadcasti64x2;
164+
}
165+
break;
166+
}
167+
168+
case INS_vextractf32x4:
150169
{
151-
evexIns = INS_vextracti64x2;
170+
if (type == TYP_DOUBLE)
171+
{
172+
evexIns = INS_vextractf64x2;
173+
}
174+
else if (varTypeIsInt(type))
175+
{
176+
evexIns = INS_vextracti32x4;
177+
}
178+
else if (varTypeIsLong(type))
179+
{
180+
evexIns = INS_vextracti64x2;
181+
}
182+
break;
152183
}
153-
break;
154-
}
155184

156-
case INS_vinsertf32x4:
157-
{
158-
if (type == TYP_DOUBLE)
185+
case INS_vextracti32x4:
159186
{
160-
evexIns = INS_vinsertf64x2;
187+
if (varTypeIsLong(type))
188+
{
189+
evexIns = INS_vextracti64x2;
190+
}
191+
break;
161192
}
162-
else if (varTypeIsInt(type))
193+
194+
case INS_vinsertf32x4:
163195
{
164-
evexIns = INS_vinserti32x4;
196+
if (type == TYP_DOUBLE)
197+
{
198+
evexIns = INS_vinsertf64x2;
199+
}
200+
else if (varTypeIsInt(type))
201+
{
202+
evexIns = INS_vinserti32x4;
203+
}
204+
else if (varTypeIsLong(type))
205+
{
206+
evexIns = INS_vinserti64x2;
207+
}
208+
break;
165209
}
166-
else if (varTypeIsLong(type))
210+
211+
case INS_vinserti32x4:
167212
{
168-
evexIns = INS_vinserti64x2;
213+
if (varTypeIsLong(type))
214+
{
215+
evexIns = INS_vinserti64x2;
216+
}
217+
break;
169218
}
170-
break;
171-
}
172219

173-
case INS_vinserti32x4:
174-
{
175-
if (varTypeIsLong(type))
220+
default:
176221
{
177-
evexIns = INS_vinserti64x2;
222+
break;
178223
}
179-
break;
180224
}
181225

182-
default:
226+
if ((evexIns != ins) && comp->canUseEvexEncoding())
183227
{
184-
break;
228+
ins = evexIns;
185229
}
186230
}
187-
188-
if ((evexIns != ins) && (comp != nullptr) && comp->canUseEvexEncoding())
189-
{
190-
ins = evexIns;
191-
}
192231
#endif // TARGET_XARCH
193232

194233
return ins;

src/coreclr/jit/hwintrinsiccodegenxarch.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1059,6 +1059,44 @@ void CodeGen::genHWIntrinsic_R_RM(
10591059
{
10601060
instOptions = AddEmbBroadcastMode(instOptions);
10611061
}
1062+
else if ((instOptions == INS_OPTS_NONE) && !GetEmitter()->IsVexEncodableInstruction(ins))
1063+
{
1064+
// We may have opportunistically selected an EVEX only instruction
1065+
// that isn't actually required, so fallback to the VEX compatible
1066+
// encoding to potentially save on the number of bytes emitted.
1067+
1068+
switch (ins)
1069+
{
1070+
case INS_vbroadcastf64x2:
1071+
{
1072+
ins = INS_vbroadcastf32x4;
1073+
break;
1074+
}
1075+
1076+
case INS_vbroadcasti64x2:
1077+
{
1078+
ins = INS_vbroadcasti32x4;
1079+
break;
1080+
}
1081+
1082+
case INS_vmovdqa64:
1083+
{
1084+
ins = INS_movdqa32;
1085+
break;
1086+
}
1087+
1088+
case INS_vmovdqu64:
1089+
{
1090+
ins = INS_movdqu32;
1091+
break;
1092+
}
1093+
1094+
default:
1095+
{
1096+
break;
1097+
}
1098+
}
1099+
}
10621100

10631101
OperandDesc rmOpDesc = genOperandDesc(ins, rmOp);
10641102

0 commit comments

Comments
 (0)