Skip to content

Commit 574b967

Browse files
JIT: Emulate missing x86 shift instructions for xplat intrinsics (dotnet#111108)
* emulate missing x86 shift instructions * disable vpsraq emulation on 32-bit * use logical shift for mask * fix disasm for shift instructions * allow vpsraq emulation on 32-bit for const shift amount * review feedback --------- Co-authored-by: Tanner Gooding <[email protected]>
1 parent 289aa17 commit 574b967

File tree

4 files changed

+78
-27
lines changed

4 files changed

+78
-27
lines changed

src/coreclr/jit/emitxarch.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12358,10 +12358,18 @@ void emitter::emitDispIns(
1235812358
reg2 = reg3;
1235912359
reg3 = tmp;
1236012360
}
12361+
12362+
emitAttr attr3 = attr;
12363+
if (hasTupleTypeInfo(ins) && ((insTupleTypeInfo(ins) & INS_TT_MEM128) != 0))
12364+
{
12365+
// Shift instructions take xmm for the 3rd operand regardless of instruction size.
12366+
attr3 = EA_16BYTE;
12367+
}
12368+
1236112369
printf("%s", emitRegName(id->idReg1(), attr));
1236212370
emitDispEmbMasking(id);
1236312371
printf(", %s, ", emitRegName(reg2, attr));
12364-
printf("%s", emitRegName(reg3, attr));
12372+
printf("%s", emitRegName(reg3, attr3));
1236512373
emitDispEmbRounding(id);
1236612374
break;
1236712375
}

src/coreclr/jit/gentree.cpp

Lines changed: 62 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7936,12 +7936,26 @@ GenTree* Compiler::gtNewAllBitsSetConNode(var_types type)
79367936

79377937
switch (type)
79387938
{
7939+
case TYP_BYTE:
7940+
case TYP_UBYTE:
7941+
{
7942+
return gtNewIconNode(0xFF);
7943+
}
7944+
7945+
case TYP_SHORT:
7946+
case TYP_USHORT:
7947+
{
7948+
return gtNewIconNode(0xFFFF);
7949+
}
7950+
79397951
case TYP_INT:
7952+
case TYP_UINT:
79407953
{
79417954
return gtNewIconNode(-1);
79427955
}
79437956

79447957
case TYP_LONG:
7958+
case TYP_ULONG:
79457959
{
79467960
return gtNewLconNode(-1);
79477961
}
@@ -20925,8 +20939,6 @@ GenTree* Compiler::gtNewSimdBinOpNode(
2092520939

2092620940
unsigned shiftCountMask = (genTypeSize(simdBaseType) * 8) - 1;
2092720941

20928-
GenTree* nonConstantByteShiftCountOp = NULL;
20929-
2093020942
if (op2->IsCnsIntOrI())
2093120943
{
2093220944
op2->AsIntCon()->gtIconVal &= shiftCountMask;
@@ -21090,39 +21102,73 @@ GenTree* Compiler::gtNewSimdBinOpNode(
2109021102
}
2109121103

2109221104
#if defined(TARGET_XARCH)
21093-
case GT_RSZ:
2109421105
case GT_LSH:
21106+
case GT_RSH:
21107+
case GT_RSZ:
2109521108
{
21096-
// We don't have actual instructions for shifting bytes, so we'll emulate them
21097-
// by shifting 32-bit values and masking off the bits that should be zeroed.
21109+
// This emulates byte shift instructions, which don't exist in x86 SIMD,
21110+
// plus arithmetic shift of qwords, which did not exist before AVX-512.
2109821111

21099-
assert(varTypeIsByte(simdBaseType));
21112+
assert(varTypeIsByte(simdBaseType) || (varTypeIsLong(simdBaseType) && (op == GT_RSH)));
2110021113

21101-
intrinsic =
21102-
GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, op, op1, op2ForLookup, TYP_INT, simdSize, false);
21114+
// We will emulate arithmetic shift by using logical shift and then masking in the sign bits.
21115+
genTreeOps instrOp = op == GT_RSH ? GT_RSZ : op;
21116+
intrinsic = GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, instrOp, op1, op2ForLookup,
21117+
genActualType(simdBaseType), simdSize, false);
2110321118
assert(intrinsic != NI_Illegal);
2110421119

2110521120
GenTree* maskAmountOp;
2110621121

2110721122
if (op2->IsCnsIntOrI())
2110821123
{
2110921124
ssize_t shiftCount = op2->AsIntCon()->gtIconVal;
21110-
ssize_t mask = op == GT_RSZ ? (255 >> shiftCount) : ((255 << shiftCount) & 0xFF);
21111-
21112-
maskAmountOp = gtNewIconNode(mask, type);
21125+
if (varTypeIsByte(simdBaseType))
21126+
{
21127+
ssize_t mask = op == GT_LSH ? ((0xFF << shiftCount) & 0xFF) : (0xFF >> shiftCount);
21128+
maskAmountOp = gtNewIconNode(mask, type);
21129+
}
21130+
else
21131+
{
21132+
int64_t mask = static_cast<int64_t>(0xFFFFFFFFFFFFFFFFULL >> shiftCount);
21133+
maskAmountOp = gtNewLconNode(mask);
21134+
}
2111321135
}
2111421136
else
2111521137
{
2111621138
assert(op2->OperIsHWIntrinsic(NI_Vector128_CreateScalar));
2111721139

21118-
GenTree* nonConstantByteShiftCountOp = fgMakeMultiUse(&op2->AsHWIntrinsic()->Op(1));
21119-
maskAmountOp = gtNewOperNode(op, TYP_INT, gtNewIconNode(255), nonConstantByteShiftCountOp);
21140+
GenTree* shiftCountDup = fgMakeMultiUse(&op2->AsHWIntrinsic()->Op(1));
21141+
if (op == GT_RSH)
21142+
{
21143+
// For arithmetic shift, we will be using ConditionalSelect to mask in the sign bits, which means
21144+
// the mask will be evaluated before the shift. We swap the copied operand with the shift amount
21145+
// operand here in order to preserve correct evaluation order for the masked shift count.
21146+
std::swap(shiftCountDup, op2->AsHWIntrinsic()->Op(1));
21147+
}
21148+
21149+
maskAmountOp = gtNewOperNode(instrOp, genActualType(simdBaseType), gtNewAllBitsSetConNode(simdBaseType),
21150+
shiftCountDup);
2112021151
}
2112121152

21122-
GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, CORINFO_TYPE_INT, simdSize);
21123-
GenTree* maskOp = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize);
21153+
if (op == GT_RSH)
21154+
{
21155+
GenTree* op1Dup = fgMakeMultiUse(&op1);
21156+
GenTree* signOp =
21157+
gtNewSimdCmpOpNode(GT_GT, type, gtNewZeroConNode(type), op1Dup, simdBaseJitType, simdSize);
21158+
21159+
CorInfoType shiftType = varTypeIsSmall(simdBaseType) ? CORINFO_TYPE_INT : simdBaseJitType;
21160+
GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, shiftType, simdSize);
21161+
GenTree* maskOp = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize);
21162+
21163+
return gtNewSimdCndSelNode(type, maskOp, shiftOp, signOp, simdBaseJitType, simdSize);
21164+
}
21165+
else
21166+
{
21167+
GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, CORINFO_TYPE_INT, simdSize);
21168+
GenTree* maskOp = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize);
2112421169

21125-
return gtNewSimdBinOpNode(GT_AND, type, shiftOp, maskOp, simdBaseJitType, simdSize);
21170+
return gtNewSimdBinOpNode(GT_AND, type, shiftOp, maskOp, simdBaseJitType, simdSize);
21171+
}
2112621172
}
2112721173
#endif // TARGET_XARCH
2112821174

src/coreclr/jit/hwintrinsicxarch.cpp

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3443,20 +3443,17 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
34433443
{
34443444
assert(sig->numArgs == 2);
34453445

3446-
if (varTypeIsByte(simdBaseType))
3447-
{
3448-
// byte and sbyte would require more work to support
3449-
break;
3450-
}
3451-
3452-
if (varTypeIsLong(simdBaseType) || (simdBaseType == TYP_DOUBLE))
3446+
#if defined(TARGET_X86)
3447+
if ((simdBaseType == TYP_LONG) || (simdBaseType == TYP_DOUBLE))
34533448
{
3454-
if (!compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL))
3449+
if (!compOpportunisticallyDependsOn(InstructionSet_EVEX) && !impStackTop(0).val->IsCnsIntOrI())
34553450
{
3456-
// long, ulong, and double would require more work to support
3451+
// If vpsraq is available, we can use that. We can also trivially emulate arithmetic shift by const
3452+
// amount. Otherwise, more work is required for long types, so we fall back to managed for now.
34573453
break;
34583454
}
34593455
}
3456+
#endif // TARGET_X86
34603457

34613458
if ((simdSize != 32) || compOpportunisticallyDependsOn(InstructionSet_AVX2))
34623459
{

src/coreclr/jit/importercalls.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3319,7 +3319,7 @@ GenTree* Compiler::impIntrinsic(CORINFO_CLASS_HANDLE clsHnd,
33193319

33203320
bool betterToExpand = false;
33213321

3322-
// Allow some lighweight intrinsics in Tier0 which can improve throughput
3322+
// Allow some lightweight intrinsics in Tier0 which can improve throughput
33233323
// we're fine if intrinsic decides to not expand itself in this case unlike mustExpand.
33243324
if (!mustExpand && opts.Tier0OptimizationEnabled())
33253325
{

0 commit comments

Comments
 (0)