Skip to content

Commit 8b00880

Browse files
JIT: Shrink data section for const vector loads (#114040)
* shrink data for scalar instructions with contained const vectors * allow CreateScalarUnsafe to remain scalar const * formatting * revert CreateScalarUnsafe changes * shrink constants in genSetRegToConst * fix build * fix throughput regression * use correct instruction for SIMD8 * update compress logic * add comment --------- Co-authored-by: Bruce Forstall <[email protected]>
1 parent 97ad3a7 commit 8b00880

File tree

15 files changed

+177
-231
lines changed

15 files changed

+177
-231
lines changed

src/coreclr/jit/codegen.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1539,7 +1539,7 @@ class CodeGen final : public CodeGenInterface
15391539
}
15401540
};
15411541

1542-
OperandDesc genOperandDesc(GenTree* op);
1542+
OperandDesc genOperandDesc(instruction ins, GenTree* op);
15431543

15441544
void inst_TT(instruction ins, emitAttr size, GenTree* op1);
15451545
void inst_RV_TT(instruction ins, emitAttr size, regNumber op1Reg, GenTree* op2);

src/coreclr/jit/codegenxarch.cpp

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -437,8 +437,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t
437437
}
438438
else
439439
{
440-
CORINFO_FIELD_HANDLE hnd = emit->emitSimd8Const(val8);
441-
emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
440+
emit->emitSimdConstCompressedLoad(val, attr, targetReg);
442441
}
443442
break;
444443
}
@@ -465,10 +464,9 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t
465464
}
466465
else
467466
{
468-
simd16_t val16 = {};
467+
simd_t val16 = {};
469468
memcpy(&val16, &val12, sizeof(val12));
470-
CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(val16);
471-
emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
469+
emit->emitSimdConstCompressedLoad(val, EA_16BYTE, targetReg);
472470
}
473471
break;
474472
}
@@ -495,8 +493,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t
495493
}
496494
else
497495
{
498-
CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(val16);
499-
emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
496+
emit->emitSimdConstCompressedLoad(val, attr, targetReg);
500497
}
501498
break;
502499
}
@@ -523,8 +520,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t
523520
}
524521
else
525522
{
526-
CORINFO_FIELD_HANDLE hnd = emit->emitSimd32Const(val32);
527-
emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
523+
emit->emitSimdConstCompressedLoad(val, attr, targetReg);
528524
}
529525
break;
530526
}
@@ -549,8 +545,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t
549545
}
550546
else
551547
{
552-
CORINFO_FIELD_HANDLE hnd = emit->emitSimd64Const(val64);
553-
emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
548+
emit->emitSimdConstCompressedLoad(val, attr, targetReg);
554549
}
555550
break;
556551
}

src/coreclr/jit/emit.cpp

Lines changed: 120 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8195,35 +8195,142 @@ CORINFO_FIELD_HANDLE emitter::emitSimd16Const(simd16_t constValue)
81958195
return emitComp->eeFindJitDataOffs(cnum);
81968196
}
81978197

8198-
#if defined(TARGET_XARCH)
8199-
CORINFO_FIELD_HANDLE emitter::emitSimd32Const(simd32_t constValue)
8198+
#ifdef TARGET_XARCH
8199+
//------------------------------------------------------------------------
8200+
// emitSimdConst: Create a simd data section constant.
8201+
//
8202+
// Arguments:
8203+
// constValue - constant value
8204+
// attr - The EA_SIZE for the constant type
8205+
//
8206+
// Return Value:
8207+
// A field handle representing the data offset to access the constant.
8208+
//
8209+
// Note:
8210+
// Access to inline data is 'abstracted' by a special type of static member
8211+
// (produced by eeFindJitDataOffs) which the emitter recognizes as being a reference
8212+
// to constant data, not a real static field.
8213+
//
8214+
CORINFO_FIELD_HANDLE emitter::emitSimdConst(simd_t* constValue, emitAttr attr)
82008215
{
8201-
unsigned cnsSize = 32;
8202-
unsigned cnsAlign = cnsSize;
8216+
unsigned cnsSize = EA_SIZE(attr);
8217+
unsigned cnsAlign = cnsSize;
8218+
var_types dataType = (cnsSize >= 8) ? emitComp->getSIMDTypeForSize(cnsSize) : TYP_FLOAT;
82038219

8220+
#ifdef TARGET_XARCH
82048221
if (emitComp->compCodeOpt() == Compiler::SMALL_CODE)
82058222
{
82068223
cnsAlign = dataSection::MIN_DATA_ALIGN;
82078224
}
8225+
#endif // TARGET_XARCH
82088226

8209-
UNATIVE_OFFSET cnum = emitDataConst(&constValue, cnsSize, cnsAlign, TYP_SIMD32);
8227+
UNATIVE_OFFSET cnum = emitDataConst(constValue, cnsSize, cnsAlign, dataType);
82108228
return emitComp->eeFindJitDataOffs(cnum);
82118229
}
82128230

8213-
CORINFO_FIELD_HANDLE emitter::emitSimd64Const(simd64_t constValue)
8231+
//------------------------------------------------------------------------
8232+
// emitSimdConstCompressedLoad: Create a simd data section constant,
8233+
// compressing it if possible, and emit an appropiate instruction
8234+
// to load or broadcast the constant to a register.
8235+
//
8236+
// Arguments:
8237+
// constValue - constant value
8238+
// attr - The EA_SIZE for the constant type
8239+
// targetReg - The target register
8240+
//
8241+
void emitter::emitSimdConstCompressedLoad(simd_t* constValue, emitAttr attr, regNumber targetReg)
82148242
{
8215-
unsigned cnsSize = 64;
8216-
unsigned cnsAlign = cnsSize;
8243+
assert(EA_SIZE(attr) >= 8 && EA_SIZE(attr) <= 64);
82178244

8218-
if (emitComp->compCodeOpt() == Compiler::SMALL_CODE)
8245+
unsigned cnsSize = EA_SIZE(attr);
8246+
unsigned dataSize = cnsSize;
8247+
instruction ins = (cnsSize == 8) ? INS_movsd_simd : INS_movups;
8248+
8249+
// Most constant vectors tend to have repeated values, so we will first check to see if
8250+
// we can replace a full vector load with a smaller broadcast.
8251+
8252+
if ((dataSize == 64) && (constValue->v256[1] == constValue->v256[0]))
82198253
{
8220-
cnsAlign = dataSection::MIN_DATA_ALIGN;
8254+
assert(emitComp->IsBaselineVector512IsaSupportedDebugOnly());
8255+
dataSize = 32;
8256+
ins = INS_vbroadcastf32x8;
82218257
}
82228258

8223-
UNATIVE_OFFSET cnum = emitDataConst(&constValue, cnsSize, cnsAlign, TYP_SIMD64);
8224-
return emitComp->eeFindJitDataOffs(cnum);
8225-
}
8259+
if ((dataSize == 32) && (constValue->v128[1] == constValue->v128[0]))
8260+
{
8261+
assert(emitComp->IsBaselineVector256IsaSupportedDebugOnly());
8262+
dataSize = 16;
8263+
ins = INS_vbroadcastf128;
8264+
}
82268265

8266+
if ((dataSize == 16) && (constValue->u64[1] == constValue->u64[0]))
8267+
{
8268+
if (((cnsSize == 16) && emitComp->compOpportunisticallyDependsOn(InstructionSet_SSE3)) ||
8269+
emitComp->compOpportunisticallyDependsOn(InstructionSet_AVX))
8270+
{
8271+
dataSize = 8;
8272+
ins = (cnsSize == 16) ? INS_movddup : INS_vbroadcastsd;
8273+
}
8274+
}
8275+
8276+
// `vbroadcastss` fills the full SIMD register, so we can't do this last step if the
8277+
// original constant was smaller than a full reg (e.g. TYP_SIMD8)
8278+
8279+
if ((dataSize == 8) && (cnsSize >= 16) && (constValue->u32[1] == constValue->u32[0]))
8280+
{
8281+
if (emitComp->compOpportunisticallyDependsOn(InstructionSet_AVX))
8282+
{
8283+
dataSize = 4;
8284+
ins = INS_vbroadcastss;
8285+
}
8286+
}
8287+
8288+
if (dataSize < cnsSize)
8289+
{
8290+
// We found a broadcast match, so emit the broadcast instruction and return.
8291+
// Here we use the original emitAttr for the instruction, because we need to
8292+
// produce a register of the original constant's size, filled with the pattern.
8293+
8294+
CORINFO_FIELD_HANDLE hnd = emitSimdConst(constValue, EA_ATTR(dataSize));
8295+
emitIns_R_C(ins, attr, targetReg, hnd, 0);
8296+
return;
8297+
}
8298+
8299+
// Otherwise, if the upper lanes and/or elements of the constant are zero, we can use a
8300+
// smaller load, because all scalar and vector memory load instructions zero the uppers.
8301+
8302+
simd32_t zeroValue = {};
8303+
8304+
if ((dataSize == 64) && (constValue->v256[1] == zeroValue))
8305+
{
8306+
dataSize = 32;
8307+
}
8308+
8309+
if ((dataSize == 32) && (constValue->v128[1] == zeroValue.v128[0]))
8310+
{
8311+
dataSize = 16;
8312+
}
8313+
8314+
if ((dataSize == 16) && (constValue->u64[1] == 0))
8315+
{
8316+
dataSize = 8;
8317+
ins = INS_movsd_simd;
8318+
}
8319+
8320+
if ((dataSize == 8) && (constValue->u32[1] == 0))
8321+
{
8322+
dataSize = 4;
8323+
ins = INS_movss;
8324+
}
8325+
8326+
// Here we set the emitAttr to the size of the actual load. It will zero extend
8327+
// up to the native SIMD register size.
8328+
8329+
attr = EA_ATTR(dataSize);
8330+
8331+
CORINFO_FIELD_HANDLE hnd = emitSimdConst(constValue, attr);
8332+
emitIns_R_C(ins, attr, targetReg, hnd, 0);
8333+
}
82278334
#endif // TARGET_XARCH
82288335

82298336
#if defined(FEATURE_MASKED_HW_INTRINSICS)

src/coreclr/jit/emit.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2658,10 +2658,9 @@ class emitter
26582658
CORINFO_FIELD_HANDLE emitSimd8Const(simd8_t constValue);
26592659
CORINFO_FIELD_HANDLE emitSimd16Const(simd16_t constValue);
26602660
#if defined(TARGET_XARCH)
2661-
CORINFO_FIELD_HANDLE emitSimd32Const(simd32_t constValue);
2662-
CORINFO_FIELD_HANDLE emitSimd64Const(simd64_t constValue);
2661+
CORINFO_FIELD_HANDLE emitSimdConst(simd_t* constValue, emitAttr attr);
2662+
void emitSimdConstCompressedLoad(simd_t* constValue, emitAttr attr, regNumber targetReg);
26632663
#endif // TARGET_XARCH
2664-
26652664
#if defined(FEATURE_MASKED_HW_INTRINSICS)
26662665
CORINFO_FIELD_HANDLE emitSimdMaskConst(simdmask_t constValue);
26672666
#endif // FEATURE_MASKED_HW_INTRINSICS

src/coreclr/jit/emitxarch.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7331,6 +7331,7 @@ bool emitter::IsMovInstruction(instruction ins)
73317331
case INS_vmovdqu8:
73327332
case INS_vmovdqu16:
73337333
case INS_vmovdqu64:
7334+
case INS_movq:
73347335
case INS_movsd_simd:
73357336
case INS_movss:
73367337
case INS_movsx:
@@ -7350,7 +7351,6 @@ bool emitter::IsMovInstruction(instruction ins)
73507351
}
73517352

73527353
#if defined(TARGET_AMD64)
7353-
case INS_movq:
73547354
case INS_movsxd:
73557355
{
73567356
return true;
@@ -7501,14 +7501,14 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size)
75017501
break;
75027502
}
75037503

7504-
#if defined(TARGET_AMD64)
75057504
case INS_movq:
75067505
{
75077506
// Clears the upper bits
75087507
hasSideEffect = true;
75097508
break;
75107509
}
75117510

7511+
#if defined(TARGET_AMD64)
75127512
case INS_movsxd:
75137513
{
75147514
// Sign-extends the source
@@ -7781,13 +7781,13 @@ void emitter::emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regN
77817781
break;
77827782
}
77837783

7784-
#if defined(TARGET_AMD64)
77857784
case INS_movq:
77867785
{
77877786
assert(isFloatReg(dstReg) && isFloatReg(srcReg));
77887787
break;
77897788
}
77907789

7790+
#if defined(TARGET_AMD64)
77917791
case INS_movsxd:
77927792
{
77937793
assert(isGeneralRegister(dstReg) && isGeneralRegister(srcReg));

src/coreclr/jit/gentree.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28323,9 +28323,6 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const
2832328323
case NI_AVX2_ConvertToVector256Int16:
2832428324
case NI_AVX2_ConvertToVector256Int32:
2832528325
case NI_AVX2_ConvertToVector256Int64:
28326-
case NI_AVX2_BroadcastVector128ToVector256:
28327-
case NI_AVX512F_BroadcastVector128ToVector512:
28328-
case NI_AVX512F_BroadcastVector256ToVector512:
2832928326
if (GetAuxiliaryJitType() == CORINFO_TYPE_PTR)
2833028327
{
2833128328
addr = Op(1);

src/coreclr/jit/hwintrinsic.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2078,9 +2078,6 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
20782078
case NI_AVX2_ConvertToVector256Int16:
20792079
case NI_AVX2_ConvertToVector256Int32:
20802080
case NI_AVX2_ConvertToVector256Int64:
2081-
case NI_AVX2_BroadcastVector128ToVector256:
2082-
case NI_AVX512F_BroadcastVector128ToVector512:
2083-
case NI_AVX512F_BroadcastVector256ToVector512:
20842081
{
20852082
// These intrinsics have both pointer and vector overloads
20862083
// We want to be able to differentiate between them so lets

src/coreclr/jit/hwintrinsiccodegenxarch.cpp

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1076,7 +1076,7 @@ void CodeGen::genHWIntrinsic_R_RM(
10761076
instOptions = AddEmbBroadcastMode(instOptions);
10771077
}
10781078

1079-
OperandDesc rmOpDesc = genOperandDesc(rmOp);
1079+
OperandDesc rmOpDesc = genOperandDesc(ins, rmOp);
10801080

10811081
if (((instOptions & INS_OPTS_EVEX_b_MASK) != 0) && (rmOpDesc.GetKind() == OperandKind::Reg))
10821082
{
@@ -1361,7 +1361,7 @@ void CodeGen::genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins,
13611361
instOptions = AddEmbBroadcastMode(instOptions);
13621362
}
13631363

1364-
OperandDesc op2Desc = genOperandDesc(op2);
1364+
OperandDesc op2Desc = genOperandDesc(ins, op2);
13651365

13661366
if (op2Desc.IsContained())
13671367
{
@@ -1431,7 +1431,7 @@ void CodeGen::genHWIntrinsic_R_R_R_RM(instruction ins,
14311431
instOptions = AddEmbBroadcastMode(instOptions);
14321432
}
14331433

1434-
OperandDesc op3Desc = genOperandDesc(op3);
1434+
OperandDesc op3Desc = genOperandDesc(ins, op3);
14351435

14361436
if (((instOptions & INS_OPTS_EVEX_b_MASK) != 0) && (op3Desc.GetKind() == OperandKind::Reg))
14371437
{
@@ -1547,7 +1547,7 @@ void CodeGen::genHWIntrinsic_R_R_R_RM_I(
15471547
instOptions = AddEmbBroadcastMode(instOptions);
15481548
}
15491549

1550-
OperandDesc op3Desc = genOperandDesc(op3);
1550+
OperandDesc op3Desc = genOperandDesc(ins, op3);
15511551

15521552
switch (op3Desc.GetKind())
15531553
{
@@ -1898,11 +1898,15 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
18981898
op1 = loPart;
18991899
}
19001900

1901-
ins = INS_movq;
19021901
baseAttr = EA_8BYTE;
19031902
}
19041903
#endif // TARGET_X86
19051904

1905+
if (op1->isUsedFromMemory() && (baseAttr == EA_8BYTE))
1906+
{
1907+
ins = INS_movq;
1908+
}
1909+
19061910
genHWIntrinsic_R_RM(node, ins, baseAttr, targetReg, op1, instOptions);
19071911
}
19081912
else
@@ -1952,7 +1956,7 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
19521956
else
19531957
{
19541958
// `movq xmm xmm` zeroes the upper 64 bits.
1955-
genHWIntrinsic_R_RM(node, INS_movq, attr, targetReg, op1, instOptions);
1959+
emit->emitIns_Mov(INS_movq, attr, targetReg, op1Reg, /* canSkip */ false);
19561960
}
19571961
break;
19581962
}
@@ -2281,10 +2285,8 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
22812285
{
22822286
minValueInt.i32[i] = INT_MIN;
22832287
}
2284-
CORINFO_FIELD_HANDLE minValueFld = typeSize == EA_16BYTE ? emit->emitSimd16Const(minValueInt.v128[0])
2285-
: emit->emitSimd32Const(minValueInt.v256[0]);
2286-
CORINFO_FIELD_HANDLE negOneFld = typeSize == EA_16BYTE ? emit->emitSimd16Const(negOneIntVec.v128[0])
2287-
: emit->emitSimd32Const(negOneIntVec.v256[0]);
2288+
CORINFO_FIELD_HANDLE minValueFld = emit->emitSimdConst(&minValueInt, typeSize);
2289+
CORINFO_FIELD_HANDLE negOneFld = emit->emitSimdConst(&negOneIntVec, typeSize);
22882290

22892291
// div-by-zero check
22902292
emit->emitIns_SIMD_R_R_R(INS_xorpd, typeSize, tmpReg1, tmpReg1, tmpReg1, instOptions);

0 commit comments

Comments
 (0)