Skip to content

Commit b99b43a

Browse files
lrhnCommit Queue
authored andcommitted
Tweak some X64 assembler instructions to save bytes.
Changes `xorpd(r,r)` and `xorq(r, r)` to `xorps(r,r)` and `xorl(r, r)`, which saves one (REX prefix) byte and still clears the register. (At least if the register is in the first 8, otherwise a REX byte is needed just to name the register. Still better to let the compiler worry about that than hardwiring a 64-bit `q` or `pd` operation.) Changes `andq` to `AndImmediate` which recognizes that a 32-bit or smaller immediate only needs an `andl` (again potentially saving a REX prefix). Simplifies the code for clamping an Uint8Clamped value to have fewer branches. (If we know the value is not in the 0..255 range, two instructions can convert all negative values to zero, and all positive values to a value with the lower 8 bit set: `(~v)>>63`.) Simplifies the code for "is infinite" to use one less 64-bit immediate (by multiplying by 2 to shift out the sign, rather than masking it out using a 63-bit mask.) Tested: No new behavior, only optimization. Covered by existing tests. Change-Id: I94e66c2ff39f0a207649f657e4da1ed43e4e819e Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/385741 Commit-Queue: Lasse Nielsen <[email protected]> Reviewed-by: Martin Kustermann <[email protected]>
1 parent a6d675a commit b99b43a

File tree

2 files changed

+56
-50
lines changed

2 files changed

+56
-50
lines changed

runtime/vm/compiler/assembler/assembler_x64.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -724,6 +724,11 @@ class Assembler : public AssemblerBase {
724724
void jmp(const ExternalLabel* label);
725725
void jmp(const Code& code);
726726

727+
/// Moves an XMM register's content to a 64-bit register.
728+
void MoveFpuRegisterToRegister(Register dst, FpuRegister src) {
729+
movq(dst, src);
730+
}
731+
727732
// Issue memory to memory move through a TMP register.
728733
// TODO(koda): Assert that these are not used for heap objects.
729734
void MoveMemoryToMemory(const Address& dst, const Address& src) {
@@ -835,6 +840,14 @@ class Assembler : public AssemblerBase {
835840
void LoadDImmediate(FpuRegister dst, double immediate);
836841
void LoadQImmediate(FpuRegister dst, simd128_value_t immediate);
837842

843+
// Sets register to zero.
844+
// Affects flags (sets zero flag, clears rest).
845+
void ClearRegister(Register reg) { xorl(reg, reg); }
846+
847+
// Sets XMM register to zero.
848+
// Affects flags (sets zero flag, clears rest).
849+
void ClearFpuRegister(FpuRegister reg) { xorps(reg, reg); }
850+
838851
void LoadIsolate(Register dst);
839852
void LoadIsolateGroup(Register dst);
840853
void LoadDispatchTable(Register dst);

runtime/vm/compiler/backend/il_x64.cc

Lines changed: 43 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -544,7 +544,7 @@ void IfThenElseInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
544544

545545
// Clear upper part of the out register. We are going to use setcc on it
546546
// which is a byte move.
547-
__ xorq(RDX, RDX);
547+
__ ClearRegister(RDX);
548548

549549
// Emit comparison code. This must not overwrite the result register.
550550
// IfThenElseInstr::Supports() should prevent EmitConditionCode from using
@@ -650,7 +650,7 @@ void ConstantInstr::EmitMoveToLocation(FlowGraphCompiler* compiler,
650650
if (RepresentationUtils::IsUnboxedInteger(representation())) {
651651
const int64_t value = Integer::Cast(value_).Value();
652652
if (value == 0) {
653-
__ xorl(destination.reg(), destination.reg());
653+
__ ClearRegister(destination.reg());
654654
} else {
655655
__ movq(destination.reg(), compiler::Immediate(value));
656656
}
@@ -1506,11 +1506,11 @@ void NativeEntryInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
15061506
compiler::Address(
15071507
THR, compiler::target::Thread::global_object_pool_offset()));
15081508
} else {
1509-
__ xorq(PP, PP); // GC-safe value into PP.
1509+
__ ClearRegister(PP); // GC-safe value into PP.
15101510
}
15111511

15121512
// Load a GC-safe value for arguments descriptor (unused but tagged).
1513-
__ xorq(ARGS_DESC_REG, ARGS_DESC_REG);
1513+
__ ClearRegister(ARGS_DESC_REG);
15141514

15151515
// Push a dummy return address which suggests that we are inside of
15161516
// InvokeDartCodeStub. This is how the stack walker detects an entry frame.
@@ -1657,8 +1657,8 @@ void Utf8ScanInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
16571657
__ leaq(bytes_end_minus_16_reg, compiler::Address(bytes_end_reg, -16));
16581658

16591659
// Initialize size and flags.
1660-
__ xorq(size_reg, size_reg);
1661-
__ xorq(flags_reg, flags_reg);
1660+
__ ClearRegister(size_reg);
1661+
__ ClearRegister(flags_reg);
16621662

16631663
__ jmp(&scan_ascii, compiler::Assembler::kNearJump);
16641664

@@ -1700,7 +1700,7 @@ void Utf8ScanInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
17001700
table_reg, temp_reg, TIMES_1,
17011701
compiler::target::OneByteString::data_offset()));
17021702
__ orq(flags_reg, temp_reg);
1703-
__ andq(temp_reg, compiler::Immediate(kSizeMask));
1703+
__ AndImmediate(temp_reg, compiler::Immediate(kSizeMask));
17041704
__ addq(size_reg, temp_reg);
17051705

17061706
// Stop if end is reached.
@@ -1735,7 +1735,7 @@ void Utf8ScanInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
17351735
table_reg, temp_reg, TIMES_1,
17361736
compiler::target::OneByteString::data_offset()));
17371737
__ orq(flags_reg, temp_reg);
1738-
__ andq(temp_reg, compiler::Immediate(kSizeMask));
1738+
__ AndImmediate(temp_reg, compiler::Immediate(kSizeMask));
17391739
__ addq(size_reg, temp_reg);
17401740

17411741
// Stop if end is reached.
@@ -1745,7 +1745,7 @@ void Utf8ScanInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
17451745
__ Bind(&done);
17461746

17471747
// Write flags to field.
1748-
__ andq(flags_reg, compiler::Immediate(kFlagsMask));
1748+
__ AndImmediate(flags_reg, compiler::Immediate(kFlagsMask));
17491749
if (!IsScanFlagsUnboxed()) {
17501750
__ SmiTag(flags_reg);
17511751
}
@@ -2042,15 +2042,12 @@ void StoreIndexedInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
20422042
__ movb(element_address, compiler::Immediate(static_cast<int8_t>(value)));
20432043
} else {
20442044
const Register value = locs()->in(2).reg();
2045-
compiler::Label store_value, store_0xff;
2045+
compiler::Label store_value;
20462046
__ CompareImmediate(value, compiler::Immediate(0xFF));
2047-
__ j(BELOW_EQUAL, &store_value, compiler::Assembler::kNearJump);
2047+
__ j(UNSIGNED_LESS_EQUAL, &store_value, compiler::Assembler::kNearJump);
20482048
// Clamp to 0x0 or 0xFF respectively.
2049-
__ j(GREATER, &store_0xff);
2050-
__ xorq(value, value);
2051-
__ jmp(&store_value, compiler::Assembler::kNearJump);
2052-
__ Bind(&store_0xff);
2053-
__ LoadImmediate(value, compiler::Immediate(0xFF));
2049+
__ notq(value);
2050+
__ sarq(value, compiler::Immediate(63));
20542051
__ Bind(&store_value);
20552052
__ movb(element_address, ByteRegisterOf(value));
20562053
}
@@ -2971,7 +2968,7 @@ static void EmitSmiShiftLeft(FlowGraphCompiler* compiler,
29712968
compiler::Label done, is_not_zero;
29722969
__ CompareObject(right, Smi::ZoneHandle(Smi::New(Smi::kBits)));
29732970
__ j(BELOW, &is_not_zero, compiler::Assembler::kNearJump);
2974-
__ xorq(left, left);
2971+
__ ClearRegister(left);
29752972
__ jmp(&done, compiler::Assembler::kNearJump);
29762973
__ Bind(&is_not_zero);
29772974
__ SmiUntag(right);
@@ -3505,7 +3502,7 @@ void BinarySmiOpInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
35053502
compiler::kObjectBytes);
35063503
compiler::Label count_ok;
35073504
__ j(LESS_EQUAL, &count_ok, compiler::Assembler::kNearJump);
3508-
__ xorq(left, left);
3505+
__ ClearRegister(left);
35093506
__ jmp(&done, compiler::Assembler::kNearJump);
35103507
__ Bind(&count_ok);
35113508
}
@@ -3704,7 +3701,7 @@ void UnboxInstr::EmitSmiConversion(FlowGraphCompiler* compiler) {
37043701
// bits intact. This creates false dependency and causes performance
37053702
// problems for subsequent uses of the XMM register. To break the
37063703
// dependency XORPS is recommended.
3707-
__ xorps(result, result);
3704+
__ ClearFpuRegister(result);
37083705
__ OBJ(cvtsi2sd)(result, box);
37093706
break;
37103707
}
@@ -4046,21 +4043,18 @@ Condition DoubleTestOpInstr::EmitConditionCode(FlowGraphCompiler* compiler,
40464043
}
40474044
case MethodRecognizer::kDouble_getIsInfinite: {
40484045
const Register temp = locs()->temp(0).reg();
4049-
__ AddImmediate(RSP, compiler::Immediate(-kDoubleSize));
4050-
__ movsd(compiler::Address(RSP, 0), value);
4051-
__ movq(temp, compiler::Address(RSP, 0));
4052-
__ AddImmediate(RSP, compiler::Immediate(kDoubleSize));
4053-
// Mask off the sign.
4054-
__ AndImmediate(temp, compiler::Immediate(0x7FFFFFFFFFFFFFFFLL));
4055-
// Compare with +infinity.
4056-
__ CompareImmediate(temp, compiler::Immediate(0x7FF0000000000000LL));
4046+
__ MoveFpuRegisterToRegister(temp, value);
4047+
// Shift out the sign.
4048+
__ addq(temp, temp);
4049+
// Compare with +/-infinity << 1.
4050+
__ CompareImmediate(temp, compiler::Immediate(0xFFE0000000000000LL));
40574051
return is_negated ? NOT_EQUAL : EQUAL;
40584052
}
40594053
case MethodRecognizer::kDouble_getIsNegative: {
40604054
const Register temp = locs()->temp(0).reg();
40614055
const FpuRegister temp_fpu = locs()->temp(1).fpu_reg();
40624056
compiler::Label not_zero;
4063-
__ xorpd(temp_fpu, temp_fpu);
4057+
__ ClearFpuRegister(temp_fpu);
40644058
__ comisd(value, temp_fpu);
40654059
// If it's NaN, it's not negative.
40664060
__ j(PARITY_EVEN, is_negated ? labels.true_label : labels.false_label);
@@ -4272,11 +4266,11 @@ DEFINE_EMIT(
42724266
}
42734267

42744268
DEFINE_EMIT(Float32x4Zero, (XmmRegister value)) {
4275-
__ xorps(value, value);
4269+
__ ClearFpuRegister(value);
42764270
}
42774271

42784272
DEFINE_EMIT(Float64x2Zero, (XmmRegister value)) {
4279-
__ xorpd(value, value);
4273+
__ ClearFpuRegister(value);
42804274
}
42814275

42824276
DEFINE_EMIT(Float32x4Clamp,
@@ -4319,7 +4313,7 @@ DEFINE_EMIT(Int32x4FromBools,
43194313
__ SubImmediate(RSP, compiler::Immediate(kSimd128Size));
43204314
for (intptr_t i = 0; i < 4; i++) {
43214315
compiler::Label done, load_false;
4322-
__ xorq(temp, temp);
4316+
__ ClearRegister(temp);
43234317
__ CompareObject(instr->locs()->in(i).reg(), Bool::True());
43244318
__ setcc(EQUAL, ByteRegisterOf(temp));
43254319
__ negl(temp); // temp = input ? -1 : 0
@@ -4341,15 +4335,15 @@ static void EmitToBoolean(FlowGraphCompiler* compiler, Register out) {
43414335
DEFINE_EMIT(Int32x4GetFlagZorW,
43424336
(Register out, XmmRegister value, Temp<XmmRegister> temp)) {
43434337
__ movhlps(temp, value); // extract upper half.
4344-
__ movq(out, temp);
4338+
__ MoveFpuRegisterToRegister(out, temp);
43454339
if (instr->kind() == SimdOpInstr::kInt32x4GetFlagW) {
43464340
__ shrq(out, compiler::Immediate(32)); // extract upper 32bits.
43474341
}
43484342
EmitToBoolean(compiler, out);
43494343
}
43504344

43514345
DEFINE_EMIT(Int32x4GetFlagXorY, (Register out, XmmRegister value)) {
4352-
__ movq(out, value);
4346+
__ MoveFpuRegisterToRegister(out, value);
43534347
if (instr->kind() == SimdOpInstr::kInt32x4GetFlagY) {
43544348
__ shrq(out, compiler::Immediate(32)); // extract upper 32bits.
43554349
}
@@ -4370,7 +4364,7 @@ DEFINE_EMIT(
43704364
__ movups(compiler::Address(RSP, 0), mask);
43714365

43724366
// temp = flag == true ? -1 : 0
4373-
__ xorq(temp, temp);
4367+
__ ClearRegister(temp);
43744368
__ CompareObject(flag, Bool::True());
43754369
__ setcc(EQUAL, ByteRegisterOf(temp));
43764370
__ negl(temp);
@@ -4699,7 +4693,7 @@ void Int32ToDoubleInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
46994693
// bits intact. This creates false dependency and causes performance
47004694
// problems for subsequent uses of the XMM register. To break the
47014695
// dependency XORPS is recommended.
4702-
__ xorps(result, result);
4696+
__ ClearFpuRegister(result);
47034697
__ cvtsi2sdl(result, value);
47044698
}
47054699

@@ -4722,7 +4716,7 @@ void SmiToDoubleInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
47224716
// bits intact. This creates false dependency and causes performance
47234717
// problems for subsequent uses of the XMM register. To break the
47244718
// dependency XORPS is recommended.
4725-
__ xorps(result, result);
4719+
__ ClearFpuRegister(result);
47264720
__ OBJ(cvtsi2sd)(result, value);
47274721
}
47284722

@@ -4731,7 +4725,7 @@ DEFINE_BACKEND(Int64ToDouble, (FpuRegister result, Register value)) {
47314725
// bits intact. This creates false dependency and causes performance
47324726
// problems for subsequent uses of the XMM register. To break the
47334727
// dependency XORPS is recommended.
4734-
__ xorps(result, result);
4728+
__ ClearFpuRegister(result);
47354729
__ cvtsi2sdq(result, value);
47364730
}
47374731

@@ -4776,7 +4770,7 @@ void DoubleToIntegerInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
47764770
compiler::Immediate(0));
47774771
__ j(EQUAL, slow_path->entry_label());
47784772

4779-
__ xorps(FpuTMP, FpuTMP);
4773+
__ ClearFpuRegister(FpuTMP);
47804774
switch (recognized_kind()) {
47814775
case MethodRecognizer::kDoubleFloorToInt:
47824776
__ roundsd(FpuTMP, value_double, compiler::Assembler::kRoundDown);
@@ -4934,7 +4928,7 @@ static void InvokeDoublePow(FlowGraphCompiler* compiler,
49344928
XmmRegister zero_temp =
49354929
locs->temp(InvokeMathCFunctionInstr::kDoubleTempIndex).fpu_reg();
49364930

4937-
__ xorps(zero_temp, zero_temp);
4931+
__ ClearFpuRegister(zero_temp);
49384932
__ LoadDImmediate(result, 1.0);
49394933

49404934
compiler::Label check_base, skip_call;
@@ -5242,7 +5236,7 @@ static void EmitHashIntegerCodeSequence(FlowGraphCompiler* compiler) {
52425236
__ movq(RDX, RAX);
52435237
__ shrq(RDX, compiler::Immediate(32));
52445238
__ xorq(RAX, RDX);
5245-
__ andq(RAX, compiler::Immediate(0x3fffffff));
5239+
__ AndImmediate(RAX, compiler::Immediate(0x3fffffff));
52465240
}
52475241

52485242
LocationSummary* HashDoubleOpInstr::MakeLocationSummary(Zone* zone,
@@ -5269,8 +5263,8 @@ void HashDoubleOpInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
52695263
// cvtsi2sd only writes to the lower part of the register and leaves upper
52705264
// bits intact. This creates false dependency and causes performance
52715265
// problems for subsequent uses of the XMM register. To break the
5272-
// dependency XORPS is recommended.
5273-
__ xorps(temp_fpu_reg, temp_fpu_reg);
5266+
// dependency XORPS is recommended (which is what ClearRegister does).
5267+
__ ClearFpuRegister(temp_fpu_reg);
52745268
__ cvttsd2siq(RAX, value);
52755269
__ cvtsi2sdq(temp_fpu_reg, RAX);
52765270
__ comisd(value, temp_fpu_reg);
@@ -5285,11 +5279,11 @@ void HashDoubleOpInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
52855279

52865280
__ Bind(&hash_double);
52875281
// Convert the double bits to a hash code that fits in a Smi.
5288-
__ movq(RAX, value);
5282+
__ MoveFpuRegisterToRegister(RAX, value);
52895283
__ movq(RDX, RAX);
52905284
__ shrq(RDX, compiler::Immediate(32));
52915285
__ xorq(RAX, RDX);
5292-
__ andq(RAX, compiler::Immediate(compiler::target::kSmiMax));
5286+
__ AndImmediate(RAX, compiler::Immediate(compiler::target::kSmiMax));
52935287

52945288
__ Bind(&done);
52955289
}
@@ -5572,7 +5566,7 @@ class Int64DivideSlowPath : public ThrowErrorSlowPathCode {
55725566
if (has_divide_by_minus_one()) {
55735567
__ Bind(div_by_minus_one_label());
55745568
if (is_mod_) {
5575-
__ xorq(RDX, RDX); // x % -1 = 0
5569+
__ ClearRegister(RDX); // x % -1 = 0
55765570
} else {
55775571
__ negq(RAX); // x / -1 = -x
55785572
}
@@ -5691,7 +5685,6 @@ static void EmitInt64ModTruncDiv(FlowGraphCompiler* compiler,
56915685
__ imulq(RDX, TMP);
56925686
__ subq(RAX, RDX);
56935687
// Compensate for Dart's Euclidean view of MOD.
5694-
__ testq(RAX, RAX);
56955688
__ j(GREATER_EQUAL, &pos);
56965689
if (divisor > 0) {
56975690
__ addq(RAX, TMP);
@@ -5968,7 +5961,7 @@ static void EmitShiftUint32ByConstant(FlowGraphCompiler* compiler,
59685961
}
59695962

59705963
if (shift >= 32) {
5971-
__ xorl(left, left);
5964+
__ ClearRegister(left);
59725965
} else {
59735966
switch (op_kind) {
59745967
case Token::kSHR:
@@ -6026,7 +6019,7 @@ class ShiftInt64OpSlowPath : public ThrowErrorSlowPathCode {
60266019
break;
60276020
case Token::kUSHR:
60286021
case Token::kSHL:
6029-
__ xorq(out, out);
6022+
__ ClearRegister(out);
60306023
break;
60316024
default:
60326025
UNREACHABLE();
@@ -6096,7 +6089,7 @@ void BinaryUint32OpInstr::EmitShiftUint32(FlowGraphCompiler* compiler) {
60966089
compiler::Label done;
60976090
__ cmpl(RCX, compiler::Immediate(kUint32ShiftCountLimit));
60986091
__ j(UNSIGNED_LESS_EQUAL, &done);
6099-
__ xorl(out, out);
6092+
__ ClearRegister(out);
61006093
__ Bind(&done);
61016094
}
61026095
}
@@ -6416,7 +6409,7 @@ void ClosureCallInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
64166409
// RCX: instructions entry point.
64176410
if (!FLAG_precompiled_mode) {
64186411
// RBX: Smi 0 (no IC data; the lazy-compile stub expects a GC-safe value).
6419-
__ xorq(IC_DATA_REG, IC_DATA_REG);
6412+
__ ClearRegister(IC_DATA_REG);
64206413
}
64216414
__ call(RCX);
64226415
compiler->EmitCallsiteMetadata(source(), deopt_id(),

0 commit comments

Comments
 (0)