Skip to content

Commit 8a74b97

Browse files
authored
cpu/uml.cpp: Added UML bit field extract instructions. (#14467)
cpu/drcbex64.cpp: Also added LZCNT implementation using x86 LZCNT instruction and optimised the BSR-based implementation. cpu/e132xs: Use bit field extract instructions.
1 parent 6523b65 commit 8a74b97

File tree

9 files changed

+1439
-361
lines changed

9 files changed

+1439
-361
lines changed

docs/source/techspecs/uml_instructions.rst

Lines changed: 92 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3502,6 +3502,94 @@ Simplification rules
35023502
* Immediate values for the ``count`` operand are truncated to five or
35033503
six bits for 32-bit or 64-bit operands, respectively.
35043504

3505+
.. _umlinst-bfx:
3506+
3507+
BFX
3508+
~~~
3509+
3510+
Extract a contiguous bit field from an integer value.
3511+
3512+
+---------------------------------+-----------------------------------------------+
3513+
| Disassembly | Usage |
3514+
+=================================+===============================================+
3515+
| .. code-block:: | .. code-block:: C++ |
3516+
| | |
3517+
| bfxu dst,src,shift,width | UML_BFXU(block, dst, src, shift, width); |
3518+
| bfxs dst,src,shift,width | UML_BFXS(block, dst, src, shift, width); |
3519+
| dbfxu dst,src,shift,width | UML_DBFXU(block, dst, src, shift, width); |
3520+
| dbfxs dst,src,shift,width | UML_DBFXS(block, dst, src, shift, width); |
3521+
+---------------------------------+-----------------------------------------------+
3522+
3523+
Extracts and right-aligns a contiguous bit field from the value of
3524+
``src``, specified by its least significant bit position and width in
3525+
bits. The field must be narrower than the ``src`` operand, but it may
3526+
wrap around from the most significant bit position to the least
3527+
significant bit position. BFXU and DBFXU zero-extend an unsigned field,
3528+
while BFXS and DBFXS sign-extend a signed field.
3529+
3530+
Back-ends may be able to optimise some forms of this instruction for
3531+
example when the ``shift`` and ``width`` operands are both immediate
3532+
values.
3533+
3534+
Operands
3535+
^^^^^^^^
3536+
3537+
dst (32-bit or 64-bit – memory, integer register)
3538+
The destination where the extracted field will be stored.
3539+
src (32-bit or 64-bit – memory, integer register, immediate, map variable)
3540+
The value to extract a contiguous bit field from.
3541+
shift (32-bit or 64-bit – memory, integer register, immediate, map variable)
3542+
The position of the least significant bit of the field to extract,
3543+
where zero is the least significant bit position, and bit numbers
3544+
increase toward the most significant bit position. Only the least
3545+
significant five bits or six bits of this operand are used,
3546+
depending on the instruction size.
3547+
width (32-bit or 64-bit – memory, integer register, immediate, map variable)
3548+
The width of the field to extract in bits. Only the least
3549+
significant five bits or six bits of this operand are used,
3550+
depending on the instruction size. The result is undefined if the
3551+
width modulo the instruction size in bits is zero.
3552+
3553+
Flags
3554+
^^^^^
3555+
3556+
carry (C)
3557+
Undefined.
3558+
overflow (V)
3559+
Undefined.
3560+
zero (Z)
3561+
Set if the result is zero, or cleared otherwise.
3562+
sign (S)
3563+
Set to the value of the most significant bit of the result (set if
3564+
the result is a negative signed integer value, or cleared
3565+
otherwise).
3566+
unordered (U)
3567+
Undefined.
3568+
3569+
Simplification rules
3570+
^^^^^^^^^^^^^^^^^^^^
3571+
3572+
* Converted to :ref:`MOV <umlinst-mov>`, :ref:`AND <umlinst-and>` or
3573+
:ref:`OR <umlinst-or>` if the ``src``, ``shift`` and ``width``
3574+
operands are all immediate values, or if the ``width`` operand is the
3575+
immediate value zero.
3576+
* Converted to :ref:`SHR <umlinst-shr>` or :ref:`SAR <umlinst-sar>` if
3577+
the ``src`` operand is not an immediate value, the ``shift`` and
3578+
``width`` operands are both immediate values, and the sum of the value
3579+
of the ``shift`` operand and the value of the ``width`` operand is
3580+
equal to the instruction size in bits.
3581+
* BFXU and DBFXU are converted to :ref:`AND <umlinst-and>` if the
3582+
``shift`` operand is the immediate value zero and ``width`` operand is
3583+
an immediate value.
3584+
* BFXS and DBFXS are converted to :ref:`SEXT <umlinst-sext>` if the
3585+
``shift`` operand is the immediate value zero and ``width`` operand is
3586+
the immediate value 8, 16 or 32.
3587+
* Immediate values for the ``src`` operand are truncated to the
3588+
instruction size.
3589+
* Immediate values for the ``shift`` and ``width`` operands are
3590+
truncated to five or six bits for 32-bit or 64-bit operands,
3591+
respectively.
3592+
35053593
.. _umlinst-roland:
35063594

35073595
ROLAND
@@ -3572,10 +3660,10 @@ Simplification rules
35723660
immediate value and the ``mask`` operand is an immediate value
35733661
containing a single contiguous left-aligned sequence of set bits of
35743662
the appropriate length for the value of the ``count`` operand.
3575-
* Converted to :ref:`SHR <umlinst-shr>` if the ``count`` operand is an
3576-
immediate value and the ``mask`` operand is an immediate value
3577-
containing a single contiguous right-aligned sequence of set bits of
3578-
the appropriate length for the value of the ``count`` operand.
3663+
* Converted to :ref:`SHR <umlinst-shr>` or :ref:`BFX <umlinst-bfx>` if
3664+
the ``count`` operand is an immediate value and the ``mask`` operand
3665+
is an immediate value containing a single contiguous right-aligned
3666+
sequence of set bits.
35793667
* Immediate values for the ``src`` and ``mask`` operands are truncated
35803668
to the instruction size.
35813669
* Immediate values for the ``count`` operand are truncated to five or

src/devices/cpu/drcbearm64.cpp

Lines changed: 174 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,8 @@ class drcbe_arm64 : public drcbe_interface
553553
void op_set(a64::Assembler &a, const uml::instruction &inst);
554554
void op_mov(a64::Assembler &a, const uml::instruction &inst);
555555
void op_sext(a64::Assembler &a, const uml::instruction &inst);
556+
void op_bfxu(a64::Assembler &a, const uml::instruction &inst);
557+
void op_bfxs(a64::Assembler &a, const uml::instruction &inst);
556558
void op_roland(a64::Assembler &a, const uml::instruction &inst);
557559
void op_rolins(a64::Assembler &a, const uml::instruction &inst);
558560
template <bool CarryIn> void op_add(a64::Assembler &a, const uml::instruction &inst);
@@ -710,8 +712,10 @@ inline void drcbe_arm64::generate_one(a64::Assembler &a, const uml::instruction
710712
case uml::OP_SET: op_set(a, inst); break; // SET dst,c
711713
case uml::OP_MOV: op_mov(a, inst); break; // MOV dst,src[,c]
712714
case uml::OP_SEXT: op_sext(a, inst); break; // SEXT dst,src
713-
case uml::OP_ROLAND: op_roland(a, inst); break; // ROLAND dst,src1,src2,src3
714-
case uml::OP_ROLINS: op_rolins(a, inst); break; // ROLINS dst,src1,src2,src3
715+
case uml::OP_BFXU: op_bfxu(a, inst); break; // BFXU dst,src,shift,width
716+
case uml::OP_BFXS: op_bfxs(a, inst); break; // BFXS dst,src,shift,width
717+
case uml::OP_ROLAND: op_roland(a, inst); break; // ROLAND dst,src,count,mask
718+
case uml::OP_ROLINS: op_rolins(a, inst); break; // ROLINS dst,src,count,mask
715719
case uml::OP_ADD: op_add<false>(a, inst); break; // ADD dst,src1,src2[,f]
716720
case uml::OP_ADDC: op_add<true>(a, inst); break; // ADDC dst,src1,src2[,f]
717721
case uml::OP_SUB: op_sub<false>(a, inst); break; // SUB dst,src1,src2[,f]
@@ -3223,6 +3227,173 @@ void drcbe_arm64::op_sext(a64::Assembler &a, const uml::instruction &inst)
32233227
}
32243228
}
32253229

3230+
void drcbe_arm64::op_bfxu(a64::Assembler &a, const uml::instruction &inst)
3231+
{
3232+
assert(inst.size() == 4 || inst.size() == 8);
3233+
assert_no_condition(inst);
3234+
assert_flags(inst, FLAG_S | FLAG_Z);
3235+
3236+
be_parameter dstp(*this, inst.param(0), PTYPE_MR);
3237+
be_parameter srcp(*this, inst.param(1), PTYPE_MRI);
3238+
be_parameter shiftp(*this, inst.param(2), PTYPE_MRI);
3239+
be_parameter widthp(*this, inst.param(3), PTYPE_MRI);
3240+
3241+
const a64::Gp output = dstp.select_register(TEMP_REG1, inst.size());
3242+
const a64::Gp src = srcp.select_register(TEMP_REG2, inst.size());
3243+
const a64::Inst::Id maskop = inst.flags() ? a64::Inst::kIdAnds : a64::Inst::kIdAnd;
3244+
const uint64_t instbits = inst.size() * 8;
3245+
3246+
if (widthp.is_immediate_value(0))
3247+
{
3248+
// undefined behaviour - do something
3249+
const a64::Gp zero = select_register(a64::xzr, inst.size());
3250+
3251+
if (inst.flags())
3252+
a.ands(output, zero, zero);
3253+
else
3254+
a.mov(output, zero);
3255+
}
3256+
else if (widthp.is_immediate())
3257+
{
3258+
const auto width(widthp.immediate() & (instbits - 1));
3259+
const auto mask(util::make_bitmask<uint64_t>(width));
3260+
3261+
mov_reg_param(a, inst.size(), src, srcp);
3262+
3263+
if (shiftp.is_immediate())
3264+
{
3265+
const auto shift(shiftp.immediate() & (instbits - 1));
3266+
3267+
if ((shift + width) <= instbits)
3268+
{
3269+
// contiguous bit field
3270+
a.ubfx(output, src, shift, width);
3271+
if (inst.flags())
3272+
a.tst(output, output);
3273+
}
3274+
else
3275+
{
3276+
// bit field wraps from LSB to MSB
3277+
a.ror(output, src, shift);
3278+
a.emit(maskop, output, output, mask);
3279+
}
3280+
}
3281+
else
3282+
{
3283+
const a64::Gp shift = shiftp.select_register(TEMP_REG3, inst.size());
3284+
3285+
mov_reg_param(a, inst.size(), shift, shiftp);
3286+
3287+
a.ror(output, src, shift);
3288+
a.emit(maskop, output, output, mask);
3289+
}
3290+
}
3291+
else
3292+
{
3293+
const a64::Gp width = (widthp != dstp) ? widthp.select_register(TEMP_REG3, inst.size()) : select_register(TEMP_REG3, inst.size());
3294+
const a64::Gp temp = select_register(FUNC_SCRATCH_REG, inst.size());
3295+
3296+
mov_reg_param(a, inst.size(), width, widthp);
3297+
if (!shiftp.is_immediate())
3298+
mov_reg_param(a, inst.size(), temp, shiftp);
3299+
mov_reg_param(a, inst.size(), src, srcp);
3300+
3301+
if (shiftp.is_immediate())
3302+
a.add(temp, width, shiftp.immediate() & (instbits - 1));
3303+
else
3304+
a.add(temp, width, temp);
3305+
a.ror(output, src, temp);
3306+
a.neg(temp, width);
3307+
a.lsr(output, output, temp);
3308+
if (inst.flags())
3309+
a.tst(output, output);
3310+
}
3311+
3312+
mov_param_reg(a, inst.size(), dstp, output);
3313+
}
3314+
3315+
void drcbe_arm64::op_bfxs(a64::Assembler &a, const uml::instruction &inst)
3316+
{
3317+
assert(inst.size() == 4 || inst.size() == 8);
3318+
assert_no_condition(inst);
3319+
assert_flags(inst, FLAG_S | FLAG_Z);
3320+
3321+
be_parameter dstp(*this, inst.param(0), PTYPE_MR);
3322+
be_parameter srcp(*this, inst.param(1), PTYPE_MRI);
3323+
be_parameter shiftp(*this, inst.param(2), PTYPE_MRI);
3324+
be_parameter widthp(*this, inst.param(3), PTYPE_MRI);
3325+
3326+
const a64::Gp output = dstp.select_register(TEMP_REG1, inst.size());
3327+
const a64::Gp src = srcp.select_register(TEMP_REG2, inst.size());
3328+
const uint64_t instbits = inst.size() * 8;
3329+
3330+
if (widthp.is_immediate_value(0))
3331+
{
3332+
// undefined behaviour - do something
3333+
const a64::Gp zero = select_register(a64::xzr, inst.size());
3334+
3335+
if (inst.flags())
3336+
a.ands(output, zero, zero);
3337+
else
3338+
a.mov(output, zero);
3339+
}
3340+
else if (widthp.is_immediate())
3341+
{
3342+
const auto width(widthp.immediate() & (instbits - 1));
3343+
3344+
mov_reg_param(a, inst.size(), src, srcp);
3345+
3346+
if (shiftp.is_immediate())
3347+
{
3348+
const auto shift(shiftp.immediate() & (instbits - 1));
3349+
3350+
if ((shift + width) <= instbits)
3351+
{
3352+
// contiguous bit field
3353+
a.sbfx(output, src, shift, width);
3354+
}
3355+
else
3356+
{
3357+
// bit field wraps from LSB to MSB
3358+
a.ror(output, src, shift);
3359+
a.sbfx(output, output, 0, width);
3360+
}
3361+
}
3362+
else
3363+
{
3364+
const a64::Gp shift = shiftp.select_register(TEMP_REG3, inst.size());
3365+
3366+
mov_reg_param(a, inst.size(), shift, shiftp);
3367+
3368+
a.ror(output, src, shift);
3369+
a.sbfx(output, output, 0, width);
3370+
}
3371+
}
3372+
else
3373+
{
3374+
const a64::Gp width = (widthp != dstp) ? widthp.select_register(TEMP_REG3, inst.size()) : select_register(TEMP_REG3, inst.size());
3375+
const a64::Gp temp = select_register(FUNC_SCRATCH_REG, inst.size());
3376+
3377+
mov_reg_param(a, inst.size(), src, srcp);
3378+
if (!shiftp.is_immediate())
3379+
mov_reg_param(a, inst.size(), temp, shiftp);
3380+
mov_reg_param(a, inst.size(), width, widthp);
3381+
3382+
if (shiftp.is_immediate())
3383+
a.add(temp, width, shiftp.immediate() & (instbits - 1));
3384+
else
3385+
a.add(temp, width, temp);
3386+
a.ror(output, src, temp);
3387+
a.neg(temp, width);
3388+
a.asr(output, output, temp);
3389+
}
3390+
3391+
mov_param_reg(a, inst.size(), dstp, output);
3392+
3393+
if (inst.flags())
3394+
a.tst(output, output);
3395+
}
3396+
32263397
void drcbe_arm64::op_roland(a64::Assembler &a, const uml::instruction &inst)
32273398
{
32283399
assert(inst.size() == 4 || inst.size() == 8);
@@ -3246,11 +3417,10 @@ void drcbe_arm64::op_roland(a64::Assembler &a, const uml::instruction &inst)
32463417
const auto pop = population_count_64(maskp.immediate());
32473418
const auto lz = count_leading_zeros_64(maskp.immediate()) & (instbits - 1);
32483419
const auto invlamask = ~(maskp.immediate() << lz) & instmask;
3249-
const bool is_right_aligned = (maskp.immediate() & (maskp.immediate() + 1)) == 0;
32503420
const bool is_contiguous = (invlamask & (invlamask + 1)) == 0;
32513421
const auto s = shiftp.immediate() & (instbits - 1);
32523422

3253-
if (is_right_aligned || is_contiguous)
3423+
if (is_contiguous)
32543424
{
32553425
mov_reg_param(a, inst.size(), src, srcp);
32563426
optimized = true;
@@ -3260,25 +3430,6 @@ void drcbe_arm64::op_roland(a64::Assembler &a, const uml::instruction &inst)
32603430
{
32613431
a.mov(output, select_register(a64::xzr, inst.size()));
32623432
}
3263-
else if (is_right_aligned)
3264-
{
3265-
// Optimize a contiguous right-aligned mask
3266-
const auto s2 = -int(s) & (instbits - 1);
3267-
3268-
if (s >= pop)
3269-
{
3270-
a.ubfx(output, src, s2, pop);
3271-
}
3272-
else if (s2 > 0)
3273-
{
3274-
a.ror(output, src, s2);
3275-
a.bfc(output, pop, instbits - pop);
3276-
}
3277-
else
3278-
{
3279-
a.and_(output, src, ~maskp.immediate() & instmask);
3280-
}
3281-
}
32823433
else if (is_contiguous)
32833434
{
32843435
// Optimize a contiguous mask

0 commit comments

Comments
 (0)