mamedev · cuavas · Nov 1, 2025 · Nov 1, 2025
diff --git a/docs/source/techspecs/uml_instructions.rst b/docs/source/techspecs/uml_instructions.rst
@@ -3502,6 +3502,94 @@ Simplification rules
 * Immediate values for the ``count`` operand are truncated to five or
   six bits for 32-bit or 64-bit operands, respectively.
 
+.. _umlinst-bfx:
+
+BFX
+~~~
+
+Extract a contiguous bit field from an integer value.
+
++---------------------------------+-----------------------------------------------+
+| Disassembly                     | Usage                                         |
++=================================+===============================================+
+| .. code-block::                 | .. code-block:: C++                           |
+|                                 |                                               |
+|     bfxu    dst,src,shift,width |     UML_BFXU(block, dst, src, shift, width);  |
+|     bfxs    dst,src,shift,width |     UML_BFXS(block, dst, src, shift, width);  |
+|     dbfxu   dst,src,shift,width |     UML_DBFXU(block, dst, src, shift, width); |
+|     dbfxs   dst,src,shift,width |     UML_DBFXS(block, dst, src, shift, width); |
++---------------------------------+-----------------------------------------------+
+
+Extracts and right-aligns a contiguous bit field from the value of
+``src``, specified by its least significant bit position and width in
+bits.  The field must be narrower than the ``src`` operand, but it may
+wrap around from the most significant bit position to the least
+significant bit position.  BFXU and DBFXU zero-extend an unsigned field,
+while BFXS and DBFXS sign-extend a signed field.
+
+Back-ends may be able to optimise some forms of this instruction for
+example when the ``shift`` and ``width`` operands are both immediate
+values.
+
+Operands
+^^^^^^^^
+
+dst (32-bit or 64-bit – memory, integer register)
+    The destination where the extracted field will be stored.
+src (32-bit or 64-bit – memory, integer register, immediate, map variable)
+    The value to extract a contiguous bit field from.
+shift (32-bit or 64-bit – memory, integer register, immediate, map variable)
+    The position of the least significant bit of the field to extract,
+    where zero is the least significant bit position, and bit numbers
+    increase toward the most significant bit position.  Only the least
+    significant five bits or six bits of this operand are used,
+    depending on the instruction size.
+width (32-bit or 64-bit – memory, integer register, immediate, map variable)
+    The width of the field to extract in bits.  Only the least
+    significant five bits or six bits of this operand are used,
+    depending on the instruction size.  The result is undefined if the
+    width modulo the instruction size in bits is zero.
+
+Flags
+^^^^^
+
+carry (C)
+    Undefined.
+overflow (V)
+    Undefined.
+zero (Z)
+    Set if the result is zero, or cleared otherwise.
+sign (S)
+    Set to the value of the most significant bit of the result (set if
+    the result is a negative signed integer value, or cleared
+    otherwise).
+unordered (U)
+    Undefined.
+
+Simplification rules
+^^^^^^^^^^^^^^^^^^^^
+
+* Converted to :ref:`MOV <umlinst-mov>`, :ref:`AND <umlinst-and>` or
+  :ref:`OR <umlinst-or>` if the ``src``, ``shift`` and ``width``
+  operands are all immediate values, or if the ``width`` operand is the
+  immediate value zero.
+* Converted to :ref:`SHR <umlinst-shr>` or :ref:`SAR <umlinst-sar>` if
+  the ``src`` operand is not an immediate value, the ``shift`` and
+  ``width`` operands are both immediate values, and the sum of the value
+  of the ``shift`` operand and the value of the ``width`` operand is
+  equal to the instruction size in bits.
+* BFXU and DBFXU are converted to :ref:`AND <umlinst-and>` if the
+  ``shift`` operand is the immediate value zero and ``width`` operand is
+  an immediate value.
+* BFXS and DBFXS are converted to :ref:`SEXT <umlinst-sext>` if the
+  ``shift`` operand is the immediate value zero and ``width`` operand is
+  the immediate value 8, 16 or 32.
+* Immediate values for the ``src`` operand are truncated to the
+  instruction size.
+* Immediate values for the ``shift`` and ``width`` operands are
+  truncated to five or six bits for 32-bit or 64-bit operands,
+  respectively.
+
 .. _umlinst-roland:
 
 ROLAND
@@ -3572,10 +3660,10 @@ Simplification rules
   immediate value and the ``mask`` operand is an immediate value
   containing a single contiguous left-aligned sequence of set bits of
   the appropriate length for the value of the ``count`` operand.
-* Converted to :ref:`SHR <umlinst-shr>` if the ``count`` operand is an
-  immediate value and the ``mask`` operand is an immediate value
-  containing a single contiguous right-aligned sequence of set bits of
-  the appropriate length for the value of the ``count`` operand.
+* Converted to :ref:`SHR <umlinst-shr>` or :ref:`BFX <umlinst-bfx>` if
+  the ``count`` operand is an immediate value and the ``mask`` operand
+  is an immediate value containing a single contiguous right-aligned
+  sequence of set bits.
 * Immediate values for the ``src`` and ``mask`` operands are truncated
   to the instruction size.
 * Immediate values for the ``count`` operand are truncated to five or

diff --git a/src/devices/cpu/drcbearm64.cpp b/src/devices/cpu/drcbearm64.cpp
@@ -553,6 +553,8 @@ class drcbe_arm64 : public drcbe_interface
 	void op_set(a64::Assembler &a, const uml::instruction &inst);
 	void op_mov(a64::Assembler &a, const uml::instruction &inst);
 	void op_sext(a64::Assembler &a, const uml::instruction &inst);
+	void op_bfxu(a64::Assembler &a, const uml::instruction &inst);
+	void op_bfxs(a64::Assembler &a, const uml::instruction &inst);
 	void op_roland(a64::Assembler &a, const uml::instruction &inst);
 	void op_rolins(a64::Assembler &a, const uml::instruction &inst);
 	template <bool CarryIn> void op_add(a64::Assembler &a, const uml::instruction &inst);
@@ -710,8 +712,10 @@ inline void drcbe_arm64::generate_one(a64::Assembler &a, const uml::instruction
 	case uml::OP_SET:     op_set(a, inst);                      break; // SET     dst,c
 	case uml::OP_MOV:     op_mov(a, inst);                      break; // MOV     dst,src[,c]
 	case uml::OP_SEXT:    op_sext(a, inst);                     break; // SEXT    dst,src
-	case uml::OP_ROLAND:  op_roland(a, inst);                   break; // ROLAND  dst,src1,src2,src3
-	case uml::OP_ROLINS:  op_rolins(a, inst);                   break; // ROLINS  dst,src1,src2,src3
+	case uml::OP_BFXU:    op_bfxu(a, inst);                     break; // BFXU    dst,src,shift,width
+	case uml::OP_BFXS:    op_bfxs(a, inst);                     break; // BFXS    dst,src,shift,width
+	case uml::OP_ROLAND:  op_roland(a, inst);                   break; // ROLAND  dst,src,count,mask
+	case uml::OP_ROLINS:  op_rolins(a, inst);                   break; // ROLINS  dst,src,count,mask
 	case uml::OP_ADD:     op_add<false>(a, inst);               break; // ADD     dst,src1,src2[,f]
 	case uml::OP_ADDC:    op_add<true>(a, inst);                break; // ADDC    dst,src1,src2[,f]
 	case uml::OP_SUB:     op_sub<false>(a, inst);               break; // SUB     dst,src1,src2[,f]
@@ -3223,6 +3227,173 @@ void drcbe_arm64::op_sext(a64::Assembler &a, const uml::instruction &inst)
 	}
 }
 
+void drcbe_arm64::op_bfxu(a64::Assembler &a, const uml::instruction &inst)
+{
+	assert(inst.size() == 4 || inst.size() == 8);
+	assert_no_condition(inst);
+	assert_flags(inst, FLAG_S | FLAG_Z);
+
+	be_parameter dstp(*this, inst.param(0), PTYPE_MR);
+	be_parameter srcp(*this, inst.param(1), PTYPE_MRI);
+	be_parameter shiftp(*this, inst.param(2), PTYPE_MRI);
+	be_parameter widthp(*this, inst.param(3), PTYPE_MRI);
+
+	const a64::Gp output = dstp.select_register(TEMP_REG1, inst.size());
+	const a64::Gp src = srcp.select_register(TEMP_REG2, inst.size());
+	const a64::Inst::Id maskop = inst.flags() ? a64::Inst::kIdAnds : a64::Inst::kIdAnd;
+	const uint64_t instbits = inst.size() * 8;
+
+	if (widthp.is_immediate_value(0))
+	{
+		// undefined behaviour - do something
+		const a64::Gp zero = select_register(a64::xzr, inst.size());
+
+		if (inst.flags())
+			a.ands(output, zero, zero);
+		else
+			a.mov(output, zero);
+	}
+	else if (widthp.is_immediate())
+	{
+		const auto width(widthp.immediate() & (instbits - 1));
+		const auto mask(util::make_bitmask<uint64_t>(width));
+
+		mov_reg_param(a, inst.size(), src, srcp);
+
+		if (shiftp.is_immediate())
+		{
+			const auto shift(shiftp.immediate() & (instbits - 1));
+
+			if ((shift + width) <= instbits)
+			{
+				// contiguous bit field
+				a.ubfx(output, src, shift, width);
+				if (inst.flags())
+					a.tst(output, output);
+			}
+			else
+			{
+				// bit field wraps from LSB to MSB
+				a.ror(output, src, shift);
+				a.emit(maskop, output, output, mask);
+			}
+		}
+		else
+		{
+			const a64::Gp shift = shiftp.select_register(TEMP_REG3, inst.size());
+
+			mov_reg_param(a, inst.size(), shift, shiftp);
+
+			a.ror(output, src, shift);
+			a.emit(maskop, output, output, mask);
+		}
+	}
+	else
+	{
+		const a64::Gp width = (widthp != dstp) ? widthp.select_register(TEMP_REG3, inst.size()) : select_register(TEMP_REG3, inst.size());
+		const a64::Gp temp = select_register(FUNC_SCRATCH_REG, inst.size());
+
+		mov_reg_param(a, inst.size(), width, widthp);
+		if (!shiftp.is_immediate())
+			mov_reg_param(a, inst.size(), temp, shiftp);
+		mov_reg_param(a, inst.size(), src, srcp);
+
+		if (shiftp.is_immediate())
+			a.add(temp, width, shiftp.immediate() & (instbits - 1));
+		else
+			a.add(temp, width, temp);
+		a.ror(output, src, temp);
+		a.neg(temp, width);
+		a.lsr(output, output, temp);
+		if (inst.flags())
+			a.tst(output, output);
+	}
+
+	mov_param_reg(a, inst.size(), dstp, output);
+}
+
+void drcbe_arm64::op_bfxs(a64::Assembler &a, const uml::instruction &inst)
+{
+	assert(inst.size() == 4 || inst.size() == 8);
+	assert_no_condition(inst);
+	assert_flags(inst, FLAG_S | FLAG_Z);
+
+	be_parameter dstp(*this, inst.param(0), PTYPE_MR);
+	be_parameter srcp(*this, inst.param(1), PTYPE_MRI);
+	be_parameter shiftp(*this, inst.param(2), PTYPE_MRI);
+	be_parameter widthp(*this, inst.param(3), PTYPE_MRI);
+
+	const a64::Gp output = dstp.select_register(TEMP_REG1, inst.size());
+	const a64::Gp src = srcp.select_register(TEMP_REG2, inst.size());
+	const uint64_t instbits = inst.size() * 8;
+
+	if (widthp.is_immediate_value(0))
+	{
+		// undefined behaviour - do something
+		const a64::Gp zero = select_register(a64::xzr, inst.size());
+
+		if (inst.flags())
+			a.ands(output, zero, zero);
+		else
+			a.mov(output, zero);
+	}
+	else if (widthp.is_immediate())
+	{
+		const auto width(widthp.immediate() & (instbits - 1));
+
+		mov_reg_param(a, inst.size(), src, srcp);
+
+		if (shiftp.is_immediate())
+		{
+			const auto shift(shiftp.immediate() & (instbits - 1));
+
+			if ((shift + width) <= instbits)
+			{
+				// contiguous bit field
+				a.sbfx(output, src, shift, width);
+			}
+			else
+			{
+				// bit field wraps from LSB to MSB
+				a.ror(output, src, shift);
+				a.sbfx(output, output, 0, width);
+			}
+		}
+		else
+		{
+			const a64::Gp shift = shiftp.select_register(TEMP_REG3, inst.size());
+
+			mov_reg_param(a, inst.size(), shift, shiftp);
+
+			a.ror(output, src, shift);
+			a.sbfx(output, output, 0, width);
+		}
+	}
+	else
+	{
+		const a64::Gp width = (widthp != dstp) ? widthp.select_register(TEMP_REG3, inst.size()) : select_register(TEMP_REG3, inst.size());
+		const a64::Gp temp = select_register(FUNC_SCRATCH_REG, inst.size());
+
+		mov_reg_param(a, inst.size(), src, srcp);
+		if (!shiftp.is_immediate())
+			mov_reg_param(a, inst.size(), temp, shiftp);
+		mov_reg_param(a, inst.size(), width, widthp);
+
+		if (shiftp.is_immediate())
+			a.add(temp, width, shiftp.immediate() & (instbits - 1));
+		else
+			a.add(temp, width, temp);
+		a.ror(output, src, temp);
+		a.neg(temp, width);
+		a.asr(output, output, temp);
+	}
+
+	mov_param_reg(a, inst.size(), dstp, output);
+
+	if (inst.flags())
+		a.tst(output, output);
+}
+
 void drcbe_arm64::op_roland(a64::Assembler &a, const uml::instruction &inst)
 {
 	assert(inst.size() == 4 || inst.size() == 8);
@@ -3246,11 +3417,10 @@ void drcbe_arm64::op_roland(a64::Assembler &a, const uml::instruction &inst)
 		const auto pop = population_count_64(maskp.immediate());
 		const auto lz = count_leading_zeros_64(maskp.immediate()) & (instbits - 1);
 		const auto invlamask = ~(maskp.immediate() << lz) & instmask;
-		const bool is_right_aligned = (maskp.immediate() & (maskp.immediate() + 1)) == 0;
 		const bool is_contiguous = (invlamask & (invlamask + 1)) == 0;
 		const auto s = shiftp.immediate() & (instbits - 1);
 
-		if (is_right_aligned || is_contiguous)
+		if (is_contiguous)
 		{
 			mov_reg_param(a, inst.size(), src, srcp);
 			optimized = true;
@@ -3260,25 +3430,6 @@ void drcbe_arm64::op_roland(a64::Assembler &a, const uml::instruction &inst)
 		{
 			a.mov(output, select_register(a64::xzr, inst.size()));
 		}
-		else if (is_right_aligned)
-		{
-			// Optimize a contiguous right-aligned mask
-			const auto s2 = -int(s) & (instbits - 1);
-
-			if (s >= pop)
-			{
-				a.ubfx(output, src, s2, pop);
-			}
-			else if (s2 > 0)
-			{
-				a.ror(output, src, s2);
-				a.bfc(output, pop, instbits - pop);
-			}
-			else
-			{
-				a.and_(output, src, ~maskp.immediate() & instmask);
-			}
-		}
 		else if (is_contiguous)
 		{
 			// Optimize a contiguous mask