Skip to content

Commit a7f7754

Browse files
committed
Merge tag 'pull-tcg-20241224' of https://gitlab.com/rth7680/qemu into staging
tcg/optimize: Remove in-flight mask data from OptContext fpu: Add float*_muladd_scalbn fpu: Remove float_muladd_halve_result fpu: Add float_round_nearest_even_max fpu: Add float_muladd_suppress_add_product_zero target/hexagon: Use float32_muladd accel/tcg: Move gen_intermediate_code to TCGCPUOps.translate_core # -----BEGIN PGP SIGNATURE----- # # iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmdrE7QdHHJpY2hhcmQu # aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV+l2Qf/aECUfMn07wns7WjX # ebWxzIRKp//ktsIJg9InL8zrCStyRqrBj0VQE9LUfO2Vhvqf8faUdh+uh2ek/Ewa # f1hfo0kDK7e7oWnCicSbHmdC0FQIrKpg2i+YXIsbd4XWOkmFAhkNenISuQfCrL3k # 3UYAA12seK9uCls+fljvhK6iid3h+4ReDFW7DPg7mumFCCz6CwzYYW/4cnhcAmOn # qVehtts8W+6SFMjTE04S8NV8OBaMisf8AbCcZf2PedRl1cHGSumLOjvjOxcQU8Hw # nGUjL8/hYWkEetzU4YzJyfHOe6F9lPJBMnDattwIswwYrTOD/Sq7VbBWFbW0EwUy # 7XIZ8Q== # =DZgo # -----END PGP SIGNATURE----- # gpg: Signature made Tue 24 Dec 2024 15:04:04 EST # gpg: using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F # gpg: issuer "[email protected]" # gpg: Good signature from "Richard Henderson <[email protected]>" [full] # Primary key fingerprint: 7A48 1E78 868B 4DB6 A85A 05C0 64DF 38E8 AF7E 215F * tag 'pull-tcg-20241224' of https://gitlab.com/rth7680/qemu: (72 commits) accel/tcg: Move gen_intermediate_code to TCGCPUOps.translate_core target/hexagon: Simplify internal_mpyhh setup target/hexagon: Use mulu64 for int128_mul_6464 target/hexagon: Remove Double target/hexagon: Remove Float target/hexagon: Expand GEN_XF_ROUND target/hexagon: Remove internal_fmafx target/hexagon: Use float32_muladd for helper_sffm[as]_lib target/hexagon: Use float32_muladd_scalbn for helper_sffma_sc target/hexagon: Use float32_muladd for helper_sffms target/hexagon: Use float32_muladd for helper_sffma target/hexagon: Use float32_mul in helper_sfmpy softfloat: Add float_muladd_suppress_add_product_zero softfloat: Add float_round_nearest_even_max softfloat: Remove float_muladd_halve_result target/sparc: Use float*_muladd_scalbn target/arm: Use float*_muladd_scalbn softfloat: Add float{16,32,64}_muladd_scalbn tcg/optimize: Move fold_cmp_vec, fold_cmpsel_vec into alphabetic sort tcg/optimize: Move fold_bitsel_vec into alphabetic sort ... Signed-off-by: Stefan Hajnoczi <[email protected]>
2 parents aa3a285 + e4a8e09 commit a7f7754

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+848
-991
lines changed

accel/tcg/cpu-exec.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1088,11 +1088,13 @@ bool tcg_exec_realizefn(CPUState *cpu, Error **errp)
10881088

10891089
if (!tcg_target_initialized) {
10901090
/* Check mandatory TCGCPUOps handlers */
1091+
const TCGCPUOps *tcg_ops = cpu->cc->tcg_ops;
10911092
#ifndef CONFIG_USER_ONLY
1092-
assert(cpu->cc->tcg_ops->cpu_exec_halt);
1093-
assert(cpu->cc->tcg_ops->cpu_exec_interrupt);
1093+
assert(tcg_ops->cpu_exec_halt);
1094+
assert(tcg_ops->cpu_exec_interrupt);
10941095
#endif /* !CONFIG_USER_ONLY */
1095-
cpu->cc->tcg_ops->initialize();
1096+
assert(tcg_ops->translate_code);
1097+
tcg_ops->initialize();
10961098
tcg_target_initialized = true;
10971099
}
10981100

accel/tcg/plugin-gen.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,15 @@ static void gen_disable_mem_helper(void)
102102

103103
static TCGv_i32 gen_cpu_index(void)
104104
{
105+
/*
106+
* Optimize when we run with a single vcpu. All values using cpu_index,
107+
* including scoreboard index, will be optimized out.
108+
* User-mode calls tb_flush when setting this flag. In system-mode, all
109+
* vcpus are created before generating code.
110+
*/
111+
if (!tcg_cflags_has(current_cpu, CF_PARALLEL)) {
112+
return tcg_constant_i32(current_cpu->cpu_index);
113+
}
105114
TCGv_i32 cpu_index = tcg_temp_ebb_new_i32();
106115
tcg_gen_ld_i32(cpu_index, tcg_env,
107116
-offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index));

accel/tcg/translate-all.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -276,8 +276,10 @@ static int setjmp_gen_code(CPUArchState *env, TranslationBlock *tb,
276276

277277
tcg_func_start(tcg_ctx);
278278

279-
tcg_ctx->cpu = env_cpu(env);
280-
gen_intermediate_code(env_cpu(env), tb, max_insns, pc, host_pc);
279+
CPUState *cs = env_cpu(env);
280+
tcg_ctx->cpu = cs;
281+
cs->cc->tcg_ops->translate_code(cs, tb, max_insns, pc, host_pc);
282+
281283
assert(tb->size != 0);
282284
tcg_ctx->cpu = NULL;
283285
*max_insns = tb->icount;
@@ -364,7 +366,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
364366
/*
365367
* Overflow of code_gen_buffer, or the current slice of it.
366368
*
367-
* TODO: We don't need to re-do gen_intermediate_code, nor
369+
* TODO: We don't need to re-do tcg_ops->translate_code, nor
368370
* should we re-do the tcg optimization currently hidden
369371
* inside tcg_gen_code. All that should be required is to
370372
* flush the TBs, allocate a new TB, re-initialize it per

fpu/softfloat-parts.c.inc

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,9 @@ static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s,
241241
int exp, flags = 0;
242242

243243
switch (s->float_rounding_mode) {
244+
case float_round_nearest_even_max:
245+
overflow_norm = true;
246+
/* fall through */
244247
case float_round_nearest_even:
245248
if (N > 64 && frac_lsb == 0) {
246249
inc = ((p->frac_hi & 1) || (p->frac_lo & round_mask) != frac_lsbm1
@@ -562,8 +565,9 @@ static FloatPartsN *partsN(mul)(FloatPartsN *a, FloatPartsN *b,
562565
* Requires A and C extracted into a double-sized structure to provide the
563566
* extra space for the widening multiply.
564567
*/
565-
static FloatPartsN *partsN(muladd)(FloatPartsN *a, FloatPartsN *b,
566-
FloatPartsN *c, int flags, float_status *s)
568+
static FloatPartsN *partsN(muladd_scalbn)(FloatPartsN *a, FloatPartsN *b,
569+
FloatPartsN *c, int scale,
570+
int flags, float_status *s)
567571
{
568572
int ab_mask, abc_mask;
569573
FloatPartsW p_widen, c_widen;
@@ -611,7 +615,9 @@ static FloatPartsN *partsN(muladd)(FloatPartsN *a, FloatPartsN *b,
611615
goto return_normal;
612616
}
613617
if (c->cls == float_class_zero) {
614-
if (a->sign != c->sign) {
618+
if (flags & float_muladd_suppress_add_product_zero) {
619+
a->sign = c->sign;
620+
} else if (a->sign != c->sign) {
615621
goto return_sub_zero;
616622
}
617623
goto return_zero;
@@ -652,9 +658,7 @@ static FloatPartsN *partsN(muladd)(FloatPartsN *a, FloatPartsN *b,
652658
a->exp = p_widen.exp;
653659

654660
return_normal:
655-
if (flags & float_muladd_halve_result) {
656-
a->exp -= 1;
657-
}
661+
a->exp += scale;
658662
finish_sign:
659663
if (flags & float_muladd_negate_result) {
660664
a->sign ^= 1;

fpu/softfloat.c

Lines changed: 34 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -789,15 +789,15 @@ static FloatParts128 *parts128_mul(FloatParts128 *a, FloatParts128 *b,
789789
#define parts_mul(A, B, S) \
790790
PARTS_GENERIC_64_128(mul, A)(A, B, S)
791791

792-
static FloatParts64 *parts64_muladd(FloatParts64 *a, FloatParts64 *b,
793-
FloatParts64 *c, int flags,
794-
float_status *s);
795-
static FloatParts128 *parts128_muladd(FloatParts128 *a, FloatParts128 *b,
796-
FloatParts128 *c, int flags,
797-
float_status *s);
792+
static FloatParts64 *parts64_muladd_scalbn(FloatParts64 *a, FloatParts64 *b,
793+
FloatParts64 *c, int scale,
794+
int flags, float_status *s);
795+
static FloatParts128 *parts128_muladd_scalbn(FloatParts128 *a, FloatParts128 *b,
796+
FloatParts128 *c, int scale,
797+
int flags, float_status *s);
798798

799-
#define parts_muladd(A, B, C, Z, S) \
800-
PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S)
799+
#define parts_muladd_scalbn(A, B, C, Z, Y, S) \
800+
PARTS_GENERIC_64_128(muladd_scalbn, A)(A, B, C, Z, Y, S)
801801

802802
static FloatParts64 *parts64_div(FloatParts64 *a, FloatParts64 *b,
803803
float_status *s);
@@ -2212,43 +2212,50 @@ floatx80_mul(floatx80 a, floatx80 b, float_status *status)
22122212
* Fused multiply-add
22132213
*/
22142214

2215-
float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
2216-
int flags, float_status *status)
2215+
float16 QEMU_FLATTEN
2216+
float16_muladd_scalbn(float16 a, float16 b, float16 c,
2217+
int scale, int flags, float_status *status)
22172218
{
22182219
FloatParts64 pa, pb, pc, *pr;
22192220

22202221
float16_unpack_canonical(&pa, a, status);
22212222
float16_unpack_canonical(&pb, b, status);
22222223
float16_unpack_canonical(&pc, c, status);
2223-
pr = parts_muladd(&pa, &pb, &pc, flags, status);
2224+
pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status);
22242225

22252226
return float16_round_pack_canonical(pr, status);
22262227
}
22272228

2228-
static float32 QEMU_SOFTFLOAT_ATTR
2229-
soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
2230-
float_status *status)
2229+
float16 float16_muladd(float16 a, float16 b, float16 c,
2230+
int flags, float_status *status)
2231+
{
2232+
return float16_muladd_scalbn(a, b, c, 0, flags, status);
2233+
}
2234+
2235+
float32 QEMU_SOFTFLOAT_ATTR
2236+
float32_muladd_scalbn(float32 a, float32 b, float32 c,
2237+
int scale, int flags, float_status *status)
22312238
{
22322239
FloatParts64 pa, pb, pc, *pr;
22332240

22342241
float32_unpack_canonical(&pa, a, status);
22352242
float32_unpack_canonical(&pb, b, status);
22362243
float32_unpack_canonical(&pc, c, status);
2237-
pr = parts_muladd(&pa, &pb, &pc, flags, status);
2244+
pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status);
22382245

22392246
return float32_round_pack_canonical(pr, status);
22402247
}
22412248

2242-
static float64 QEMU_SOFTFLOAT_ATTR
2243-
soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
2244-
float_status *status)
2249+
float64 QEMU_SOFTFLOAT_ATTR
2250+
float64_muladd_scalbn(float64 a, float64 b, float64 c,
2251+
int scale, int flags, float_status *status)
22452252
{
22462253
FloatParts64 pa, pb, pc, *pr;
22472254

22482255
float64_unpack_canonical(&pa, a, status);
22492256
float64_unpack_canonical(&pb, b, status);
22502257
float64_unpack_canonical(&pc, c, status);
2251-
pr = parts_muladd(&pa, &pb, &pc, flags, status);
2258+
pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status);
22522259

22532260
return float64_round_pack_canonical(pr, status);
22542261
}
@@ -2267,7 +2274,7 @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
22672274
if (unlikely(!can_use_fpu(s))) {
22682275
goto soft;
22692276
}
2270-
if (unlikely(flags & float_muladd_halve_result)) {
2277+
if (unlikely(flags & float_muladd_suppress_add_product_zero)) {
22712278
goto soft;
22722279
}
22732280

@@ -2323,7 +2330,7 @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
23232330
return ur.s;
23242331

23252332
soft:
2326-
return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
2333+
return float32_muladd_scalbn(ua.s, ub.s, uc.s, 0, flags, s);
23272334
}
23282335

23292336
float64 QEMU_FLATTEN
@@ -2338,9 +2345,6 @@ float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
23382345
if (unlikely(!can_use_fpu(s))) {
23392346
goto soft;
23402347
}
2341-
if (unlikely(flags & float_muladd_halve_result)) {
2342-
goto soft;
2343-
}
23442348

23452349
float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
23462350
if (unlikely(!f64_is_zon3(ua, ub, uc))) {
@@ -2394,7 +2398,7 @@ float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
23942398
return ur.s;
23952399

23962400
soft:
2397-
return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
2401+
return float64_muladd_scalbn(ua.s, ub.s, uc.s, 0, flags, s);
23982402
}
23992403

24002404
float64 float64r32_muladd(float64 a, float64 b, float64 c,
@@ -2405,7 +2409,7 @@ float64 float64r32_muladd(float64 a, float64 b, float64 c,
24052409
float64_unpack_canonical(&pa, a, status);
24062410
float64_unpack_canonical(&pb, b, status);
24072411
float64_unpack_canonical(&pc, c, status);
2408-
pr = parts_muladd(&pa, &pb, &pc, flags, status);
2412+
pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status);
24092413

24102414
return float64r32_round_pack_canonical(pr, status);
24112415
}
@@ -2418,7 +2422,7 @@ bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
24182422
bfloat16_unpack_canonical(&pa, a, status);
24192423
bfloat16_unpack_canonical(&pb, b, status);
24202424
bfloat16_unpack_canonical(&pc, c, status);
2421-
pr = parts_muladd(&pa, &pb, &pc, flags, status);
2425+
pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status);
24222426

24232427
return bfloat16_round_pack_canonical(pr, status);
24242428
}
@@ -2431,7 +2435,7 @@ float128 QEMU_FLATTEN float128_muladd(float128 a, float128 b, float128 c,
24312435
float128_unpack_canonical(&pa, a, status);
24322436
float128_unpack_canonical(&pb, b, status);
24332437
float128_unpack_canonical(&pc, c, status);
2434-
pr = parts_muladd(&pa, &pb, &pc, flags, status);
2438+
pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status);
24352439

24362440
return float128_round_pack_canonical(pr, status);
24372441
}
@@ -5249,8 +5253,9 @@ float32 float32_exp2(float32 a, float_status *status)
52495253

52505254
float64_unpack_canonical(&rp, float64_one, status);
52515255
for (i = 0 ; i < 15 ; i++) {
5256+
52525257
float64_unpack_canonical(&tp, float32_exp2_coefficients[i], status);
5253-
rp = *parts_muladd(&tp, &xnp, &rp, 0, status);
5258+
rp = *parts_muladd_scalbn(&tp, &xnp, &rp, 0, 0, status);
52545259
xnp = *parts_mul(&xnp, &xp, status);
52555260
}
52565261

include/exec/translator.h

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,20 +21,6 @@
2121
#include "qemu/bswap.h"
2222
#include "exec/vaddr.h"
2323

24-
/**
25-
* gen_intermediate_code
26-
* @cpu: cpu context
27-
* @tb: translation block
28-
* @max_insns: max number of instructions to translate
29-
* @pc: guest virtual program counter address
30-
* @host_pc: host physical program counter address
31-
*
32-
* This function must be provided by the target, which should create
33-
* the target-specific DisasContext, and then invoke translator_loop.
34-
*/
35-
void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
36-
vaddr pc, void *host_pc);
37-
3824
/**
3925
* DisasJumpType:
4026
* @DISAS_NEXT: Next instruction in program order.

include/fpu/softfloat-types.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,8 @@ typedef enum __attribute__((__packed__)) {
140140
float_round_to_odd = 5,
141141
/* Not an IEEE rounding mode: round to closest odd, overflow to inf */
142142
float_round_to_odd_inf = 6,
143+
/* Not an IEEE rounding mode: round to nearest even, overflow to max */
144+
float_round_nearest_even_max = 7,
143145
} FloatRoundMode;
144146

145147
/*

include/fpu/softfloat.h

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -120,14 +120,16 @@ bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status);
120120
| Using these differs from negating an input or output before calling
121121
| the muladd function in that this means that a NaN doesn't have its
122122
| sign bit inverted before it is propagated.
123-
| We also support halving the result before rounding, as a special
124-
| case to support the ARM fused-sqrt-step instruction FRSQRTS.
123+
|
124+
| With float_muladd_suppress_add_product_zero, if A or B is zero
125+
| such that the product is a true zero, then return C without addition.
126+
| This preserves the sign of C when C is +/- 0. Used for Hexagon.
125127
*----------------------------------------------------------------------------*/
126128
enum {
127129
float_muladd_negate_c = 1,
128130
float_muladd_negate_product = 2,
129131
float_muladd_negate_result = 4,
130-
float_muladd_halve_result = 8,
132+
float_muladd_suppress_add_product_zero = 8,
131133
};
132134

133135
/*----------------------------------------------------------------------------
@@ -238,6 +240,8 @@ float16 float16_add(float16, float16, float_status *status);
238240
float16 float16_sub(float16, float16, float_status *status);
239241
float16 float16_mul(float16, float16, float_status *status);
240242
float16 float16_muladd(float16, float16, float16, int, float_status *status);
243+
float16 float16_muladd_scalbn(float16, float16, float16,
244+
int, int, float_status *status);
241245
float16 float16_div(float16, float16, float_status *status);
242246
float16 float16_scalbn(float16, int, float_status *status);
243247
float16 float16_min(float16, float16, float_status *status);
@@ -597,6 +601,8 @@ float32 float32_mul(float32, float32, float_status *status);
597601
float32 float32_div(float32, float32, float_status *status);
598602
float32 float32_rem(float32, float32, float_status *status);
599603
float32 float32_muladd(float32, float32, float32, int, float_status *status);
604+
float32 float32_muladd_scalbn(float32, float32, float32,
605+
int, int, float_status *status);
600606
float32 float32_sqrt(float32, float_status *status);
601607
float32 float32_exp2(float32, float_status *status);
602608
float32 float32_log2(float32, float_status *status);
@@ -792,6 +798,8 @@ float64 float64_mul(float64, float64, float_status *status);
792798
float64 float64_div(float64, float64, float_status *status);
793799
float64 float64_rem(float64, float64, float_status *status);
794800
float64 float64_muladd(float64, float64, float64, int, float_status *status);
801+
float64 float64_muladd_scalbn(float64, float64, float64,
802+
int, int, float_status *status);
795803
float64 float64_sqrt(float64, float_status *status);
796804
float64 float64_log2(float64, float_status *status);
797805
FloatRelation float64_compare(float64, float64, float_status *status);

include/hw/core/tcg-cpu-ops.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,19 @@ struct TCGCPUOps {
2424
* Called when the first CPU is realized.
2525
*/
2626
void (*initialize)(void);
27+
/**
28+
* @translate_code: Translate guest instructions to TCGOps
29+
* @cpu: cpu context
30+
* @tb: translation block
31+
* @max_insns: max number of instructions to translate
32+
* @pc: guest virtual program counter address
33+
* @host_pc: host physical program counter address
34+
*
35+
* This function must be provided by the target, which should create
36+
* the target-specific DisasContext, and then invoke translator_loop.
37+
*/
38+
void (*translate_code)(CPUState *cpu, TranslationBlock *tb,
39+
int *max_insns, vaddr pc, void *host_pc);
2740
/**
2841
* @synchronize_from_tb: Synchronize state from a TCG #TranslationBlock
2942
*

target/alpha/cpu.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ static const struct SysemuCPUOps alpha_sysemu_ops = {
224224

225225
static const TCGCPUOps alpha_tcg_ops = {
226226
.initialize = alpha_translate_init,
227+
.translate_code = alpha_translate_code,
227228
.synchronize_from_tb = alpha_cpu_synchronize_from_tb,
228229
.restore_state_to_opc = alpha_restore_state_to_opc,
229230

0 commit comments

Comments
 (0)