Avoid modify src reg firstly when src reg and dst reg are the same (#1220)

lum1n0us · web-flow · commit b84f11724bbb · 2022-06-10T15:33:16.000+08:00
diff --git a/core/iwasm/fast-jit/cg/x86-64/jit_codegen_x86_64.cpp b/core/iwasm/fast-jit/cg/x86-64/jit_codegen_x86_64.cpp
@@ -3014,7 +3014,6 @@ alu_r_imm_to_r_f32(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
     mov_imm_to_m(a, cache, imm, 4);
 
     mov_r_to_r_f32(a, reg_no_dst, reg_no1_src);
-
     return alu_r_m_float(a, op, reg_no_dst, cache, true);
 }
 
@@ -3033,40 +3032,52 @@ static bool
 alu_r_r_to_r_f32(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
                  int32 reg_no1_src, int32 reg_no2_src)
 {
+    bool store_result = false;
+
+    /**
+     * - op r0,r0,r1. do nothing since instructions always store results in
+     *   the first register
+     *
+     * - op r1,r0,r1. use FREE_REG to cache and replace r0, and then store
+     *   results in r1
+     *
+     * - op r0,r1,r2. use r0 to cache and replace r1, and accept the result
+     *   naturally
+     **/
+    if (reg_no_dst == reg_no2_src) {
+        store_result = true;
+        reg_no_dst = REG_F32_FREE_IDX;
+    }
+    mov_r_to_r_f32(a, reg_no_dst, reg_no1_src);
+
     switch (op) {
         case ADD:
         {
-            mov_r_to_r_f32(a, reg_no_dst, reg_no1_src);
             a.addss(regs_float[reg_no_dst], regs_float[reg_no2_src]);
             break;
         }
         case SUB:
         {
-            mov_r_to_r_f32(a, reg_no_dst, reg_no1_src);
             a.subss(regs_float[reg_no_dst], regs_float[reg_no2_src]);
             break;
         }
         case MUL:
         {
-            mov_r_to_r_f32(a, reg_no_dst, reg_no1_src);
             a.mulss(regs_float[reg_no_dst], regs_float[reg_no2_src]);
             break;
         }
         case DIV_S:
         {
-            mov_r_to_r_f32(a, reg_no_dst, reg_no1_src);
             a.divss(regs_float[reg_no_dst], regs_float[reg_no2_src]);
             break;
         }
         case MAX:
         {
-            mov_r_to_r_f32(a, reg_no_dst, reg_no1_src);
             a.maxss(regs_float[reg_no_dst], regs_float[reg_no2_src]);
             break;
         }
         case MIN:
         {
-            mov_r_to_r_f32(a, reg_no_dst, reg_no1_src);
             a.minss(regs_float[reg_no_dst], regs_float[reg_no2_src]);
             break;
         }
@@ -3076,6 +3087,10 @@ alu_r_r_to_r_f32(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
             return false;
         }
     }
+
+    if (store_result)
+        mov_r_to_r_f32(a, reg_no2_src, REG_F32_FREE_IDX);
+
     return true;
 }
 
@@ -3188,7 +3203,6 @@ alu_r_imm_to_r_f64(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
     mov_imm_to_m(a, cache, imm, 8);
 
     mov_r_to_r_f64(a, reg_no_dst, reg_no1_src);
-
     return alu_r_m_float(a, op, reg_no_dst, cache, false);
 }
 
@@ -3207,40 +3221,52 @@ static bool
 alu_r_r_to_r_f64(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
                  int32 reg_no1_src, int32 reg_no2_src)
 {
+    bool store_result = false;
+
+    /**
+     * - op r0,r0,r1. do nothing since instructions always store results in
+     *   the first register
+     *
+     * - op r1,r0,r1. use FREE_REG to cache and replace r0, and then store
+     *   results in r1
+     *
+     * - op r0,r1,r2. use r0 to cache and replace r1, and accept the result
+     *   naturally
+     **/
+    if (reg_no_dst == reg_no2_src) {
+        store_result = true;
+        reg_no_dst = REG_F64_FREE_IDX;
+    }
+    mov_r_to_r_f64(a, reg_no_dst, reg_no1_src);
+
     switch (op) {
         case ADD:
         {
-            mov_r_to_r_f64(a, reg_no_dst, reg_no1_src);
             a.addsd(regs_float[reg_no_dst], regs_float[reg_no2_src]);
             break;
         }
         case SUB:
         {
-            mov_r_to_r_f64(a, reg_no_dst, reg_no1_src);
             a.subsd(regs_float[reg_no_dst], regs_float[reg_no2_src]);
             break;
         }
         case MUL:
         {
-            mov_r_to_r_f64(a, reg_no_dst, reg_no1_src);
             a.mulsd(regs_float[reg_no_dst], regs_float[reg_no2_src]);
             break;
         }
         case DIV_S:
         {
-            mov_r_to_r_f64(a, reg_no_dst, reg_no1_src);
             a.divsd(regs_float[reg_no_dst], regs_float[reg_no2_src]);
             break;
         }
         case MAX:
         {
-            mov_r_to_r_f64(a, reg_no_dst, reg_no1_src);
             a.maxsd(regs_float[reg_no_dst], regs_float[reg_no2_src]);
             break;
         }
         case MIN:
         {
-            mov_r_to_r_f64(a, reg_no_dst, reg_no1_src);
             a.minsd(regs_float[reg_no_dst], regs_float[reg_no2_src]);
             break;
         }
@@ -3250,6 +3276,10 @@ alu_r_r_to_r_f64(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
             return false;
         }
     }
+
+    if (store_result)
+        mov_r_to_r_f64(a, reg_no2_src, REG_F64_FREE_IDX);
+
     return true;
 }
 
diff --git a/core/iwasm/fast-jit/fe/jit_emit_numberic.c b/core/iwasm/fast-jit/fe/jit_emit_numberic.c
@@ -1379,9 +1379,11 @@ compile_op_float_math(JitCompContext *cc, FloatMath math_op, bool is_f32)
 
     switch (math_op) {
         case FLOAT_ABS:
+            /* TODO: andps 0x7fffffffffffffff */
             func = is_f32 ? (void *)fabsf : (void *)fabs;
             break;
         case FLOAT_NEG:
+            /* TODO: xorps 0x8000000000000000 */
             func = is_f32 ? (void *)negf : (void *)neg;
             break;
         case FLOAT_CEIL:

Original file line number	Diff line number	Diff line change
`@@ -3014,7 +3014,6 @@ alu_r_imm_to_r_f32(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,`
`3014`	`3014`	`mov_imm_to_m(a, cache, imm, 4);`
`3015`	`3015`
`3016`	`3016`	`mov_r_to_r_f32(a, reg_no_dst, reg_no1_src);`
`3017`		`-`
`3018`	`3017`	`return alu_r_m_float(a, op, reg_no_dst, cache, true);`
`3019`	`3018`	`}`
`3020`	`3019`
`@@ -3033,40 +3032,52 @@ static bool`
`3033`	`3032`	`alu_r_r_to_r_f32(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,`
`3034`	`3033`	`int32 reg_no1_src, int32 reg_no2_src)`
`3035`	`3034`	`{`
	`3035`	`+ bool store_result = false;`
	`3036`	`+`
	`3037`	`+ /**`
	`3038`	`+ * - op r0,r0,r1. do nothing since instructions always store results in`
	`3039`	`+ * the first register`
	`3040`	`+ *`
	`3041`	`+ * - op r1,r0,r1. use FREE_REG to cache and replace r0, and then store`
	`3042`	`+ * results in r1`
	`3043`	`+ *`
	`3044`	`+ * - op r0,r1,r2. use r0 to cache and replace r1, and accept the result`
	`3045`	`+ * naturally`
	`3046`	`+ **/`
	`3047`	`+ if (reg_no_dst == reg_no2_src) {`
	`3048`	`+ store_result = true;`
	`3049`	`+ reg_no_dst = REG_F32_FREE_IDX;`
	`3050`	`+ }`
	`3051`	`+ mov_r_to_r_f32(a, reg_no_dst, reg_no1_src);`
	`3052`	`+`
`3036`	`3053`	`switch (op) {`
`3037`	`3054`	`case ADD:`
`3038`	`3055`	`{`
`3039`		`- mov_r_to_r_f32(a, reg_no_dst, reg_no1_src);`
`3040`	`3056`	`a.addss(regs_float[reg_no_dst], regs_float[reg_no2_src]);`
`3041`	`3057`	`break;`
`3042`	`3058`	`}`
`3043`	`3059`	`case SUB:`
`3044`	`3060`	`{`
`3045`		`- mov_r_to_r_f32(a, reg_no_dst, reg_no1_src);`
`3046`	`3061`	`a.subss(regs_float[reg_no_dst], regs_float[reg_no2_src]);`
`3047`	`3062`	`break;`
`3048`	`3063`	`}`
`3049`	`3064`	`case MUL:`
`3050`	`3065`	`{`
`3051`		`- mov_r_to_r_f32(a, reg_no_dst, reg_no1_src);`
`3052`	`3066`	`a.mulss(regs_float[reg_no_dst], regs_float[reg_no2_src]);`
`3053`	`3067`	`break;`
`3054`	`3068`	`}`
`3055`	`3069`	`case DIV_S:`
`3056`	`3070`	`{`
`3057`		`- mov_r_to_r_f32(a, reg_no_dst, reg_no1_src);`
`3058`	`3071`	`a.divss(regs_float[reg_no_dst], regs_float[reg_no2_src]);`
`3059`	`3072`	`break;`
`3060`	`3073`	`}`
`3061`	`3074`	`case MAX:`
`3062`	`3075`	`{`
`3063`		`- mov_r_to_r_f32(a, reg_no_dst, reg_no1_src);`
`3064`	`3076`	`a.maxss(regs_float[reg_no_dst], regs_float[reg_no2_src]);`
`3065`	`3077`	`break;`
`3066`	`3078`	`}`
`3067`	`3079`	`case MIN:`
`3068`	`3080`	`{`
`3069`		`- mov_r_to_r_f32(a, reg_no_dst, reg_no1_src);`
`3070`	`3081`	`a.minss(regs_float[reg_no_dst], regs_float[reg_no2_src]);`
`3071`	`3082`	`break;`
`3072`	`3083`	`}`
`@@ -3076,6 +3087,10 @@ alu_r_r_to_r_f32(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,`
`3076`	`3087`	`return false;`
`3077`	`3088`	`}`
`3078`	`3089`	`}`
	`3090`	`+`
	`3091`	`+ if (store_result)`
	`3092`	`+ mov_r_to_r_f32(a, reg_no2_src, REG_F32_FREE_IDX);`
	`3093`	`+`
`3079`	`3094`	`return true;`
`3080`	`3095`	`}`
`3081`	`3096`
`@@ -3188,7 +3203,6 @@ alu_r_imm_to_r_f64(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,`
`3188`	`3203`	`mov_imm_to_m(a, cache, imm, 8);`
`3189`	`3204`
`3190`	`3205`	`mov_r_to_r_f64(a, reg_no_dst, reg_no1_src);`
`3191`		`-`
`3192`	`3206`	`return alu_r_m_float(a, op, reg_no_dst, cache, false);`
`3193`	`3207`	`}`
`3194`	`3208`
`@@ -3207,40 +3221,52 @@ static bool`
`3207`	`3221`	`alu_r_r_to_r_f64(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,`
`3208`	`3222`	`int32 reg_no1_src, int32 reg_no2_src)`
`3209`	`3223`	`{`
	`3224`	`+ bool store_result = false;`
	`3225`	`+`
	`3226`	`+ /**`
	`3227`	`+ * - op r0,r0,r1. do nothing since instructions always store results in`
	`3228`	`+ * the first register`
	`3229`	`+ *`
	`3230`	`+ * - op r1,r0,r1. use FREE_REG to cache and replace r0, and then store`
	`3231`	`+ * results in r1`
	`3232`	`+ *`
	`3233`	`+ * - op r0,r1,r2. use r0 to cache and replace r1, and accept the result`
	`3234`	`+ * naturally`
	`3235`	`+ **/`
	`3236`	`+ if (reg_no_dst == reg_no2_src) {`
	`3237`	`+ store_result = true;`
	`3238`	`+ reg_no_dst = REG_F64_FREE_IDX;`
	`3239`	`+ }`
	`3240`	`+ mov_r_to_r_f64(a, reg_no_dst, reg_no1_src);`
	`3241`	`+`
`3210`	`3242`	`switch (op) {`
`3211`	`3243`	`case ADD:`
`3212`	`3244`	`{`
`3213`		`- mov_r_to_r_f64(a, reg_no_dst, reg_no1_src);`
`3214`	`3245`	`a.addsd(regs_float[reg_no_dst], regs_float[reg_no2_src]);`
`3215`	`3246`	`break;`
`3216`	`3247`	`}`
`3217`	`3248`	`case SUB:`
`3218`	`3249`	`{`
`3219`		`- mov_r_to_r_f64(a, reg_no_dst, reg_no1_src);`
`3220`	`3250`	`a.subsd(regs_float[reg_no_dst], regs_float[reg_no2_src]);`
`3221`	`3251`	`break;`
`3222`	`3252`	`}`
`3223`	`3253`	`case MUL:`
`3224`	`3254`	`{`
`3225`		`- mov_r_to_r_f64(a, reg_no_dst, reg_no1_src);`
`3226`	`3255`	`a.mulsd(regs_float[reg_no_dst], regs_float[reg_no2_src]);`
`3227`	`3256`	`break;`
`3228`	`3257`	`}`
`3229`	`3258`	`case DIV_S:`
`3230`	`3259`	`{`
`3231`		`- mov_r_to_r_f64(a, reg_no_dst, reg_no1_src);`
`3232`	`3260`	`a.divsd(regs_float[reg_no_dst], regs_float[reg_no2_src]);`
`3233`	`3261`	`break;`
`3234`	`3262`	`}`
`3235`	`3263`	`case MAX:`
`3236`	`3264`	`{`
`3237`		`- mov_r_to_r_f64(a, reg_no_dst, reg_no1_src);`
`3238`	`3265`	`a.maxsd(regs_float[reg_no_dst], regs_float[reg_no2_src]);`
`3239`	`3266`	`break;`
`3240`	`3267`	`}`
`3241`	`3268`	`case MIN:`
`3242`	`3269`	`{`
`3243`		`- mov_r_to_r_f64(a, reg_no_dst, reg_no1_src);`
`3244`	`3270`	`a.minsd(regs_float[reg_no_dst], regs_float[reg_no2_src]);`
`3245`	`3271`	`break;`
`3246`	`3272`	`}`
`@@ -3250,6 +3276,10 @@ alu_r_r_to_r_f64(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,`
`3250`	`3276`	`return false;`
`3251`	`3277`	`}`
`3252`	`3278`	`}`
	`3279`	`+`
	`3280`	`+ if (store_result)`
	`3281`	`+ mov_r_to_r_f64(a, reg_no2_src, REG_F64_FREE_IDX);`
	`3282`	`+`
`3253`	`3283`	`return true;`
`3254`	`3284`	`}`
`3255`	`3285`