@@ -3014,7 +3014,6 @@ alu_r_imm_to_r_f32(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
30143014 mov_imm_to_m (a, cache, imm, 4 );
30153015
30163016 mov_r_to_r_f32 (a, reg_no_dst, reg_no1_src);
3017-
30183017 return alu_r_m_float (a, op, reg_no_dst, cache, true );
30193018}
30203019
@@ -3033,40 +3032,52 @@ static bool
30333032alu_r_r_to_r_f32 (x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
30343033 int32 reg_no1_src, int32 reg_no2_src)
30353034{
3035+ bool store_result = false ;
3036+
3037+ /* *
3038+ * - op r0,r0,r1. do nothing since instructions always store results in
3039+ * the first register
3040+ *
3041+ * - op r1,r0,r1. use FREE_REG to cache and replace r0, and then store
3042+ * results in r1
3043+ *
3044+ * - op r0,r1,r2. use r0 to cache and replace r1, and accept the result
3045+ * naturally
3046+ **/
3047+ if (reg_no_dst == reg_no2_src) {
3048+ store_result = true ;
3049+ reg_no_dst = REG_F32_FREE_IDX;
3050+ }
3051+ mov_r_to_r_f32 (a, reg_no_dst, reg_no1_src);
3052+
30363053 switch (op) {
30373054 case ADD:
30383055 {
3039- mov_r_to_r_f32 (a, reg_no_dst, reg_no1_src);
30403056 a.addss (regs_float[reg_no_dst], regs_float[reg_no2_src]);
30413057 break ;
30423058 }
30433059 case SUB:
30443060 {
3045- mov_r_to_r_f32 (a, reg_no_dst, reg_no1_src);
30463061 a.subss (regs_float[reg_no_dst], regs_float[reg_no2_src]);
30473062 break ;
30483063 }
30493064 case MUL:
30503065 {
3051- mov_r_to_r_f32 (a, reg_no_dst, reg_no1_src);
30523066 a.mulss (regs_float[reg_no_dst], regs_float[reg_no2_src]);
30533067 break ;
30543068 }
30553069 case DIV_S:
30563070 {
3057- mov_r_to_r_f32 (a, reg_no_dst, reg_no1_src);
30583071 a.divss (regs_float[reg_no_dst], regs_float[reg_no2_src]);
30593072 break ;
30603073 }
30613074 case MAX:
30623075 {
3063- mov_r_to_r_f32 (a, reg_no_dst, reg_no1_src);
30643076 a.maxss (regs_float[reg_no_dst], regs_float[reg_no2_src]);
30653077 break ;
30663078 }
30673079 case MIN:
30683080 {
3069- mov_r_to_r_f32 (a, reg_no_dst, reg_no1_src);
30703081 a.minss (regs_float[reg_no_dst], regs_float[reg_no2_src]);
30713082 break ;
30723083 }
@@ -3076,6 +3087,10 @@ alu_r_r_to_r_f32(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
30763087 return false ;
30773088 }
30783089 }
3090+
3091+ if (store_result)
3092+ mov_r_to_r_f32 (a, reg_no2_src, REG_F32_FREE_IDX);
3093+
30793094 return true ;
30803095}
30813096
@@ -3188,7 +3203,6 @@ alu_r_imm_to_r_f64(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
31883203 mov_imm_to_m (a, cache, imm, 8 );
31893204
31903205 mov_r_to_r_f64 (a, reg_no_dst, reg_no1_src);
3191-
31923206 return alu_r_m_float (a, op, reg_no_dst, cache, false );
31933207}
31943208
@@ -3207,40 +3221,52 @@ static bool
32073221alu_r_r_to_r_f64 (x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
32083222 int32 reg_no1_src, int32 reg_no2_src)
32093223{
3224+ bool store_result = false ;
3225+
3226+ /* *
3227+ * - op r0,r0,r1. do nothing since instructions always store results in
3228+ * the first register
3229+ *
3230+ * - op r1,r0,r1. use FREE_REG to cache and replace r0, and then store
3231+ * results in r1
3232+ *
3233+ * - op r0,r1,r2. use r0 to cache and replace r1, and accept the result
3234+ * naturally
3235+ **/
3236+ if (reg_no_dst == reg_no2_src) {
3237+ store_result = true ;
3238+ reg_no_dst = REG_F64_FREE_IDX;
3239+ }
3240+ mov_r_to_r_f64 (a, reg_no_dst, reg_no1_src);
3241+
32103242 switch (op) {
32113243 case ADD:
32123244 {
3213- mov_r_to_r_f64 (a, reg_no_dst, reg_no1_src);
32143245 a.addsd (regs_float[reg_no_dst], regs_float[reg_no2_src]);
32153246 break ;
32163247 }
32173248 case SUB:
32183249 {
3219- mov_r_to_r_f64 (a, reg_no_dst, reg_no1_src);
32203250 a.subsd (regs_float[reg_no_dst], regs_float[reg_no2_src]);
32213251 break ;
32223252 }
32233253 case MUL:
32243254 {
3225- mov_r_to_r_f64 (a, reg_no_dst, reg_no1_src);
32263255 a.mulsd (regs_float[reg_no_dst], regs_float[reg_no2_src]);
32273256 break ;
32283257 }
32293258 case DIV_S:
32303259 {
3231- mov_r_to_r_f64 (a, reg_no_dst, reg_no1_src);
32323260 a.divsd (regs_float[reg_no_dst], regs_float[reg_no2_src]);
32333261 break ;
32343262 }
32353263 case MAX:
32363264 {
3237- mov_r_to_r_f64 (a, reg_no_dst, reg_no1_src);
32383265 a.maxsd (regs_float[reg_no_dst], regs_float[reg_no2_src]);
32393266 break ;
32403267 }
32413268 case MIN:
32423269 {
3243- mov_r_to_r_f64 (a, reg_no_dst, reg_no1_src);
32443270 a.minsd (regs_float[reg_no_dst], regs_float[reg_no2_src]);
32453271 break ;
32463272 }
@@ -3250,6 +3276,10 @@ alu_r_r_to_r_f64(x86::Assembler &a, ALU_OP op, int32 reg_no_dst,
32503276 return false ;
32513277 }
32523278 }
3279+
3280+ if (store_result)
3281+ mov_r_to_r_f64 (a, reg_no2_src, REG_F64_FREE_IDX);
3282+
32533283 return true ;
32543284}
32553285
0 commit comments