|
2167 | 2167 | ;; The above rules automatically sink loads for rhs operands, so additionally |
2168 | 2168 | ;; add rules for sinking loads with lhs operands. |
2169 | 2169 | (rule 1 (lower (has_type $F32 (fadd (sinkable_load x) y))) |
2170 | | - (x64_addss y (sink_load x))) |
| 2170 | + (x64_addss y x)) |
2171 | 2171 | (rule 1 (lower (has_type $F64 (fadd (sinkable_load x) y))) |
2172 | | - (x64_addsd y (sink_load x))) |
| 2172 | + (x64_addsd y x)) |
2173 | 2173 | (rule 1 (lower (has_type $F32X4 (fadd (sinkable_load x) y))) |
2174 | | - (x64_addps y (sink_load x))) |
| 2174 | + (x64_addps y x)) |
2175 | 2175 | (rule 1 (lower (has_type $F64X2 (fadd (sinkable_load x) y))) |
2176 | | - (x64_addpd y (sink_load x))) |
| 2176 | + (x64_addpd y x)) |
2177 | 2177 |
|
2178 | 2178 | ;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2179 | 2179 |
|
|
2200 | 2200 | ;; The above rules automatically sink loads for rhs operands, so additionally |
2201 | 2201 | ;; add rules for sinking loads with lhs operands. |
2202 | 2202 | (rule 1 (lower (has_type $F32 (fmul (sinkable_load x) y))) |
2203 | | - (x64_mulss y (sink_load x))) |
| 2203 | + (x64_mulss y x)) |
2204 | 2204 | (rule 1 (lower (has_type $F64 (fmul (sinkable_load x) y))) |
2205 | | - (x64_mulsd y (sink_load x))) |
| 2205 | + (x64_mulsd y x)) |
2206 | 2206 | (rule 1 (lower (has_type $F32X4 (fmul (sinkable_load x) y))) |
2207 | | - (x64_mulps y (sink_load x))) |
| 2207 | + (x64_mulps y x)) |
2208 | 2208 | (rule 1 (lower (has_type $F64X2 (fmul (sinkable_load x) y))) |
2209 | | - (x64_mulpd y (sink_load x))) |
| 2209 | + (x64_mulpd y x)) |
2210 | 2210 |
|
2211 | 2211 | ;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2212 | 2212 |
|
|
2438 | 2438 |
|
2439 | 2439 | ;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2440 | 2440 |
|
| 2441 | +;; Base case for fma is to call out to one of two libcalls. For vectors they |
| 2442 | +;; need to be decomposed, handle each element individually, and then recomposed. |
| 2443 | + |
2441 | 2444 | (rule (lower (has_type $F32 (fma x y z))) |
2442 | 2445 | (libcall_3 (LibCall.FmaF32) x y z)) |
2443 | 2446 | (rule (lower (has_type $F64 (fma x y z))) |
2444 | 2447 | (libcall_3 (LibCall.FmaF64) x y z)) |
2445 | | -(rule 1 (lower (has_type (and (use_fma $true) $F32) (fma x y z))) |
2446 | | - (x64_vfmadd213ss x y z)) |
2447 | | -(rule 1 (lower (has_type (and (use_fma $true) $F64) (fma x y z))) |
2448 | | - (x64_vfmadd213sd x y z)) |
2449 | | -(rule (lower (has_type (and (use_fma $true) $F32X4) (fma x y z))) |
2450 | | - (x64_vfmadd213ps x y z)) |
2451 | | -(rule (lower (has_type (and (use_fma $true) $F64X2) (fma x y z))) |
2452 | | - (x64_vfmadd213pd x y z)) |
| 2448 | + |
| 2449 | +(rule (lower (has_type $F32X4 (fma x y z))) |
| 2450 | + (let ( |
| 2451 | + (x Xmm (put_in_xmm x)) |
| 2452 | + (y Xmm (put_in_xmm y)) |
| 2453 | + (z Xmm (put_in_xmm z)) |
| 2454 | + (x0 Xmm (libcall_3 (LibCall.FmaF32) x y z)) |
| 2455 | + (x1 Xmm (libcall_3 (LibCall.FmaF32) |
| 2456 | + (x64_pshufd x 1) |
| 2457 | + (x64_pshufd y 1) |
| 2458 | + (x64_pshufd z 1))) |
| 2459 | + (x2 Xmm (libcall_3 (LibCall.FmaF32) |
| 2460 | + (x64_pshufd x 2) |
| 2461 | + (x64_pshufd y 2) |
| 2462 | + (x64_pshufd z 2))) |
| 2463 | + (x3 Xmm (libcall_3 (LibCall.FmaF32) |
| 2464 | + (x64_pshufd x 3) |
| 2465 | + (x64_pshufd y 3) |
| 2466 | + (x64_pshufd z 3))) |
| 2467 | + |
| 2468 | + (tmp Xmm (vec_insert_lane $F32X4 x0 x1 1)) |
| 2469 | + (tmp Xmm (vec_insert_lane $F32X4 tmp x2 2)) |
| 2470 | + (tmp Xmm (vec_insert_lane $F32X4 tmp x3 3)) |
| 2471 | + ) |
| 2472 | + tmp)) |
| 2473 | +(rule (lower (has_type $F64X2 (fma x y z))) |
| 2474 | + (let ( |
| 2475 | + (x Xmm (put_in_xmm x)) |
| 2476 | + (y Xmm (put_in_xmm y)) |
| 2477 | + (z Xmm (put_in_xmm z)) |
| 2478 | + (x0 Xmm (libcall_3 (LibCall.FmaF64) x y z)) |
| 2479 | + (x1 Xmm (libcall_3 (LibCall.FmaF64) |
| 2480 | + (x64_pshufd x 0xee) |
| 2481 | + (x64_pshufd y 0xee) |
| 2482 | + (x64_pshufd z 0xee))) |
| 2483 | + ) |
| 2484 | + (vec_insert_lane $F64X2 x0 x1 1))) |
| 2485 | + |
| 2486 | + |
| 2487 | +;; Special case for when the `fma` feature is active and a native instruction |
| 2488 | +;; can be used. |
| 2489 | +(rule 1 (lower (has_type ty (fma x y z))) |
| 2490 | + (if-let $true (use_fma)) |
| 2491 | + (fmadd ty x y z)) |
| 2492 | + |
| 2493 | +(decl fmadd (Type Value Value Value) Xmm) |
| 2494 | +(decl fnmadd (Type Value Value Value) Xmm) |
| 2495 | + |
| 2496 | +;; Base case. Note that this will automatically sink a load with `z`, the value |
| 2497 | +;; to add. |
| 2498 | +(rule (fmadd ty x y z) (x64_vfmadd213 ty x y z)) |
| 2499 | + |
| 2500 | +;; Allow sinking loads with one of the two values being multiplied in addition |
| 2501 | +;; to the value being added. Note that both x and y can be sunk here due to |
| 2502 | +;; multiplication being commutative. |
| 2503 | +(rule 1 (fmadd ty (sinkable_load x) y z) (x64_vfmadd132 ty y z x)) |
| 2504 | +(rule 2 (fmadd ty x (sinkable_load y) z) (x64_vfmadd132 ty x z y)) |
| 2505 | + |
| 2506 | +;; If one of the values being multiplied is negated then use a `vfnmadd*` |
| 2507 | +;; instruction instead |
| 2508 | +(rule 3 (fmadd ty (fneg x) y z) (fnmadd ty x y z)) |
| 2509 | +(rule 4 (fmadd ty x (fneg y) z) (fnmadd ty x y z)) |
| 2510 | + |
| 2511 | +(rule (fnmadd ty x y z) (x64_vfnmadd213 ty x y z)) |
| 2512 | +(rule 1 (fnmadd ty (sinkable_load x) y z) (x64_vfnmadd132 ty y z x)) |
| 2513 | +(rule 2 (fnmadd ty x (sinkable_load y) z) (x64_vfnmadd132 ty x z y)) |
| 2514 | + |
| 2515 | +;; Like `fmadd` if one argument is negated switch which one is being codegen'd |
| 2516 | +(rule 3 (fnmadd ty (fneg x) y z) (fmadd ty x y z)) |
| 2517 | +(rule 4 (fnmadd ty x (fneg y) z) (fmadd ty x y z)) |
2453 | 2518 |
|
2454 | 2519 | ;; Rules for `load*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2455 | 2520 |
|
|
0 commit comments