Skip to content

Commit d5593b9

Browse files
committed
[AMDGPU] Use native sqrt when flushing denorm is allowed
1 parent 13a3c4f commit d5593b9

File tree

5 files changed

+102
-953
lines changed

5 files changed

+102
-953
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11080,7 +11080,8 @@ SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
1108011080
MVT VT = Op.getValueType().getSimpleVT();
1108111081
const SDValue X = Op.getOperand(0);
1108211082

11083-
if (allowApproxFunc(DAG, Flags)) {
11083+
if (allowApproxFunc(DAG, Flags) ||
11084+
denormalModeIsFlushAllF32(DAG.getMachineFunction())) {
1108411085
// Instruction is 1ulp but ignores denormals.
1108511086
return DAG.getNode(
1108611087
ISD::INTRINSIC_WO_CHAIN, DL, VT,

llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll

Lines changed: 5 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -442,23 +442,7 @@ define float @v_fdiv_recip_sqrt_f32(float %x) {
442442
; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32:
443443
; CODEGEN-DAZ-SDAG: ; %bb.0:
444444
; CODEGEN-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
445-
; CODEGEN-DAZ-SDAG-NEXT: s_mov_b32 s4, 0xf800000
446-
; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
447-
; CODEGEN-DAZ-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
448-
; CODEGEN-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
449-
; CODEGEN-DAZ-SDAG-NEXT: v_rsq_f32_e32 v1, v0
450-
; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1
451-
; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0.5, v1
452-
; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 0.5
453-
; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v2, v2, v3, v2
454-
; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v4, -v2, v2, v0
455-
; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v1, v1, v3, v1
456-
; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v1, v4, v1, v2
457-
; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
458-
; CODEGEN-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
459-
; CODEGEN-DAZ-SDAG-NEXT: v_mov_b32_e32 v2, 0x260
460-
; CODEGEN-DAZ-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
461-
; CODEGEN-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
445+
; CODEGEN-DAZ-SDAG-NEXT: v_sqrt_f32_e32 v0, v0
462446
; CODEGEN-DAZ-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
463447
; CODEGEN-DAZ-SDAG-NEXT: v_rcp_f32_e32 v2, v1
464448
; CODEGEN-DAZ-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -512,23 +496,7 @@ define float @v_fdiv_recip_sqrt_f32(float %x) {
512496
; IR-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32:
513497
; IR-DAZ-SDAG: ; %bb.0:
514498
; IR-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
515-
; IR-DAZ-SDAG-NEXT: s_mov_b32 s4, 0xf800000
516-
; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
517-
; IR-DAZ-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
518-
; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
519-
; IR-DAZ-SDAG-NEXT: v_rsq_f32_e32 v1, v0
520-
; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1
521-
; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0.5, v1
522-
; IR-DAZ-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 0.5
523-
; IR-DAZ-SDAG-NEXT: v_fma_f32 v2, v2, v3, v2
524-
; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, -v2, v2, v0
525-
; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v1, v3, v1
526-
; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v4, v1, v2
527-
; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
528-
; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
529-
; IR-DAZ-SDAG-NEXT: v_mov_b32_e32 v2, 0x260
530-
; IR-DAZ-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
531-
; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
499+
; IR-DAZ-SDAG-NEXT: v_sqrt_f32_e32 v0, v0
532500
; IR-DAZ-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
533501
; IR-DAZ-SDAG-NEXT: v_rcp_f32_e32 v2, v1
534502
; IR-DAZ-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -1086,23 +1054,7 @@ define float @v_fdiv_recip_sqrt_f32_afn_fdiv_only(float %x) {
10861054
; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only:
10871055
; CODEGEN-DAZ-SDAG: ; %bb.0:
10881056
; CODEGEN-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1089-
; CODEGEN-DAZ-SDAG-NEXT: s_mov_b32 s4, 0xf800000
1090-
; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
1091-
; CODEGEN-DAZ-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
1092-
; CODEGEN-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1093-
; CODEGEN-DAZ-SDAG-NEXT: v_rsq_f32_e32 v1, v0
1094-
; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1
1095-
; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0.5, v1
1096-
; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 0.5
1097-
; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v2, v2, v3, v2
1098-
; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v4, -v2, v2, v0
1099-
; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v1, v1, v3, v1
1100-
; CODEGEN-DAZ-SDAG-NEXT: v_fma_f32 v1, v4, v1, v2
1101-
; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
1102-
; CODEGEN-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
1103-
; CODEGEN-DAZ-SDAG-NEXT: v_mov_b32_e32 v2, 0x260
1104-
; CODEGEN-DAZ-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
1105-
; CODEGEN-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
1057+
; CODEGEN-DAZ-SDAG-NEXT: v_sqrt_f32_e32 v0, v0
11061058
; CODEGEN-DAZ-SDAG-NEXT: v_rcp_f32_e32 v0, v0
11071059
; CODEGEN-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31]
11081060
;
@@ -1132,23 +1084,7 @@ define float @v_fdiv_recip_sqrt_f32_afn_fdiv_only(float %x) {
11321084
; IR-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only:
11331085
; IR-DAZ-SDAG: ; %bb.0:
11341086
; IR-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1135-
; IR-DAZ-SDAG-NEXT: s_mov_b32 s4, 0xf800000
1136-
; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
1137-
; IR-DAZ-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
1138-
; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1139-
; IR-DAZ-SDAG-NEXT: v_rsq_f32_e32 v1, v0
1140-
; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1
1141-
; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0.5, v1
1142-
; IR-DAZ-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 0.5
1143-
; IR-DAZ-SDAG-NEXT: v_fma_f32 v2, v2, v3, v2
1144-
; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, -v2, v2, v0
1145-
; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v1, v3, v1
1146-
; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v4, v1, v2
1147-
; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
1148-
; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
1149-
; IR-DAZ-SDAG-NEXT: v_mov_b32_e32 v2, 0x260
1150-
; IR-DAZ-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
1151-
; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
1087+
; IR-DAZ-SDAG-NEXT: v_sqrt_f32_e32 v0, v0
11521088
; IR-DAZ-SDAG-NEXT: v_rcp_f32_e32 v0, v0
11531089
; IR-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31]
11541090
;
@@ -1714,23 +1650,7 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
17141650
; IR-DAZ-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract:
17151651
; IR-DAZ-SDAG: ; %bb.0:
17161652
; IR-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1717-
; IR-DAZ-SDAG-NEXT: s_mov_b32 s4, 0xf800000
1718-
; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
1719-
; IR-DAZ-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
1720-
; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1721-
; IR-DAZ-SDAG-NEXT: v_rsq_f32_e32 v1, v0
1722-
; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1
1723-
; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0.5, v1
1724-
; IR-DAZ-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 0.5
1725-
; IR-DAZ-SDAG-NEXT: v_fma_f32 v2, v2, v3, v2
1726-
; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, -v2, v2, v0
1727-
; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v1, v3, v1
1728-
; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v4, v1, v2
1729-
; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
1730-
; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
1731-
; IR-DAZ-SDAG-NEXT: v_mov_b32_e32 v2, 0x260
1732-
; IR-DAZ-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
1733-
; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
1653+
; IR-DAZ-SDAG-NEXT: v_sqrt_f32_e32 v0, v0
17341654
; IR-DAZ-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
17351655
; IR-DAZ-SDAG-NEXT: v_rcp_f32_e32 v2, v1
17361656
; IR-DAZ-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0

0 commit comments

Comments
 (0)