Skip to content

Commit 5b3f666

Browse files
committed
Update comment, apply same fix to global isel, and fix lit tests
1 parent 2861db0 commit 5b3f666

File tree

4 files changed

+39
-65
lines changed

4 files changed

+39
-65
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2805,8 +2805,11 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
28052805

28062806
SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
28072807
SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2808-
// Our implementation of LOG is not contract safe, so disable instruction
2809-
// contraction.
2808+
// Our implementation of LOG is not contract safe because we generate
2809+
// error-correcting summations based on the rounding error of the first
2810+
// multiplication below, so contracting the multiply with the final add will
2811+
// lead to inaccurate final results. Disable contraction for the expanded
2812+
// instructions.
28102813
Flags.setAllowContract(false);
28112814
R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
28122815
SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
@@ -2830,8 +2833,11 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
28302833
SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
28312834
SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
28322835
SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2833-
// Our implementation of LOG is not contract safe, so disable instruction
2834-
// contraction.
2836+
// Our implementation of LOG is not contract safe because we generate
2837+
// error-correcting summations based on the rounding error of the first
2838+
// multiplication below, so contracting the multiply with the final add will
2839+
// lead to inaccurate final results. Disable contraction for the expanded
2840+
// instructions.
28352841
Flags.setAllowContract(false);
28362842
SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
28372843
SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3508,9 +3508,6 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
35083508
MachineRegisterInfo &MRI = *B.getMRI();
35093509
Register Dst = MI.getOperand(0).getReg();
35103510
Register X = MI.getOperand(1).getReg();
3511-
// Our implementation of LOG is not contract safe, so disable contraction in
3512-
// the flags before reading the field.
3513-
MI.clearFlags(MachineInstr::FmContract);
35143511
unsigned Flags = MI.getFlags();
35153512
const LLT Ty = MRI.getType(X);
35163513
MachineFunction &MF = B.getMF();
@@ -3554,12 +3551,17 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
35543551

35553552
auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
35563553
auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3557-
3558-
R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3559-
auto NegR = B.buildFNeg(Ty, R, Flags);
3560-
auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3561-
auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3562-
R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3554+
// Our implementation of LOG is not contract safe because we generate
3555+
// error-correcting summations based on the rounding error of the first
3556+
// multiplication below, so contracting the multiply with the final add will
3557+
// lead to inaccurate final results. Disable contraction for the expanded
3558+
// instructions.
3559+
auto NewFlags = Flags & ~(MachineInstr::FmContract);
3560+
R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0);
3561+
auto NegR = B.buildFNeg(Ty, R, NewFlags);
3562+
auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags);
3563+
auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags);
3564+
R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
35633565
} else {
35643566
// ch+ct is ln(2)/ln(10) to more than 36 bits
35653567
const float ch_log10 = 0x1.344000p-2f;
@@ -3575,12 +3577,18 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
35753577
auto MaskConst = B.buildConstant(Ty, 0xfffff000);
35763578
auto YH = B.buildAnd(Ty, Y, MaskConst);
35773579
auto YT = B.buildFSub(Ty, Y, YH, Flags);
3578-
auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3580+
// Our implementation of LOG is not contract safe because we generate
3581+
// error-correcting summations based on the rounding error of the first
3582+
// multiplication below, so contracting the multiply with the final add will
3583+
// lead to inaccurate final results. Disable contraction for the expanded
3584+
// instructions.
3585+
auto NewFlags = Flags & ~(MachineInstr::FmContract);
3586+
auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags);
35793587

35803588
Register Mad0 =
3581-
getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3582-
Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3583-
R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3589+
getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3590+
Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags);
3591+
R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags);
35843592
}
35853593

35863594
const bool IsFiniteOnly =

llvm/test/CodeGen/AMDGPU/llvm.log.ll

Lines changed: 4 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6742,8 +6742,6 @@ define half @v_log_f16_fast(half %in) {
67426742
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
67436743
; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
67446744
; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
6745-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
6746-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
67476745
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
67486746
;
67496747
; SI-GISEL-LABEL: v_log_f16_fast:
@@ -7405,10 +7403,6 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) {
74057403
; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
74067404
; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
74077405
; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1
7408-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
7409-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
7410-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
7411-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
74127406
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
74137407
;
74147408
; SI-GISEL-LABEL: v_log_v2f16_fast:
@@ -7674,12 +7668,6 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) {
76747668
; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
76757669
; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1
76767670
; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2
7677-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
7678-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
7679-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
7680-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
7681-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
7682-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
76837671
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
76847672
;
76857673
; SI-GISEL-LABEL: v_log_v3f16_fast:
@@ -8006,28 +7994,20 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
80067994
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80077995
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
80087996
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
8009-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3
80107997
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
7998+
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3
80117999
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
80128000
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
8013-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3
80148001
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
8002+
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3
80158003
; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
80168004
; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
8017-
; SI-SDAG-NEXT: v_log_f32_e32 v3, v3
80188005
; SI-SDAG-NEXT: v_log_f32_e32 v2, v2
8006+
; SI-SDAG-NEXT: v_log_f32_e32 v3, v3
80198007
; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
80208008
; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1
8021-
; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3
80228009
; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2
8023-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
8024-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
8025-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
8026-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3
8027-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
8028-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
8029-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
8030-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3
8010+
; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3
80318011
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
80328012
;
80338013
; SI-GISEL-LABEL: v_log_v4f16_fast:

llvm/test/CodeGen/AMDGPU/llvm.log10.ll

Lines changed: 4 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6742,8 +6742,6 @@ define half @v_log10_f16_fast(half %in) {
67426742
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
67436743
; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
67446744
; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
6745-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
6746-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
67476745
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
67486746
;
67496747
; SI-GISEL-LABEL: v_log10_f16_fast:
@@ -7405,10 +7403,6 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) {
74057403
; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
74067404
; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
74077405
; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1
7408-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
7409-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
7410-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
7411-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
74127406
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
74137407
;
74147408
; SI-GISEL-LABEL: v_log10_v2f16_fast:
@@ -7674,12 +7668,6 @@ define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) {
76747668
; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
76757669
; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1
76767670
; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2
7677-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
7678-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
7679-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
7680-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
7681-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
7682-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
76837671
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
76847672
;
76857673
; SI-GISEL-LABEL: v_log10_v3f16_fast:
@@ -8006,28 +7994,20 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
80067994
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80077995
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
80087996
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
8009-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3
80107997
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
7998+
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3
80117999
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
80128000
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
8013-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3
80148001
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
8002+
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3
80158003
; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
80168004
; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
8017-
; SI-SDAG-NEXT: v_log_f32_e32 v3, v3
80188005
; SI-SDAG-NEXT: v_log_f32_e32 v2, v2
8006+
; SI-SDAG-NEXT: v_log_f32_e32 v3, v3
80198007
; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
80208008
; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v1
8021-
; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3
80228009
; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a209b, v2
8023-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
8024-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
8025-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
8026-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3
8027-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
8028-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
8029-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
8030-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3
8010+
; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a209b, v3
80318011
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
80328012
;
80338013
; SI-GISEL-LABEL: v_log10_v4f16_fast:

0 commit comments

Comments
 (0)