Skip to content

Commit bb3f8dd

Browse files
authored
Merge branch 'main' into issue_147819
2 parents b62e283 + 400ce1a commit bb3f8dd

File tree

20 files changed

+318
-159
lines changed

20 files changed

+318
-159
lines changed

llvm/include/llvm/Support/DebugLog.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ class LogWithNewline {
3838
raw_ostream &os)
3939
: os(os) {
4040
if (debug_type)
41-
os << debug_type << " ";
42-
os << "[" << file << ":" << line << "] ";
41+
os << "[" << debug_type << "] ";
42+
os << file << ":" << line << " ";
4343
}
4444
~LogWithNewline() { os << '\n'; }
4545
template <typename T> raw_ostream &operator<<(const T &t) && {

llvm/lib/Analysis/ConstantFolding.cpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -929,12 +929,11 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
929929
if (!AllConstantInt)
930930
break;
931931

932-
// TODO: Try to intersect two inrange attributes?
933-
if (!InRange) {
934-
InRange = GEP->getInRange();
935-
if (InRange)
936-
// Adjust inrange by offset until now.
937-
InRange = InRange->sextOrTrunc(BitWidth).subtract(Offset);
932+
// Adjust inrange offset and intersect inrange attributes
933+
if (auto GEPRange = GEP->getInRange()) {
934+
auto AdjustedGEPRange = GEPRange->sextOrTrunc(BitWidth).subtract(Offset);
935+
InRange =
936+
InRange ? InRange->intersectWith(AdjustedGEPRange) : AdjustedGEPRange;
938937
}
939938

940939
Ptr = cast<Constant>(GEP->getOperand(0));

llvm/lib/Target/AMDGPU/GCNRegPressure.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,11 @@ bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1,
3838

3939
unsigned GCNRegPressure::getRegKind(const TargetRegisterClass *RC,
4040
const SIRegisterInfo *STI) {
41-
return STI->isSGPRClass(RC) ? SGPR : (STI->isAGPRClass(RC) ? AGPR : VGPR);
41+
return STI->isSGPRClass(RC)
42+
? SGPR
43+
: (STI->isAGPRClass(RC)
44+
? AGPR
45+
: (STI->isVectorSuperClass(RC) ? AVGPR : VGPR));
4246
}
4347

4448
void GCNRegPressure::inc(unsigned Reg,

llvm/lib/Target/AMDGPU/GCNRegPressure.h

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,43 +29,57 @@ class raw_ostream;
2929
class SlotIndex;
3030

3131
struct GCNRegPressure {
32-
enum RegKind { SGPR, VGPR, AGPR, TOTAL_KINDS };
32+
enum RegKind { SGPR, VGPR, AGPR, AVGPR, TOTAL_KINDS };
3333

3434
GCNRegPressure() {
3535
clear();
3636
}
3737

38-
bool empty() const { return !Value[SGPR] && !Value[VGPR] && !Value[AGPR]; }
38+
bool empty() const {
39+
return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR];
40+
}
3941

4042
void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); }
4143

4244
/// \returns the SGPR32 pressure
4345
unsigned getSGPRNum() const { return Value[SGPR]; }
44-
/// \returns the aggregated ArchVGPR32, AccVGPR32 pressure dependent upon \p
45-
/// UnifiedVGPRFile
46+
/// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure
47+
/// dependent upon \p UnifiedVGPRFile
4648
unsigned getVGPRNum(bool UnifiedVGPRFile) const {
4749
if (UnifiedVGPRFile) {
48-
return Value[AGPR] ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR])
49-
: Value[VGPR];
50+
return Value[AGPR]
51+
? getUnifiedVGPRNum(Value[VGPR], Value[AGPR], Value[AVGPR])
52+
: Value[VGPR] + Value[AVGPR];
5053
}
51-
return std::max(Value[VGPR], Value[AGPR]);
54+
// AVGPR assignment priority is based on the width of the register. Account
55+
// AVGPR pressure as VGPR.
56+
return std::max(Value[VGPR] + Value[AVGPR], Value[AGPR]);
5257
}
5358

5459
/// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs
55-
/// and \p NumAGPRs AGPRS, for a target with a unified VGPR file.
60+
/// \p NumAGPRs AGPRS, and \p NumAVGPRs AVGPRs for a target with a unified
61+
/// VGPR file.
5662
inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs,
57-
unsigned NumAGPRs) {
58-
return alignTo(NumArchVGPRs, AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
63+
unsigned NumAGPRs,
64+
unsigned NumAVGPRs) {
65+
66+
// Assume AVGPRs will be assigned as VGPRs.
67+
return alignTo(NumArchVGPRs + NumAVGPRs,
68+
AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
5969
NumAGPRs;
6070
}
6171

62-
/// \returns the ArchVGPR32 pressure
63-
unsigned getArchVGPRNum() const { return Value[VGPR]; }
72+
/// \returns the ArchVGPR32 pressure, plus the AVGPRS which we assume will be
73+
/// allocated as VGPR
74+
unsigned getArchVGPRNum() const { return Value[VGPR] + Value[AVGPR]; }
6475
/// \returns the AccVGPR32 pressure
6576
unsigned getAGPRNum() const { return Value[AGPR]; }
77+
/// \returns the AVGPR32 pressure
78+
unsigned getAVGPRNum() const { return Value[AVGPR]; }
6679

6780
unsigned getVGPRTuplesWeight() const {
68-
return std::max(Value[TOTAL_KINDS + VGPR], Value[TOTAL_KINDS + AGPR]);
81+
return std::max(Value[TOTAL_KINDS + VGPR] + Value[TOTAL_KINDS + AVGPR],
82+
Value[TOTAL_KINDS + AGPR]);
6983
}
7084
unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; }
7185

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14179,6 +14179,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
1417914179
Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
1418014180
(VT == MVT::f32 || VT == MVT::f64 ||
1418114181
(VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14182+
(VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14183+
(VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
1418214184
(VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
1418314185
Op0.hasOneUse()) {
1418414186
if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2508,7 +2508,20 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
25082508
.addReg(DstHi);
25092509
}
25102510
break;
2511+
2512+
case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2513+
assert(ST.hasBF16PackedInsts());
2514+
MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
2515+
MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
2516+
MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
2517+
MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
2518+
auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2519+
Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
2520+
auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2521+
Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
2522+
break;
25112523
}
2524+
25122525
return true;
25132526
}
25142527

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2865,6 +2865,7 @@ def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
28652865
def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
28662866
def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>;
28672867
def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], /*EnableClamp=*/1>;
2868+
def VOP_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, untyped]>;
28682869

28692870
def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
28702871
def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1894,6 +1894,9 @@ let SubtargetPredicate = UseRealTrue16Insts in
18941894
def : ClampPat<V_MAX_F16_t16_e64, f16>;
18951895
let SubtargetPredicate = UseFakeTrue16Insts in
18961896
def : ClampPat<V_MAX_F16_fake16_e64, f16>;
1897+
// FIXME-TRUE16: Pseudo expansion of this won't work with True16.
1898+
let True16Predicate = UseFakeTrue16Insts in
1899+
def : ClampPat<V_MAX_BF16_PSEUDO_e64, bf16>;
18971900

18981901
let SubtargetPredicate = HasVOP3PInsts in {
18991902
def : GCNPat <
@@ -1903,6 +1906,13 @@ def : GCNPat <
19031906
>;
19041907
}
19051908

1909+
let SubtargetPredicate = HasBF16PackedInsts in {
1910+
def : GCNPat <
1911+
(v2bf16 (AMDGPUclamp (VOP3PMods v2bf16:$src0, i32:$src0_modifiers))),
1912+
(V_PK_MAX_NUM_BF16 $src0_modifiers, $src0,
1913+
$src0_modifiers, $src0, DSTCLAMP.ENABLE)
1914+
>;
1915+
} // End SubtargetPredicate = HasBF16PackedInsts
19061916

19071917
/********** ================================ **********/
19081918
/********** Floating point absolute/negative **********/

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1236,6 +1236,12 @@ let isCommutable = 1, isReMaterializable = 1 in {
12361236
defm V_PK_MIN_NUM_BF16 : VOP3PInst<"v_pk_min_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fminnum_like>;
12371237
defm V_PK_MAX_NUM_BF16 : VOP3PInst<"v_pk_max_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fmaxnum_like>;
12381238
defm V_PK_FMA_BF16 : VOP3PInst<"v_pk_fma_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fma>;
1239+
1240+
// Scalar pseudo used to emulate AMDGPUClamp.
1241+
// Expanded to V_PK_MAX_NUM_BF16 with unused high half.
1242+
// FIXME-TRUE16: Pseudo expansion of this won't work with True16.
1243+
let True16Predicate = UseFakeTrue16Insts in
1244+
defm V_MAX_BF16_PSEUDO : VOP3Inst <"v_max_bf16", VOP_BF16_BF16_BF16>;
12391245
}
12401246
} // End isCommutable = 1, isReMaterializable = 1
12411247

llvm/test/CodeGen/AMDGPU/bf16-math.ll

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,146 @@ define amdgpu_ps void @v_test_max_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat>
323323
ret void
324324
}
325325

326+
define amdgpu_ps bfloat @test_clamp_bf16(bfloat %src) {
327+
; GCN-LABEL: test_clamp_bf16:
328+
; GCN: ; %bb.0:
329+
; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
330+
; GCN-NEXT: ; return to shader part epilog
331+
%max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
332+
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
333+
ret bfloat %clamp
334+
}
335+
336+
define amdgpu_ps bfloat @test_clamp_bf16_s(bfloat inreg %src) {
337+
; GCN-LABEL: test_clamp_bf16_s:
338+
; GCN: ; %bb.0:
339+
; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
340+
; GCN-NEXT: ; return to shader part epilog
341+
%max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
342+
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
343+
ret bfloat %clamp
344+
}
345+
346+
define amdgpu_ps float @test_clamp_v2bf16(<2 x bfloat> %src) {
347+
; GCN-LABEL: test_clamp_v2bf16:
348+
; GCN: ; %bb.0:
349+
; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
350+
; GCN-NEXT: ; return to shader part epilog
351+
%max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
352+
%clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
353+
%ret = bitcast <2 x bfloat> %clamp to float
354+
ret float %ret
355+
}
356+
357+
define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) {
358+
; GCN-LABEL: test_clamp_v2bf16_s:
359+
; GCN: ; %bb.0:
360+
; GCN-NEXT: v_pk_max_num_bf16 v0, s0, s0 clamp
361+
; GCN-NEXT: ; return to shader part epilog
362+
%max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
363+
%clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
364+
%ret = bitcast <2 x bfloat> %clamp to float
365+
ret float %ret
366+
}
367+
368+
define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
369+
; GCN-LABEL: test_clamp_bf16_folding:
370+
; GCN: ; %bb.0:
371+
; GCN-NEXT: v_exp_bf16_e32 v0, v0
372+
; GCN-NEXT: v_nop
373+
; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
374+
; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
375+
; GCN-NEXT: ; return to shader part epilog
376+
%exp = call bfloat @llvm.exp2.bf16(bfloat %src)
377+
%max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0)
378+
%clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
379+
ret bfloat %clamp
380+
}
381+
382+
define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloat> %src1) {
383+
; GCN-LABEL: test_clamp_v2bf16_folding:
384+
; GCN: ; %bb.0:
385+
; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1
386+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
387+
; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
388+
; GCN-NEXT: ; return to shader part epilog
389+
%mul = fmul <2 x bfloat> %src0, %src1
390+
%max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %mul, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
391+
%clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
392+
%ret = bitcast <2 x bfloat> %clamp to float
393+
ret float %ret
394+
}
395+
396+
define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
397+
; GCN-LABEL: v_test_mul_add_v2bf16_vvv:
398+
; GCN: ; %bb.0:
399+
; GCN-NEXT: v_pk_mul_bf16 v2, v2, v3
400+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
401+
; GCN-NEXT: v_pk_add_bf16 v2, v2, v4
402+
; GCN-NEXT: global_store_b32 v[0:1], v2, off
403+
; GCN-NEXT: s_endpgm
404+
%mul = fmul contract <2 x bfloat> %a, %b
405+
%add = fadd contract <2 x bfloat> %mul, %c
406+
store <2 x bfloat> %add, ptr addrspace(1) %out
407+
ret void
408+
}
409+
410+
define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
411+
; GCN-LABEL: v_test_mul_add_v2bf16_vss:
412+
; GCN: ; %bb.0:
413+
; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
414+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
415+
; GCN-NEXT: v_pk_add_bf16 v2, v2, s1
416+
; GCN-NEXT: global_store_b32 v[0:1], v2, off
417+
; GCN-NEXT: s_endpgm
418+
%mul = fmul contract <2 x bfloat> %a, %b
419+
%add = fadd contract <2 x bfloat> %mul, %c
420+
store <2 x bfloat> %add, ptr addrspace(1) %out
421+
ret void
422+
}
423+
424+
define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
425+
; GCN-LABEL: v_test_mul_add_v2bf16_sss:
426+
; GCN: ; %bb.0:
427+
; GCN-NEXT: v_pk_mul_bf16 v2, s0, s1
428+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
429+
; GCN-NEXT: v_pk_add_bf16 v2, v2, s2
430+
; GCN-NEXT: global_store_b32 v[0:1], v2, off
431+
; GCN-NEXT: s_endpgm
432+
%mul = fmul contract <2 x bfloat> %a, %b
433+
%add = fadd contract <2 x bfloat> %mul, %c
434+
store <2 x bfloat> %add, ptr addrspace(1) %out
435+
ret void
436+
}
437+
438+
define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
439+
; GCN-LABEL: v_test_mul_add_v2bf16_vsc:
440+
; GCN: ; %bb.0:
441+
; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
442+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
443+
; GCN-NEXT: v_pk_add_bf16 v2, v2, 0.5 op_sel_hi:[1,0]
444+
; GCN-NEXT: global_store_b32 v[0:1], v2, off
445+
; GCN-NEXT: s_endpgm
446+
%mul = fmul contract <2 x bfloat> %a, %b
447+
%add = fadd contract <2 x bfloat> %mul, <bfloat 0.5, bfloat 0.5>
448+
store <2 x bfloat> %add, ptr addrspace(1) %out
449+
ret void
450+
}
451+
452+
define amdgpu_ps void @v_test_mul_add_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat> %a) {
453+
; GCN-LABEL: v_test_mul_add_v2bf16_vll:
454+
; GCN: ; %bb.0:
455+
; GCN-NEXT: v_pk_mul_bf16 v2, 0x42c83f80, v2
456+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
457+
; GCN-NEXT: v_pk_add_bf16 v2, 0x43484000, v2
458+
; GCN-NEXT: global_store_b32 v[0:1], v2, off
459+
; GCN-NEXT: s_endpgm
460+
%mul = fmul contract <2 x bfloat> %a, <bfloat 1.0, bfloat 100.0>
461+
%add = fadd contract <2 x bfloat> %mul, <bfloat 2.0, bfloat 200.0>
462+
store <2 x bfloat> %add, ptr addrspace(1) %out
463+
ret void
464+
}
465+
326466
define amdgpu_ps void @v_test_fma_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
327467
; GCN-LABEL: v_test_fma_v2bf16_vvv:
328468
; GCN: ; %bb.0:
@@ -426,6 +566,8 @@ define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src
426566
ret void
427567
}
428568

569+
declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
570+
declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
429571
declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
430572
declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
431573
declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)

0 commit comments

Comments
 (0)