Skip to content

Commit 6b5afdc

Browse files
authored
[AMDGPU] Support bfloat comparison for ballot intrinsic (llvm#165495)
We do not have native instructions for direct bfloat comparisons. However, we can expand bfloat to float, and do float comparison instead. TODO: handle bfloat comparison for ballot intrinsic on global isel path. Fixes: SWDEV-563403
1 parent 88cee4c commit 6b5afdc

File tree

3 files changed

+41
-2
lines changed

3 files changed

+41
-2
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7035,9 +7035,15 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
70357035
SDLoc SL(N);
70367036

70377037
if (Src.getOpcode() == ISD::SETCC) {
7038+
SDValue Op0 = Src.getOperand(0);
7039+
SDValue Op1 = Src.getOperand(1);
7040+
// Need to expand bfloat to float for comparison (setcc).
7041+
if (Op0.getValueType() == MVT::bf16) {
7042+
Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
7043+
Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
7044+
}
70387045
// (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7039-
return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
7040-
Src.getOperand(1), Src.getOperand(2));
7046+
return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));
70417047
}
70427048
if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
70437049
// (ballot 0) -> 0

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -591,3 +591,24 @@ exit:
591591
store i32 %ballot, ptr addrspace(1) %out
592592
ret void
593593
}
594+
595+
define amdgpu_cs i32 @compare_bfloats(bfloat %x, bfloat %y) {
596+
; GFX10-LABEL: compare_bfloats:
597+
; GFX10: ; %bb.0:
598+
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
599+
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
600+
; GFX10-NEXT: v_cmp_gt_f32_e64 s0, v0, v1
601+
; GFX10-NEXT: ; return to shader part epilog
602+
;
603+
; GFX11-LABEL: compare_bfloats:
604+
; GFX11: ; %bb.0:
605+
; GFX11-NEXT: v_mov_b16_e32 v2.l, 0
606+
; GFX11-NEXT: v_mov_b16_e32 v2.h, v1.l
607+
; GFX11-NEXT: v_mov_b16_e32 v1.h, v0.l
608+
; GFX11-NEXT: v_mov_b16_e32 v1.l, v2.l
609+
; GFX11-NEXT: v_cmp_gt_f32_e64 s0, v1, v2
610+
; GFX11-NEXT: ; return to shader part epilog
611+
%cmp = fcmp ogt bfloat %x, %y
612+
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
613+
ret i32 %ballot
614+
}

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -557,3 +557,15 @@ exit:
557557
store i64 %ballot, ptr addrspace(1) %out
558558
ret void
559559
}
560+
561+
define amdgpu_cs i64 @compare_bfloats(bfloat %x, bfloat %y) {
562+
; CHECK-LABEL: compare_bfloats:
563+
; CHECK: ; %bb.0:
564+
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 16, v1
565+
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 16, v0
566+
; CHECK-NEXT: v_cmp_gt_f32_e64 s[0:1], v0, v1
567+
; CHECK-NEXT: ; return to shader part epilog
568+
%cmp = fcmp ogt bfloat %x, %y
569+
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
570+
ret i64 %ballot
571+
}

0 commit comments

Comments
 (0)