Skip to content

Commit 8f96880

Browse files
authored
Merge branch 'main' into clang/update-arm-cross-cmake
2 parents 9fe3b6d + 9270328 commit 8f96880

27 files changed

+1572
-238
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-conversion-
437437
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", "nc", "gfx950-insts")
438438
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", "nc", "gfx950-insts")
439439

440+
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf16, "V16fV8yV8yV16fIiIiIi", "nc", "gfx950-insts")
441+
440442
//===----------------------------------------------------------------------===//
441443
// GFX12+ only builtins.
442444
//===----------------------------------------------------------------------===//

clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ typedef short v8s __attribute__((ext_vector_type(8)));
2424
typedef short v16s __attribute__((ext_vector_type(16)));
2525
typedef short v32s __attribute__((ext_vector_type(32)));
2626
typedef double v4d __attribute__((ext_vector_type(4)));
27+
typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
2728

2829

2930
#ifdef MFMA_GFX908_TESTS
@@ -424,5 +425,10 @@ v16f test_mfma_f32_32x32x16_f16(v8h a, v8h b, v16f c)
424425
return __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 1, 2, 3);
425426
}
426427

428+
// CHECK-GFX950-LABEL: @test_mfma_f32_32x32x16_bf16(
429+
// CHECK-GFX950: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %a, <8 x bfloat> %b, <16 x float> %c, i32 1, i32 2, i32 3)
430+
v16f test_mfma_f32_32x32x16_bf16(v8bf16 a, v8bf16 b, v16f c) {
431+
return __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 1, 2, 3);
432+
}
427433

428434
#endif

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
typedef float float4 __attribute__((ext_vector_type(4)));
55
typedef float float16 __attribute__((ext_vector_type(16)));
66
typedef half half8 __attribute__((ext_vector_type(8)));
7+
typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
78

89

910
void test_mfma_f32_16x16x32_f16(__global float4* out, half8 a, half8 b, float4 c, int X) {
@@ -19,3 +20,9 @@ void test_mfma_f32_32x32x16_f16(__global float16* out, half8 a, half8 b, float16
1920
*out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a constant integer}}
2021
*out = __builtin_amdgcn_mfma_f32_32x32x16_f16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_f16' must be a constant integer}}
2122
}
23+
24+
void test_mfma_f32_32x32x16_bf16(__global float16* out, bfloat8 a, bfloat8 b, float16 c, int X) {
25+
*out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
26+
*out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
27+
*out = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x16_bf16' must be a constant integer}}
28+
}

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,12 @@
44
typedef float float4 __attribute__((ext_vector_type(4)));
55
typedef float float16 __attribute__((ext_vector_type(16)));
66
typedef half half8 __attribute__((ext_vector_type(8)));
7+
typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
78

89
void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
9-
__global float16* out1, half8 a1, half8 b1, float16 c1) {
10+
__global float16* out1, half8 a1, half8 b1, float16 c1,
11+
__global float16* out2, bfloat8 a2, bfloat8 b2, float16 c2) {
1012
*out0 = __builtin_amdgcn_mfma_f32_16x16x32_f16(a0, b0, c0, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_f16' needs target feature gfx950-insts}}
1113
*out1 = __builtin_amdgcn_mfma_f32_32x32x16_f16(a1, b1, c1, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_f16' needs target feature gfx950-insts}}
14+
*out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}}
1215
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1674,7 +1674,7 @@ class AMDGPURawBufferLoadLDS : Intrinsic <
16741674
[],
16751675
[llvm_v4i32_ty, // rsrc(SGPR)
16761676
LLVMQualPointerType<3>, // LDS base offset
1677-
llvm_i32_ty, // Data byte size: 1/2/4
1677+
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
16781678
llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling)
16791679
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
16801680
llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling)
@@ -1693,7 +1693,7 @@ class AMDGPURawPtrBufferLoadLDS : Intrinsic <
16931693
[],
16941694
[AMDGPUBufferRsrcTy, // rsrc(SGPR)
16951695
LLVMQualPointerType<3>, // LDS base offset
1696-
llvm_i32_ty, // Data byte size: 1/2/4
1696+
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
16971697
llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling)
16981698
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
16991699
llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling)
@@ -1715,7 +1715,7 @@ class AMDGPUStructBufferLoadLDS : Intrinsic <
17151715
[],
17161716
[llvm_v4i32_ty, // rsrc(SGPR)
17171717
LLVMQualPointerType<3>, // LDS base offset
1718-
llvm_i32_ty, // Data byte size: 1/2/4
1718+
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
17191719
llvm_i32_ty, // vindex(VGPR)
17201720
llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling)
17211721
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
@@ -1735,7 +1735,7 @@ class AMDGPUStructPtrBufferLoadLDS : Intrinsic <
17351735
[],
17361736
[AMDGPUBufferRsrcTy, // rsrc(SGPR)
17371737
LLVMQualPointerType<3>, // LDS base offset
1738-
llvm_i32_ty, // Data byte size: 1/2/4
1738+
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
17391739
llvm_i32_ty, // vindex(VGPR)
17401740
llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling)
17411741
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
@@ -2452,7 +2452,7 @@ class AMDGPUGlobalLoadLDS :
24522452
[],
24532453
[LLVMQualPointerType<1>, // Base global pointer to load from
24542454
LLVMQualPointerType<3>, // LDS base pointer to store to
2455-
llvm_i32_ty, // Data byte size: 1/2/4
2455+
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
24562456
llvm_i32_ty, // imm offset (applied to both global and LDS address)
24572457
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
24582458
// bit 1 = sc1,
@@ -3117,6 +3117,8 @@ def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">,
31173117
defset list<Intrinsic> AMDGPUMFMAIntrinsics950 = {
31183118
def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v8f16_ty>;
31193119
def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty>;
3120+
3121+
def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty>;
31203122
}
31213123

31223124
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3240,6 +3240,24 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
32403240
: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
32413241
: AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
32423242
break;
3243+
case 12:
3244+
if (!Subtarget->hasLDSLoadB96_B128())
3245+
return false;
3246+
3247+
Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3248+
: AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3249+
: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3250+
: AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3251+
break;
3252+
case 16:
3253+
if (!Subtarget->hasLDSLoadB96_B128())
3254+
return false;
3255+
3256+
Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3257+
: AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3258+
: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3259+
: AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3260+
break;
32433261
}
32443262

32453263
MachineBasicBlock *MBB = MI.getParent();
@@ -3329,6 +3347,16 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
33293347
case 4:
33303348
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
33313349
break;
3350+
case 12:
3351+
if (!Subtarget->hasLDSLoadB96_B128())
3352+
return false;
3353+
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3354+
break;
3355+
case 16:
3356+
if (!Subtarget->hasLDSLoadB96_B128())
3357+
return false;
3358+
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3359+
break;
33323360
}
33333361

33343362
MachineBasicBlock *MBB = MI.getParent();

llvm/lib/Target/AMDGPU/BUFInstructions.td

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -573,9 +573,17 @@ multiclass MUBUF_Pseudo_Loads<string opName, ValueType load_vt = i32,
573573
}
574574
}
575575

576-
multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32> {
576+
multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32, Predicate LDSPred = TruePredicate> {
577577
defm NAME : MUBUF_Pseudo_Loads<opName, load_vt>;
578-
defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, 0, 1>;
578+
579+
if !ne(LDSPred, TruePredicate) then {
580+
let SubtargetPredicate = LDSPred in {
581+
defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, 0, 1>;
582+
}
583+
} else {
584+
defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, 0, 1>;
585+
}
586+
579587
}
580588

581589
multiclass MUBUF_Pseudo_Loads_LDSOpc<string opName,
@@ -956,11 +964,11 @@ defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads_Lds <
956964
defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads <
957965
"buffer_load_dwordx2", v2i32
958966
>;
959-
defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads <
960-
"buffer_load_dwordx3", v3i32
967+
defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads_Lds <
968+
"buffer_load_dwordx3", v3i32, /*LDSPred=*/HasGFX950Insts
961969
>;
962-
defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads <
963-
"buffer_load_dwordx4", v4i32
970+
defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads_Lds <
971+
"buffer_load_dwordx4", v4i32, /*LDSPred=*/HasGFX950Insts
964972
>;
965973

966974
defm BUFFER_LOAD_LDS_B32 : MUBUF_Pseudo_Loads_LDSOpc <
@@ -3231,8 +3239,8 @@ defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_Lds_vi <0x12>;
32313239
defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_Lds_vi <0x13>;
32323240
defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_Lds_vi <0x14>;
32333241
defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_vi <0x15>;
3234-
defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_vi <0x16>;
3235-
defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_vi <0x17>;
3242+
defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_Lds_vi <0x16>;
3243+
defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_Lds_vi <0x17>;
32363244
defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_vi <0x18>;
32373245
defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_vi <0x19>;
32383246
defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_vi <0x1a>;

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -934,6 +934,11 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_usho
934934
defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort">;
935935
defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">;
936936

937+
let SubtargetPredicate = HasGFX950Insts in {
938+
defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx3">;
939+
defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx4">;
940+
}
941+
937942
let SubtargetPredicate = isGFX12Plus in {
938943
defm GLOBAL_ATOMIC_COND_SUB_U32 : FLAT_Global_Atomic_Pseudo <"global_atomic_cond_sub_u32", VGPR_32, i32>;
939944
defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>;
@@ -1980,6 +1985,10 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Real_AllAddr_LDS <0x028, 0x12>;
19801985
defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_LDS <0x029, 0x13>;
19811986
defm GLOBAL_LOAD_LDS_DWORD : FLAT_Real_AllAddr_LDS <0x02a, 0x14>;
19821987

1988+
defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Real_AllAddr_LDS <0x07e, 0x07e>;
1989+
defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Real_AllAddr_LDS <0x07d, 0x07d>;
1990+
1991+
19831992
defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Real_Atomics_vi <0x40>;
19841993
defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Real_Atomics_vi <0x41>;
19851994
defm GLOBAL_ATOMIC_ADD : FLAT_Global_Real_Atomics_vi <0x42>;

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1289,6 +1289,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
12891289
// hasGFX940Insts and hasGFX90AInsts are also true.
12901290
bool hasGFX950Insts() const { return GFX950Insts; }
12911291

1292+
/// Returns true if the target supports
1293+
/// global_load_lds_dwordx3/global_load_lds_dwordx4 or
1294+
/// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
1295+
bool hasLDSLoadB96_B128() const {
1296+
return hasGFX950Insts();
1297+
}
1298+
12921299
bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
12931300

12941301
bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -889,6 +889,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
889889

890890
setOperationAction(ISD::MUL, MVT::i1, Promote);
891891

892+
if (Subtarget->hasBF16ConversionInsts()) {
893+
setOperationAction(ISD::FP_ROUND, MVT::v2bf16, Legal);
894+
setOperationAction(ISD::FP_ROUND, MVT::bf16, Legal);
895+
setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
896+
}
897+
892898
setTargetDAGCombine({ISD::ADD,
893899
ISD::UADDO_CARRY,
894900
ISD::SUB,
@@ -9819,6 +9825,22 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
98199825
: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
98209826
: AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
98219827
break;
9828+
case 12:
9829+
if (!Subtarget->hasLDSLoadB96_B128())
9830+
return SDValue();
9831+
Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
9832+
: AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
9833+
: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
9834+
: AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
9835+
break;
9836+
case 16:
9837+
if (!Subtarget->hasLDSLoadB96_B128())
9838+
return SDValue();
9839+
Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
9840+
: AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
9841+
: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
9842+
: AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
9843+
break;
98229844
}
98239845

98249846
SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
@@ -9888,6 +9910,16 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
98889910
case 4:
98899911
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
98909912
break;
9913+
case 12:
9914+
if (!Subtarget->hasLDSLoadB96_B128())
9915+
return SDValue();
9916+
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
9917+
break;
9918+
case 16:
9919+
if (!Subtarget->hasLDSLoadB96_B128())
9920+
return SDValue();
9921+
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
9922+
break;
98919923
}
98929924

98939925
auto *M = cast<MemSDNode>(Op);

0 commit comments

Comments
 (0)