-
Notifications
You must be signed in to change notification settings - Fork 15.2k
AMDGPU: Implement builtins for gfx1250 wmma instructions #148991
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Co-Authored-by: Stanislav Mekhanoshin <[email protected]> Co-Authored-by: Shilei Tian <[email protected]>
Member
|
@llvm/pr-subscribers-clang-codegen @llvm/pr-subscribers-clang Author: Changpeng Fang (changpeng) ChangesPatch is 64.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/148991.diff 4 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 71e4b3486167a..29e1e99bba9ef 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -676,5 +676,45 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_bf8, "V2hs", "nc", "gfx1250-insts")
+// GFX1250 WMMA builtins
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, "V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_bf16, "V8fIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x32_bf16, "V8yIbV16yIbV16yIsV8yIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16, "V8yIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8, "V8fV8iV8iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8, "V8fV8iV8iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8, "V8fV8iV8iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8, "V8fV8iV8iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x64_iu8, "V8iIbV8iIbV8iV8iIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_f16, "V8fIbV16hIbV16hIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x32_f16, "V8hIbV16hIbV16hIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_32x16x128_f4, "V16fV16iV8iIsV16f", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x64_bf16, "V8fIbV16yIbV32yV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x64_bf16, "V8yIbV16yIbV32yV8yiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16, "V8fIbV16yIbV32yV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8, "V8fV8iV16iV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8, "V8fV8iV16iV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8, "V8fV8iV16iV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8, "V8fV8iV16iV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8, "V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8, "V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8, "V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8, "V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x128_iu8, "V8iIbV8iIbV16iV8iiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x64_f16, "V8fIbV16hIbV32hV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x64_f16, "V8hIbV16hIbV32hV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+
#undef BUILTIN
#undef TARGET_BUILTIN
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 0d8c2ed284994..e1f9cbe7aea26 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -822,7 +822,46 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
- case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
+ // GFX1250 WMMA builtins
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
+ case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
+ case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8: {
// These operations perform a matrix multiplication and accumulation of
// the form:
@@ -837,6 +876,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
// "false".
bool AppendFalseForOpselArg = false;
unsigned BuiltinWMMAOp;
+ // Need return type when D and C are of different types.
+ bool NeedReturnType = false;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
@@ -975,6 +1016,160 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
break;
+ // GFX1250 WMMA builtins
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
+ ArgsForMatchingMatrixTypes = {5, 1};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x4_f32;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
+ ArgsForMatchingMatrixTypes = {5, 1};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_bf16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
+ ArgsForMatchingMatrixTypes = {5, 1};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_f16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
+ ArgsForMatchingMatrixTypes = {5, 1};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x32_f16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
+ ArgsForMatchingMatrixTypes = {5, 1};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
+ NeedReturnType = true;
+ ArgsForMatchingMatrixTypes = {1, 5};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
+ ArgsForMatchingMatrixTypes = {4, 1};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x64_iu8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
+ ArgsForMatchingMatrixTypes = {3, 0, 1};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
+ ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_f16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
+ ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
+ ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x64_f16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
+ ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
+ ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8:
+ ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8;
+ break;
}
SmallVector<Value *, 6> Args;
@@ -984,6 +1179,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
Args.push_back(Builder.getFalse());
SmallVector<llvm::Type *, 6> ArgTypes;
+ if (NeedReturnType)
+ ArgTypes.push_back(ConvertType(E->getType()));
for (auto ArgIdx : ArgsForMatchingMatrixTypes)
ArgTypes.push_back(Args[ArgIdx]->getType());
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
new file mode 100644
index 0000000000000..e4ef3defdb341
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
@@ -0,0 +1,433 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250
+
+typedef float v16f __attribute__((ext_vector_type(16)));
+typedef float v8f __attribute__((ext_vector_type(8)));
+typedef float v2f __attribute__((ext_vector_type(2)));
+typedef half v8h __attribute__((ext_vector_type(8)));
+typedef half v16h __attribute__((ext_vector_type(16)));
+typedef half v32h __attribute__((ext_vector_type(32)));
+typedef __bf16 v32bf16 __attribute__((ext_vector_type(32)));
+typedef __bf16 v16bf16 __attribute__((ext_vector_type(16)));
+typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
+typedef int v16i __attribute__((ext_vector_type(16)));
+typedef int v8i __attribute__((ext_vector_type(8)));
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x4_f32(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 false, <2 x float> [[A:%.*]], i1 false, <2 x float> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x4_f32(global v8f* out, v2f a, v2f b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x4_f32(0, a, 0, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x32_bf16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <16 x bfloat> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 true, i1 false)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x32_bf16(global v8f* out, v16bf16 a, v16bf16 b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(0, a, 0, b, 0, c, true, false);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_bf16_16x16x32_bf16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <16 x bfloat> [[B:%.*]], i16 0, <8 x bfloat> [[C:%.*]], i1 false, i1 false)
+// CHECK-GFX1250-NEXT: store <8 x bfloat> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_bf16_16x16x32_bf16(global v8bf16* out, v16bf16 a, v16bf16 b, v8bf16 c)
+{
+ *out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(0, a, 0, b, 0, c, false, false);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_bf16f32_16x16x32_bf16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16.v8f32(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <16 x bfloat> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x bfloat> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GF...
[truncated]
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Fix a format error
shiltian
approved these changes
Jul 15, 2025
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Labels
backend:AMDGPU
clang:codegen
IR generation bugs: mangling, exceptions, etc.
clang:frontend
Language frontend issues, e.g. anything involving "Sema"
clang
Clang issues not falling into any other category
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
No description provided.