Skip to content

Commit b261de3

Browse files
rampitecgithub-actions[bot]
authored andcommitted
Automerge: [AMDGPU] Add v_cvt_scale_pk8_* gfx1250 instructions (#151616)
2 parents a31aa04 + c7bb105 commit b261de3

File tree

17 files changed

+730
-4
lines changed

17 files changed

+730
-4
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -707,6 +707,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f16, "sV2h", "nc", "gfx1250-insts")
707707
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f16, "sV2h", "nc", "gfx1250-insts")
708708
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f16, "ihiUiIi", "nc", "gfx1250-insts")
709709
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f16, "ihiUiIi", "nc", "gfx1250-insts")
710+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f16_fp8, "V8hV2UiUiIUi", "nc", "gfx1250-insts")
711+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_fp8, "V8yV2UiUiIUi", "nc", "gfx1250-insts")
712+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f16_bf8, "V8hV2UiUiIUi", "nc", "gfx1250-insts")
713+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_bf8, "V8yV2UiUiIUi", "nc", "gfx1250-insts")
714+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f16_fp4, "V8hUiUiIUi", "nc", "gfx1250-insts")
715+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_fp4, "V8yUiUiIUi", "nc", "gfx1250-insts")
716+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp8, "V8fV2UiUiIUi", "nc", "gfx1250-insts")
717+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_bf8, "V8fV2UiUiIUi", "nc", "gfx1250-insts")
718+
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp4, "V8fUiUiIUi", "nc", "gfx1250-insts")
710719
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts")
711720
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts")
712721
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")

clang/lib/Sema/SemaAMDGPU.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,16 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
8484
return checkMovDPPFunctionCall(TheCall, 2, 1);
8585
case AMDGPU::BI__builtin_amdgcn_update_dpp: {
8686
return checkMovDPPFunctionCall(TheCall, 6, 2);
87+
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f16_fp8:
88+
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_bf16_fp8:
89+
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f16_bf8:
90+
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_bf16_bf8:
91+
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f16_fp4:
92+
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_bf16_fp4:
93+
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f32_fp8:
94+
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f32_bf8:
95+
case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f32_fp4:
96+
return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 7);
8797
}
8898
default:
8999
return false;

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,20 @@
77
typedef unsigned int uint;
88
typedef unsigned short int ushort;
99
typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
10+
typedef unsigned int __attribute__((ext_vector_type(3))) uint3;
11+
typedef unsigned int __attribute__((ext_vector_type(4))) uint4;
1012
typedef __bf16 __attribute__((ext_vector_type(2))) bfloat2;
13+
typedef __bf16 __attribute__((ext_vector_type(8))) bfloat8;
14+
typedef __bf16 __attribute__((ext_vector_type(16))) bfloat16;
15+
typedef __bf16 __attribute__((ext_vector_type(32))) bfloat32;
1116
typedef half __attribute__((ext_vector_type(2))) half2;
17+
typedef half __attribute__((ext_vector_type(8))) half8;
18+
typedef half __attribute__((ext_vector_type(16))) half16;
19+
typedef half __attribute__((ext_vector_type(32))) half32;
20+
typedef float __attribute__((ext_vector_type(8))) float8;
21+
typedef float __attribute__((ext_vector_type(16))) float16;
22+
typedef float __attribute__((ext_vector_type(32))) float32;
23+
typedef short __attribute__((ext_vector_type(2))) short2;
1224

1325
// CHECK-LABEL: @test_setprio_inc_wg(
1426
// CHECK-NEXT: entry:
@@ -563,6 +575,105 @@ void test_cvt_sr_fp8_f16(global int* out, half a, short sr, int old)
563575
*out = __builtin_amdgcn_cvt_sr_fp8_f16(a, sr, old, 3);
564576
}
565577

578+
// CHECK-LABEL: @test_cvt_scale_pk(
579+
// CHECK-NEXT: entry:
580+
// CHECK-NEXT: [[OUTH8_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
581+
// CHECK-NEXT: [[OUTY8_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
582+
// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
583+
// CHECK-NEXT: [[OUTF32_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
584+
// CHECK-NEXT: [[OUTF8_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
585+
// CHECK-NEXT: [[OUTH16_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
586+
// CHECK-NEXT: [[OUTY16_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
587+
// CHECK-NEXT: [[OUTF16_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
588+
// CHECK-NEXT: [[SRC3_ADDR:%.*]] = alloca <3 x i32>, align 16, addrspace(5)
589+
// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
590+
// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
591+
// CHECK-NEXT: [[OUTH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTH8_ADDR]] to ptr
592+
// CHECK-NEXT: [[OUTY8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTY8_ADDR]] to ptr
593+
// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr
594+
// CHECK-NEXT: [[OUTF32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTF32_ADDR]] to ptr
595+
// CHECK-NEXT: [[OUTF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTF8_ADDR]] to ptr
596+
// CHECK-NEXT: [[OUTH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTH16_ADDR]] to ptr
597+
// CHECK-NEXT: [[OUTY16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTY16_ADDR]] to ptr
598+
// CHECK-NEXT: [[OUTF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTF16_ADDR]] to ptr
599+
// CHECK-NEXT: [[SRC3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC3_ADDR]] to ptr
600+
// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr
601+
// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
602+
// CHECK-NEXT: store ptr addrspace(1) [[OUTH8:%.*]], ptr [[OUTH8_ADDR_ASCAST]], align 8
603+
// CHECK-NEXT: store ptr addrspace(1) [[OUTY8:%.*]], ptr [[OUTY8_ADDR_ASCAST]], align 8
604+
// CHECK-NEXT: store <2 x i32> [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 8
605+
// CHECK-NEXT: store ptr addrspace(1) [[OUTF32:%.*]], ptr [[OUTF32_ADDR_ASCAST]], align 8
606+
// CHECK-NEXT: store ptr addrspace(1) [[OUTF8:%.*]], ptr [[OUTF8_ADDR_ASCAST]], align 8
607+
// CHECK-NEXT: store ptr addrspace(1) [[OUTH16:%.*]], ptr [[OUTH16_ADDR_ASCAST]], align 8
608+
// CHECK-NEXT: store ptr addrspace(1) [[OUTY16:%.*]], ptr [[OUTY16_ADDR_ASCAST]], align 8
609+
// CHECK-NEXT: store ptr addrspace(1) [[OUTF16:%.*]], ptr [[OUTF16_ADDR_ASCAST]], align 8
610+
// CHECK-NEXT: store <3 x i32> [[SRC3:%.*]], ptr [[SRC3_ADDR_ASCAST]], align 16
611+
// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4
612+
// CHECK-NEXT: store i32 [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4
613+
// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8
614+
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
615+
// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.fp8(<2 x i32> [[TMP0]], i32 [[TMP1]], i32 4)
616+
// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUTH8_ADDR_ASCAST]], align 8
617+
// CHECK-NEXT: store <8 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 16
618+
// CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8
619+
// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
620+
// CHECK-NEXT: [[TMP6:%.*]] = call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp8(<2 x i32> [[TMP4]], i32 [[TMP5]], i32 5)
621+
// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUTY8_ADDR_ASCAST]], align 8
622+
// CHECK-NEXT: store <8 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 16
623+
// CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8
624+
// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
625+
// CHECK-NEXT: [[TMP10:%.*]] = call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.bf8(<2 x i32> [[TMP8]], i32 [[TMP9]], i32 6)
626+
// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUTH8_ADDR_ASCAST]], align 8
627+
// CHECK-NEXT: store <8 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 16
628+
// CHECK-NEXT: [[TMP12:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8
629+
// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
630+
// CHECK-NEXT: [[TMP14:%.*]] = call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.bf8(<2 x i32> [[TMP12]], i32 [[TMP13]], i32 7)
631+
// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUTY8_ADDR_ASCAST]], align 8
632+
// CHECK-NEXT: store <8 x bfloat> [[TMP14]], ptr addrspace(1) [[TMP15]], align 16
633+
// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
634+
// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
635+
// CHECK-NEXT: [[TMP18:%.*]] = call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.fp4(i32 [[TMP16]], i32 [[TMP17]], i32 1)
636+
// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUTH8_ADDR_ASCAST]], align 8
637+
// CHECK-NEXT: store <8 x half> [[TMP18]], ptr addrspace(1) [[TMP19]], align 16
638+
// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
639+
// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
640+
// CHECK-NEXT: [[TMP22:%.*]] = call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp4(i32 [[TMP20]], i32 [[TMP21]], i32 2)
641+
// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUTY8_ADDR_ASCAST]], align 8
642+
// CHECK-NEXT: store <8 x bfloat> [[TMP22]], ptr addrspace(1) [[TMP23]], align 16
643+
// CHECK-NEXT: [[TMP24:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8
644+
// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
645+
// CHECK-NEXT: [[TMP26:%.*]] = call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp8(<2 x i32> [[TMP24]], i32 [[TMP25]], i32 5)
646+
// CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr [[OUTF8_ADDR_ASCAST]], align 8
647+
// CHECK-NEXT: store <8 x float> [[TMP26]], ptr addrspace(1) [[TMP27]], align 32
648+
// CHECK-NEXT: [[TMP28:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8
649+
// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
650+
// CHECK-NEXT: [[TMP30:%.*]] = call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.bf8(<2 x i32> [[TMP28]], i32 [[TMP29]], i32 6)
651+
// CHECK-NEXT: [[TMP31:%.*]] = load ptr addrspace(1), ptr [[OUTF8_ADDR_ASCAST]], align 8
652+
// CHECK-NEXT: store <8 x float> [[TMP30]], ptr addrspace(1) [[TMP31]], align 32
653+
// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
654+
// CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
655+
// CHECK-NEXT: [[TMP34:%.*]] = call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp4(i32 [[TMP32]], i32 [[TMP33]], i32 7)
656+
// CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr [[OUTF8_ADDR_ASCAST]], align 8
657+
// CHECK-NEXT: store <8 x float> [[TMP34]], ptr addrspace(1) [[TMP35]], align 32
658+
// CHECK-NEXT: ret void
659+
//
660+
void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2,
661+
global float32 *outf32, global float8 *outf8,
662+
global half16 *outh16, global bfloat16 *outy16,
663+
global float16 *outf16, uint3 src3,
664+
uint src1, uint scale)
665+
{
666+
*outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp8(src2, scale, 4);
667+
*outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp8(src2, scale, 5);
668+
*outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_bf8(src2, scale, 6);
669+
*outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_bf8(src2, scale, 7);
670+
*outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp4(src1, scale, 1);
671+
*outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp4(src1, scale, 2);
672+
*outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp8(src2, scale, 5);
673+
*outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_bf8(src2, scale, 6);
674+
*outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 7);
675+
}
676+
566677
// CHECK-LABEL: @test_sat_pk4_i4_i8(
567678
// CHECK-NEXT: entry:
568679
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,21 @@
11
// REQUIRES: amdgpu-registered-target
22
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s
33

4+
typedef unsigned int uint;
5+
typedef unsigned short int ushort;
46
typedef int v2i __attribute__((ext_vector_type(2)));
7+
typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
8+
typedef unsigned int __attribute__((ext_vector_type(3))) uint3;
9+
typedef __bf16 __attribute__((ext_vector_type(8))) bfloat8;
10+
typedef __bf16 __attribute__((ext_vector_type(16))) bfloat16;
11+
typedef __bf16 __attribute__((ext_vector_type(32))) bfloat32;
12+
typedef half __attribute__((ext_vector_type(8))) half8;
13+
typedef half __attribute__((ext_vector_type(16))) half16;
14+
typedef half __attribute__((ext_vector_type(32))) half32;
15+
typedef float __attribute__((ext_vector_type(8))) float8;
16+
typedef float __attribute__((ext_vector_type(16))) float16;
17+
typedef float __attribute__((ext_vector_type(32))) float32;
18+
519
typedef int v4i __attribute__((ext_vector_type(4)));
620
typedef int v8i __attribute__((ext_vector_type(8)));
721

@@ -29,6 +43,32 @@ void test__builtin_amdgcn_cvt_f16_bf8(int a, int b) {
2943
__builtin_amdgcn_cvt_f16_bf8(a, b); // expected-error {{'__builtin_amdgcn_cvt_f16_bf8' must be a constant integer}}
3044
}
3145

46+
void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2,
47+
global float32 *outf32, global half16 *outh16, global bfloat16 *outy16,
48+
global float16 *outf16, uint3 src3,
49+
global float8 *outf8, uint src1, uint scale, uint scale_sel)
50+
{
51+
*outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f16_fp8' must be a constant integer}}
52+
*outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_bf16_fp8' must be a constant integer}}
53+
*outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_bf8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f16_bf8' must be a constant integer}}
54+
*outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_bf8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_bf16_bf8' must be a constant integer}}
55+
*outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp4(src1, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f16_fp4' must be a constant integer}}
56+
*outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp4(src1, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_bf16_fp4' must be a constant integer}}
57+
*outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f32_fp8' must be a constant integer}}
58+
*outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_bf8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f32_bf8' must be a constant integer}}
59+
*outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f32_fp4' must be a constant integer}}
60+
61+
*outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
62+
*outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
63+
*outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_bf8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
64+
*outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_bf8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
65+
*outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp4(src1, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
66+
*outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp4(src1, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
67+
*outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
68+
*outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_bf8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
69+
*outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
70+
}
71+
3272
void test_amdgcn_load_monitor(global int* b32gaddr, global v2i* b64gaddr, global v4i* b128gaddr, int *b32faddr, v2i* b64faddr, v4i *b128faddr,
3373
global int* b32out, global v2i* b64out, global v4i* b128out, int cpol)
3474
{

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -633,18 +633,33 @@ def int_amdgcn_cvt_sr_bf8_f16 : DefaultAttrsIntrinsic<
633633
[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
634634
>, ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f16">;
635635

636-
class AMDGPUCvtScaleF32Intrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic<
637-
[DstTy], [Src0Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
636+
// llvm.amdgcn.cvt.scale.pk32.f16.bf6 v32f16 vdst, v6i32 src0, i32 scale_sel [0..7]
637+
class AMDGPUCvtScaleIntrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic<
638+
[DstTy], [Src0Ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>]
638639
>, ClangBuiltin<"__builtin_amdgcn_"#name>;
639640

640-
class AMDGPUCvtScaleF32ToFP6BF6Intrinsic<LLVMType DstTy, LLVMType Src0Ty, LLVMType Src1Ty, string name> : DefaultAttrsIntrinsic<
641-
[DstTy], [Src0Ty, Src1Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
641+
class AMDGPUCvtScaleF32Intrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic<
642+
[DstTy], [Src0Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
642643
>, ClangBuiltin<"__builtin_amdgcn_"#name>;
643644

644645
class AMDGPUCvtScaleF32SRIntrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic<
645646
[DstTy], [Src0Ty, llvm_i32_ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
646647
>, ClangBuiltin<"__builtin_amdgcn_"#name>;
647648

649+
def int_amdgcn_cvt_scale_pk8_f16_fp8 : AMDGPUCvtScaleIntrinsic<llvm_v8f16_ty, llvm_v2i32_ty, "cvt_scale_pk8_f16_fp8">;
650+
def int_amdgcn_cvt_scale_pk8_bf16_fp8 : AMDGPUCvtScaleIntrinsic<llvm_v8bf16_ty, llvm_v2i32_ty, "cvt_scale_pk8_bf16_fp8">;
651+
def int_amdgcn_cvt_scale_pk8_f16_bf8 : AMDGPUCvtScaleIntrinsic<llvm_v8f16_ty, llvm_v2i32_ty, "cvt_scale_pk8_f16_bf8">;
652+
def int_amdgcn_cvt_scale_pk8_bf16_bf8 : AMDGPUCvtScaleIntrinsic<llvm_v8bf16_ty, llvm_v2i32_ty, "cvt_scale_pk8_bf16_bf8">;
653+
def int_amdgcn_cvt_scale_pk8_f16_fp4 : AMDGPUCvtScaleIntrinsic<llvm_v8f16_ty, llvm_i32_ty, "cvt_scale_pk8_f16_fp4">;
654+
def int_amdgcn_cvt_scale_pk8_bf16_fp4 : AMDGPUCvtScaleIntrinsic<llvm_v8bf16_ty, llvm_i32_ty, "cvt_scale_pk8_bf16_fp4">;
655+
def int_amdgcn_cvt_scale_pk8_f32_fp8 : AMDGPUCvtScaleIntrinsic<llvm_v8f32_ty, llvm_v2i32_ty, "cvt_scale_pk8_f32_fp8">;
656+
def int_amdgcn_cvt_scale_pk8_f32_bf8 : AMDGPUCvtScaleIntrinsic<llvm_v8f32_ty, llvm_v2i32_ty, "cvt_scale_pk8_f32_bf8">;
657+
def int_amdgcn_cvt_scale_pk8_f32_fp4 : AMDGPUCvtScaleIntrinsic<llvm_v8f32_ty, llvm_i32_ty, "cvt_scale_pk8_f32_fp4">;
658+
659+
class AMDGPUCvtScaleF32ToFP6BF6Intrinsic<LLVMType DstTy, LLVMType Src0Ty, LLVMType Src1Ty, string name> : DefaultAttrsIntrinsic<
660+
[DstTy], [Src0Ty, Src1Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
661+
>, ClangBuiltin<"__builtin_amdgcn_"#name>;
662+
648663
def int_amdgcn_cvt_scalef32_pk32_fp6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_fp6_f16">;
649664
def int_amdgcn_cvt_scalef32_pk32_bf6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_bf6_f16">;
650665
def int_amdgcn_cvt_scalef32_pk32_fp6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_pk32_fp6_bf16">;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4582,6 +4582,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
45824582
case Intrinsic::amdgcn_cvt_pk_bf8_f16:
45834583
case Intrinsic::amdgcn_cvt_sr_fp8_f16:
45844584
case Intrinsic::amdgcn_cvt_sr_bf8_f16:
4585+
case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp8:
4586+
case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp8:
4587+
case Intrinsic::amdgcn_cvt_scale_pk8_f16_bf8:
4588+
case Intrinsic::amdgcn_cvt_scale_pk8_bf16_bf8:
4589+
case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp4:
4590+
case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp4:
4591+
case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp8:
4592+
case Intrinsic::amdgcn_cvt_scale_pk8_f32_bf8:
4593+
case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp4:
45854594
case Intrinsic::amdgcn_sat_pk4_i4_i8:
45864595
case Intrinsic::amdgcn_sat_pk4_u4_u8:
45874596
case Intrinsic::amdgcn_fmed3:

0 commit comments

Comments
 (0)