@@ -1556,6 +1556,52 @@ llvm.func @rocdl.cvt.scale.pk16(%v3xi32: vector<3xi32>, %scale:i32) {
15561556 llvm.return
15571557}
15581558
1559+ // CHECK-LABEL: rocdl.cvt.scalef32.pk16
1560+ // CHECK-SAME:(<16 x float> %[[V16F32:.+]], <16 x half> %[[V16F16:.+]], <16 x bfloat> %[[V16BF16:.+]], float %[[SCALE:.+]])
1561+ llvm.func @rocdl.cvt.scalef32.pk16 (%v16xf32: vector <16 xf32 >, %v16xf16: vector <16 xf16 >, %v16xbf16: vector <16 xbf16 >, %scale: f32 ) {
1562+
1563+ // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f16(<16 x half> %[[V16F16]], float %[[SCALE]])
1564+ %0 = rocdl.cvt.scalef32.pk16.fp6.f16 %v16xf16 , %scale : vector <3 xi32 >
1565+ // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.bf16(<16 x bfloat> %[[V16BF16]], float %[[SCALE]])
1566+ %1 = rocdl.cvt.scalef32.pk16.fp6.bf16 %v16xbf16 , %scale : vector <3 xi32 >
1567+ // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f32(<16 x float> %[[V16F32]], float %[[SCALE]])
1568+ %2 = rocdl.cvt.scalef32.pk16.fp6.f32 %v16xf32 , %scale : vector <3 xi32 >
1569+
1570+ // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f16(<16 x half> %[[V16F16]], float %[[SCALE]])
1571+ %3 = rocdl.cvt.scalef32.pk16.bf6.f16 %v16xf16 , %scale : vector <3 xi32 >
1572+ // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.bf16(<16 x bfloat> %[[V16BF16]], float %[[SCALE]])
1573+ %4 = rocdl.cvt.scalef32.pk16.bf6.bf16 %v16xbf16 , %scale : vector <3 xi32 >
1574+ // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f32(<16 x float> %[[V16F32]], float %[[SCALE]])
1575+ %5 = rocdl.cvt.scalef32.pk16.bf6.f32 %v16xf32 , %scale : vector <3 xi32 >
1576+
1577+ llvm.return
1578+ }
1579+
1580+ // CHECK-LABEL: rocdl.cvt.scalef32.sr.pk16
1581+ // CHECK-SAME:(<16 x float> %[[V16F32:.+]], <16 x half> %[[V16F16:.+]], <16 x bfloat> %[[V16BF16:.+]], i32 %[[SEED:.+]], float %[[SCALE:.+]])
1582+ llvm.func @rocdl.cvt.scalef32.sr.pk16 (%v16xf32: vector <16 xf32 >,
1583+ %v16xf16: vector <16 xf16 >,
1584+ %v16xbf16: vector <16 xbf16 >,
1585+ %seed: i32 ,
1586+ %scale: f32 ) {
1587+
1588+ // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f16(<16 x half> %[[V16F16]], i32 %[[SEED]], float %[[SCALE]])
1589+ %0 = rocdl.cvt.scalef32.sr.pk16.fp6.f16 %v16xf16 , %seed , %scale : vector <3 xi32 >
1590+ // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> %[[V16BF16]], i32 %[[SEED]], float %[[SCALE]])
1591+ %1 = rocdl.cvt.scalef32.sr.pk16.fp6.bf16 %v16xbf16 , %seed , %scale : vector <3 xi32 >
1592+ // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f32(<16 x float> %[[V16F32]], i32 %[[SEED]], float %[[SCALE]])
1593+ %2 = rocdl.cvt.scalef32.sr.pk16.fp6.f32 %v16xf32 , %seed , %scale : vector <3 xi32 >
1594+
1595+ // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> %[[V16F16]], i32 %[[SEED]], float %[[SCALE]])
1596+ %3 = rocdl.cvt.scalef32.sr.pk16.bf6.f16 %v16xf16 , %seed , %scale : vector <3 xi32 >
1597+ // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> %[[V16BF16]], i32 %[[SEED]], float %[[SCALE]])
1598+ %4 = rocdl.cvt.scalef32.sr.pk16.bf6.bf16 %v16xbf16 , %seed , %scale : vector <3 xi32 >
1599+ // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f32(<16 x float> %[[V16F32]], i32 %[[SEED]], float %[[SCALE]])
1600+ %5 = rocdl.cvt.scalef32.sr.pk16.bf6.f32 %v16xf32 , %seed , %scale : vector <3 xi32 >
1601+
1602+ llvm.return
1603+ }
1604+
15591605// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" }
15601606// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
15611607// CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
0 commit comments