@@ -373,3 +373,120 @@ func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) ->
373373 %ret = amdgpu.scaled_ext_packed %v [0 ], %scale : vector <2 xf4 E2 M1 FN> to vector <2 xbf16 >
374374 func.return %ret : vector <2 xbf16 >
375375}
376+
377+ // CHECK-LABEL: func.func @scaled_ext_one_f8e4m3_f32
378+ // CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf8E4M3FN> to vector<1xi8>
379+ // CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
380+ // CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
381+ // CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi8>
382+ // CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
383+ // CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<4xi8> to i32
384+ // CHECK: rocdl.cvt.scalef32.pk.f32.fp8 [[BITCAST]][false], %arg1 : vector<2xf32>
385+ func.func @scaled_ext_one_f8e4m3_f32 (%v: vector <1 xf8 E4 M3 FN>, %scale: f32 ) -> vector <2 xf32 > {
386+ %ret = amdgpu.scaled_ext_packed %v [0 ], %scale : vector <1 xf8 E4 M3 FN> to vector <2 xf32 >
387+ func.return %ret : vector <2 xf32 >
388+ }
389+
390+ // CHECK-LABEL: func.func @scaled_ext_one_f8e4m3_f16
391+ // CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf8E4M3FN> to vector<1xi8>
392+ // CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
393+ // CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
394+ // CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi8>
395+ // CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
396+ // CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<4xi8> to i32
397+ // CHECK: rocdl.cvt.scalef32.pk.f16.fp8 [[BITCAST]][false], %arg1 : vector<2xf16>
398+ func.func @scaled_ext_one_f8e4m3_f16 (%v: vector <1 xf8 E4 M3 FN>, %scale: f32 ) -> vector <2 xf16 > {
399+ %ret = amdgpu.scaled_ext_packed %v [0 ], %scale : vector <1 xf8 E4 M3 FN> to vector <2 xf16 >
400+ func.return %ret : vector <2 xf16 >
401+ }
402+
403+ // CHECK-LABEL: func.func @scaled_ext_one_f8e4m3_bf16
404+ // CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf8E4M3FN> to vector<1xi8>
405+ // CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
406+ // CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
407+ // CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi8>
408+ // CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
409+ // CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<4xi8> to i32
410+ // CHECK: rocdl.cvt.scalef32.pk.bf16.fp8 [[BITCAST]][false], %arg1 : vector<2xbf16>
411+ func.func @scaled_ext_one_f8e4m3_bf16 (%v: vector <1 xf8 E4 M3 FN>, %scale: f32 ) -> vector <2 xbf16 > {
412+ %ret = amdgpu.scaled_ext_packed %v [0 ], %scale : vector <1 xf8 E4 M3 FN> to vector <2 xbf16 >
413+ func.return %ret : vector <2 xbf16 >
414+ }
415+
416+ // CHECK-LABEL: func.func @scaled_ext_one_f8e5m2_f32
417+ // CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf8E5M2> to vector<1xi8>
418+ // CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
419+ // CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
420+ // CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi8>
421+ // CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
422+ // CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<4xi8> to i32
423+ // CHECK: rocdl.cvt.scalef32.pk.f32.bf8 [[BITCAST]][false], %arg1 : vector<2xf32>
424+ func.func @scaled_ext_one_f8e5m2_f32 (%v: vector <1 xf8 E5 M2 >, %scale: f32 ) -> vector <2 xf32 > {
425+ %ret = amdgpu.scaled_ext_packed %v [0 ], %scale : vector <1 xf8 E5 M2 > to vector <2 xf32 >
426+ func.return %ret : vector <2 xf32 >
427+ }
428+
429+ // CHECK-LABEL: func.func @scaled_ext_one_f8e5m2_f16
430+ // CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf8E5M2> to vector<1xi8>
431+ // CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
432+ // CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
433+ // CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi8>
434+ // CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
435+ // CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<4xi8> to i32
436+ // CHECK: rocdl.cvt.scalef32.pk.f16.bf8 [[BITCAST]][false], %arg1 : vector<2xf16>
437+ func.func @scaled_ext_one_f8e5m2_f16 (%v: vector <1 xf8 E5 M2 >, %scale: f32 ) -> vector <2 xf16 > {
438+ %ret = amdgpu.scaled_ext_packed %v [0 ], %scale : vector <1 xf8 E5 M2 > to vector <2 xf16 >
439+ func.return %ret : vector <2 xf16 >
440+ }
441+
442+ // CHECK-LABEL: func.func @scaled_ext_one_f8e5m2_bf16
443+ // CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf8E5M2> to vector<1xi8>
444+ // CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
445+ // CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
446+ // CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi8>
447+ // CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
448+ // CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<4xi8> to i32
449+ // CHECK: rocdl.cvt.scalef32.pk.bf16.bf8 [[BITCAST]][false], %arg1 : vector<2xbf16>
450+ func.func @scaled_ext_one_f8e5m2_bf16 (%v: vector <1 xf8 E5 M2 >, %scale: f32 ) -> vector <2 xbf16 > {
451+ %ret = amdgpu.scaled_ext_packed %v [0 ], %scale : vector <1 xf8 E5 M2 > to vector <2 xbf16 >
452+ func.return %ret : vector <2 xbf16 >
453+ }
454+
455+ // CHECK-LABEL: func.func @scaled_ext_one_f4e2m1_f32
456+ // CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf4E2M1FN> to vector<1xi4>
457+ // CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
458+ // CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
459+ // CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi4>
460+ // CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
461+ // CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<8xi4> to i32
462+ // CHECK: rocdl.cvt.scalef32.pk.f32.fp4 [[BITCAST]][0], %arg1 : vector<2xf32>
463+ func.func @scaled_ext_one_f4e2m1_f32 (%v: vector <1 xf4 E2 M1 FN>, %scale: f32 ) -> vector <2 xf32 > {
464+ %ret = amdgpu.scaled_ext_packed %v [0 ], %scale : vector <1 xf4 E2 M1 FN> to vector <2 xf32 >
465+ func.return %ret : vector <2 xf32 >
466+ }
467+
468+ // CHECK-LABEL: func.func @scaled_ext_one_f4e2m1_f16
469+ // CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf4E2M1FN> to vector<1xi4>
470+ // CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
471+ // CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
472+ // CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi4>
473+ // CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
474+ // CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<8xi4> to i32
475+ // CHECK: rocdl.cvt.scalef32.pk.f16.fp4 [[BITCAST]][0], %arg1 : vector<2xf16>
476+ func.func @scaled_ext_one_f4e2m1_f16 (%v: vector <1 xf4 E2 M1 FN>, %scale: f32 ) -> vector <2 xf16 > {
477+ %ret = amdgpu.scaled_ext_packed %v [0 ], %scale : vector <1 xf4 E2 M1 FN> to vector <2 xf16 >
478+ func.return %ret : vector <2 xf16 >
479+ }
480+
481+ // CHECK-LABEL: func.func @scaled_ext_one_f4e2m1_bf16
482+ // CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf4E2M1FN> to vector<1xi4>
483+ // CHECK-DAG: [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
484+ // CHECK-DAG: [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
485+ // CHECK: [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi4>
486+ // CHECK: [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
487+ // CHECK: [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<8xi4> to i32
488+ // CHECK: rocdl.cvt.scalef32.pk.bf16.fp4 [[BITCAST]][0], %arg1 : vector<2xbf16>
489+ func.func @scaled_ext_one_f4e2m1_bf16 (%v: vector <1 xf4 E2 M1 FN>, %scale: f32 ) -> vector <2 xbf16 > {
490+ %ret = amdgpu.scaled_ext_packed %v [0 ], %scale : vector <1 xf4 E2 M1 FN> to vector <2 xbf16 >
491+ func.return %ret : vector <2 xbf16 >
492+ }
0 commit comments