@@ -457,117 +457,3 @@ func.func @sched_barrier() {
457457 func.return
458458}
459459
460- // CHECK-LABEL: @scaled_ext_packed816_fp4
461- // CHECK-SAME: (%[[SOURCE:.+]]: vector<8xf4E2M1FN>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>)
462- func.func @scaled_ext_packed816_fp4 (%v: vector <8 xf4 E2 M1 FN>, %scale: vector <4 xf8 E8 M0 FNU>) -> (vector <8 xf16 >, vector <8 xbf16 >, vector <8 xf32 >) {
463- // CHECK: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8>
464- // CHECK: %[[SOURCE_8xi4:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<8xf4E2M1FN> to vector<8xi4>
465- // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
466- // CHECK: %[[SOURCE_i32:.+]] = llvm.bitcast %[[SOURCE_8xi4]] : vector<8xi4> to i32
467- // CHECK: rocdl.cvt.scale.pk8.f16.fp4 %[[SOURCE_i32]], %[[SCALE_i32]][0] : vector<8xf16>
468- %ret0 = amdgpu.scaled_ext_packed816 %v scale (%scale ) blockSize (32 ) firstScaleLane (0 ) firstScaleByte (0 ) : vector <8 xf4 E2 M1 FN>, vector <4 xf8 E8 M0 FNU> -> vector <8 xf16 >
469-
470- // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
471- // CHECK: %[[SOURCE_i32:.+]] = llvm.bitcast %[[SOURCE_8xi4]] : vector<8xi4> to i32
472- // CHECK: rocdl.cvt.scale.pk8.bf16.fp4 %[[SOURCE_i32]], %[[SCALE_i32]][0] : vector<8xbf16>
473- %ret1 = amdgpu.scaled_ext_packed816 %v scale (%scale ) blockSize (32 ) firstScaleLane (0 ) firstScaleByte (0 ) : vector <8 xf4 E2 M1 FN>, vector <4 xf8 E8 M0 FNU> -> vector <8 xbf16 >
474-
475- // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
476- // CHECK: %[[SOURCE_i32:.+]] = llvm.bitcast %[[SOURCE_8xi4]] : vector<8xi4> to i32
477- // CHECK: rocdl.cvt.scale.pk8.f32.fp4 %[[SOURCE_i32]], %[[SCALE_i32]][0] : vector<8xf32>
478- %ret2 = amdgpu.scaled_ext_packed816 %v scale (%scale ) blockSize (32 ) firstScaleLane (0 ) firstScaleByte (0 ) : vector <8 xf4 E2 M1 FN>, vector <4 xf8 E8 M0 FNU> -> vector <8 xf32 >
479- func.return %ret0 , %ret1 , %ret2: vector <8 xf16 >, vector <8 xbf16 >, vector <8 xf32 >
480- }
481-
482- // CHECK-LABEL: @scaled_ext_packed816_fp8
483- // CHECK-SAME: (%[[SOURCE:.+]]: vector<8xf8E4M3FN>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>)
484- func.func @scaled_ext_packed816_fp8 (%v: vector <8 xf8 E4 M3 FN>, %scale: vector <4 xf8 E8 M0 FNU>) -> (vector <8 xf16 >, vector <8 xbf16 >, vector <8 xf32 >) {
485- // CHECK: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8>
486- // CHECK: %[[SOURCE_8xi8:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<8xf8E4M3FN> to vector<8xi8>
487- // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
488- // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
489- // CHECK: rocdl.cvt.scale.pk8.f16.fp8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xf16>
490- %ret0 = amdgpu.scaled_ext_packed816 %v scale (%scale ) blockSize (32 ) firstScaleLane (0 ) firstScaleByte (0 ) : vector <8 xf8 E4 M3 FN>, vector <4 xf8 E8 M0 FNU> -> vector <8 xf16 >
491-
492- // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
493- // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
494- // CHECK: rocdl.cvt.scale.pk8.bf16.fp8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xbf16>
495- %ret1 = amdgpu.scaled_ext_packed816 %v scale (%scale ) blockSize (32 ) firstScaleLane (0 ) firstScaleByte (0 ) : vector <8 xf8 E4 M3 FN>, vector <4 xf8 E8 M0 FNU> -> vector <8 xbf16 >
496-
497- // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
498- // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
499- // CHECK: rocdl.cvt.scale.pk8.f32.fp8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xf32>
500- %ret2 = amdgpu.scaled_ext_packed816 %v scale (%scale ) blockSize (32 ) firstScaleLane (0 ) firstScaleByte (0 ) : vector <8 xf8 E4 M3 FN>, vector <4 xf8 E8 M0 FNU> -> vector <8 xf32 >
501-
502- func.return %ret0 , %ret1 , %ret2 : vector <8 xf16 >, vector <8 xbf16 >, vector <8 xf32 >
503- }
504-
505- // CHECK-LABEL: @scaled_ext_packed816_bf8
506- // CHECK-SAME: (%[[SOURCE:.+]]: vector<8xf8E5M2>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>)
507- func.func @scaled_ext_packed816_bf8 (%v: vector <8 xf8 E5 M2 >, %scale: vector <4 xf8 E8 M0 FNU>) -> (vector <8 xf16 >, vector <8 xbf16 >, vector <8 xf32 >) {
508- // CHECK: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8>
509- // CHECK: %[[SOURCE_8xi8:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<8xf8E5M2> to vector<8xi8>
510- // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
511- // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
512- // CHECK: %[[RES:.+]] = rocdl.cvt.scale.pk8.f16.bf8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xf16>
513- %ret0 = amdgpu.scaled_ext_packed816 %v scale (%scale ) blockSize (32 ) firstScaleLane (0 ) firstScaleByte (0 ) : vector <8 xf8 E5 M2 >, vector <4 xf8 E8 M0 FNU> -> vector <8 xf16 >
514-
515- // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
516- // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
517- // CHECK: rocdl.cvt.scale.pk8.bf16.bf8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xbf16>
518- %ret1 = amdgpu.scaled_ext_packed816 %v scale (%scale ) blockSize (32 ) firstScaleLane (0 ) firstScaleByte (0 ) : vector <8 xf8 E5 M2 >, vector <4 xf8 E8 M0 FNU> -> vector <8 xbf16 >
519-
520- // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
521- // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
522- // CHECK: rocdl.cvt.scale.pk8.f32.bf8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xf32>
523- %ret2 = amdgpu.scaled_ext_packed816 %v scale (%scale ) blockSize (32 ) firstScaleLane (0 ) firstScaleByte (0 ) : vector <8 xf8 E5 M2 >, vector <4 xf8 E8 M0 FNU> -> vector <8 xf32 >
524- func.return %ret0 , %ret1 , %ret2 : vector <8 xf16 >, vector <8 xbf16 >, vector <8 xf32 >
525- }
526-
527-
528- // CHECK-LABEL: @scaled_ext_packed816_fp6
529- // CHECK-SAME: (%[[SOURCE:.+]]: vector<16xf6E2M3FN>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>)
530- func.func @scaled_ext_packed816_fp6 (%v: vector <16 xf6 E2 M3 FN>, %scale: vector <4 xf8 E8 M0 FNU>) -> (vector <16 xf16 >, vector <16 xbf16 >, vector <16 xf32 >) {
531- // CHECK-DAG: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8>
532- // CHECK-DAG: %[[SOURCE_16xi6:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<16xf6E2M3FN> to vector<16xi6>
533-
534- // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
535- // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
536- // CHECK: rocdl.cvt.scale.pk16.f16.fp6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xf16>
537- %ret0 = amdgpu.scaled_ext_packed816 %v scale (%scale ) blockSize (32 ) firstScaleLane (0 ) firstScaleByte (0 ) : vector <16 xf6 E2 M3 FN>, vector <4 xf8 E8 M0 FNU> -> vector <16 xf16 >
538-
539- // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
540- // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
541- // CHECK: rocdl.cvt.scale.pk16.bf16.fp6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xbf16>
542- %ret1 = amdgpu.scaled_ext_packed816 %v scale (%scale ) blockSize (32 ) firstScaleLane (0 ) firstScaleByte (0 ) : vector <16 xf6 E2 M3 FN>, vector <4 xf8 E8 M0 FNU> -> vector <16 xbf16 >
543-
544- // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
545- // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
546- // CHECK: rocdl.cvt.scale.pk16.f32.fp6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xf32>
547- %ret2 = amdgpu.scaled_ext_packed816 %v scale (%scale ) blockSize (32 ) firstScaleLane (0 ) firstScaleByte (0 ) : vector <16 xf6 E2 M3 FN>, vector <4 xf8 E8 M0 FNU> -> vector <16 xf32 >
548- return %ret0 , %ret1 , %ret2: vector <16 xf16 >, vector <16 xbf16 >, vector <16 xf32 >
549- }
550-
551- // CHECK-LABEL: @scaled_ext_packed816_bf6
552- // CHECK-SAME: (%[[SOURCE:.+]]: vector<16xf6E3M2FN>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>)
553- func.func @scaled_ext_packed816_bf6 (%v: vector <16 xf6 E3 M2 FN>, %scale: vector <4 xf8 E8 M0 FNU>) -> (vector <16 xf16 >, vector <16 xbf16 >, vector <16 xf32 >) {
554- // CHECK-DAG: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8>
555- // CHECK-DAG: %[[SOURCE_16xi6:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<16xf6E3M2FN> to vector<16xi6>
556-
557- // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
558- // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
559- // CHECK: rocdl.cvt.scale.pk16.f16.bf6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xf16>
560- %ret0 = amdgpu.scaled_ext_packed816 %v scale (%scale ) blockSize (32 ) firstScaleLane (0 ) firstScaleByte (0 ) : vector <16 xf6 E3 M2 FN>, vector <4 xf8 E8 M0 FNU> -> vector <16 xf16 >
561-
562- // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
563- // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
564- // CHECK: rocdl.cvt.scale.pk16.bf16.bf6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xbf16>
565- %ret1 = amdgpu.scaled_ext_packed816 %v scale (%scale ) blockSize (32 ) firstScaleLane (0 ) firstScaleByte (0 ) : vector <16 xf6 E3 M2 FN>, vector <4 xf8 E8 M0 FNU> -> vector <16 xbf16 >
566-
567- // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
568- // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
569- // CHECK: rocdl.cvt.scale.pk16.f32.bf6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xf32>
570- %ret2 = amdgpu.scaled_ext_packed816 %v scale (%scale ) blockSize (32 ) firstScaleLane (0 ) firstScaleByte (0 ) : vector <16 xf6 E3 M2 FN>, vector <4 xf8 E8 M0 FNU> -> vector <16 xf32 >
571- return %ret0 , %ret1 , %ret2: vector <16 xf16 >, vector <16 xbf16 >, vector <16 xf32 >
572- }
573-
0 commit comments