@@ -53,6 +53,31 @@ func.func @vector_load_i4(%arg1: index, %arg2: index) -> vector<3x8xi4> {
5353
5454// -----
5555
56+ func.func @vector_load_f4 (%arg1: index , %arg2: index ) -> vector <3 x8 xf4 E2 M1 FN> {
57+ %0 = memref.alloc () : memref <3 x8 xf4 E2 M1 FN>
58+ %cst = arith.constant dense <0.0 > : vector <3 x8 xf4 E2 M1 FN>
59+ %1 = vector.load %0 [%arg1 , %arg2 ] : memref <3 x8 xf4 E2 M1 FN>, vector <8 xf4 E2 M1 FN>
60+ %2 = vector.insert %1 , %cst [0 ] : vector <8 xf4 E2 M1 FN> into vector <3 x8 xf4 E2 M1 FN>
61+ return %2 : vector <3 x8 xf4 E2 M1 FN>
62+ }
63+ // CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 floordiv 2)>
64+ // CHECK: func @vector_load_f4
65+ // CHECK-SAME: (%[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index)
66+ // CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<12xi8>
67+ // CHECK: %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG0]], %[[ARG1]]]
68+ // CHECK: %[[VEC:.+]] = vector.load %[[ALLOC]][%[[INDEX]]] : memref<12xi8>, vector<4xi8>
69+ // CHECK: %[[VEC_F4:.+]] = vector.bitcast %[[VEC]] : vector<4xi8> to vector<8xf4E2M1FN>
70+
71+ // CHECK32-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 8)>
72+ // CHECK32: func @vector_load_f4
73+ // CHECK32-SAME: (%[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index)
74+ // CHECK32: %[[ALLOC:.+]] = memref.alloc() : memref<3xi32>
75+ // CHECK32: %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG0]], %[[ARG1]]]
76+ // CHECK32: %[[VEC:.+]] = vector.load %[[ALLOC]][%[[INDEX]]] : memref<3xi32>, vector<1xi32>
77+ // CHECK32: %[[VEC_F4:.+]] = vector.bitcast %[[VEC]] : vector<1xi32> to vector<8xf4E2M1FN>
78+
79+ // -----
80+
5681func.func @vector_load_i4_dynamic (%arg0 : index , %arg1 : index , %arg2 : index , %arg3 : index ) -> vector <8 xi4 > {
5782 %0 = memref.alloc (%arg0 , %arg1 ) : memref <?x?xi4 >
5883 %1 = vector.load %0 [%arg2 , %arg3 ] : memref <?x?xi4 >, vector <8 xi4 >
@@ -119,6 +144,37 @@ func.func @vector_transfer_read_i4(%arg1: index, %arg2: index) -> vector<8xi4> {
119144
120145// -----
121146
147+ func.func @vector_transfer_read_f4 (%arg1: index , %arg2: index ) -> vector <8 xf4 E2 M1 FN> {
148+ %c0 = arith.constant 0.0 : f4E2M1FN
149+ %0 = memref.alloc () : memref <3 x8 xf4 E2 M1 FN>
150+ %1 = vector.transfer_read %0 [%arg1 , %arg2 ], %c0 {in_bounds = [true ]} :
151+ memref <3 x8 xf4 E2 M1 FN>, vector <8 xf4 E2 M1 FN>
152+ return %1 : vector <8 xf4 E2 M1 FN>
153+ }
154+ // CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 floordiv 2)>
155+ // CHECK: func @vector_transfer_read_f4
156+ // CHECK-SAME: (%[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index)
157+ // CHECK: %[[CONST:.+]] = arith.constant 0.{{0+}}e+00 : f4E2M1FN
158+ // CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<12xi8>
159+ // CHECK: %[[BC:.+]] = arith.bitcast %[[CONST]] : f4E2M1FN to i4
160+ // CHECK: %[[PAD:.+]] = arith.extui %[[BC]] : i4 to i8
161+ // CHECK: %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG0]], %[[ARG1]]]
162+ // CHECK: %[[VEC:.+]] = vector.transfer_read %[[ALLOC]][%[[INDEX]]], %[[PAD]] : memref<12xi8>, vector<4xi8>
163+ // CHECK: %[[VEC_F4:.+]] = vector.bitcast %[[VEC]] : vector<4xi8> to vector<8xf4E2M1FN>
164+
165+ // CHECK32-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 8)>
166+ // CHECK32: func @vector_transfer_read_f4
167+ // CHECK32-SAME: (%[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index)
168+ // CHECK32: %[[CONST:.+]] = arith.constant 0.{{0+}}e+00 : f4E2M1FN
169+ // CHECK32: %[[ALLOC:.+]] = memref.alloc() : memref<3xi32>
170+ // CHECK32: %[[BC:.+]] = arith.bitcast %[[CONST]] : f4E2M1FN to i4
171+ // CHECK32: %[[PAD:.+]] = arith.extui %[[BC]] : i4 to i32
172+ // CHECK32: %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG0]], %[[ARG1]]]
173+ // CHECK32: %[[VEC:.+]] = vector.transfer_read %[[ALLOC]][%[[INDEX]]], %[[PAD]] : memref<3xi32>, vector<1xi32>
174+ // CHECK32: %[[VEC_F4:.+]] = vector.bitcast %[[VEC]] : vector<1xi32> to vector<8xf4E2M1FN>
175+
176+ // -----
177+
122178///----------------------------------------------------------------------------------------
123179/// vector.maskedload
124180///----------------------------------------------------------------------------------------
@@ -439,6 +495,28 @@ func.func @vector_store_i4(%arg0: vector<8xi4>, %arg1: index, %arg2: index) {
439495
440496// -----
441497
498+ func.func @vector_store_f4 (%arg0: vector <8 xf4 E2 M1 FN>, %arg1: index , %arg2: index ) {
499+ %0 = memref.alloc () : memref <4 x8 xf4 E2 M1 FN>
500+ vector.store %arg0 , %0 [%arg1 , %arg2 ] :memref <4 x8 xf4 E2 M1 FN>, vector <8 xf4 E2 M1 FN>
501+ return
502+ }
503+
504+ // CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 floordiv 2)>
505+ // CHECK: func @vector_store_f4
506+ // CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<16xi8>
507+ // CHECK: %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG1]], %[[ARG2]]]
508+ // CHECK: %[[VEC_I8:.+]] = vector.bitcast %[[ARG0]] : vector<8xf4E2M1FN> to vector<4xi8>
509+ // CHECK: vector.store %[[VEC_I8:.+]], %[[ALLOC:.+]][%[[INDEX:.+]]] : memref<16xi8>, vector<4xi8>
510+
511+ // CHECK32-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 8)>
512+ // CHECK32: func @vector_store_f4
513+ // CHECK32: %[[ALLOC:.+]] = memref.alloc() : memref<4xi32>
514+ // CHECK32: %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG1]], %[[ARG2]]]
515+ // CHECK32: %[[VEC_I32:.+]] = vector.bitcast %[[ARG0]] : vector<8xf4E2M1FN> to vector<1xi32>
516+ // CHECK32: vector.store %[[VEC_I32:.+]], %[[ALLOC:.+]][%[[INDEX:.+]]] : memref<4xi32>, vector<1xi32>
517+
518+ // -----
519+
442520// FIXME: This example assumes that the store happens at a byte boundary, but
443521// that's not guaranteed. Below is a counter-example with specific dimensions:
444522// vector.store %arg0, %0[0, 3] : memref<2x13xi4>, vector<8xi4>
0 commit comments