@@ -86,72 +86,72 @@ void matmul_vectorized_2x2_bfp16(const bfp16ebs8 *__restrict pA,
8686 AIE_PREPARE_FOR_PIPELINING
8787 AIE_LOOP_MIN_ITERATION_COUNT (4 )
8888 for (unsigned z = 0 ; z < rowA; z += 2 ) {
89- aie::block_vector_input_buffer_stream<bfp16ebs8, 64 > pC1In (pC);
90- pC1In.seek (z * colB);
91- aie::block_vector_input_buffer_stream<bfp16ebs8, 64 > pC2In (pC);
92- pC2In.seek ((z + 1 ) * colB);
93- aie::block_vector_output_buffer_stream<bfp16ebs8, 64 > pC1Out (pC);
94- pC1Out.seek (z * colB);
95- aie::block_vector_output_buffer_stream<bfp16ebs8, 64 > pC2Out (pC);
96- pC2Out.seek ((z + 1 ) * colB);
97-
98- for (unsigned j = 0 ; j < colB; j += 2 )
89+ aie::block_vector_input_buffer_stream<bfp16ebs8, 64 > pC1In (pC);
90+ pC1In.seek (z * colB);
91+ aie::block_vector_input_buffer_stream<bfp16ebs8, 64 > pC2In (pC);
92+ pC2In.seek ((z + 1 ) * colB);
93+ aie::block_vector_output_buffer_stream<bfp16ebs8, 64 > pC1Out (pC);
94+ pC1Out.seek (z * colB);
95+ aie::block_vector_output_buffer_stream<bfp16ebs8, 64 > pC2Out (pC);
96+ pC2Out.seek ((z + 1 ) * colB);
97+
98+ for (unsigned j = 0 ; j < colB; j += 2 )
9999#ifdef OPT_PERF_ENABLED
100100 AIE_LOOP_FLATTEN
101101#endif
102- {
103- aie::block_vector_input_buffer_stream<bfp16ebs8, 64 > pA1bfp16 (pA);
104- aie::block_vector_input_buffer_stream<bfp16ebs8, 64 > pA2bfp16 (pA);
105- pA1bfp16.seek (z * colA);
106- pA2bfp16.seek ((z + 1 ) * colA);
107-
108- aie::block_vector_input_buffer_stream<bfp16ebs8, 64 > pB1bfp16 (pB);
109- aie::block_vector_input_buffer_stream<bfp16ebs8, 64 > pB2bfp16 (pB);
110- // For non transposed matrix
111- // pB1bfp16.seek(j);
112- // pB2bfp16.seek(j + 1);
113- pB1bfp16.seek (j * colA);
114- pB2bfp16.seek ((j + 1 ) * colA);
115-
116- aie::block_vector<bfp16ebs8, sizeA> A0;
117- aie::block_vector<bfp16ebs8, sizeA> A1;
118- aie::block_vector<bfp16ebs8, sizeB> B0;
119- aie::block_vector<bfp16ebs8, sizeB> B1;
120-
121- // Note that unlike the example mentioned above, we need
122- // to use a mac to take into account results from previous kernel
123- // calls but this is completely unrelated to the block datatype.
124- aie::accum<accfloat, sizeC> accC00 (pC1In.pop ());
125- aie::accum<accfloat, sizeC> accC01 (pC1In.pop ());
126- aie::accum<accfloat, sizeC> accC10 (pC2In.pop ());
127- aie::accum<accfloat, sizeC> accC11 (pC2In.pop ());
128-
129- for (unsigned i = 0 ; i < colA; ++i)
102+ {
103+ aie::block_vector_input_buffer_stream<bfp16ebs8, 64 > pA1bfp16 (pA);
104+ aie::block_vector_input_buffer_stream<bfp16ebs8, 64 > pA2bfp16 (pA);
105+ pA1bfp16.seek (z * colA);
106+ pA2bfp16.seek ((z + 1 ) * colA);
107+
108+ aie::block_vector_input_buffer_stream<bfp16ebs8, 64 > pB1bfp16 (pB);
109+ aie::block_vector_input_buffer_stream<bfp16ebs8, 64 > pB2bfp16 (pB);
110+ // For non transposed matrix
111+ // pB1bfp16.seek(j);
112+ // pB2bfp16.seek(j + 1);
113+ pB1bfp16.seek (j * colA);
114+ pB2bfp16.seek ((j + 1 ) * colA);
115+
116+ aie::block_vector<bfp16ebs8, sizeA> A0;
117+ aie::block_vector<bfp16ebs8, sizeA> A1;
118+ aie::block_vector<bfp16ebs8, sizeB> B0;
119+ aie::block_vector<bfp16ebs8, sizeB> B1;
120+
121+ // Note that unlike the example mentioned above, we need
122+ // to use a mac to take into account results from previous kernel
123+ // calls but this is completely unrelated to the block datatype.
124+ aie::accum<accfloat, sizeC> accC00 (pC1In.pop ());
125+ aie::accum<accfloat, sizeC> accC01 (pC1In.pop ());
126+ aie::accum<accfloat, sizeC> accC10 (pC2In.pop ());
127+ aie::accum<accfloat, sizeC> accC11 (pC2In.pop ());
128+
129+ for (unsigned i = 0 ; i < colA; ++i)
130130#ifdef OPT_PERF_ENABLED
131- AIE_LOOP_FLATTEN
131+ AIE_LOOP_FLATTEN
132132#endif
133- {
134- A0 = pA1bfp16.pop ();
135- A1 = pA2bfp16.pop ();
136-
137- // For non transposed matrix
138- // B0 = pB1bfp16.pop_seek(colB - 1);
139- // B1 = pB2bfp16.pop_seek(colB - 1);
140- B0 = pB1bfp16.pop ();
141- B1 = pB2bfp16.pop ();
142-
143- accC00 = mac_8x8_8x8T (A0, B0, accC00);
144- accC01 = mac_8x8_8x8T (A0, B1, accC01);
145- accC10 = mac_8x8_8x8T (A1, B0, accC10);
146- accC11 = mac_8x8_8x8T (A1, B1, accC11);
147- }
148-
149- pC1Out.push (accC00.template to_vector <bfp16ebs8>());
150- pC1Out.push (accC01.template to_vector <bfp16ebs8>());
151- pC2Out.push (accC10.template to_vector <bfp16ebs8>());
152- pC2Out.push (accC11.template to_vector <bfp16ebs8>());
153- }
154- }
133+ {
134+ A0 = pA1bfp16.pop ();
135+ A1 = pA2bfp16.pop ();
136+
137+ // For non transposed matrix
138+ // B0 = pB1bfp16.pop_seek(colB - 1);
139+ // B1 = pB2bfp16.pop_seek(colB - 1);
140+ B0 = pB1bfp16.pop ();
141+ B1 = pB2bfp16.pop ();
142+
143+ accC00 = mac_8x8_8x8T (A0, B0, accC00);
144+ accC01 = mac_8x8_8x8T (A0, B1, accC01);
145+ accC10 = mac_8x8_8x8T (A1, B0, accC10);
146+ accC11 = mac_8x8_8x8T (A1, B1, accC11);
147+ }
148+
149+ pC1Out.push (accC00.template to_vector <bfp16ebs8>());
150+ pC1Out.push (accC01.template to_vector <bfp16ebs8>());
151+ pC2Out.push (accC10.template to_vector <bfp16ebs8>());
152+ pC2Out.push (accC11.template to_vector <bfp16ebs8>());
153+ }
154+ }
155155}
156156
157157extern " C" {
0 commit comments