Skip to content

Commit d10d27d

Browse files
Run clang-format (#2553)
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
1 parent 45c368a commit d10d27d

File tree

35 files changed

+783
-647
lines changed

35 files changed

+783
-647
lines changed

.github/workflows/lintAndFormat.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ jobs:
153153
tool_name: clang-format
154154
level: error
155155
cleanup: true
156-
fail_on_error: true
156+
fail_level: any
157157

158158
- name: Run black format
159159
if: success() || failure()
@@ -175,7 +175,7 @@ jobs:
175175
with:
176176
tool_name: black
177177
level: error
178-
fail_on_error: true
178+
fail_level: any
179179

180180
code-coverage:
181181

aie_kernels/aie2p/layer_norm.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ void layer_norm(const T *restrict input, T *restrict output, int32_t cols) {
4343

4444
::aie::vector<T, N> mean_v = ::aie::broadcast<T, N>(mean);
4545
::aie::vector<T, N> inv_std_v = ::aie::broadcast<T, N>(inv_std);
46-
46+
4747
for (int i = 0; i < vector_chunks; i++) {
4848
::aie::vector<T, N> reg_a = ::aie::load_v<N>(input + i * N);
4949
::aie::vector<T, N> diff_v = ::aie::sub(reg_a, mean_v);

aie_kernels/aie2p/mm.cc

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,8 @@ matmul_vectorized_8x8x8_bf16_f32(const bfloat16 *__restrict pA,
342342
static_assert(n % (2 * t) == 0);
343343

344344
return matmul_vectorized_2x2_mmul<bfloat16, float, (m / r), (k / s), (n / t),
345-
r, s, t, is_b_row_maj, is_c_row_maj>(pA, pB, pC);
345+
r, s, t, is_b_row_maj, is_c_row_maj>(pA, pB,
346+
pC);
346347
}
347348

348349
template <unsigned m, unsigned k, unsigned n>
@@ -481,8 +482,8 @@ extern "C" {
481482
r, s, t) \
482483
void matmul_scalar_##mlir_type_in##_##mlir_type_out( \
483484
ctype_in *a_in, ctype_in *b_in, ctype_out *c_out) { \
484-
matmul_scalar<ctype_in, ctype_out, DIM_M, DIM_K, DIM_N, is_b_row_maj, is_c_row_maj>(a_in, b_in, \
485-
c_out); \
485+
matmul_scalar<ctype_in, ctype_out, DIM_M, DIM_K, DIM_N, is_b_row_maj, \
486+
is_c_row_maj>(a_in, b_in, c_out); \
486487
}
487488

488489
#define zero_vectorized_c_func(ctype_in, mlir_type_in, ctype_out, \

aie_kernels/aie2p/mm_bfp.cc

Lines changed: 61 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -86,72 +86,72 @@ void matmul_vectorized_2x2_bfp16(const bfp16ebs8 *__restrict pA,
8686
AIE_PREPARE_FOR_PIPELINING
8787
AIE_LOOP_MIN_ITERATION_COUNT(4)
8888
for (unsigned z = 0; z < rowA; z += 2) {
89-
aie::block_vector_input_buffer_stream<bfp16ebs8, 64> pC1In(pC);
90-
pC1In.seek(z * colB);
91-
aie::block_vector_input_buffer_stream<bfp16ebs8, 64> pC2In(pC);
92-
pC2In.seek((z + 1) * colB);
93-
aie::block_vector_output_buffer_stream<bfp16ebs8, 64> pC1Out(pC);
94-
pC1Out.seek(z * colB);
95-
aie::block_vector_output_buffer_stream<bfp16ebs8, 64> pC2Out(pC);
96-
pC2Out.seek((z + 1) * colB);
97-
98-
for (unsigned j = 0; j < colB; j += 2)
89+
aie::block_vector_input_buffer_stream<bfp16ebs8, 64> pC1In(pC);
90+
pC1In.seek(z * colB);
91+
aie::block_vector_input_buffer_stream<bfp16ebs8, 64> pC2In(pC);
92+
pC2In.seek((z + 1) * colB);
93+
aie::block_vector_output_buffer_stream<bfp16ebs8, 64> pC1Out(pC);
94+
pC1Out.seek(z * colB);
95+
aie::block_vector_output_buffer_stream<bfp16ebs8, 64> pC2Out(pC);
96+
pC2Out.seek((z + 1) * colB);
97+
98+
for (unsigned j = 0; j < colB; j += 2)
9999
#ifdef OPT_PERF_ENABLED
100100
AIE_LOOP_FLATTEN
101101
#endif
102-
{
103-
aie::block_vector_input_buffer_stream<bfp16ebs8, 64> pA1bfp16(pA);
104-
aie::block_vector_input_buffer_stream<bfp16ebs8, 64> pA2bfp16(pA);
105-
pA1bfp16.seek(z * colA);
106-
pA2bfp16.seek((z + 1) * colA);
107-
108-
aie::block_vector_input_buffer_stream<bfp16ebs8, 64> pB1bfp16(pB);
109-
aie::block_vector_input_buffer_stream<bfp16ebs8, 64> pB2bfp16(pB);
110-
// For non transposed matrix
111-
// pB1bfp16.seek(j);
112-
// pB2bfp16.seek(j + 1);
113-
pB1bfp16.seek(j * colA);
114-
pB2bfp16.seek((j + 1) * colA);
115-
116-
aie::block_vector<bfp16ebs8, sizeA> A0;
117-
aie::block_vector<bfp16ebs8, sizeA> A1;
118-
aie::block_vector<bfp16ebs8, sizeB> B0;
119-
aie::block_vector<bfp16ebs8, sizeB> B1;
120-
121-
// Note that unlike the example mentioned above, we need
122-
// to use a mac to take into account results from previous kernel
123-
// calls but this is completely unrelated to the block datatype.
124-
aie::accum<accfloat, sizeC> accC00(pC1In.pop());
125-
aie::accum<accfloat, sizeC> accC01(pC1In.pop());
126-
aie::accum<accfloat, sizeC> accC10(pC2In.pop());
127-
aie::accum<accfloat, sizeC> accC11(pC2In.pop());
128-
129-
for (unsigned i = 0; i < colA; ++i)
102+
{
103+
aie::block_vector_input_buffer_stream<bfp16ebs8, 64> pA1bfp16(pA);
104+
aie::block_vector_input_buffer_stream<bfp16ebs8, 64> pA2bfp16(pA);
105+
pA1bfp16.seek(z * colA);
106+
pA2bfp16.seek((z + 1) * colA);
107+
108+
aie::block_vector_input_buffer_stream<bfp16ebs8, 64> pB1bfp16(pB);
109+
aie::block_vector_input_buffer_stream<bfp16ebs8, 64> pB2bfp16(pB);
110+
// For non transposed matrix
111+
// pB1bfp16.seek(j);
112+
// pB2bfp16.seek(j + 1);
113+
pB1bfp16.seek(j * colA);
114+
pB2bfp16.seek((j + 1) * colA);
115+
116+
aie::block_vector<bfp16ebs8, sizeA> A0;
117+
aie::block_vector<bfp16ebs8, sizeA> A1;
118+
aie::block_vector<bfp16ebs8, sizeB> B0;
119+
aie::block_vector<bfp16ebs8, sizeB> B1;
120+
121+
// Note that unlike the example mentioned above, we need
122+
// to use a mac to take into account results from previous kernel
123+
// calls but this is completely unrelated to the block datatype.
124+
aie::accum<accfloat, sizeC> accC00(pC1In.pop());
125+
aie::accum<accfloat, sizeC> accC01(pC1In.pop());
126+
aie::accum<accfloat, sizeC> accC10(pC2In.pop());
127+
aie::accum<accfloat, sizeC> accC11(pC2In.pop());
128+
129+
for (unsigned i = 0; i < colA; ++i)
130130
#ifdef OPT_PERF_ENABLED
131-
AIE_LOOP_FLATTEN
131+
AIE_LOOP_FLATTEN
132132
#endif
133-
{
134-
A0 = pA1bfp16.pop();
135-
A1 = pA2bfp16.pop();
136-
137-
// For non transposed matrix
138-
// B0 = pB1bfp16.pop_seek(colB - 1);
139-
// B1 = pB2bfp16.pop_seek(colB - 1);
140-
B0 = pB1bfp16.pop();
141-
B1 = pB2bfp16.pop();
142-
143-
accC00 = mac_8x8_8x8T(A0, B0, accC00);
144-
accC01 = mac_8x8_8x8T(A0, B1, accC01);
145-
accC10 = mac_8x8_8x8T(A1, B0, accC10);
146-
accC11 = mac_8x8_8x8T(A1, B1, accC11);
147-
}
148-
149-
pC1Out.push(accC00.template to_vector<bfp16ebs8>());
150-
pC1Out.push(accC01.template to_vector<bfp16ebs8>());
151-
pC2Out.push(accC10.template to_vector<bfp16ebs8>());
152-
pC2Out.push(accC11.template to_vector<bfp16ebs8>());
153-
}
154-
}
133+
{
134+
A0 = pA1bfp16.pop();
135+
A1 = pA2bfp16.pop();
136+
137+
// For non transposed matrix
138+
// B0 = pB1bfp16.pop_seek(colB - 1);
139+
// B1 = pB2bfp16.pop_seek(colB - 1);
140+
B0 = pB1bfp16.pop();
141+
B1 = pB2bfp16.pop();
142+
143+
accC00 = mac_8x8_8x8T(A0, B0, accC00);
144+
accC01 = mac_8x8_8x8T(A0, B1, accC01);
145+
accC10 = mac_8x8_8x8T(A1, B0, accC10);
146+
accC11 = mac_8x8_8x8T(A1, B1, accC11);
147+
}
148+
149+
pC1Out.push(accC00.template to_vector<bfp16ebs8>());
150+
pC1Out.push(accC01.template to_vector<bfp16ebs8>());
151+
pC2Out.push(accC10.template to_vector<bfp16ebs8>());
152+
pC2Out.push(accC11.template to_vector<bfp16ebs8>());
153+
}
154+
}
155155
}
156156

157157
extern "C" {

aie_kernels/aie2p/mm_bfp_mixed.cc

Lines changed: 65 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -38,77 +38,77 @@ void matmul_vectorized_2x2_bfp16_bf16(const bfloat16 *__restrict pA,
3838
AIE_PREPARE_FOR_PIPELINING
3939
AIE_LOOP_MIN_ITERATION_COUNT(4)
4040
for (unsigned z = 0; z < rowA; z += 2) {
41-
bfloat16 *__restrict pC1 = pC + (z * colB + 0) * sizeC;
42-
bfloat16 *__restrict pC2 = pC + ((z + 1) * colB + 0) * sizeC;
41+
bfloat16 *__restrict pC1 = pC + (z * colB + 0) * sizeC;
42+
bfloat16 *__restrict pC2 = pC + ((z + 1) * colB + 0) * sizeC;
4343

44-
for (unsigned j = 0; j < colB; j += 2)
44+
for (unsigned j = 0; j < colB; j += 2)
4545
#ifdef OPT_PERF_ENABLED
4646
AIE_LOOP_FLATTEN
4747
#endif
48-
{
49-
const bfloat16 *__restrict pA1 = pA + (z * colA + 0) * sizeA;
50-
const bfloat16 *__restrict pA2 = pA + ((z + 1) * colA + 0) * sizeA;
51-
52-
aie::block_vector_input_buffer_stream<bfp16ebs8, 64> pB1bfp16(pB);
53-
aie::block_vector_input_buffer_stream<bfp16ebs8, 64> pB2bfp16(pB);
54-
// For non transposed matrix
55-
// pB1bfp16.seek(j);
56-
// pB2bfp16.seek(j + 1);
57-
pB1bfp16.seek(j * colA);
58-
pB2bfp16.seek((j + 1) * colA);
59-
60-
aie::vector<bfloat16, sizeA> A0;
61-
aie::vector<bfloat16, sizeA> A1;
62-
aie::block_vector<bfp16ebs8, sizeB> B0;
63-
aie::block_vector<bfp16ebs8, sizeB> B1;
64-
65-
aie::accum<accfloat, sizeC> accC00(aie::load_v<sizeC>(pC1));
66-
aie::accum<accfloat, sizeC> accC01(aie::load_v<sizeC>(pC1 + sizeC));
67-
aie::accum<accfloat, sizeC> accC10(aie::load_v<sizeC>(pC2));
68-
aie::accum<accfloat, sizeC> accC11(aie::load_v<sizeC>(pC2 + sizeC));
69-
70-
aie::accum<accfloat, 64> accA0;
71-
aie::accum<accfloat, 64> accA1;
72-
73-
for (unsigned i = 0; i < colA; ++i)
48+
{
49+
const bfloat16 *__restrict pA1 = pA + (z * colA + 0) * sizeA;
50+
const bfloat16 *__restrict pA2 = pA + ((z + 1) * colA + 0) * sizeA;
51+
52+
aie::block_vector_input_buffer_stream<bfp16ebs8, 64> pB1bfp16(pB);
53+
aie::block_vector_input_buffer_stream<bfp16ebs8, 64> pB2bfp16(pB);
54+
// For non transposed matrix
55+
// pB1bfp16.seek(j);
56+
// pB2bfp16.seek(j + 1);
57+
pB1bfp16.seek(j * colA);
58+
pB2bfp16.seek((j + 1) * colA);
59+
60+
aie::vector<bfloat16, sizeA> A0;
61+
aie::vector<bfloat16, sizeA> A1;
62+
aie::block_vector<bfp16ebs8, sizeB> B0;
63+
aie::block_vector<bfp16ebs8, sizeB> B1;
64+
65+
aie::accum<accfloat, sizeC> accC00(aie::load_v<sizeC>(pC1));
66+
aie::accum<accfloat, sizeC> accC01(aie::load_v<sizeC>(pC1 + sizeC));
67+
aie::accum<accfloat, sizeC> accC10(aie::load_v<sizeC>(pC2));
68+
aie::accum<accfloat, sizeC> accC11(aie::load_v<sizeC>(pC2 + sizeC));
69+
70+
aie::accum<accfloat, 64> accA0;
71+
aie::accum<accfloat, 64> accA1;
72+
73+
for (unsigned i = 0; i < colA; ++i)
7474
#ifdef OPT_PERF_ENABLED
75-
AIE_LOOP_FLATTEN
75+
AIE_LOOP_FLATTEN
7676
#endif
77-
{
78-
A0 = aie::load_v<sizeA>(pA1);
79-
pA1 += sizeA;
80-
A1 = aie::load_v<sizeA>(pA2);
81-
pA2 += sizeA;
82-
83-
// Convert A0 into bfp16
84-
accA0 = A0;
85-
// Convert A1 into bfp16 through a different path (see bfp
86-
// conversion example)
87-
accA1 = mul_elem_64(A1, concat(broadcast_one_to_v32bfloat16(),
88-
broadcast_one_to_v32bfloat16()));
89-
90-
// For non transposed matrix
91-
// B0 = pB1bfp16.pop_seek(colB - 1);
92-
// B1 = pB2bfp16.pop_seek(colB - 1);
93-
B0 = pB1bfp16.pop();
94-
B1 = pB2bfp16.pop();
95-
96-
accC00 = mac_8x8_8x8T(accA0.to_vector<bfp16ebs8>(), B0, accC00);
97-
accC01 = mac_8x8_8x8T(accA0.to_vector<bfp16ebs8>(), B1, accC01);
98-
accC10 = mac_8x8_8x8T(accA1.to_vector<bfp16ebs8>(), B0, accC10);
99-
accC11 = mac_8x8_8x8T(accA1.to_vector<bfp16ebs8>(), B1, accC11);
100-
}
101-
102-
aie::store_v(pC1, accC00.template to_vector<bfloat16>());
103-
pC1 += sizeC;
104-
aie::store_v(pC1, accC01.template to_vector<bfloat16>());
105-
pC1 += sizeC;
106-
aie::store_v(pC2, accC10.template to_vector<bfloat16>());
107-
pC2 += sizeC;
108-
aie::store_v(pC2, accC11.template to_vector<bfloat16>());
109-
pC2 += sizeC;
110-
}
111-
}
77+
{
78+
A0 = aie::load_v<sizeA>(pA1);
79+
pA1 += sizeA;
80+
A1 = aie::load_v<sizeA>(pA2);
81+
pA2 += sizeA;
82+
83+
// Convert A0 into bfp16
84+
accA0 = A0;
85+
// Convert A1 into bfp16 through a different path (see bfp
86+
// conversion example)
87+
accA1 = mul_elem_64(A1, concat(broadcast_one_to_v32bfloat16(),
88+
broadcast_one_to_v32bfloat16()));
89+
90+
// For non transposed matrix
91+
// B0 = pB1bfp16.pop_seek(colB - 1);
92+
// B1 = pB2bfp16.pop_seek(colB - 1);
93+
B0 = pB1bfp16.pop();
94+
B1 = pB2bfp16.pop();
95+
96+
accC00 = mac_8x8_8x8T(accA0.to_vector<bfp16ebs8>(), B0, accC00);
97+
accC01 = mac_8x8_8x8T(accA0.to_vector<bfp16ebs8>(), B1, accC01);
98+
accC10 = mac_8x8_8x8T(accA1.to_vector<bfp16ebs8>(), B0, accC10);
99+
accC11 = mac_8x8_8x8T(accA1.to_vector<bfp16ebs8>(), B1, accC11);
100+
}
101+
102+
aie::store_v(pC1, accC00.template to_vector<bfloat16>());
103+
pC1 += sizeC;
104+
aie::store_v(pC1, accC01.template to_vector<bfloat16>());
105+
pC1 += sizeC;
106+
aie::store_v(pC2, accC10.template to_vector<bfloat16>());
107+
pC2 += sizeC;
108+
aie::store_v(pC2, accC11.template to_vector<bfloat16>());
109+
pC2 += sizeC;
110+
}
111+
}
112112
}
113113

114114
extern "C" {

aie_kernels/aie2p/rope.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@
99
//===----------------------------------------------------------------------===//
1010

1111
#include <aie_api/aie.hpp>
12+
#include <math.h>
1213
#include <stdint.h>
1314
#include <stdio.h>
1415
#include <stdlib.h>
15-
#include <math.h>
1616

1717
template <typename T, int N>
1818
void rope_kernel(const T *restrict input, const T *restrict lut,
19-
T *restrict output, int32_t dims) {
19+
T *restrict output, int32_t dims) {
2020
event0();
2121

2222
for (int v = 0; v < dims; v += N) {

0 commit comments

Comments
 (0)