Skip to content

Commit c5222f5

Browse files
authored
Disambiguate python buffers (#2724)
1 parent 19cce64 commit c5222f5

File tree

34 files changed

+290
-392
lines changed

34 files changed

+290
-392
lines changed

programming_examples/basic/tiling_exploration/per_tile/per_tile.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,8 @@
77
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
88
import argparse
99
import numpy as np
10-
import sys
1110

12-
from aie.iron import LocalBuffer, ObjectFifo, Program, Runtime, Worker
11+
from aie.iron import Buffer, ObjectFifo, Program, Runtime, Worker
1312
from aie.iron.placers import SequentialPlacer
1413
from aie.iron.device import NPU1Col1
1514
from aie.iron.controlflow import range_
@@ -39,21 +38,18 @@ def generate_module(
3938

4039
# Use an ObjectFifo for dataflow
4140
of_out = ObjectFifo(flattened_tile)
41+
access_counter = Buffer(initial_value=np.array([0], dtype=dtype))
4242

4343
# The task a core will run
44-
def access_order(of_out):
45-
access_counter = LocalBuffer(initial_value=np.array([0], dtype=dtype))
46-
47-
for _ in range_(sys.maxsize):
48-
elemOut = of_out.acquire(1)
49-
for i in range_(tile_size):
50-
elemOut[i] = access_counter[0]
51-
access_counter[0] += 1
52-
of_out.release(1)
53-
pass
44+
def access_order(of_out, counter_buf):
45+
elemOut = of_out.acquire(1)
46+
for i in range_(tile_size):
47+
elemOut[i] = counter_buf[0]
48+
counter_buf[0] += 1
49+
of_out.release(1)
5450

5551
# Create a worker (which will be placed on a core) to run the task
56-
worker = Worker(access_order, [of_out.prod()], while_true=False)
52+
worker = Worker(access_order, [of_out.prod(), access_counter])
5753

5854
# Runtime operations to move data to/from the AIE-array
5955
rt = Runtime()

programming_examples/basic/vector_reduce_max/multi_column_designs/col_wise_vector_reduce_max.py

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
Program,
1717
Runtime,
1818
Worker,
19-
LocalBuffer,
19+
Buffer,
2020
str_to_dtype,
2121
)
2222
from aie.iron.placers import SequentialPlacer
@@ -72,30 +72,40 @@ def my_reduce_max(dev, in1_size, out_size, num_cores, dtype_str, trace_size):
7272
else np.array([np.iinfo(dtype).min], dtype=dtype)
7373
)
7474

75-
def core_body(*args):
76-
nextC_buffer = LocalBuffer(
77-
type=np.ndarray[(out_num_elements,), np.dtype[dtype]],
78-
initial_value=min_val,
75+
nextC_buffers = []
76+
tmp_buffers = []
77+
for i in range(num_cores):
78+
nextC_buffers.append(
79+
Buffer(
80+
type=np.ndarray[(out_num_elements,), np.dtype[dtype]],
81+
initial_value=min_val,
82+
)
7983
)
80-
tmp_buffer = LocalBuffer(
81-
type=np.ndarray[(out_num_elements,), np.dtype[dtype]],
82-
initial_value=min_val,
84+
tmp_buffers.append(
85+
Buffer(
86+
type=np.ndarray[(out_num_elements,), np.dtype[dtype]],
87+
initial_value=min_val,
88+
)
8389
)
90+
91+
def core_body(*args):
8492
# Extract fixed arguments from end of args list
8593
compute_max = args[-1]
8694
reduce_max_vector = args[-2]
95+
tmp_buffer = args[-3]
96+
c_buffer = args[-4]
8797

8898
# Extract object fifos from start of args list
8999
of_in1 = args[0]
90100
of_out = args[1]
91101
neighbor_of_in1s = args[
92-
2:-2
102+
2:-4
93103
] # Variable number of input fifos based on num_cores
94104

95105
for _ in range_(N_div_n):
96106
elem_in1 = of_in1.acquire(1)
97107
reduce_max_vector(elem_in1, tmp_buffer, tile_size)
98-
compute_max(nextC_buffer, tmp_buffer, nextC_buffer)
108+
compute_max(c_buffer, tmp_buffer, c_buffer)
99109
of_in1.release(1)
100110

101111
elem_out = of_out.acquire(1)
@@ -107,14 +117,14 @@ def core_body(*args):
107117

108118
# Compute max across all inputs
109119
for elem in elem_in1s[:-1]:
110-
compute_max(elem, nextC_buffer, nextC_buffer)
111-
compute_max(elem_in1s[-1], nextC_buffer, elem_out)
120+
compute_max(elem, c_buffer, c_buffer)
121+
compute_max(elem_in1s[-1], c_buffer, elem_out)
112122

113123
# Release all inputs
114124
for neighbor_of in neighbor_of_in1s:
115125
neighbor_of.release(1)
116126
else:
117-
elem_out[0] = nextC_buffer[0]
127+
elem_out[0] = c_buffer[0]
118128
of_out.release(1)
119129

120130
# Define a worker to run the task on a core
@@ -126,7 +136,9 @@ def core_body(*args):
126136
if num_cores - cores_per_col < i:
127137
fifo_args.append(of_outs[i - 1].cons())
128138

129-
fifo_args.extend([reduce_max_vector, compute_max])
139+
fifo_args.extend(
140+
[nextC_buffers[i], tmp_buffers[i], reduce_max_vector, compute_max]
141+
)
130142
my_workers.append(
131143
Worker(
132144
core_body,

programming_examples/basic/vector_reduce_max/multi_column_designs/row_wise_vector_reduce_max.py

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
Program,
1717
Runtime,
1818
Worker,
19-
LocalBuffer,
19+
Buffer,
2020
str_to_dtype,
2121
)
2222
from aie.iron.placers import SequentialPlacer
@@ -73,6 +73,21 @@ def my_reduce_max(dev, in1_size, out_size, dtype_str, trace_size):
7373
if dtype_str == "bf16"
7474
else np.array([np.iinfo(dtype).min], dtype=dtype)
7575
)
76+
nextC_buffers = []
77+
tmp_buffers = []
78+
for i in range(n_cores):
79+
nextC_buffers.append(
80+
Buffer(
81+
type=np.ndarray[(out_tensor_size,), np.dtype[dtype]],
82+
initial_value=min_val,
83+
)
84+
)
85+
tmp_buffers.append(
86+
Buffer(
87+
type=np.ndarray[(out_tensor_size,), np.dtype[dtype]],
88+
initial_value=min_val,
89+
)
90+
)
7691

7792
taps = [
7893
TensorAccessPattern(
@@ -85,22 +100,16 @@ def my_reduce_max(dev, in1_size, out_size, dtype_str, trace_size):
85100
]
86101

87102
def core_body(*args):
88-
nextC_buffer = LocalBuffer(
89-
type=np.ndarray[(out_tensor_size,), np.dtype[dtype]],
90-
initial_value=min_val,
91-
)
92-
tmp_buffer = LocalBuffer(
93-
type=np.ndarray[(out_tensor_size,), np.dtype[dtype]],
94-
initial_value=min_val,
95-
)
96103
# Extract fixed arguments from end of args list
97104
compute_max = args[-1]
98105
reduce_max_vector = args[-2]
106+
nextC_buffer = args[-3]
107+
tmp_buffer = args[-4]
99108

100109
# Extract object fifos from start of args list
101110
of_in = args[0]
102111
of_out = args[1]
103-
in_fifos = args[2:-2] # Variable number of input fifos based on n_cores
112+
in_fifos = args[2:-4] # Variable number of input fifos based on n_cores
104113

105114
for _ in range_(num_iter):
106115
elem_in = of_in.acquire(1)
@@ -150,7 +159,10 @@ def core_body(*args):
150159
fifo_args.append(out_fifos[4].cons())
151160
fifo_args.extend(out_fifos[j].cons() for j in range(6, n_cores))
152161

153-
fifo_args.extend([reduce_max_vector, compute_max])
162+
fifo_args.extend(
163+
[tmp_buffers[i], nextC_buffers[i], reduce_max_vector, compute_max]
164+
)
165+
154166
workers.append(
155167
Worker(
156168
core_body,

programming_examples/basic/vector_reduce_max/single_column_designs/vector_reduce_max_chained.py

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
Program,
1616
Runtime,
1717
Worker,
18-
LocalBuffer,
18+
Buffer,
1919
str_to_dtype,
2020
)
2121
from aie.iron.placers import SequentialPlacer
@@ -84,16 +84,26 @@ def my_reduce_max(dev, in1_size, out_size, dtype_str, trace_size):
8484
else np.array([np.iinfo(dtype).min], dtype=dtype)
8585
)
8686

87-
# Define a task to run
88-
def start_core_body(of_in, of_out, reduce_max_vector, compute_max):
89-
nextC_buffer = LocalBuffer(
90-
type=np.ndarray[(out_tensor_size,), np.dtype[dtype]],
91-
initial_value=min_val,
87+
nextC_buffers = []
88+
tmp_buffers = []
89+
for i in range(n_cores):
90+
nextC_buffers.append(
91+
Buffer(
92+
type=np.ndarray[(out_tensor_size,), np.dtype[dtype]],
93+
initial_value=min_val,
94+
)
9295
)
93-
tmp_buffer = LocalBuffer(
94-
type=np.ndarray[(out_tensor_size,), np.dtype[dtype]],
95-
initial_value=min_val,
96+
tmp_buffers.append(
97+
Buffer(
98+
type=np.ndarray[(out_tensor_size,), np.dtype[dtype]],
99+
initial_value=min_val,
100+
)
96101
)
102+
103+
# Define a task to run
104+
def start_core_body(
105+
of_in, of_out, reduce_max_vector, compute_max, nextC_buffer, tmp_buffer
106+
):
97107
elem_out = of_out.acquire(1)
98108
for _ in range_(num_iter):
99109
elem_in = of_in.acquire(1)
@@ -103,16 +113,9 @@ def start_core_body(of_in, of_out, reduce_max_vector, compute_max):
103113
elem_out[0] = nextC_buffer[0]
104114
of_out.release(1)
105115

106-
def core_body(of_in, of_out, in0, reduce_max_vector, compute_max):
107-
nextC_buffer = LocalBuffer(
108-
type=np.ndarray[(out_tensor_size,), np.dtype[dtype]],
109-
initial_value=min_val,
110-
)
111-
tmp_buffer = LocalBuffer(
112-
type=np.ndarray[(out_tensor_size,), np.dtype[dtype]],
113-
initial_value=min_val,
114-
)
115-
116+
def core_body(
117+
of_in, of_out, in0, reduce_max_vector, compute_max, nextC_buffer, tmp_buffer
118+
):
116119
for _ in range_(num_iter):
117120
elem_in = of_in.acquire(1)
118121
reduce_max_vector(elem_in, tmp_buffer, elems_per_core)
@@ -138,6 +141,8 @@ def core_body(of_in, of_out, in0, reduce_max_vector, compute_max):
138141
out_fifos[i + 1].cons(),
139142
reduce_max_vector,
140143
compute_max,
144+
nextC_buffers[i],
145+
tmp_buffers[i],
141146
],
142147
trace=enable_trace,
143148
)
@@ -151,6 +156,8 @@ def core_body(of_in, of_out, in0, reduce_max_vector, compute_max):
151156
out_fifos[i].prod(),
152157
reduce_max_vector,
153158
compute_max,
159+
nextC_buffers[i],
160+
tmp_buffers[i],
154161
],
155162
trace=enable_trace,
156163
)

programming_examples/basic/vector_reduce_max/single_column_designs/vector_reduce_max_memtile.py

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
Program,
1616
Runtime,
1717
Worker,
18-
LocalBuffer,
18+
Buffer,
1919
str_to_dtype,
2020
)
2121
from aie.iron.placers import SequentialPlacer
@@ -93,16 +93,26 @@ def my_reduce_max(dev, in1_size, out_size, dtype_str, trace_size):
9393
else np.array([np.iinfo(dtype).min], dtype=dtype)
9494
)
9595

96-
# Define a task to run
97-
def start_core_body(of_in, of_out, reduce_max_vector, compute_max):
98-
nextC_buffer = LocalBuffer(
99-
type=np.ndarray[(out_tensor_size,), np.dtype[dtype]],
100-
initial_value=min_val,
96+
nextC_buffers = []
97+
tmp_buffers = []
98+
for i in range(n_cores):
99+
nextC_buffers.append(
100+
Buffer(
101+
type=np.ndarray[(out_tensor_size,), np.dtype[dtype]],
102+
initial_value=min_val,
103+
)
101104
)
102-
tmp_buffer = LocalBuffer(
103-
type=np.ndarray[(out_tensor_size,), np.dtype[dtype]],
104-
initial_value=min_val,
105+
tmp_buffers.append(
106+
Buffer(
107+
type=np.ndarray[(out_tensor_size,), np.dtype[dtype]],
108+
initial_value=min_val,
109+
)
105110
)
111+
112+
# Define a task to run
113+
def start_core_body(
114+
of_in, of_out, reduce_max_vector, compute_max, nextC_buffer, tmp_buffer
115+
):
106116
elem_out = of_out.acquire(1)
107117
for _ in range_(num_iter):
108118
elem_in = of_in.acquire(1)
@@ -120,15 +130,9 @@ def core_body(
120130
reduce_max_vector,
121131
reduce_max_scalar,
122132
compute_max,
133+
nextC_buffer,
134+
tmp_buffer,
123135
):
124-
nextC_buffer = LocalBuffer(
125-
type=np.ndarray[(out_tensor_size,), np.dtype[dtype]],
126-
initial_value=min_val,
127-
)
128-
tmp_buffer = LocalBuffer(
129-
type=np.ndarray[(out_tensor_size,), np.dtype[dtype]],
130-
initial_value=min_val,
131-
)
132136
elem_out = elemC_out.acquire(1)
133137
for _ in range_(num_iter):
134138
elem_in = of_in.acquire(1)
@@ -156,6 +160,8 @@ def core_body(
156160
out_fifos[i].prod(),
157161
reduce_max_vector,
158162
compute_max,
163+
nextC_buffers[i],
164+
tmp_buffers[i],
159165
],
160166
trace=True if i == 1 else None,
161167
)
@@ -169,6 +175,8 @@ def core_body(
169175
reduce_max_vector,
170176
reduce_max_scalar,
171177
compute_max,
178+
nextC_buffers[i],
179+
tmp_buffers[i],
172180
]
173181
workers.append(Worker(core_body, fn_args=fifo_args, trace=None))
174182

0 commit comments

Comments
 (0)