-
Notifications
You must be signed in to change notification settings - Fork 32
Description
After discussion with @lukamac, we identified that the current DMA transfer scheduling is overly strict with synchronization. For example, we always wait for input tile t to finish before starting the transfer of the next tile t+1, even though the hardware could already handle this in parallel. This conservative scheduling guarantees correctness, but it prevents us from taking full advantage of double-buffering.
With proper double-buffering, we should instead keep two futures for inputs and two for outputs. That way, the DMA engine can already start moving data for tile t+1 while the compute kernel is still working on tile t, and simultaneously flush out results from tile t-1. This enables true overlap of compute, prefetch, and drain.
See #116, which previously attempted to implement this scheduling scheme. This PR shows the correct approach:
- Introduces ping-pong buffers for A/B/C and outputs.
- Uses distinct channels per parity (0/1) to track DMA ownership.
- Moves synchronization points from “wait before start” to “wait before reuse,” ensuring correctness while maximizing overlap.
In practice, this is the optimal scheduling scheme. However, for most current workloads, the DMA transfers are typically not longer than the control overhead plus kernel execution, so runtime performance is not affected. Additionally, MCHAN only supports one in-flight transaction per direction. The benefit is primarily architectural cleanliness and readiness for cases where transfer time becomes a bottleneck.
Manual Improved Code
static void _closure(void *_closure_args) {
// CLOSURE ARG CAST
_closure_args_t *args = (_closure_args_t *)_closure_args;
uint8_t *DeeployNetwork_TILING_CODEGEN_L1__tileIdxPtr = args->DeeployNetwork_TILING_CODEGEN_L1__tileIdxPtr;
// CLOSURE FUNCTION CALL
uint16_t *DeeployNetwork_TILING_CODEGEN_L1__M_ref = (uint16_t *)DeeployNetwork_TILING_CODEGEN_L1__M + 0;
float32_t *DeeployNetwork_TILING_CODEGEN_L1__A_ref = (float32_t *)((char *)DeeployNetwork_MEMORYARENA_L1 + 5504);
float32_t *DeeployNetwork_TILING_CODEGEN_L1__B_ref = (float32_t *)((char *)DeeployNetwork_MEMORYARENA_L1 + 1408);
float32_t *DeeployNetwork_TILING_CODEGEN_L1__C_ref = (float32_t *)((char *)DeeployNetwork_MEMORYARENA_L1 + 8320);
float32_t *DeeployNetwork_TILING_CODEGEN_L1__data_out_ref = (float32_t *)((char *)DeeployNetwork_MEMORYARENA_L1 + 0);
void *DeeployNetwork_TILING_CODEGEN_L1__input_0_ref = (void *)DeeployNetwork_input_0 + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__input_0_buffer_0 = (void *)DeeployNetwork_TILING_CODEGEN_L1__A_ref + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__input_0_buffer_1 = (void *)DeeployNetwork_TILING_CODEGEN_L1__A_ref + 1408;
void *DeeployNetwork_TILING_CODEGEN_L1__A_next = (void *)DeeployNetwork_TILING_CODEGEN_L1__input_0_buffer_1 + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_ref = (void *)DeeployNetwork_weight_tensor + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_buffer_0 = (void *)DeeployNetwork_TILING_CODEGEN_L1__B_ref + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_buffer_1 = (void *)DeeployNetwork_TILING_CODEGEN_L1__B_ref + 2048;
void *DeeployNetwork_TILING_CODEGEN_L1__B_next = (void *)DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_buffer_1 + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_ref = (void *)DeeployNetwork_bias_tensor + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_buffer_0 = (void *)DeeployNetwork_TILING_CODEGEN_L1__C_ref + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_buffer_1 = (void *)DeeployNetwork_TILING_CODEGEN_L1__C_ref + 704;
void *DeeployNetwork_TILING_CODEGEN_L1__C_next = (void *)DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_buffer_1 + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__output_0_ref = (void *)DeeployNetwork_output_0 + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__output_0_buffer_0 = (void *)DeeployNetwork_TILING_CODEGEN_L1__data_out_ref + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__output_0_buffer_1 = (void *)DeeployNetwork_TILING_CODEGEN_L1__data_out_ref + 704;
// Initialize DMA future
uint32_t channel_input_0 = (uint32_t)-1;
uint32_t channel_output_0 = (uint32_t)-1;
uint32_t channel_input_1 = (uint32_t)-1;
uint32_t channel_output_1 = (uint32_t)-1;
// Transfer initial input tile
channel_input_0 = mchan_channel_alloc();
mchan_transfer_1d(1443200, DeeployNetwork_TILING_CODEGEN_L1__A_ref, DeeployNetwork_TILING_CODEGEN_L1__input_0_ref);
// UPDATE VARIABLE DeeployNetwork_TILING_CODEGEN_L1__input_0_ref
DeeployNetwork_TILING_CODEGEN_L1__input_0_ref += DeeployNetwork_TILING_CODEGEN_L1__A_relativeOffset[0];
// Transfer initial input tile
mchan_transfer_2d_ext_strided(1968128, DeeployNetwork_TILING_CODEGEN_L1__B_ref, DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_ref, 64, 128);
// UPDATE VARIABLE DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_ref
DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_ref += DeeployNetwork_TILING_CODEGEN_L1__B_relativeOffset[0];
// Transfer initial input tile
mchan_transfer_2d_ext_strided(1966784, DeeployNetwork_TILING_CODEGEN_L1__C_ref, DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_ref, 64, 128);
// UPDATE VARIABLE DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_ref
DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_ref += DeeployNetwork_TILING_CODEGEN_L1__C_relativeOffset[0];
// TILING LOOP
for (int TILING_I = DeeployNetwork_TILING_CODEGEN_L1__numTiles[*DeeployNetwork_TILING_CODEGEN_L1__tileIdxPtr];
TILING_I < DeeployNetwork_TILING_CODEGEN_L1__numTiles[(*DeeployNetwork_TILING_CODEGEN_L1__tileIdxPtr) + 1]; TILING_I++) {
switch ((TILING_I) % 2) {
case 0:
DeeployNetwork_TILING_CODEGEN_L1__A_ref = (float32_t *)DeeployNetwork_TILING_CODEGEN_L1__input_0_buffer_0;
DeeployNetwork_TILING_CODEGEN_L1__B_ref = (float32_t *)DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_buffer_0;
DeeployNetwork_TILING_CODEGEN_L1__C_ref = (float32_t *)DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_buffer_0;
DeeployNetwork_TILING_CODEGEN_L1__data_out_ref = (float32_t *)DeeployNetwork_TILING_CODEGEN_L1__output_0_buffer_0;
break;
case 1:
DeeployNetwork_TILING_CODEGEN_L1__A_ref = (float32_t *)DeeployNetwork_TILING_CODEGEN_L1__input_0_buffer_1;
DeeployNetwork_TILING_CODEGEN_L1__B_ref = (float32_t *)DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_buffer_1;
DeeployNetwork_TILING_CODEGEN_L1__C_ref = (float32_t *)DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_buffer_1;
DeeployNetwork_TILING_CODEGEN_L1__data_out_ref = (float32_t *)DeeployNetwork_TILING_CODEGEN_L1__output_0_buffer_1;
break;
}
// Wait for current input tile
// DOUBLE BUFFERING CHECK TILE LOAD
if ((TILING_I + 1) < DeeployNetwork_TILING_CODEGEN_L1__numTiles[*DeeployNetwork_TILING_CODEGEN_L1__tileIdxPtr + 1]) {
switch ((TILING_I + 1) % 2) {
case 0:
DeeployNetwork_TILING_CODEGEN_L1__A_next = (void *)DeeployNetwork_TILING_CODEGEN_L1__input_0_buffer_0;
channel_input_1 = mchan_channel_alloc();
break;
case 1:
DeeployNetwork_TILING_CODEGEN_L1__A_next = (void *)DeeployNetwork_TILING_CODEGEN_L1__input_0_buffer_1;
channel_input_0 = mchan_channel_alloc();
break;
}
// Transfer next input tile
mchan_transfer_1d(DeeployNetwork_TILING_CODEGEN_L1__A_cmd[TILING_I + 1], DeeployNetwork_TILING_CODEGEN_L1__A_next,
DeeployNetwork_TILING_CODEGEN_L1__input_0_ref);
// UPDATE VARIABLE DeeployNetwork_TILING_CODEGEN_L1__input_0_ref
DeeployNetwork_TILING_CODEGEN_L1__input_0_ref += DeeployNetwork_TILING_CODEGEN_L1__A_relativeOffset[TILING_I + 1];
}
// Wait for current input tile
// DOUBLE BUFFERING CHECK TILE LOAD
if ((TILING_I + 1) < DeeployNetwork_TILING_CODEGEN_L1__numTiles[*DeeployNetwork_TILING_CODEGEN_L1__tileIdxPtr + 1]) {
switch ((TILING_I + 1) % 2) {
case 0:
DeeployNetwork_TILING_CODEGEN_L1__B_next = (void *)DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_buffer_0;
break;
case 1:
DeeployNetwork_TILING_CODEGEN_L1__B_next = (void *)DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_buffer_1;
break;
}
// Transfer next input tile
mchan_transfer_2d_ext_strided(1968128, DeeployNetwork_TILING_CODEGEN_L1__B_next, DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_ref, 64, 128);
// UPDATE VARIABLE DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_ref
DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_ref += DeeployNetwork_TILING_CODEGEN_L1__B_relativeOffset[TILING_I + 1];
}
// Wait for current input tile
// DOUBLE BUFFERING CHECK TILE LOAD
if ((TILING_I + 1) < DeeployNetwork_TILING_CODEGEN_L1__numTiles[*DeeployNetwork_TILING_CODEGEN_L1__tileIdxPtr + 1]) {
switch ((TILING_I + 1) % 2) {
case 0:
DeeployNetwork_TILING_CODEGEN_L1__C_next = (void *)DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_buffer_0;
break;
case 1:
DeeployNetwork_TILING_CODEGEN_L1__C_next = (void *)DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_buffer_1;
break;
}
// Transfer next input tile
mchan_transfer_2d_ext_strided(DeeployNetwork_TILING_CODEGEN_L1__C_cmd[TILING_I + 1], DeeployNetwork_TILING_CODEGEN_L1__C_next,
DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_ref, 64, 128);
// UPDATE VARIABLE DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_ref
DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_ref += DeeployNetwork_TILING_CODEGEN_L1__C_relativeOffset[TILING_I + 1];
}
if (TILING_I > 1) {
// Wait for current output tile
if (TILING_I % 2) {
mchan_channel_wait(channel_output_0);
mchan_channel_free(channel_output_0);
} else {
mchan_channel_wait(channel_output_1);
mchan_channel_free(channel_output_1);
}
}
if (TILING_I % 2) {
mchan_channel_wait(channel_input_0);
mchan_channel_free(channel_input_0);
} else {
mchan_channel_wait(channel_input_1);
mchan_channel_free(channel_input_1);
}
// UPDATE VARIABLE DeeployNetwork_TILING_CODEGEN_L1__M_ref
*DeeployNetwork_TILING_CODEGEN_L1__M_ref = DeeployNetwork_TILING_CODEGEN_L1__M[TILING_I];
_cluster_fork_args_t DeeployNetwork__cluster_fork_args =
(_cluster_fork_args_t){.DeeployNetwork_TILING_CODEGEN_L1__M_ref = DeeployNetwork_TILING_CODEGEN_L1__M_ref,
.DeeployNetwork_TILING_CODEGEN_L1__A_ref = DeeployNetwork_TILING_CODEGEN_L1__A_ref,
.DeeployNetwork_TILING_CODEGEN_L1__B_ref = DeeployNetwork_TILING_CODEGEN_L1__B_ref,
.DeeployNetwork_TILING_CODEGEN_L1__C_ref = DeeployNetwork_TILING_CODEGEN_L1__C_ref,
.DeeployNetwork_TILING_CODEGEN_L1__data_out_ref = DeeployNetwork_TILING_CODEGEN_L1__data_out_ref};
pi_cl_team_fork(NUM_CORES, (void *)_cluster_fork, &DeeployNetwork__cluster_fork_args);
// Transfer current output tile
if (TILING_I % 2) {
channel_output_0 = mchan_channel_alloc();
} else {
channel_output_1 = mchan_channel_alloc();
}
mchan_transfer_2d_ext_strided(DeeployNetwork_TILING_CODEGEN_L1__data_out_cmd[TILING_I], DeeployNetwork_TILING_CODEGEN_L1__data_out_ref,
DeeployNetwork_TILING_CODEGEN_L1__output_0_ref, 64, 128);
// UPDATE VARIABLE DeeployNetwork_TILING_CODEGEN_L1__output_0_ref
DeeployNetwork_TILING_CODEGEN_L1__output_0_ref += DeeployNetwork_TILING_CODEGEN_L1__data_out_relativeOffset[TILING_I];
// CLOSE TILING LOOP
}
*DeeployNetwork_TILING_CODEGEN_L1__tileIdxPtr += 1;
// Wait for final output tile
mchan_channel_wait(channel_output_0);
mchan_channel_free(channel_output_0);
mchan_channel_wait(channel_output_1);
mchan_channel_free(channel_output_1);
// Deinitialize DMA future
// CLOSURE ARG WRITEBACK
}Current Code
static void _closure(void *_closure_args) {
// CLOSURE ARG CAST
_closure_args_t *args = (_closure_args_t *)_closure_args;
uint8_t *DeeployNetwork_TILING_CODEGEN_L1__tileIdxPtr = args->DeeployNetwork_TILING_CODEGEN_L1__tileIdxPtr;
// CLOSURE FUNCTION CALL
uint16_t *DeeployNetwork_TILING_CODEGEN_L1__M_ref = (uint16_t *)DeeployNetwork_TILING_CODEGEN_L1__M + 0;
float32_t *DeeployNetwork_TILING_CODEGEN_L1__A_ref = (float32_t *)((char *)DeeployNetwork_MEMORYARENA_L1 + 5504);
float32_t *DeeployNetwork_TILING_CODEGEN_L1__B_ref = (float32_t *)((char *)DeeployNetwork_MEMORYARENA_L1 + 1408);
float32_t *DeeployNetwork_TILING_CODEGEN_L1__C_ref = (float32_t *)((char *)DeeployNetwork_MEMORYARENA_L1 + 8320);
float32_t *DeeployNetwork_TILING_CODEGEN_L1__data_out_ref = (float32_t *)((char *)DeeployNetwork_MEMORYARENA_L1 + 0);
void *DeeployNetwork_TILING_CODEGEN_L1__input_0_ref = (void *)DeeployNetwork_input_0 + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__input_0_buffer_0 = (void *)DeeployNetwork_TILING_CODEGEN_L1__A_ref + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__input_0_buffer_1 = (void *)DeeployNetwork_TILING_CODEGEN_L1__A_ref + 1408;
void *DeeployNetwork_TILING_CODEGEN_L1__A_next = (void *)DeeployNetwork_TILING_CODEGEN_L1__input_0_buffer_1 + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_ref = (void *)DeeployNetwork_weight_tensor + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_buffer_0 = (void *)DeeployNetwork_TILING_CODEGEN_L1__B_ref + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_buffer_1 = (void *)DeeployNetwork_TILING_CODEGEN_L1__B_ref + 2048;
void *DeeployNetwork_TILING_CODEGEN_L1__B_next = (void *)DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_buffer_1 + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_ref = (void *)DeeployNetwork_bias_tensor + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_buffer_0 = (void *)DeeployNetwork_TILING_CODEGEN_L1__C_ref + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_buffer_1 = (void *)DeeployNetwork_TILING_CODEGEN_L1__C_ref + 704;
void *DeeployNetwork_TILING_CODEGEN_L1__C_next = (void *)DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_buffer_1 + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__output_0_ref = (void *)DeeployNetwork_output_0 + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__output_0_buffer_0 = (void *)DeeployNetwork_TILING_CODEGEN_L1__data_out_ref + 0;
void *DeeployNetwork_TILING_CODEGEN_L1__output_0_buffer_1 = (void *)DeeployNetwork_TILING_CODEGEN_L1__data_out_ref + 704;
// Initialize DMA future
uint32_t channel_output = (uint32_t)-1;
uint32_t channel_input = (uint32_t)-1;
// Transfer initial input tile
channel_input = mchan_channel_alloc();
mchan_transfer_1d(1443200, DeeployNetwork_TILING_CODEGEN_L1__A_ref, DeeployNetwork_TILING_CODEGEN_L1__input_0_ref);
// UPDATE VARIABLE DeeployNetwork_TILING_CODEGEN_L1__input_0_ref
DeeployNetwork_TILING_CODEGEN_L1__input_0_ref += DeeployNetwork_TILING_CODEGEN_L1__A_relativeOffset[0];
// Transfer initial input tile
mchan_transfer_2d_ext_strided(1968128, DeeployNetwork_TILING_CODEGEN_L1__B_ref, DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_ref, 64, 128);
// UPDATE VARIABLE DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_ref
DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_ref += DeeployNetwork_TILING_CODEGEN_L1__B_relativeOffset[0];
// Transfer initial input tile
mchan_transfer_2d_ext_strided(1966784, DeeployNetwork_TILING_CODEGEN_L1__C_ref, DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_ref, 64, 128);
// UPDATE VARIABLE DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_ref
DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_ref += DeeployNetwork_TILING_CODEGEN_L1__C_relativeOffset[0];
// TILING LOOP
for (int TILING_I = DeeployNetwork_TILING_CODEGEN_L1__numTiles[*DeeployNetwork_TILING_CODEGEN_L1__tileIdxPtr];
TILING_I < DeeployNetwork_TILING_CODEGEN_L1__numTiles[(*DeeployNetwork_TILING_CODEGEN_L1__tileIdxPtr) + 1]; TILING_I++) {
switch ((TILING_I) % 2) {
case 0:
DeeployNetwork_TILING_CODEGEN_L1__A_ref = (float32_t *)DeeployNetwork_TILING_CODEGEN_L1__input_0_buffer_0;
DeeployNetwork_TILING_CODEGEN_L1__B_ref = (float32_t *)DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_buffer_0;
DeeployNetwork_TILING_CODEGEN_L1__C_ref = (float32_t *)DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_buffer_0;
DeeployNetwork_TILING_CODEGEN_L1__data_out_ref = (float32_t *)DeeployNetwork_TILING_CODEGEN_L1__output_0_buffer_0;
break;
case 1:
DeeployNetwork_TILING_CODEGEN_L1__A_ref = (float32_t *)DeeployNetwork_TILING_CODEGEN_L1__input_0_buffer_1;
DeeployNetwork_TILING_CODEGEN_L1__B_ref = (float32_t *)DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_buffer_1;
DeeployNetwork_TILING_CODEGEN_L1__C_ref = (float32_t *)DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_buffer_1;
DeeployNetwork_TILING_CODEGEN_L1__data_out_ref = (float32_t *)DeeployNetwork_TILING_CODEGEN_L1__output_0_buffer_1;
break;
}
// Wait for current input tile
mchan_channel_wait(channel_input);
mchan_channel_free(channel_input);
// DOUBLE BUFFERING CHECK TILE LOAD
if ((TILING_I + 1) < DeeployNetwork_TILING_CODEGEN_L1__numTiles[*DeeployNetwork_TILING_CODEGEN_L1__tileIdxPtr + 1]) {
switch ((TILING_I + 1) % 2) {
case 0:
DeeployNetwork_TILING_CODEGEN_L1__A_next = (void *)DeeployNetwork_TILING_CODEGEN_L1__input_0_buffer_0;
break;
case 1:
DeeployNetwork_TILING_CODEGEN_L1__A_next = (void *)DeeployNetwork_TILING_CODEGEN_L1__input_0_buffer_1;
break;
}
// Transfer next input tile
channel_input = mchan_channel_alloc();
mchan_transfer_1d(DeeployNetwork_TILING_CODEGEN_L1__A_cmd[TILING_I + 1], DeeployNetwork_TILING_CODEGEN_L1__A_next,
DeeployNetwork_TILING_CODEGEN_L1__input_0_ref);
// UPDATE VARIABLE DeeployNetwork_TILING_CODEGEN_L1__input_0_ref
DeeployNetwork_TILING_CODEGEN_L1__input_0_ref += DeeployNetwork_TILING_CODEGEN_L1__A_relativeOffset[TILING_I + 1];
}
// Wait for current input tile
// DOUBLE BUFFERING CHECK TILE LOAD
if ((TILING_I + 1) < DeeployNetwork_TILING_CODEGEN_L1__numTiles[*DeeployNetwork_TILING_CODEGEN_L1__tileIdxPtr + 1]) {
switch ((TILING_I + 1) % 2) {
case 0:
DeeployNetwork_TILING_CODEGEN_L1__B_next = (void *)DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_buffer_0;
break;
case 1:
DeeployNetwork_TILING_CODEGEN_L1__B_next = (void *)DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_buffer_1;
break;
}
// Transfer next input tile
mchan_transfer_2d_ext_strided(1968128, DeeployNetwork_TILING_CODEGEN_L1__B_next, DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_ref, 64, 128);
// UPDATE VARIABLE DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_ref
DeeployNetwork_TILING_CODEGEN_L1__weight_tensor_ref += DeeployNetwork_TILING_CODEGEN_L1__B_relativeOffset[TILING_I + 1];
}
// Wait for current input tile
// DOUBLE BUFFERING CHECK TILE LOAD
if ((TILING_I + 1) < DeeployNetwork_TILING_CODEGEN_L1__numTiles[*DeeployNetwork_TILING_CODEGEN_L1__tileIdxPtr + 1]) {
switch ((TILING_I + 1) % 2) {
case 0:
DeeployNetwork_TILING_CODEGEN_L1__C_next = (void *)DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_buffer_0;
break;
case 1:
DeeployNetwork_TILING_CODEGEN_L1__C_next = (void *)DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_buffer_1;
break;
}
// Transfer next input tile
mchan_transfer_2d_ext_strided(DeeployNetwork_TILING_CODEGEN_L1__C_cmd[TILING_I + 1], DeeployNetwork_TILING_CODEGEN_L1__C_next,
DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_ref, 64, 128);
// UPDATE VARIABLE DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_ref
DeeployNetwork_TILING_CODEGEN_L1__bias_tensor_ref += DeeployNetwork_TILING_CODEGEN_L1__C_relativeOffset[TILING_I + 1];
}
// UPDATE VARIABLE DeeployNetwork_TILING_CODEGEN_L1__M_ref
*DeeployNetwork_TILING_CODEGEN_L1__M_ref = DeeployNetwork_TILING_CODEGEN_L1__M[TILING_I];
_cluster_fork_args_t DeeployNetwork__cluster_fork_args =
(_cluster_fork_args_t){.DeeployNetwork_TILING_CODEGEN_L1__M_ref = DeeployNetwork_TILING_CODEGEN_L1__M_ref,
.DeeployNetwork_TILING_CODEGEN_L1__A_ref = DeeployNetwork_TILING_CODEGEN_L1__A_ref,
.DeeployNetwork_TILING_CODEGEN_L1__B_ref = DeeployNetwork_TILING_CODEGEN_L1__B_ref,
.DeeployNetwork_TILING_CODEGEN_L1__C_ref = DeeployNetwork_TILING_CODEGEN_L1__C_ref,
.DeeployNetwork_TILING_CODEGEN_L1__data_out_ref = DeeployNetwork_TILING_CODEGEN_L1__data_out_ref};
pi_cl_team_fork(NUM_CORES, (void *)_cluster_fork, &DeeployNetwork__cluster_fork_args);
// Wait for previous output tile
mchan_channel_wait(channel_output);
mchan_channel_free(channel_output);
// Transfer current output tile
channel_output = mchan_channel_alloc();
mchan_transfer_2d_ext_strided(DeeployNetwork_TILING_CODEGEN_L1__data_out_cmd[TILING_I], DeeployNetwork_TILING_CODEGEN_L1__data_out_ref,
DeeployNetwork_TILING_CODEGEN_L1__output_0_ref, 64, 128);
// UPDATE VARIABLE DeeployNetwork_TILING_CODEGEN_L1__output_0_ref
DeeployNetwork_TILING_CODEGEN_L1__output_0_ref += DeeployNetwork_TILING_CODEGEN_L1__data_out_relativeOffset[TILING_I];
// CLOSE TILING LOOP
}
*DeeployNetwork_TILING_CODEGEN_L1__tileIdxPtr += 1;
// Wait for final output tile
mchan_channel_wait(channel_output);
mchan_channel_free(channel_output);
// Deinitialize DMA future
// CLOSURE ARG WRITEBACK
}Originally posted by @Xeratec in #114 (comment)
Metadata
Metadata
Assignees
Labels
Type
Projects
Status