diff --git a/CMakeLists.txt b/CMakeLists.txt index ccc952d..324a091 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,70 +1,109 @@ cmake_minimum_required(VERSION 3.18) project(pulp-nnx - VERSION 0.3.0 - DESCRIPTION "Kernel library for PULP-based NN accelerators." - LANGUAGES C) + VERSION 0.3.0 + DESCRIPTION "Kernel library for PULP-based NN accelerators." + LANGUAGES C) -add_library(pulp-nnx STATIC) +add_library(pulp-nnx INTERFACE) -target_sources(pulp-nnx PRIVATE util/pulp_nnx_util.c util/hwpe.c) -target_include_directories(pulp-nnx PUBLIC inc util) +add_library(pulp-nnx-hal STATIC) + +target_sources(pulp-nnx-hal PRIVATE util/pulp_nnx_util.c util/hwpe.c) +target_include_directories(pulp-nnx-hal PUBLIC inc util) option(USE_NE16 "Use the NE16 accelerator.") option(USE_NEUREKA "Use the N-EUREKA accelerator.") option(USE_NEUREKA_V2 "Use the N-EUREKA v2 accelerator.") +option(ENABLE_BSP "Enable the build of the BSP for your chosen accelerator. Requires the PULP-SDK.") if (NOT ${USE_NE16} AND NOT ${USE_NEUREKA} AND NOT ${USE_NEUREKA_V2}) - message(FATAL_ERROR "[PULP-NNX] No accelerator in use. Please set an appropriate USE_ option.") + message(FATAL_ERROR "[PULP-NNX] No accelerator in use. Please set an appropriate USE_ option.") +endif() + +if(${ENABLE_BSP}) + add_library(pulp-nnx-bsp STATIC) endif() if (${USE_NE16}) - message(STATUS "[PULP-NNX] Using the NE16 accelerator.") - target_sources(pulp-nnx - PRIVATE - ne16/bsp/ne16_pulp_bsp.c - ne16/hal/ne16.c - ne16/hal/ne16_task.c - src/pulp_nnx_ne16.c - ) - target_include_directories(pulp-nnx - PUBLIC - ne16/bsp - ne16/hal - ne16/gvsoc - ) + message(STATUS "[PULP-NNX] Using the NE16 accelerator.") + target_sources(pulp-nnx-hal + PRIVATE + ne16/hal/ne16.c + ne16/hal/ne16_task.c + ) + target_include_directories(pulp-nnx-hal + PUBLIC + ne16/hal + ne16/gvsoc + ) + if(${ENABLE_BSP}) + target_sources(pulp-nnx-bsp + PRIVATE + ne16/bsp/ne16_pulp_bsp.c + src/pulp_nnx_ne16.c + ) + target_include_directories(pulp-nnx-bsp + PUBLIC + ne16/bsp + ) + endif() + endif() if (${USE_NEUREKA}) - message(STATUS "[PULP-NNX] Using the N-EUREKA accelerator.") - target_sources(pulp-nnx - PRIVATE - neureka/bsp/neureka_siracusa_bsp.c - neureka/hal/neureka.c - neureka/hal/neureka_task.c - src/pulp_nnx_neureka.c - ) - target_include_directories(pulp-nnx - PUBLIC - neureka/bsp - neureka/hal - neureka/gvsoc - ) + message(STATUS "[PULP-NNX] Using the N-EUREKA accelerator.") + target_sources(pulp-nnx-hal + PRIVATE + neureka/hal/neureka.c + neureka/hal/neureka_task.c + ) + target_include_directories(pulp-nnx-hal + PUBLIC + neureka/hal + neureka/gvsoc + ) + if(${ENABLE_BSP}) + target_sources(pulp-nnx-bsp + PRIVATE + neureka/bsp/neureka_siracusa_bsp.c + src/pulp_nnx_neureka.c + ) + target_include_directories(pulp-nnx-bsp + PUBLIC + neureka/bsp + ) + endif() + endif() if (${USE_NEUREKA_V2}) - message(STATUS "[PULP-NNX] Using the N-EUREKA v2 accelerator.") - target_sources(pulp-nnx - PRIVATE - neureka_v2/bsp/neureka_v2_siracusa_bsp.c - neureka_v2/hal/neureka_v2.c - neureka_v2/hal/neureka_v2_task.c - src/pulp_nnx_neureka_v2.c - ) - target_include_directories(pulp-nnx - PUBLIC - neureka_v2/bsp - neureka_v2/hal - neureka_v2/gvsoc - ) + message(STATUS "[PULP-NNX] Using the N-EUREKA v2 accelerator.") + target_sources(pulp-nnx-hal + PRIVATE + neureka_v2/hal/neureka_v2.c + neureka_v2/hal/neureka_v2_task.c + ) + target_include_directories(pulp-nnx-hal + PUBLIC + neureka_v2/hal + neureka_v2/gvsoc + ) + if(${ENABLE_BSP}) + target_sources(pulp-nnx-bsp + PRIVATE + neureka_v2/bsp/neureka_v2_pulp_bsp.c + src/pulp_nnx_neureka_v2.c + ) + target_include_directories(pulp-nnx-bsp + PUBLIC + neureka_v2/bsp + ) + endif() + +endif() + +target_link_libraries(pulp-nnx INTERFACE pulp-nnx-hal) +if(${ENABLE_BSP}) + target_link_libraries(pulp-nnx INTERFACE pulp-nnx-bsp) endif() diff --git a/inc/pulp_nnx_neureka.h b/inc/pulp_nnx_neureka.h index fea4bb4..0ddef59 100644 --- a/inc/pulp_nnx_neureka.h +++ b/inc/pulp_nnx_neureka.h @@ -19,7 +19,6 @@ */ #include "neureka.h" -#include "neureka_siracusa_bsp.h" #include "neureka_task.h" #include diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c index 9a311dc..18939b8 100644 --- a/neureka/hal/neureka_task.c +++ b/neureka/hal/neureka_task.c @@ -165,14 +165,17 @@ void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in, .d2 = h_out_stride}; task->data.cfg.output_stride = output_stride; - task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES; if (task->kernel_shape == 1) { // 1x1 + task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES_1x1; task->data.cfg.weights_stride.d1 = - NEUREKA_WEIGHT_BANDWIDTH_BYTES * num_k_in; + (NEUREKA_WEIGHT_BANDWIDTH_BYTES_1x1 / 8) * task->qw * num_k_in; } else if (!task->depthwise) { // 3x3 + task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES_3x3; task->data.cfg.weights_stride.d1 = - NEUREKA_WEIGHT_BANDWIDTH_BYTES * task->qw * num_k_in; + NEUREKA_WEIGHT_BANDWIDTH_BYTES_3x3 * task->qw * num_k_in; + } else { // 3x3 depthwise + task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES_3x3; task->data.cfg.weights_stride.d1 = 0; } task->data.cfg.weights_stride.d2 = 0; diff --git a/neureka/hal/neureka_task_defs.h b/neureka/hal/neureka_task_defs.h index fa08289..b48f78e 100644 --- a/neureka/hal/neureka_task_defs.h +++ b/neureka/hal/neureka_task_defs.h @@ -21,22 +21,38 @@ #ifndef __NEUREKA_DEFS_H__ #define __NEUREKA_DEFS_H__ -/* ARHITECTURE */ - -#define NEUREKA_SUBTILE_INPUT_HEIGHT_1x1 (6) -#define NEUREKA_SUBTILE_INPUT_WIDTH_1x1 (6) +/* ARCHITECTURE */ +// The definitions wrapped in #ifndefs can be overwritten with compiler flags +// for different parametrizations of the Neureka architecture +#ifndef NNX_NEUREKA_PE_H +#define NNX_NEUREKA_PE_H (6) +#endif +#ifndef NNX_NEUREKA_PE_W +#define NNX_NEUREKA_PE_W (6) +#endif +#define NNX_NEUREKA_BANDWIDTH_1x1 (256) + +#ifndef NNX_NEUREKA_BANDWIDTH_3x3 +#define NNX_NEUREKA_BANDWIDTH_3x3 (256) +#endif + +#define NEUREKA_SUBTILE_INPUT_HEIGHT_1x1 (NNX_NEUREKA_PE_H) +#define NEUREKA_SUBTILE_INPUT_WIDTH_1x1 (NNX_NEUREKA_PE_W) #define NEUREKA_SUBTILE_INPUT_CHANNEL_1x1 (32) -#define NEUREKA_SUBTILE_INPUT_HEIGHT_3x3 (8) -#define NEUREKA_SUBTILE_INPUT_WIDTH_3x3 (8) +#define NEUREKA_SUBTILE_INPUT_HEIGHT_3x3 (NNX_NEUREKA_PE_H + 2) +#define NEUREKA_SUBTILE_INPUT_WIDTH_3x3 (NNX_NEUREKA_PE_W + 2) +#ifndef NEUREKA_SUBTILE_INPUT_CHANNEL_3x3 #define NEUREKA_SUBTILE_INPUT_CHANNEL_3x3 (28) +#endif -#define NEUREKA_SUBTILE_OUTPUT_HEIGHT (6) -#define NEUREKA_SUBTILE_OUTPUT_WIDTH (6) +#define NEUREKA_SUBTILE_OUTPUT_HEIGHT (NNX_NEUREKA_PE_H) +#define NEUREKA_SUBTILE_OUTPUT_WIDTH (NNX_NEUREKA_PE_W) #define NEUREKA_SUBTILE_OUTPUT_CHANNEL (32) #define NEUREKA_OUTPUT_BANDWIDTH_BYTES (32) -#define NEUREKA_WEIGHT_BANDWIDTH_BYTES (32) +#define NEUREKA_WEIGHT_BANDWIDTH_BYTES_1x1 (NNX_NEUREKA_BANDWIDTH_1x1 / 8) +#define NEUREKA_WEIGHT_BANDWIDTH_BYTES_3x3 (NNX_NEUREKA_BANDWIDTH_3x3 / 8) /* TASK REGISTERS */ diff --git a/util/hwpe.c b/util/hwpe.c index 0430081..1f99c1f 100644 --- a/util/hwpe.c +++ b/util/hwpe.c @@ -30,33 +30,34 @@ #define HWPE_SWSYNC 6 #define HWPE_TASK_REG_OFFSET 8 -inline void hwpe_reg_write(hwpe_dev_t *dev, int reg, uint32_t value) { +inline void hwpe_reg_write(const hwpe_dev_t *dev, int reg, uint32_t value) { dev->base_addr[reg] = value; } -inline uint32_t hwpe_reg_read(hwpe_dev_t *dev, int reg) { +inline uint32_t hwpe_reg_read(const hwpe_dev_t *dev, int reg) { return dev->base_addr[reg]; } -inline void hwpe_task_reg_write(hwpe_dev_t *dev, int reg, uint32_t value) { +inline void hwpe_task_reg_write(const hwpe_dev_t *dev, int reg, + uint32_t value) { hwpe_reg_write(dev, HWPE_TASK_REG_OFFSET + reg, value); } -inline uint32_t hwpe_task_reg_read(hwpe_dev_t *dev, int reg) { +inline uint32_t hwpe_task_reg_read(const hwpe_dev_t *dev, int reg) { return hwpe_reg_read(dev, HWPE_TASK_REG_OFFSET + reg); } -void hwpe_soft_clear(hwpe_dev_t *dev) { +void hwpe_soft_clear(const hwpe_dev_t *dev) { hwpe_reg_write(dev, HWPE_SOFT_CLEAR, 0); for (volatile int i = 0; i < 10; i++) ; } -uint32_t hwpe_task_queue_status(hwpe_dev_t *dev) { +uint32_t hwpe_task_queue_status(const hwpe_dev_t *dev) { return hwpe_reg_read(dev, HWPE_STATUS); } -int hwpe_task_queue_acquire_task(hwpe_dev_t *dev, uint8_t *id) { +int hwpe_task_queue_acquire_task(const hwpe_dev_t *dev, uint8_t *id) { uint32_t read_value = (int32_t)hwpe_reg_read(dev, HWPE_ACQUIRE); if (read_value >= 256) { return 1; @@ -66,20 +67,21 @@ int hwpe_task_queue_acquire_task(hwpe_dev_t *dev, uint8_t *id) { } } -void hwpe_task_queue_write_task(hwpe_dev_t *dev, uint32_t *data, int len) { +void hwpe_task_queue_write_task(const hwpe_dev_t *dev, uint32_t *data, + int len) { for (int i = 0; i < len; i++) { hwpe_task_reg_write(dev, i, data[i]); } } -void hwpe_task_queue_release_and_run(hwpe_dev_t *dev) { +void hwpe_task_queue_release_and_run(const hwpe_dev_t *dev) { hwpe_reg_write(dev, HWPE_TRIGGER, 0); } -void hwpe_task_queue_release(hwpe_dev_t *dev) { +void hwpe_task_queue_release(const hwpe_dev_t *dev) { hwpe_reg_write(dev, HWPE_TRIGGER, 1); } -uint8_t hwpe_last_task_id(hwpe_dev_t *dev) { +uint8_t hwpe_last_task_id(const hwpe_dev_t *dev) { return (uint8_t)hwpe_reg_read(dev, HWPE_RUNNING_JOB); } diff --git a/util/hwpe.h b/util/hwpe.h index 52bf912..7b7f65c 100644 --- a/util/hwpe.h +++ b/util/hwpe.h @@ -28,16 +28,16 @@ typedef struct hwpe_dev_t { volatile uint32_t *base_addr; } hwpe_dev_t; -void hwpe_reg_write(hwpe_dev_t *dev, int reg, uint32_t value); -uint32_t hwpe_reg_read(hwpe_dev_t *dev, int reg); -void hwpe_task_reg_write(hwpe_dev_t *dev, int reg, uint32_t value); -uint32_t hwpe_task_reg_read(hwpe_dev_t *dev, int reg); -void hwpe_soft_clear(hwpe_dev_t *dev); -uint32_t hwpe_task_queue_status(hwpe_dev_t *dev); -int hwpe_task_queue_acquire_task(hwpe_dev_t *dev, uint8_t *id); -void hwpe_task_queue_write_task(hwpe_dev_t *dev, uint32_t *data, int len); -void hwpe_task_queue_release_and_run(hwpe_dev_t *dev); -void hwpe_task_queue_release(hwpe_dev_t *dev); -uint8_t hwpe_last_task_id(hwpe_dev_t *dev); +void hwpe_reg_write(const hwpe_dev_t *dev, int reg, uint32_t value); +uint32_t hwpe_reg_read(const hwpe_dev_t *dev, int reg); +void hwpe_task_reg_write(const hwpe_dev_t *dev, int reg, uint32_t value); +uint32_t hwpe_task_reg_read(const hwpe_dev_t *dev, int reg); +void hwpe_soft_clear(const hwpe_dev_t *dev); +uint32_t hwpe_task_queue_status(const hwpe_dev_t *dev); +int hwpe_task_queue_acquire_task(const hwpe_dev_t *dev, uint8_t *id); +void hwpe_task_queue_write_task(const hwpe_dev_t *dev, uint32_t *data, int len); +void hwpe_task_queue_release_and_run(const hwpe_dev_t *dev); +void hwpe_task_queue_release(const hwpe_dev_t *dev); +uint8_t hwpe_last_task_id(const hwpe_dev_t *dev); #endif // !__HWPE_H__