Performance improvements (~50%) running the model.

microbit-carlos · microbit-carlos · commit 2a93b24f21bb · 2024-06-11T20:20:06.000+01:00
Cached some values and removed some checks that were performed
more than once. Before model would take ~150 microseconds to run,
and after these chanages ~75 microseconds.
diff --git a/mlrunner/ml4f.c b/mlrunner/ml4f.c
@@ -124,6 +124,17 @@ int ml4f_full_invoke(const ml4f_header_t *model, const float *input, float *outp
     return r;
 }
 
+int ml4f_full_invoke_arena(const ml4f_header_t *model, uint8_t *arena, const float *input, float *output) {
+    if (!ml4f_is_valid_header(model))
+        return -1;
+    memcpy(arena + model->input_offset, input,
+           ml4f_shape_size(ml4f_input_shape(model), model->input_type));
+    int r = ml4f_invoke(model, arena);
+    memcpy(output, arena + model->output_offset,
+           ml4f_shape_size(ml4f_output_shape(model), model->output_type));
+    return r;
+}
+
 int ml4f_full_invoke_argmax(const ml4f_header_t *model, const float *input) {
     if (!ml4f_is_valid_header(model))
         return -1;
diff --git a/mlrunner/ml4f.h b/mlrunner/ml4f.h
@@ -53,6 +53,7 @@ uint32_t ml4f_shape_size(const uint32_t *shape, uint32_t type);
 int ml4f_argmax(const float *data, uint32_t size);
 
 int ml4f_full_invoke(const ml4f_header_t *model, const float *input, float *output);
+int ml4f_full_invoke_arena(const ml4f_header_t *model, uint8_t *arena, const float *input, float *output);
 int ml4f_full_invoke_argmax(const ml4f_header_t *model, const float *input);
 
 #ifdef __cplusplus
diff --git a/mlrunner/mlrunner.c b/mlrunner/mlrunner.c
@@ -4,15 +4,18 @@
 #include "mlrunner.h"
 
 // Pointer to the model in flash
-static uint32_t* MODEL_ADDRESS = NULL;
+static uint32_t *MODEL_ADDRESS = NULL;
+static uint8_t *model_arena = NULL;
+static size_t input_length = 0;
+static size_t output_length = 0;
 
 /*****************************************************************************/
 /* Private API                                                               */
 /*****************************************************************************/
 /**
  * @return True if the model header is valid, False otherwise.
  */
-static bool is_model_valid(const void* model_address) {
+static bool is_model_valid(const void *model_address) {
     ml_model_header_t *model_header = (ml_model_header_t *)model_address;
     if (model_header->magic0 != MODEL_HEADER_MAGIC0) {
         return false;
@@ -39,8 +42,8 @@ static bool is_model_valid(const void* model_address) {
  *
  * @return The ML4F model or NULL if the model is not present or invalid.
  */
-static ml4f_header_t* get_ml4f_model() {
-    if (MODEL_ADDRESS == NULL || !is_model_valid(MODEL_ADDRESS)) {
+static inline ml4f_header_t* get_ml4f_model() {
+    if (MODEL_ADDRESS == NULL) {
         return NULL;
     }
     ml_model_header_t *model_header = (ml_model_header_t *)MODEL_ADDRESS;
@@ -56,13 +59,41 @@ bool ml_setModel(const void *model_address) {
         return false;
     }
     MODEL_ADDRESS = (uint32_t *)model_address;
+
+    // Allocate the model arena
+    int model_arena_size = ml_getArenaSize();
+    if (model_arena_size <= 0) {
+        MODEL_ADDRESS = NULL;
+        return false;
+    }
+    if (model_arena != NULL) {
+        free(model_arena);
+    }
+    model_arena = malloc(model_arena_size);
+    if (model_arena == NULL) {
+        MODEL_ADDRESS = NULL;
+        return false;
+    }
+
+    // Set the cached input and output lengths
+    ml_getInputLength();
+    ml_getOutputLength();
+
     return true;
 }
 
 bool ml_isModelPresent() {
     return MODEL_ADDRESS != NULL;
 }
 
+int ml_getArenaSize() {
+    ml4f_header_t *ml4f_model = get_ml4f_model();
+    if (ml4f_model == NULL) {
+        return -1;
+    }
+    return ml4f_model->arena_bytes;
+}
+
 int ml_getSamplesPeriod() {
     const ml_model_header_t* const model_header = (ml_model_header_t*)MODEL_ADDRESS;
     if (model_header == NULL) {
@@ -88,19 +119,27 @@ int ml_getSampleDimensions() {
 }
 
 int ml_getInputLength() {
-    ml4f_header_t *ml4f_model = get_ml4f_model();
-    if (ml4f_model == NULL) {
-        return -1;
+    if (input_length == 0) {
+        ml4f_header_t *ml4f_model = get_ml4f_model();
+        if (ml4f_model == NULL) {
+            return -1;
+        }
+        input_length = ml4f_shape_elements(ml4f_input_shape(ml4f_model));
     }
-    return ml4f_shape_elements(ml4f_input_shape(ml4f_model));
+
+    return input_length;
 }
 
 int ml_getOutputLength() {
-    ml4f_header_t *ml4f_model = get_ml4f_model();
-    if (ml4f_model == NULL) {
-        return -1;
+    if (output_length == 0) {
+        ml4f_header_t *ml4f_model = get_ml4f_model();
+        if (ml4f_model == NULL) {
+            return -1;
+        }
+        output_length = ml4f_shape_elements(ml4f_output_shape(ml4f_model));
     }
-    return ml4f_shape_elements(ml4f_output_shape(ml4f_model));
+
+    return output_length;
 }
 
 // TODO: Remove this function and use ml_getLabels instead
@@ -241,69 +280,53 @@ ml_predictions_t *ml_allocatePredictions() {
     return predictions;
 }
 
-bool ml_predict(const float *input, const int in_len, const ml_actions_t *actions, ml_predictions_t *predictions_out) {
-    if (input == NULL || in_len <= 0 || actions == NULL || predictions_out == NULL) {
-        return false;
-    }
-
-    int model_output_len = ml_getOutputLength();
-    if (model_output_len <= 0 ||
-            model_output_len != (int)actions->len ||
-            model_output_len != (int)predictions_out->len) {
+bool ml_predict(const float *input, const size_t in_len, const ml_actions_t *actions, ml_predictions_t *predictions_out) {
+    if (actions == NULL || actions->len != output_length ||
+            predictions_out == NULL || predictions_out->len != output_length) {
         return false;
     }
 
-    bool success = ml_runModel(input, in_len, &predictions_out->prediction, predictions_out->len);
+    bool success = ml_runModel(input, in_len, (float *)&predictions_out->prediction, output_length);
     if (!success) {
         return false;
     }
-    predictions_out->index = ml_calcPrediction(actions, &predictions_out->prediction, predictions_out->len);
+    predictions_out->index = ml_calcPrediction(actions, (float *)&predictions_out->prediction, output_length);
 
     return true;
 }
 
 
-bool ml_runModel(const float *input, const int in_len, float* individual_predictions, const int out_len) {
-    if (individual_predictions == NULL) {
-        return false;
-    }
-
-    int model_input_len = ml_getInputLength();
-    if (model_input_len <= 0 || model_input_len != in_len) {
-        return false;
-    }
-    int model_output_len = ml_getOutputLength();
-    if (model_output_len <= 0 || model_output_len != out_len) {
+bool ml_runModel(const float *input, const size_t in_len, float* individual_predictions, const size_t out_len) {
+    if (input == NULL || individual_predictions == NULL || input_length != in_len || output_length != out_len) {
         return false;
     }
 
     ml4f_header_t *ml4f_model = get_ml4f_model();
-    int r = ml4f_full_invoke(ml4f_model, input, individual_predictions);
+    int r = ml4f_full_invoke_arena(ml4f_model, model_arena, input, individual_predictions);
     if (r != 0) {
         return false;
     }
 
     return true;
 }
 
-int ml_calcPrediction(const ml_actions_t *actions, const float* predictions, const int len) {
-    if (actions == NULL || predictions == NULL || len <= 0 || len != (int)actions->len) {
+int ml_calcPrediction(const ml_actions_t *actions, const float* predictions, const size_t len) {
+    if (actions == NULL || predictions == NULL || len != actions->len) {
         return -1;
     }
 
     float predictions_above_threshold[len];
-    for (int i = 0; i < len; i++) {
+    for (size_t i = 0; i < len; i++) {
         if (predictions[i] >= actions->action[i].threshold) {
             predictions_above_threshold[i] = predictions[i];
         } else {
             predictions_above_threshold[i] = 0.0f;
         }
     }
     int max_index = ml4f_argmax(predictions_above_threshold, len);
-    if (max_index < 0 || max_index >= len) {
+    if (max_index < 0) {
         return -1;
     }
-
     // If the max predictionn is 0, then none were above the threshold
     if (predictions_above_threshold[max_index] == 0.0f) {
         max_index = -1;
diff --git a/mlrunner/mlrunner.h b/mlrunner/mlrunner.h
@@ -88,6 +88,14 @@ bool ml_setModel(const void *model_address);
  */
 bool ml_isModelPresent();
 
+/**
+ * @brief Get the arena size that has been allocated to run the loaded model.
+ *
+ * @return The size, in bytes, of the arena required for the model.
+ *         Or -1 if the model is not present.
+ */
+int ml_getArenaSize();
+
 /**
  * @brief Get the period between samples required for the model.
  *
@@ -179,7 +187,7 @@ ml_predictions_t *ml_allocatePredictions();
  *        Or -1 if the model is not present, the actions or input length
  *        doesn't match, or the prediction failed.
  */
-bool ml_predict(const float *input, const int in_len, const ml_actions_t *actions, ml_predictions_t *predictions_out);
+bool ml_predict(const float *input, const size_t in_len, const ml_actions_t *actions, ml_predictions_t *predictions_out);
 
 /**
  * @brief Run the model and return the individual predictions for each action.
@@ -191,7 +199,7 @@ bool ml_predict(const float *input, const int in_len, const ml_actions_t *action
  * @return True if the model is present and the model run was successful,
  *         False otherwise.
  */
-bool ml_runModel(const float *input, const int in_len, float* predictions_out, const int out_len);
+bool ml_runModel(const float *input, const size_t in_len, float* predictions_out, const size_t out_len);
 
 /**
  * @brief Calculate the overall prediction based on the actions thresholds.
@@ -204,7 +212,7 @@ bool ml_runModel(const float *input, const int in_len, float* predictions_out, c
  *        Or -1 if the model is not present, the actions or predictions length
  *        doesn't match, or the prediction failed.
  */
-int ml_calcPrediction(const ml_actions_t *actions, const float* predictions, const int len);
+int ml_calcPrediction(const ml_actions_t *actions, const float* predictions, const size_t len);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/testextension.cpp b/testextension.cpp
@@ -20,6 +20,7 @@
 
 namespace testrunner {
     static ml_actions_t *actions = NULL;
+    static ml_predictions_t *predictions = NULL;
     static bool initialised = false;
     static const uint16_t ML_CODAL_TIMER_VALUE = 1;
 
@@ -40,18 +41,12 @@ namespace testrunner {
     void runModel() {
         if (!initialised) return;
 
-        ml_predictions_t *predictions = ml_allocatePredictions();
-        if (predictions == NULL) {
-            DEBUG_PRINT("Failed to allocate memory for predictions\n");
-            uBit.panic(TEST_RUNNER_ERROR + 9);
-        }
-
         unsigned int time_start = system_timer_current_time_us();
 
         float *modelData = mlDataProcessor.getProcessedData();
         if (modelData == NULL) {
             DEBUG_PRINT("Failed to processed data for the model\n");
-            uBit.panic(TEST_RUNNER_ERROR + 10);
+            uBit.panic(TEST_RUNNER_ERROR + 21);
         }
 
         unsigned int time_mid = system_timer_current_time_us();
@@ -60,7 +55,7 @@ namespace testrunner {
             modelData, mlDataProcessor.getProcessedDataSize(), actions, predictions);
         if (!success) {
             DEBUG_PRINT("Failed to run model\n");
-            uBit.panic(TEST_RUNNER_ERROR + 11);
+            uBit.panic(TEST_RUNNER_ERROR + 22);
         }
 
         unsigned int time_end = system_timer_current_time_us();
@@ -74,7 +69,7 @@ namespace testrunner {
         } else {
             DEBUG_PRINT("None\n");
         }
-        DEBUG_PRINT("\tPredictions:");
+        DEBUG_PRINT("\tIndividual:");
         for (size_t i = 0; i < actions->len; i++) {
             DEBUG_PRINT(" %s [%d]",
                         actions->action[i].label,
@@ -83,8 +78,6 @@ namespace testrunner {
         DEBUG_PRINT("\n\n");
 
         MicroBitEvent evt(TEST_RUNNER_ID_INFERENCE, predictions->index + 2);
-
-        free(predictions);
     }
 
     void recordAccData(MicroBitEvent) {
@@ -172,21 +165,41 @@ namespace testrunner {
             uBit.panic(TEST_RUNNER_ERROR + 6);
         }
 
+        const int modelOutputLen = ml_getInputLength();
+        DEBUG_PRINT("\tModel output length: %d\n", modelOutputLen);
+        if (modelOutputLen <= 0) {
+            DEBUG_PRINT("Model output length invalid\n");
+            uBit.panic(TEST_RUNNER_ERROR + 7);
+        }
+
+        const int modelArenaSize = ml_getArenaSize();
+        DEBUG_PRINT("\tModel arena size: %d bytes\n", modelArenaSize);
+        if (modelArenaSize <= 0) {
+            DEBUG_PRINT("Model arena size length invalid\n");
+            uBit.panic(TEST_RUNNER_ERROR + 8);
+        }
+
         actions = ml_allocateActions();
         if (actions == NULL) {
             DEBUG_PRINT("Failed to allocate memory for actions\n");
-            uBit.panic(TEST_RUNNER_ERROR + 7);
+            uBit.panic(TEST_RUNNER_ERROR + 9);
         }
         const bool getActionsSuccess = ml_getActions(actions);
         if (!getActionsSuccess) {
             DEBUG_PRINT("Failed to retrieve actions\n");
-            uBit.panic(TEST_RUNNER_ERROR + 8);
+            uBit.panic(TEST_RUNNER_ERROR + 10);
         }
         DEBUG_PRINT("\tActions (%d):\n", actions->len);
         for (size_t i = 0; i < actions->len; i++) {
             DEBUG_PRINT("\t\t'%s' threshold = %d%%\n", actions->action[i].label, (int)(actions->action[i].threshold * 100));
         }
 
+        predictions = ml_allocatePredictions();
+        if (predictions == NULL) {
+            DEBUG_PRINT("Failed to allocate memory for predictions\n");
+            uBit.panic(TEST_RUNNER_ERROR + 11);
+        }
+
         const MlDataProcessorConfig_t mlDataConfig = {
             .samples = samplesLen,
             .dimensions = sampleDimensions,
@@ -198,7 +211,7 @@ namespace testrunner {
         if (mlInitResult != MLDP_SUCCESS) {
             DEBUG_PRINT("Failed to initialise ML data processor (%d)\n", mlInitResult);
             // TODO: Check error type and set panic value accordingly
-            uBit.panic(TEST_RUNNER_ERROR + 8);
+            uBit.panic(TEST_RUNNER_ERROR + 12);
         }
 
         // Set up background timer to collect data and run model