Arm backend: Improve memory config and documentation in the runtime (pytorch#5580)

zingo · facebook-github-bot · commit a9ad3c66e7d7 · 2024-09-30T08:47:48.000-07:00
Summary: Pull Request resolved: pytorch#5580 Reviewed By: digantdesai Differential Revision: D63637392 Pulled By: mergennachin fbshipit-source-id: 527414da91f072456cc49eb20cd68d12dde9e400
diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp
@@ -23,26 +23,39 @@
 #include "arm_perf_monitor.h"
 
 #ifdef SEMIHOSTING
-// In our unit test flow, we have the capability to provide an enitre model to
-// the Corstone-3xx FVP using semi hosting. Hence, the input allocation pool
-// needs to be large enough to take an entire model. On the FVP,
-// network_model_sec is linked to the DDR, which is large (256MB on
-// Corstone-300).
-const size_t input_allocation_pool_size = 100 * 1024 * 1024;
+
+/**
+ * The input_file_allocation_pool should be large enough to fit the various
+ * input file data used when loading the data files when running semihosting
+ * e.g. the input file data and the pte file data
+ * In our unit test flow, we have the capability to provide an enitre model to
+ * the Corstone-3xx FVP using semi hosting. Hence, the input file allocation
+ * pool needs to be large enough to take an entire model and input. On the FVP,
+ * network_model_sec is linked to the DDR, which is large (256MB on
+ * Corstone-300).
+ * If you use semihosting on your HW this can be lowered to fit your
+ * files/memory
+ */
+
+const size_t input_file_allocation_pool_size = 60 * 1024 * 1024;
 unsigned char __attribute__((
     section("network_model_sec"),
-    aligned(16))) input_allocation_pool[input_allocation_pool_size];
-// memory for the model will be allocated from the input_allocation_pool
+    aligned(16))) input_file_allocation_pool[input_file_allocation_pool_size];
 char* model_pte = nullptr;
+
 #else
+
 /**
  * This header file is generated by the build process based on the .pte file
  * specified in the ET_PTE_FILE_PATH variable to the cmake build.
  * Control of the action of the .pte, it's use of operators and delegates, and
  * which are included in the bare metal build are also orchestrated by the
  * CMakeLists file. For example use see examples/arm/run.sh
+ *
+ * e.g. This includes the pte as a big chunk of data struct into this file
  */
 #include "model_pte.h"
+
 #endif
 
 using executorch::aten::ScalarType;
@@ -63,12 +76,34 @@ using executorch::runtime::Span;
 using executorch::runtime::Tag;
 using executorch::runtime::TensorInfo;
 
-#define METHOD_ALLOCATOR_POOL_SIZE (70 * 1024 * 1024)
+/**
+ * The method_allocation_pool should be large enough to fit the setup, input
+ * used and other data used like the planned memory pool (e.g. memory-planned
+ * buffers to use for mutable tensor data) In this example we run on a
+ * Corstone-3xx FVP so we can use a lot of memory to be able to run and test
+ * large models if you run on HW this should be lowered to fit into your
+ * availible memory.
+ */
+#ifndef ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE
+#define ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE (60 * 1024 * 1024)
+#endif
+const size_t method_allocation_pool_size =
+    ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE;
 unsigned char __attribute__((
     section("network_model_sec"),
-    aligned(16))) method_allocation_pool[METHOD_ALLOCATOR_POOL_SIZE];
+    aligned(16))) method_allocation_pool[method_allocation_pool_size];
 
-const size_t temp_allocation_pool_size = 1 * 1024 * 1024;
+/**
+ * The temp_allocation_pool is used for allocating temporary data during kernel
+ * or delegate execution. This will be reset after each kernel or delegate call.
+ * Currently a MemoryAllocator is used but a PlatformMemoryAllocator is probably
+ * a better fit
+ */
+#ifndef ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE
+#define ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE (1 * 1024 * 1024)
+#endif
+const size_t temp_allocation_pool_size =
+    ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE;
 unsigned char __attribute__((
     section("network_model_sec"),
     aligned(16))) temp_allocation_pool[temp_allocation_pool_size];
@@ -256,8 +291,8 @@ int main(int argc, const char* argv[]) {
 
 #ifdef SEMIHOSTING
   const char* output_basename = nullptr;
-  MemoryAllocator input_allocator(
-      input_allocation_pool_size, input_allocation_pool);
+  MemoryAllocator input_file_allocator(
+      input_file_allocation_pool_size, input_file_allocation_pool);
 
   /* parse input parameters */
   for (int i = 0; i < argc; i++) {
@@ -271,13 +306,13 @@ int main(int argc, const char* argv[]) {
           ++nbr_inputs,
           input_tensor_filename);
       auto [buffer, buffer_size] =
-          read_binary_file(input_tensor_filename, input_allocator);
+          read_binary_file(input_tensor_filename, input_file_allocator);
       input_buffers.push_back(std::make_pair(buffer, buffer_size));
     } else if (std::strcmp(argv[i], "-m") == 0) {
       const char* pte_filename = argv[++i];
       ET_LOG(Info, "Reading pte model from file %s", pte_filename);
       auto [buffer, buffer_size] =
-          read_binary_file(pte_filename, input_allocator);
+          read_binary_file(pte_filename, input_file_allocator);
       // Store the model data with the same variable as if it was loaded
       // from compiled in location.
       model_pte = buffer;
@@ -320,7 +355,7 @@ int main(int argc, const char* argv[]) {
   }
 
   MemoryAllocator method_allocator(
-      METHOD_ALLOCATOR_POOL_SIZE, method_allocation_pool);
+      method_allocation_pool_size, method_allocation_pool);
 
   std::vector<uint8_t*> planned_buffers; // Owns the memory
   std::vector<Span<uint8_t>> planned_spans; // Passed to the allocator