CodeLinaro
diff --git a/‎cmake/onnxruntime_unittests.cmake‎
Lines changed: 2 additions & 2 deletions b/‎cmake/onnxruntime_unittests.cmake‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/onnxruntime/core/framework/allocator.h‎
Lines changed: 5 additions & 0 deletions b/‎include/onnxruntime/core/framework/allocator.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/onnxruntime/core/framework/ortdevice.h‎
Lines changed: 23 additions & 7 deletions b/‎include/onnxruntime/core/framework/ortdevice.h‎
Lines changed: 23 additions & 7 deletions
diff --git a/‎onnxruntime/core/framework/allocation_planner.cc‎
Lines changed: 58 additions & 4 deletions b/‎onnxruntime/core/framework/allocation_planner.cc‎
Lines changed: 58 additions & 4 deletions
diff --git a/‎onnxruntime/core/framework/allocator.cc‎
Lines changed: 30 additions & 7 deletions b/‎onnxruntime/core/framework/allocator.cc‎
Lines changed: 30 additions & 7 deletions
diff --git a/‎onnxruntime/core/framework/execution_frame.cc‎
Lines changed: 4 additions & 1 deletion b/‎onnxruntime/core/framework/execution_frame.cc‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎onnxruntime/core/providers/cpu/cpu_execution_provider.cc‎
Lines changed: 3 additions & 3 deletions b/‎onnxruntime/core/providers/cpu/cpu_execution_provider.cc‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc‎
Lines changed: 2 additions & 1 deletion b/‎onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎onnxruntime/core/providers/qnn/builder/qnn_model.cc‎
Lines changed: 3 additions & 1 deletion b/‎onnxruntime/core/providers/qnn/builder/qnn_model.cc‎
Lines changed: 3 additions & 1 deletion
@@ -1291,7 +1291,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       endif()
       if (CMAKE_SYSTEM_NAME MATCHES "AIX")
         list(APPEND onnxruntime_perf_test_libs onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 gtest absl_failure_signal_handler absl_examine_stack absl_flags_parse  absl_flags_usage absl_flags_usage_internal)
-    endif()
+      endif()
       target_link_libraries(onnxruntime_perf_test PRIVATE ${onnxruntime_perf_test_libs} Threads::Threads)
       if(WIN32)
         target_link_libraries(onnxruntime_perf_test PRIVATE debug dbghelp advapi32)
@@ -1301,7 +1301,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
     endif()
     set_target_properties(onnxruntime_perf_test PROPERTIES FOLDER "ONNXRuntimeTest")
 
-  endif()
+endif()
 
 
   if(onnxruntime_USE_QNN)
 
@@ -41,6 +41,7 @@ struct OrtArenaCfg {
 
 namespace onnxruntime {
 constexpr const char* CPU = "Cpu";
+constexpr const char* CPU_ALIGNED_4K = "CpuAligned4K";
 constexpr const char* CUDA = "Cuda";
 constexpr const char* CUDA_PINNED = "CudaPinned";
 constexpr const char* CANN = "Cann";
@@ -57,6 +58,7 @@ constexpr const char* WEBGPU_BUFFER = "WebGPU_Buffer";
 constexpr const char* WEBNN_TENSOR = "WebNN_Tensor";
 
 constexpr size_t kAllocAlignment = 256;
+constexpr const size_t kAlloc4KAlignment = 4096;
 
 class IAllocator;
 class Stream;
@@ -270,4 +272,7 @@ using AllocatorMap = std::map<OrtDevice, AllocatorPtr>;
 
 void* AllocatorDefaultAlloc(size_t size);
 void AllocatorDefaultFree(void* p);
+void* AllocatorDefaultAllocAligned(size_t size, size_t alignment);
+void AllocatorDefaultFreeAligned(void* p, size_t alignment);
+
 }  // namespace onnxruntime
@@ -11,6 +11,7 @@ struct OrtDevice {
   using DeviceType = int8_t;
   using MemoryType = int8_t;
   using DeviceId = int16_t;
+  using Alignment = size_t;
 
   // Pre-defined device types.
   static const DeviceType CPU = 0;
@@ -28,31 +29,40 @@ struct OrtDevice {
     static const MemoryType QNN_HTP_SHARED = 4;
   };
 
-  constexpr OrtDevice(DeviceType device_type_, MemoryType memory_type_, DeviceId device_id_)
+  constexpr OrtDevice(DeviceType device_type_, MemoryType memory_type_, DeviceId device_id_, Alignment alignment) noexcept
       : device_type(device_type_),
         memory_type(memory_type_),
-        device_id(device_id_) {}
+        device_id(device_id_),
+        alignment(alignment) {}
 
-  constexpr OrtDevice() : OrtDevice(CPU, MemType::DEFAULT, 0) {}
+  constexpr OrtDevice(DeviceType device_type_, MemoryType memory_type_, DeviceId device_id_) noexcept
+      : OrtDevice(device_type_, memory_type_, device_id_, 0) {}
 
-  DeviceType Type() const {
+  constexpr OrtDevice() noexcept : OrtDevice(CPU, MemType::DEFAULT, 0) {}
+
+  DeviceType Type() const noexcept {
     return device_type;
   }
 
-  MemoryType MemType() const {
+  MemoryType MemType() const noexcept {
     return memory_type;
   }
 
-  DeviceId Id() const {
+  DeviceId Id() const noexcept {
     return device_id;
   }
 
+  Alignment GetAlignment() const noexcept {
+    return alignment;
+  }
+
   std::string ToString() const {
     std::ostringstream ostr;
     ostr << "Device:["
          << "DeviceType:" << static_cast<int>(device_type)
          << " MemoryType:" << static_cast<int>(memory_type)
          << " DeviceId:" << device_id
+         << " Alignment:" << alignment
          << "]";
     return ostr.str();
   }
@@ -62,6 +72,7 @@ struct OrtDevice {
     auto h = std::hash<int>()(device_type);
     onnxruntime::HashCombine(memory_type, h);
     onnxruntime::HashCombine(device_id, h);
+    onnxruntime::HashCombine(alignment, h);
     return h;
   }
 
@@ -71,8 +82,10 @@ struct OrtDevice {
       return device_type < other.device_type;
     if (memory_type != other.memory_type)
       return memory_type < other.memory_type;
+    if (device_id != other.device_id)
+      return device_id < other.device_id;
 
-    return device_id < other.device_id;
+    return alignment < other.alignment;
   }
 
  private:
@@ -84,6 +97,9 @@ struct OrtDevice {
 
   // Device index.
   int32_t device_id : 16;
+
+  // Required alignment
+  Alignment alignment;
 };
 
 inline bool operator==(const OrtDevice& left, const OrtDevice& other) {
 
@@ -8,6 +8,7 @@
 #include <sstream>
 #include <ctime>
 #include <iomanip>
+#include <iterator>
 #include "core/common/exceptions.h"
 #include "core/common/inlined_containers.h"
 #include "core/common/safeint.h"
@@ -725,6 +726,25 @@ class PlannerImpl {
       ProcessDef(index, graph_viewer_.GetNodeArg(pair.first));
     }
 
+    // If the suggested_device is also CPU and default mem type, then
+    // we check which one has higher alignment and use that one if it is so.
+    // If the suggested device is CPU, but not the default mem type, then
+    // it is a CPU accessible memory device allocator. They typically have a page aligment
+    // so that would satisfy the alignment requirement of any other CPU consumers.
+    // If one device is not on CPU, we default on the one that is CPU.
+    auto determine_device = [](const OrtDevice& output_device, const OrtDevice& suggested_device) -> OrtDevice {
+      if (output_device.Type() == OrtDevice::CPU && suggested_device.Type() == OrtDevice::CPU) {
+        if (output_device.MemType() == OrtDevice::MemType::DEFAULT &&
+            suggested_device.MemType() == OrtDevice::MemType::DEFAULT) {
+          return (output_device.GetAlignment() >= suggested_device.GetAlignment()) ? output_device : suggested_device;
+        } else {
+          return (output_device.MemType() != OrtDevice::MemType::DEFAULT) ? output_device : suggested_device;
+        }
+      } else {
+        return (output_device.Type() == OrtDevice::CPU) ? output_device : suggested_device;
+      }
+    };
+
     InlinedHashSet<OrtValueIndex> set_node_arg_has_explicit_consumer;
 
     InlinedHashMap<OrtValueIndex, const IExecutionProvider*> map_implicitly_consumed_node_arg_to_ep;
@@ -756,6 +776,7 @@ class PlannerImpl {
         // Add location information if applicable for the provided input def
         auto process_input = [&graph_inputs, &exec_provider, &p_kernel_def, &is_implicit_input,
                               &set_node_arg_has_explicit_consumer,
+                              &determine_device,
                               &map_implicitly_consumed_node_arg_to_ep,
                               &set_implicitly_consumed_node_arg_has_heterogenous_ep_consumers,
                               this](const NodeArg& input, size_t arg_idx) {
@@ -856,9 +877,12 @@ class PlannerImpl {
                     // we have seen
                     plan_.SetLocation(static_cast<size_t>(index), exec_provider->GetOrtDeviceByMemType(OrtMemType::OrtMemTypeDefault));
                   } else {
-                    // Default the location to CPU
-                    plan_.SetLocation(static_cast<size_t>(index),
-                                      execution_providers_.Get(CPU)->GetOrtDeviceByMemType(OrtMemType::OrtMemTypeDefault));
+                    // We want to minimize the amount of copies, so we want at least one
+                    // device to match or match both if they are CPU based.
+                    OrtDevice result = determine_device(
+                        already_seen_ep_for_node_arg->second->GetOrtDeviceByMemType(OrtMemType::OrtMemTypeDefault),
+                        exec_provider->GetOrtDeviceByMemType(OrtMemType::OrtMemTypeDefault));
+                    plan_.SetLocation(static_cast<size_t>(index), result);
                     set_implicitly_consumed_node_arg_has_heterogenous_ep_consumers.insert(index);
                   }
                 }
@@ -881,7 +905,37 @@ class PlannerImpl {
           if (!node_output->Exists()) continue;
           OrtValueIndex index = Index(node_output->Name());
           ProcessDef(index, node_output);
-          plan_.SetLocation(static_cast<size_t>(index), exec_provider->GetOrtDeviceByMemType(p_kernel_def->OutputMemoryType(i)));
+          OrtDevice output_device = exec_provider->GetOrtDeviceByMemType(p_kernel_def->OutputMemoryType(i));
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+          // Downstream nodes of certain providers may require a CPU accessible location override
+          // to make sure the EP does not incur an unnecessary copy.
+          // We only do it for CPU based EPs. We are not likely to encounter
+          // non CPU devices here since they are already taken care of by using MemCpy nodes earlier.
+          // However, we still ignore them.
+          if (output_device.Type() == OrtDevice::CPU &&
+              output_device.MemType() == OrtDevice::MemType::DEFAULT) {
+            const auto& output_name = node_output->Name();
+            const auto consumers = graph_viewer_.GetConsumerNodes(output_name);
+            for (const auto* consumer : consumers) {
+              if (consumer != nullptr) {
+                const auto& ep_type = consumer->GetExecutionProviderType();
+                auto suggested_device = execution_providers_.Get(ep_type)->GetOrtDeviceByMemType(
+                    OrtMemType::OrtMemTypeCPUInput);
+                if (suggested_device.Type() == OrtDevice::CPU &&
+                    suggested_device.MemType() == OrtDevice::MemType::DEFAULT) {
+                  output_device = determine_device(output_device, suggested_device);
+                } else if (suggested_device.Type() == OrtDevice::CPU) {
+                  // Edge case: there are more than one downstream nodes that suggest their own CPU accessible
+                  // memory. In that case, we can not win them all, but the chosen device would still make it run
+                  // and reduce a number of copies for some.
+                  output_device = suggested_device;
+                  break;
+                }
+              }
+            }
+          }
+#endif
+          plan_.SetLocation(static_cast<size_t>(index), output_device);
         }
       }
     }
 
@@ -41,8 +41,7 @@ bool IAllocator::CalcMemSizeForArrayWithAlignment(size_t nmemb, size_t size, siz
 }
 
 #ifdef USE_MIMALLOC
-void* AllocatorDefaultAlloc(size_t size) {
-  const size_t alignment = MlasGetPreferredBufferAlignment();
+void* AllocatorDefaultAllocAligned(size_t size, size_t alignment) {
   if (size <= 0) return nullptr;
   size += MLAS_SYMM_QGEMM_BUF_OVERRUN;
   void* p;
@@ -71,10 +70,18 @@ void AllocatorDefaultFree(void* p) {
 #endif
 }
 
+void AllocatorDefaultFreeAligned(void* p, size_t alignment) {
+#if defined(_MSC_VER)
+  mi_free_aligned(p, alignment);
 #else
-void* AllocatorDefaultAlloc(size_t size) {
-  const size_t alignment = MlasGetPreferredBufferAlignment();
-  if (size <= 0) return nullptr;
+  mi_free(p);
+#endif
+}
+
+#else
+
+void* AllocatorDefaultAllocAligned(size_t size, size_t alignment) {
+  if (size == 0) return nullptr;
   size += MLAS_SYMM_QGEMM_BUF_OVERRUN;
   void* p;
 #if _MSC_VER
@@ -101,14 +108,25 @@ void AllocatorDefaultFree(void* p) {
 #endif
 }
 
+void AllocatorDefaultFreeAligned(void* p, size_t /* alignment */) {
+  AllocatorDefaultFree(p);
+}
+
 #endif  // USE_MIMALLOC
 
+void* AllocatorDefaultAlloc(size_t size) {
+  const size_t alignment = MlasGetPreferredBufferAlignment();
+  return AllocatorDefaultAllocAligned(size, alignment);
+}
+
 void* CPUAllocator::Alloc(size_t size) {
-  return AllocatorDefaultAlloc(size);
+  const auto alignment = std::max(Info().device.GetAlignment(), MlasGetPreferredBufferAlignment());
+  return AllocatorDefaultAllocAligned(size, alignment);
 }
 
 void CPUAllocator::Free(void* p) {
-  AllocatorDefaultFree(p);
+  const auto alignment = std::max(Info().device.GetAlignment(), MlasGetPreferredBufferAlignment());
+  AllocatorDefaultFreeAligned(p, alignment);
 }
 
 void* AllocateBufferWithOptions(IAllocator& alloc, size_t size, bool use_reserve, Stream* stream, WaitNotificationFn wait_fn) {
@@ -168,6 +186,11 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA
         onnxruntime::QNN_HTP_SHARED, type,
         OrtDevice(OrtDevice::CPU, OrtDevice::MemType::QNN_HTP_SHARED, static_cast<OrtDevice::DeviceId>(id1)),
         id1, mem_type1);
+  } else if (strcmp(name1, onnxruntime::CPU_ALIGNED_4K) == 0) {
+    *out = new OrtMemoryInfo(
+        onnxruntime::CPU_ALIGNED_4K, type,
+        OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1), onnxruntime::kAlloc4KAlignment),
+        id1, mem_type1);
   } else {
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Specified device is not supported.");
   }
 
@@ -529,8 +529,11 @@ Status ExecutionFrame::AllocateMLValueTensorSelfOwnBufferHelper(OrtValue& ort_va
     return Status(ONNXRUNTIME, FAIL, "Trying to allocate memory for unused optional inputs/outputs");
   }
 
+  // This alignment is used to properly space out individual chunks in mempatterns memory buffer.
+  const auto alignment = std::max(location.GetAlignment(), kAllocAlignment);
+
   size_t size = 0;
-  ORT_RETURN_IF_ERROR(Tensor::CalculateTensorStorageSize(element_type, shape, kAllocAlignment, size));
+  ORT_RETURN_IF_ERROR(Tensor::CalculateTensorStorageSize(element_type, shape, alignment, size));
 
   // Lazily get the allocator only if needed.
   AllocatorPtr alloc = nullptr;
 
@@ -32,10 +32,10 @@ CPUExecutionProvider::CPUExecutionProvider(const CPUExecutionProviderInfo& info)
 
 std::vector<AllocatorPtr> CPUExecutionProvider::CreatePreferredAllocators() {
   const bool create_arena = DoesCpuAllocatorSupportArenaUsage() ? info_.create_arena : false;
-  AllocatorCreationInfo device_info{[](int) { return std::make_unique<CPUAllocator>(); },
-                                    DEFAULT_CPU_ALLOCATOR_DEVICE_ID, create_arena};
+  AllocatorCreationInfo device_info_cpu{[](int) { return std::make_unique<CPUAllocator>(); },
+                                        DEFAULT_CPU_ALLOCATOR_DEVICE_ID, create_arena};
 
-  return std::vector<AllocatorPtr>{CreateAllocator(device_info)};
+  return std::vector<AllocatorPtr>{CreateAllocator(device_info_cpu)};
 }
 
 // Forward declarations of op kernels
 
@@ -1850,9 +1850,10 @@ Status QnnBackendManager::GetOrRegisterContextMemHandle(Qnn_ContextHandle_t cont
   if (did_register) {
     HtpSharedMemoryAllocator::AllocationCleanUpFn unregister_mem_handle =
         [&logger = *logger_,
+         shared_memory_address,
          weak_backend_manager = weak_from_this(),
          weak_context_handle_record = std::weak_ptr{context_handle_record}](
-            void* shared_memory_address) {
+            void* /* allocation_base_address */) {
           // Lock QnnBackendManager shared_ptr to ensure that QNN interface is still valid.
           auto backend_manager = weak_backend_manager.lock();
           if (!backend_manager) {
 
@@ -200,7 +200,9 @@ static Status BindQnnTensorMemoryToOrtValueMemory(const logging::Logger& logger,
                                                   Qnn_ContextHandle_t qnn_context,
                                                   Qnn_Tensor_t& qnn_tensor) {
   // either set qnn_tensor memHandle or clientBuf
-  const bool uses_shared_memory = ort_value_memory_info == HtpSharedMemoryAllocator::AssociatedMemoryInfo();
+  const static auto htp_shared_mem_info = HtpSharedMemoryAllocator::AssociatedMemoryInfo();
+  const bool uses_shared_memory = (ort_value_memory_info.device.Type() == htp_shared_mem_info.device.Type() &&
+                                   ort_value_memory_info.device.MemType() == htp_shared_mem_info.device.MemType());
 
   if (!uses_shared_memory) {
     LOGS(logger, VERBOSE) << "Setting Qnn_Tensor_t clientBuf to ORT tensor memory.";