Skip to content
13 changes: 13 additions & 0 deletions backends/xnnpack/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,19 @@ option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE
# Keeping this OFF by default due to regressions in decode and model load with
# kleidi kernels
option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI "Enable Arm Kleidi kernels" OFF)

# Turning this on cache weights between partitions and methods. If weights
# are shared across methods/partitions then this can reduce load time and
# memory usage

# Keeping this off maintains existing behavior. Turning this on serializes
# execution and initialization of delegates, to be revisited
option(EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE
"Enable weights cache to cache and manage all packed weights" OFF)

if(EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE)
add_definitions(-DENABLE_XNNPACK_WEIGHTS_CACHE)
endif()
if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE)
add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE)
endif()
Expand Down
72 changes: 60 additions & 12 deletions backends/xnnpack/runtime/XNNCompiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
#include <executorch/backends/xnnpack/serialization/schema_generated.h>
#include <executorch/extension/threadpool/threadpool.h>
#include <executorch/runtime/executor/pte_data_map.h>
#include <string>
#include <unordered_map>
#include <vector>

#pragma clang diagnostic ignored "-Wmissing-prototypes"
#pragma clang diagnostic ignored "-Wglobal-constructors"
Expand Down Expand Up @@ -167,7 +169,8 @@ const uint8_t* getConstantDataPtr(
GraphPtr flatbuffer_graph,
const uint8_t* constant_data_ptr,
const NamedDataMap* named_data_map,
std::vector<FreeableBuffer>& loaded_buffers_from_map) {
std::vector<FreeableBuffer>& freeable_buffers,
XNNWeightsCache* weights_cache) {
auto buffer_idx = tensor_value->constant_buffer_idx();
if (buffer_idx) {
if (!constant_data_ptr) {
Expand All @@ -187,6 +190,15 @@ const uint8_t* getConstantDataPtr(
return constant_data_ptr + offset;
} else {
const std::string& data_name = constant_data_offset->named_key()->str();
#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
Result<const uint8_t*> data_ptr =
weights_cache->load_unpacked_data(data_name);
if (!data_ptr.ok()) {
ET_LOG(Error, "Failed to load weights from cache");
return nullptr;
}
return data_ptr.get();
#else
Result<FreeableBuffer> buffer =
named_data_map->get_data(data_name.c_str());
if (!buffer.ok()) {
Expand All @@ -198,8 +210,9 @@ const uint8_t* getConstantDataPtr(
}
const uint8_t* data_ptr =
static_cast<const uint8_t*>(buffer.get().data());
loaded_buffers_from_map.push_back(std::move(buffer.get()));
freeable_buffers.push_back(std::move(buffer.get()));
return data_ptr;
#endif
}
}
}
Expand All @@ -222,7 +235,8 @@ Error defineTensor(
std::vector<uint32_t>& output_ids,
CompileAllocator& allocator,
const NamedDataMap* named_data_map,
std::vector<FreeableBuffer>& loaded_buffers_from_map) {
std::vector<FreeableBuffer>& freeable_buffers,
XNNWeightsCache* weights_cache) {
const fb_xnnpack::XNNTensorValue* tensor_value = nullptr;
const fb_xnnpack::XNNQuantizedTensorValue* qtensor_value = nullptr;

Expand Down Expand Up @@ -264,7 +278,8 @@ Error defineTensor(
flatbuffer_graph,
constant_data_ptr,
named_data_map,
loaded_buffers_from_map);
freeable_buffers,
weights_cache);

xnn_status status;
// The type we might have to convert to
Expand Down Expand Up @@ -1999,9 +2014,9 @@ ET_NODISCARD Error XNNCompiler::compileModel(
const void* buffer_pointer,
size_t num_bytes,
XNNExecutor* executor,
MemoryAllocator* runtime_allocator,
const NamedDataMap* named_data_map,
xnn_workspace_t workspace) {
XNNWeightsCache* weights_cache,
xnn_workspace_t workspace,
const NamedDataMap* named_data_map) {
Result<XNNHeader> header = XNNHeader::Parse(buffer_pointer, num_bytes);
const uint8_t* flatbuffer_data = nullptr;
const uint8_t* constant_data = nullptr;
Expand Down Expand Up @@ -2065,11 +2080,14 @@ ET_NODISCARD Error XNNCompiler::compileModel(
// Invalid ids do not need to be remapped
remapped_ids.emplace(XNN_INVALID_VALUE_ID, XNN_INVALID_VALUE_ID);

// If weight cache is not on we hold onto all the unpacked buffers
// and we free them at the end
std::vector<FreeableBuffer> unpacked_buffers;

// External Ids for inputs and outputs
std::vector<uint32_t> input_ids;
std::vector<uint32_t> output_ids;
Error err = Error::Ok;
std::vector<FreeableBuffer> loaded_buffers_from_map;
for (auto value : *flatbuffer_graph->xvalues()) {
err = defineTensor(
subgraph.get(),
Expand All @@ -2081,7 +2099,8 @@ ET_NODISCARD Error XNNCompiler::compileModel(
output_ids,
compile_allocator,
named_data_map,
loaded_buffers_from_map);
unpacked_buffers,
weights_cache);

if (err != Error::Ok) {
return err;
Expand All @@ -2103,20 +2122,34 @@ ET_NODISCARD Error XNNCompiler::compileModel(

xnn_runtime_t runtime_ptr = nullptr;

// XNNWeightsCache if weights cache is not enabled, then XNNWeightsCache
// just manages the unpacked weights until the runtime is created.
#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
ET_CHECK_OR_RETURN_ERROR(
unpacked_buffers.size() == 0,
Internal,
"Weight Cache is enabled, which means unpacked buffers should be owned by the cache");
xnn_weights_cache_t weights_cache_ptr =
weights_cache->get_num_unpacked_data() > 0 ? weights_cache->get()
: nullptr;
#else
xnn_weights_cache_t weights_cache_ptr = nullptr;
#endif

#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
ET_CHECK_OR_RETURN_ERROR(
workspace != nullptr, Internal, "Failed to initialize XNNPACK workspace");
status = xnn_create_runtime_v4(
subgraph.get(),
/*weight_cache=*/nullptr, // TODO - support weight cache
weights_cache_ptr,
workspace,
::executorch::extension::threadpool::get_pthreadpool(),
runtime_flags,
&runtime_ptr);
#else
status = xnn_create_runtime_v3(
subgraph.get(),
/*weight_cache=*/nullptr, // TODO - support weight cache
weights_cache_ptr,
::executorch::extension::threadpool::get_pthreadpool(),
runtime_flags,
&runtime_ptr);
Expand All @@ -2128,10 +2161,25 @@ ET_NODISCARD Error XNNCompiler::compileModel(
"XNN Runtime creation failed with code: %s",
xnn_status_to_string(status));

#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
auto packed_weights_names = weights_cache->finalize_for_runtime();
ET_CHECK_OR_RETURN_ERROR(
packed_weights_names.ok(),
Internal,
"Failed to finalize weights cache after creating the xnn runtime")
#else
for (auto& buffer : unpacked_buffers) {
buffer.Free();
}
Result<std::vector<std::string>> packed_weights_names =
std::vector<std::string>();
#endif

err = executor->initialize( // NOLINT: runtime_ptr is non-null
runtime_ptr,
std::move(input_ids),
std::move(output_ids));
std::move(output_ids),
std::move(packed_weights_names.get()));

return err;
};
Expand Down
10 changes: 4 additions & 6 deletions backends/xnnpack/runtime/XNNCompiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,9 @@
#pragma once

#include <executorch/backends/xnnpack/runtime/XNNExecutor.h>
#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
#include <executorch/runtime/platform/compiler.h>

#include <xnnpack.h>
#include <memory>
#include <vector>

namespace executorch {
namespace backends {
Expand All @@ -29,9 +27,9 @@ class XNNCompiler {
const void* buffer_pointer,
size_t num_bytes,
XNNExecutor* executor,
executorch::runtime::MemoryAllocator* runtime_allocator,
const executorch::runtime::NamedDataMap* named_data_map,
xnn_workspace_t workspace);
XNNWeightsCache* weights_cache,
xnn_workspace_t workspace,
const NamedDataMap* named_data_map);
};

} // namespace delegate
Expand Down
4 changes: 3 additions & 1 deletion backends/xnnpack/runtime/XNNExecutor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ using executorch::runtime::kTensorDimensionLimit;
ET_NODISCARD Error XNNExecutor::initialize(
xnn_runtime_t runtime,
std::vector<uint32_t>&& input_ids,
std::vector<uint32_t>&& output_ids) {
std::vector<uint32_t>&& output_ids,
std::vector<std::string>&& packed_data_names) {
runtime_ = std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)>(
runtime, xnn_delete_runtime);

Expand All @@ -51,6 +52,7 @@ ET_NODISCARD Error XNNExecutor::initialize(
std::sort(output_ids_.begin(), output_ids_.end());

externals_.resize(input_ids_.size() + output_ids_.size());
packed_data_names_ = std::move(packed_data_names);

return Error::Ok;
}
Expand Down
8 changes: 7 additions & 1 deletion backends/xnnpack/runtime/XNNExecutor.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class XNNExecutor {
std::vector<uint32_t> input_ids_;
std::vector<uint32_t> output_ids_;
std::vector<xnn_external_value> externals_;
std::vector<std::string> packed_data_names_;

public:
XNNExecutor() = default;
Expand All @@ -46,6 +47,10 @@ class XNNExecutor {
return output_ids_.size();
}

inline std::vector<std::string> get_packed_data_names() {
return packed_data_names_;
}

/**
* Initialize the XNNExecutor with a given runtime and input/output ids.
* The input/output ids are expected to be sorted in order of their
Expand All @@ -54,7 +59,8 @@ class XNNExecutor {
ET_NODISCARD executorch::runtime::Error initialize(
xnn_runtime_t runtime,
std::vector<uint32_t>&& input_ids,
std::vector<uint32_t>&& output_ids);
std::vector<uint32_t>&& output_ids,
std::vector<std::string>&& packed_data_names);

/**
* Prepares the arguments for runtime graph execution.
Expand Down
42 changes: 35 additions & 7 deletions backends/xnnpack/runtime/XNNPACKBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*/

#include <executorch/backends/xnnpack/runtime/XNNCompiler.h>
#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
#include <executorch/runtime/backend/interface.h>
#include <executorch/runtime/core/error.h>
#include <executorch/runtime/core/evalue.h>
Expand All @@ -20,6 +21,7 @@
namespace executorch {
namespace backends {

using executorch::backends::xnnpack::delegate::XNNWeightsCache;
using executorch::runtime::ArrayRef;
using executorch::runtime::Backend;
using executorch::runtime::BackendExecutionContext;
Expand Down Expand Up @@ -81,13 +83,18 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
}

const NamedDataMap* named_data_map = context.get_named_data_map();

#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
// This is needed to serialize access to xnn_create_runtime which is not
// thread safe. This can heppen when multiple threads call init() on
// the same backend instance.
#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
const std::lock_guard<std::mutex> lock(workspace_mutex_);
#endif

#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
const std::lock_guard<std::mutex> lock_weight_cache(weights_cache_mutex_);
weights_cache_->initialize_for_runtime(
context.get_runtime_allocator(), named_data_map);
#endif

// Executor has been allocated but not constructed, ensure that runtime_ is
// nullptr by constructing it in place here. NOTE: Since we use placement
// new and since this type is not trivially destructible, we must call the
Expand All @@ -97,9 +104,9 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
processed->data(),
processed->size(),
executor,
context.get_runtime_allocator(),
named_data_map,
workspace_.get());
weights_cache_.get(),
workspace_.get(),
named_data_map);
// This backend does not need its processed data after compiling the model.
processed->Free();

Expand All @@ -125,6 +132,10 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
const std::lock_guard<std::mutex> lock(workspace_mutex_);
#endif

#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
const std::lock_guard<std::mutex> lock_weights_cache(weights_cache_mutex_);
#endif

// Prepare Inputs/Outputs and Propagate Input Shapes
Error err = executor->prepare_args(args);
if (err != Error::Ok) {
Expand All @@ -145,16 +156,24 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {

void destroy(DelegateHandle* handle) const override {
if (handle != nullptr) {
#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
// This is needed to serialize access to xnn_delete_runtime which is not
// thread safe. This can heppen when multiple threads call destroy() on
// the same backend instance.
#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
const std::lock_guard<std::mutex> lock(workspace_mutex_);
#endif

auto executor = static_cast<xnnpack::delegate::XNNExecutor*>(handle);

#ifdef ENABLE_XNNPACK_PROFILING
executor->print_avg_op_timings();
#endif

#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
const std::lock_guard<std::mutex> lock_weights_cache(
weights_cache_mutex_);
weights_cache_->delete_packed_data(executor->get_packed_data_names());
#endif
// XNNExecutor is not trivially destructible. Since this was constructed
// manually in init(), we must destroy it manually here.
executor->~XNNExecutor();
Expand All @@ -167,6 +186,15 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
std::unique_ptr<xnn_workspace, decltype(&xnn_release_workspace)> workspace_{
nullptr,
&xnn_release_workspace};

// Weights cache is global to all delegate instances.
mutable std::mutex weights_cache_mutex_;
std::unique_ptr<XNNWeightsCache> weights_cache_ =
std::make_unique<XNNWeightsCache>();

// Lock Hiearchy for Mutexes:
// workspace_mutex_
// weights_cache_mutex_
};

namespace {
Expand Down
10 changes: 7 additions & 3 deletions backends/xnnpack/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,15 @@ def _get_preprocessor_flags():
Disable if someone explictly specified a config option,
else Enable otherwise
"""
if native.read_config("executorch", "xnnpack_workspace_sharing", "0") == "0":
return []
preprocessor_flags = []
if native.read_config("executorch", "xnnpack_workspace_sharing", "0") != "0":
preprocessor_flags.append("-DENABLE_XNNPACK_SHARED_WORKSPACE")

if native.read_config("executorch", "xnnpack_weights_cache", "0") != "0":
preprocessor_flags.append("-DENABLE_XNNPACK_WEIGHTS_CACHE")

# Enable if not disabled through config
return ["-DENABLE_XNNPACK_SHARED_WORKSPACE"]
return preprocessor_flags

def define_common_targets():
runtime.cxx_library(
Expand Down
Loading