Skip to content

Commit abb20d3

Browse files
noemotiovonwangweixuan
authored andcommitted
CANN: implement LRU cache for ACL graphs (ggml-org#15814)
* CANN: implement LRU cache for ACL graphs in CANN backend - Introduce ggml_cann_graph_lru_cache to store multiple ggml_cann_graph objects. - Graphs are loaded on demand and evicted using LRU policy when capacity is exceeded. - Updated push, move_to_front, and clear methods to manage cached graphs efficiently. - Ensures reuse of graphs, reducing graph reconstruction overhead in CANN backend. * fix typo * The LRU cache capacity can be configured via an env variable Signed-off-by: noemotiovon <[email protected]> * refactory acl graph * refactory && fix review comments Signed-off-by: noemotiovon <[email protected]> --------- Signed-off-by: noemotiovon <[email protected]>
1 parent 74f52f7 commit abb20d3

File tree

3 files changed

+168
-51
lines changed

3 files changed

+168
-51
lines changed

docs/backend/CANN.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,3 +314,10 @@ Controls automatic cleanup of the memory pool. This option is only effective whe
314314

315315
Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
316316

317+
### GGML_CANN_ACL_GRAPH
318+
319+
Operators are executed using ACL graph execution, rather than in op-by-op (eager) mode. Enabled by default.
320+
321+
### GGML_CANN_GRAPH_CACHE_CAPACITY
322+
323+
Maximum number of compiled CANN graphs kept in the LRU cache, default is 12. When the number of cached graphs exceeds this capacity, the least recently used graph will be evicted.

ggml/src/ggml-cann/common.h

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include <unistd.h>
3939
#include <functional>
4040
#include <optional>
41+
#include <list>
4142

4243
#include "../include/ggml-cann.h"
4344
#include "../include/ggml.h"
@@ -106,6 +107,7 @@ int32_t ggml_cann_get_device();
106107

107108
std::optional<std::string> get_env(const std::string& name);
108109
bool parse_bool(const std::string& value);
110+
int parse_integer(const std::string& value);
109111

110112
/**
111113
* @brief Abstract base class for memory pools used by CANN.
@@ -350,14 +352,72 @@ struct ggml_graph_node_properties {
350352
struct ggml_cann_graph {
351353
~ggml_cann_graph() {
352354
if (graph != nullptr) {
353-
aclmdlRIDestroy(graph);
355+
ACL_CHECK(aclmdlRIDestroy(graph));
354356
}
355357
}
356358

357359
aclmdlRI graph = nullptr;
358360

359361
std::vector<ggml_graph_node_properties> ggml_graph_properties;
360362
};
363+
364+
/**
365+
* @brief LRU cache for managing ggml_cann_graph objects.
366+
*
367+
* This class maintains a list of shared_ptr to ggml_cann_graph objects
368+
* and enforces a maximum capacity. It provides methods to push new graphs,
369+
* move existing graphs to the front (most recently used), and clear the cache.
370+
*/
371+
struct ggml_cann_graph_lru_cache {
372+
size_t capacity; /**< Maximum number of graphs in the cache. */
373+
374+
std::list<ggml_cann_graph*> cache_list; /**< List storing cached graphs as raw pointers. */
375+
376+
ggml_cann_graph_lru_cache() {
377+
capacity = parse_integer(get_env("GGML_CANN_GRAPH_CACHE_CAPACITY").value_or("12"));
378+
}
379+
380+
/**
381+
* @brief Push a new graph to the front of the cache.
382+
* If the cache exceeds capacity, the least recently used graph is deleted.
383+
* @param new_node Pointer to the new ggml_cann_graph to cache.
384+
* Ownership is transferred to the cache (cache will delete it).
385+
*/
386+
void push(ggml_cann_graph* new_node) {
387+
if (cache_list.size() >= capacity) {
388+
ggml_cann_graph* old = cache_list.back();
389+
cache_list.pop_back();
390+
delete old; // free the old graph
391+
}
392+
cache_list.push_front(new_node);
393+
}
394+
395+
/**
396+
* @brief Move an existing graph to the front of the cache.
397+
* @param node Pointer to the ggml_cann_graph to move.
398+
*/
399+
void move_to_front(ggml_cann_graph* node) {
400+
cache_list.remove(node);
401+
cache_list.push_front(node);
402+
}
403+
404+
/**
405+
* @brief Clear all graphs from the cache (also frees memory).
406+
*/
407+
void clear() {
408+
for (auto ptr : cache_list) {
409+
delete ptr;
410+
}
411+
cache_list.clear();
412+
}
413+
414+
/**
415+
* @brief Destructor that clears the cache and frees all cached graphs.
416+
*/
417+
~ggml_cann_graph_lru_cache() {
418+
clear();
419+
}
420+
};
361421
#endif // USE_ACL_GRAPH
362422

363423
/**
@@ -370,7 +430,8 @@ struct ggml_backend_cann_context {
370430
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
371431
#ifdef USE_ACL_GRAPH
372432
/// Cached CANN ACL graph used for executing the current ggml computation graph.
373-
std::unique_ptr<ggml_cann_graph> cann_graph;
433+
ggml_cann_graph_lru_cache graph_lru_cache;
434+
bool acl_graph_mode = true;
374435
#endif
375436
cann_task_queue task_queue;
376437
bool async_mode;

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 98 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,24 @@ bool parse_bool(const std::string& value) {
116116
return valid_values.find(value) != valid_values.end();
117117
}
118118

119+
/**
120+
* @brief Parse a string as an integer, returning 0 if invalid.
121+
*
122+
* This function attempts to convert the input string `value` to an `int`.
123+
* If the string is not a valid integer or is out of the `int` range,
124+
* it returns 0.
125+
*
126+
* @param value The string to parse.
127+
* @return The parsed integer, or 0 if conversion fails.
128+
*/
129+
int parse_integer(const std::string& value) {
130+
try {
131+
return std::stoi(value);
132+
} catch (...) {
133+
return 0;
134+
}
135+
}
136+
119137
/**
120138
* @brief Initialize the CANN device information.
121139
*
@@ -2077,30 +2095,52 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
20772095

20782096
#ifdef USE_ACL_GRAPH
20792097
/**
2080-
* @brief Populate the internal CANN graph node properties from the ggml computation graph.
2098+
* @brief Add a new CANN graph to the LRU cache by populating node properties from the ggml graph.
2099+
*
2100+
* This function creates a new ggml_cann_graph object and fills its node properties
2101+
* (operation type, dimensions, strides, input sources, and operation parameters)
2102+
* based on the current ggml computation graph.
20812103
*
2082-
* This function copies all node attributes (operation type, dimensions, strides, input sources,
2083-
* and operation parameters) into the cached CANN graph structure for later reuse or comparison.
2104+
* Each node in the ggml graph is mapped to a property entry in the new CANN graph:
2105+
* - node address
2106+
* - operation type
2107+
* - shape (ne) and strides (nb)
2108+
* - source tensor addresses
2109+
* - operation parameters
20842110
*
2085-
* @param cann_ctx The CANN backend context.
2086-
* @param cgraph The ggml computational graph.
2111+
* After initialization, the new graph is pushed into the LRU cache owned by the
2112+
* CANN backend context. The cache takes ownership of the graph and manages its
2113+
* lifetime (including deletion upon eviction).
2114+
*
2115+
* @param cann_ctx The CANN backend context containing the graph cache.
2116+
* @param cgraph The current ggml computation graph.
20872117
*/
2088-
static void set_ggml_graph_node_properties(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
2089-
for (int node_idx = 0; node_idx < cgraph->n_nodes; node_idx++) {
2118+
static void add_lru_matched_graph_node_properties(
2119+
ggml_backend_cann_context * cann_ctx,
2120+
ggml_cgraph * cgraph) {
2121+
// Create a new ggml_cann_graph object on the heap (its lifetime is managed by the cache).
2122+
ggml_cann_graph * new_graph = new ggml_cann_graph();
2123+
new_graph->ggml_graph_properties.resize(cgraph->n_nodes);
2124+
2125+
for (int node_idx = 0; node_idx < cgraph->n_nodes; ++node_idx) {
20902126
ggml_tensor * node = cgraph->nodes[node_idx];
2091-
cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_address = node->data;
2092-
cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_op = node->op;
2127+
auto & prop = new_graph->ggml_graph_properties[node_idx];
20932128

2094-
for (int dim = 0; dim < GGML_MAX_DIMS; dim++) {
2095-
cann_ctx->cann_graph->ggml_graph_properties[node_idx].ne[dim] = node->ne[dim];
2096-
cann_ctx->cann_graph->ggml_graph_properties[node_idx].nb[dim] = node->nb[dim];
2097-
}
2098-
for (int src = 0; src < GGML_MAX_SRC; src++) {
2099-
cann_ctx->cann_graph->ggml_graph_properties[node_idx].src_address[src] =
2100-
node->src[src] ? node->src[src]->data : nullptr;
2129+
prop.node_address = node->data;
2130+
prop.node_op = node->op;
2131+
2132+
std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne);
2133+
std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb);
2134+
2135+
for (int src = 0; src < GGML_MAX_SRC; ++src) {
2136+
prop.src_address[src] = node->src[src] ? node->src[src]->data : nullptr;
21012137
}
2102-
memcpy(cann_ctx->cann_graph->ggml_graph_properties[node_idx].op_params, node->op_params, GGML_MAX_OP_PARAMS);
2138+
2139+
memcpy(prop.op_params, node->op_params, GGML_MAX_OP_PARAMS);
21032140
}
2141+
2142+
// Insert into the LRU cache (cache takes ownership and will delete it when evicted).
2143+
cann_ctx->graph_lru_cache.push(new_graph);
21042144
}
21052145

21062146
/**
@@ -2145,30 +2185,45 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
21452185
}
21462186

21472187
/**
2148-
* @brief Determine if the CANN graph needs to be rebuilt due to graph changes.
2188+
* @brief Check whether there is a cached CANN graph that matches the current ggml graph.
2189+
*
2190+
* This function iterates through the cached CANN graphs stored in the LRU cache and
2191+
* compares them against the given ggml computation graph. A match requires that the
2192+
* number of nodes is the same and that each node’s properties (operation type,
2193+
* dimensions, strides, inputs, and operation parameters) are identical.
21492194
*
2150-
* This checks whether the number or properties of ggml graph nodes have changed
2151-
* compared to the last captured CANN graph. If so, the CANN graph must be re-captured.
2195+
* If a matching graph is found, it is promoted to the front of the LRU cache and the
2196+
* function returns true. Otherwise, the function returns false, indicating that a new
2197+
* CANN graph needs to be captured.
21522198
*
2153-
* @param cann_ctx The CANN backend context.
2199+
* @param cann_ctx The CANN backend context containing the graph cache.
21542200
* @param cgraph The current ggml computation graph.
2155-
* @return true if an update is required; false otherwise.
2156-
*/
2157-
static bool is_cann_graph_update_required(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
2158-
// The number of nodes is different, so the graph needs to be reconstructed.
2159-
if (cann_ctx->cann_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
2160-
cann_ctx->cann_graph->ggml_graph_properties.resize(cgraph->n_nodes);
2161-
return true;
2162-
}
2201+
* @return true if a matching cached graph exists; false otherwise.
2202+
*/
2203+
static bool is_matched_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
2204+
ggml_cann_graph_lru_cache &lru_cache = cann_ctx->graph_lru_cache;
2205+
for (auto &graph_ptr : lru_cache.cache_list) {
2206+
// Skip graphs with a different number of nodes.
2207+
if (graph_ptr->ggml_graph_properties.size() != static_cast<size_t>(cgraph->n_nodes)) {
2208+
continue;
2209+
}
21632210

2164-
// The number of nodes is the same; iterate over each node to check whether they match.
2165-
for (int i = 0; i < cgraph->n_nodes; i++) {
2166-
bool has_matching_properties = ggml_graph_node_has_matching_properties(
2167-
cgraph->nodes[i], &cann_ctx->cann_graph->ggml_graph_properties[i]);
2168-
if(!has_matching_properties) {
2211+
// Check if all nodes match.
2212+
bool all_match = true;
2213+
for (int i = 0; i < cgraph->n_nodes; ++i) {
2214+
if (!ggml_graph_node_has_matching_properties(cgraph->nodes[i], &graph_ptr->ggml_graph_properties[i])) {
2215+
all_match = false;
2216+
break;
2217+
}
2218+
}
2219+
2220+
if (all_match) {
2221+
// update cache_list && renturn graph_ptr
2222+
lru_cache.move_to_front(graph_ptr);
21692223
return true;
21702224
}
21712225
}
2226+
21722227
return false;
21732228
}
21742229
#endif // USE_ACL_GRAPH
@@ -2187,17 +2242,13 @@ static bool is_cann_graph_update_required(ggml_backend_cann_context * cann_ctx,
21872242
* @param cann_graph_update_required Whether graph capture is needed due to graph changes.
21882243
*/
21892244
static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph,
2190-
bool & use_cann_graph, bool & cann_graph_update_required) {
2245+
bool & use_cann_graph, bool & cann_graph_update_required) {
21912246
#ifdef USE_ACL_GRAPH
2247+
ggml_cann_graph* matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
21922248
if (use_cann_graph && cann_graph_update_required) {
2193-
if (cann_ctx->cann_graph->graph != nullptr) {
2194-
ACL_CHECK(aclmdlRIDestroy(cann_ctx->cann_graph->graph));
2195-
cann_ctx->cann_graph->graph = nullptr;
2196-
}
21972249
ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
21982250
}
21992251
#endif // USE_ACL_GRAPH
2200-
22012252
// Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
22022253
// With the use of CANN graphs, the execution will be performed by the graph launch.
22032254
if (!use_cann_graph || cann_graph_update_required) {
@@ -2218,12 +2269,12 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
22182269

22192270
#ifdef USE_ACL_GRAPH
22202271
if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture
2221-
ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &cann_ctx->cann_graph->graph));
2272+
ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
22222273
}
22232274

22242275
if (use_cann_graph) {
22252276
// Execute graph
2226-
ACL_CHECK(aclmdlRIExecuteAsync(cann_ctx->cann_graph->graph, cann_ctx->stream()));
2277+
ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream()));
22272278
}
22282279
#endif // USE_ACL_GRAPH
22292280
}
@@ -2257,19 +2308,17 @@ static enum ggml_status ggml_backend_cann_graph_compute(
22572308
}
22582309

22592310
if (use_cann_graph) {
2260-
if (cann_ctx->cann_graph == nullptr) {
2261-
cann_ctx->cann_graph.reset(new ggml_cann_graph());
2262-
cann_graph_update_required = true;
2311+
// If no matching graph is found, the graph needs to be recaptured.
2312+
cann_graph_update_required = !is_matched_graph(cann_ctx, cgraph);
2313+
if (cann_graph_update_required) {
2314+
// If no matching graph is found, add a new ACL graph.
2315+
add_lru_matched_graph_node_properties(cann_ctx, cgraph);
22632316
}
2264-
2265-
cann_graph_update_required = is_cann_graph_update_required(cann_ctx, cgraph);
2266-
set_ggml_graph_node_properties(cann_ctx, cgraph);
22672317
}
22682318
#else
22692319
bool use_cann_graph = false;
22702320
bool cann_graph_update_required = false;
22712321
#endif // USE_ACL_GRAPH
2272-
22732322
evaluate_and_capture_cann_graph(
22742323
cann_ctx,
22752324
cgraph,

0 commit comments

Comments
 (0)