From a83dc461a4fcddd3108d48cfeda917c4a70c7f5d Mon Sep 17 00:00:00 2001 From: noemotiovon <757486878@qq.com> Date: Wed, 3 Sep 2025 09:30:52 +0000 Subject: [PATCH 1/4] CANN:Refactor ND to NZ workspace to be per-device in Ascend backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replaced the previous single global ND→NZ workspace with a per-device cache using unordered_map keyed by device ID. - Functions `release_nz_workspace`, `relloc_nz_workspace`, and `get_nz_workspace` now manage workspace independently for each device, preventing memory conflicts in multi-device / pipeline parallel scenarios. - This change fixes potential precision issues caused by workspace overwrites when multiple devices perform ND→NZ conversions concurrently. Co-authored-by: hipudding --- ggml/src/ggml-cann/ggml-cann.cpp | 55 ++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 0d9eb8fa1b9ca..cc2ac4e243cc5 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1118,28 +1118,39 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor( // ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed namespace { - void* g_nz_workspace = nullptr; - size_t g_nz_workspace_allocated = 0; - - void release_nz_workspace() { - if (g_nz_workspace) { - aclrtFree(g_nz_workspace); - g_nz_workspace = nullptr; - g_nz_workspace_allocated = 0; + + static std::unordered_map g_nz_workspace_map; + static std::unordered_map g_nz_workspace_allocated_map; + + void release_nz_workspace(int device) { + auto it = g_nz_workspace_map.find(device); + if (it != g_nz_workspace_map.end() && it->second) { + aclrtFree(it->second); + g_nz_workspace_map.erase(it); + g_nz_workspace_allocated_map.erase(device); } } - void relloc_nz_workspace(size_t new_size) { - if (new_size > g_nz_workspace_allocated) { - if (g_nz_workspace) { - aclrtFree(g_nz_workspace); - g_nz_workspace = nullptr; + void relloc_nz_workspace(int device, size_t new_size) { + void* &workspace = g_nz_workspace_map[device]; + size_t &allocated = g_nz_workspace_allocated_map[device]; + + if (new_size > allocated) { + if (workspace) { + aclrtFree(workspace); + workspace = nullptr; + } + ACL_CHECK(aclrtMalloc(&workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST)); + allocated = new_size; } - ACL_CHECK(aclrtMalloc(&g_nz_workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST)); - g_nz_workspace_allocated = new_size; } + + void* get_nz_workspace(int device) { + auto it = g_nz_workspace_map.find(device); + return (it != g_nz_workspace_map.end()) ? it->second : nullptr; } -} + +} // namespace /** * @brief Convert tensor weights to NZ format using Ascend CANN API. @@ -1149,13 +1160,13 @@ namespace { * improve performance on certain hardware. * * @param tensor Pointer to the input ggml_tensor containing the weights. - * @param data Pointer to the raw data buffer for the tensor weights. * @param offset Byte offset within the tensor data buffer where weights start. + * @param device device id. * * @note The workspace buffer used in this function is managed globally and reused * across calls. This reduces overhead from repeated memory allocation and deallocation. */ -static void weight_format_to_nz(ggml_tensor *tensor, size_t offset) { +static void weight_format_to_nz(ggml_tensor *tensor, size_t offset, int device) { aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne, tensor->nb, 2, ACL_FORMAT_ND, offset); uint64_t workspaceSize = 0; @@ -1165,7 +1176,9 @@ static void weight_format_to_nz(ggml_tensor *tensor, size_t offset) { ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor)); // Avoid frequent malloc/free of the workspace. - relloc_nz_workspace(workspaceSize); + relloc_nz_workspace(device, workspaceSize); + + void* g_nz_workspace = get_nz_workspace(device); ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr)); ACL_CHECK(aclDestroyTensor(weightTransposed)); @@ -1203,7 +1216,7 @@ static void ggml_backend_cann_buffer_set_tensor( if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) { GGML_ASSERT(tensor->ne[2] == 1); GGML_ASSERT(tensor->ne[3] == 1); - weight_format_to_nz(tensor, offset); + weight_format_to_nz(tensor, offset, ctx->device); } } else { void *transform_buffer = malloc(size); @@ -2246,7 +2259,7 @@ static enum ggml_status ggml_backend_cann_graph_compute( ggml_backend_cann_context* cann_ctx = (ggml_backend_cann_context*)backend->context; ggml_cann_set_device(cann_ctx->device); - release_nz_workspace(); + release_nz_workspace(cann_ctx->device); #ifdef USE_ACL_GRAPH bool use_cann_graph = true; From 540d7b4c8b1923c3e3820c2538192c0a1a1deee5 Mon Sep 17 00:00:00 2001 From: noemotiovon <757486878@qq.com> Date: Thu, 4 Sep 2025 02:48:10 +0000 Subject: [PATCH 2/4] refactor Signed-off-by: noemotiovon <757486878@qq.com> --- ggml/src/ggml-cann/ggml-cann.cpp | 81 ++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 30 deletions(-) diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index cc2ac4e243cc5..4b43e379dcbc4 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1117,40 +1117,61 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor( } // ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed -namespace { - - static std::unordered_map g_nz_workspace_map; - static std::unordered_map g_nz_workspace_allocated_map; - - void release_nz_workspace(int device) { - auto it = g_nz_workspace_map.find(device); - if (it != g_nz_workspace_map.end() && it->second) { - aclrtFree(it->second); - g_nz_workspace_map.erase(it); - g_nz_workspace_allocated_map.erase(device); +class NzWorkspace { +public: + // Constructor: initialize with no allocated buffer + NzWorkspace() : ptr_(nullptr), allocated_(0) {} + + // Reset workspace to uninitialized state: + // - Free allocated device memory (if any) + // - Clear internal pointer and size + // Equivalent to release_nz_workspace(device) in old version + void init() { + if (ptr_) { + aclrtFree(ptr_); + ptr_ = nullptr; + allocated_ = 0; } } - void relloc_nz_workspace(int device, size_t new_size) { - void* &workspace = g_nz_workspace_map[device]; - size_t &allocated = g_nz_workspace_allocated_map[device]; - - if (new_size > allocated) { - if (workspace) { - aclrtFree(workspace); - workspace = nullptr; - } - ACL_CHECK(aclrtMalloc(&workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST)); - allocated = new_size; + // Allocate or reallocate the workspace buffer: + // - If requested size > currently allocated size: + // * Free the old buffer (if any) + // * Allocate a new buffer with requested size on device + // - If requested size <= currently allocated size: + // * Do nothing (reuse existing buffer) + // Equivalent to relloc_nz_workspace(device, new_size) in old version + void realloc(size_t new_size) { + if (new_size > allocated_) { + init(); + ACL_CHECK(aclrtMalloc(&ptr_, new_size, ACL_MEM_MALLOC_HUGE_FIRST)); + allocated_ = new_size; } } - void* get_nz_workspace(int device) { - auto it = g_nz_workspace_map.find(device); - return (it != g_nz_workspace_map.end()) ? it->second : nullptr; + // Return raw device pointer (may be nullptr if not allocated) + // Equivalent to get_nz_workspace(device) in old version + void* get() const { return ptr_; } + +private: + void* ptr_; // Pointer to allocated device buffer + size_t allocated_; // Size of currently allocated buffer (bytes) +}; + +// Global array of NzWorkspace, one per device +// g_nz_workspaces[device] corresponds to workspace of given device +static std::array g_nz_workspaces; + +// Accessor for workspace of a given device +// - Throws std::out_of_range if device index is invalid +// - Caller can then use .init(), .realloc(), .get() +inline NzWorkspace& get_workspace(int device) { + if (device < 0 || device >= static_cast(g_nz_workspaces.size())) { + throw std::out_of_range("device id out of range"); } + return g_nz_workspaces[device]; +} -} // namespace /** * @brief Convert tensor weights to NZ format using Ascend CANN API. @@ -1176,9 +1197,9 @@ static void weight_format_to_nz(ggml_tensor *tensor, size_t offset, int device) ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor)); // Avoid frequent malloc/free of the workspace. - relloc_nz_workspace(device, workspaceSize); - - void* g_nz_workspace = get_nz_workspace(device); + get_workspace(device).realloc(workspaceSize); + + void* g_nz_workspace = get_workspace(device).get(); ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr)); ACL_CHECK(aclDestroyTensor(weightTransposed)); @@ -2259,7 +2280,7 @@ static enum ggml_status ggml_backend_cann_graph_compute( ggml_backend_cann_context* cann_ctx = (ggml_backend_cann_context*)backend->context; ggml_cann_set_device(cann_ctx->device); - release_nz_workspace(cann_ctx->device); + get_workspace(cann_ctx->device).init(); #ifdef USE_ACL_GRAPH bool use_cann_graph = true; From 600ef99b2fe469d180f6979ed8c5939d05f0ad7b Mon Sep 17 00:00:00 2001 From: noemotiovon <757486878@qq.com> Date: Thu, 4 Sep 2025 04:24:08 +0000 Subject: [PATCH 3/4] rename Signed-off-by: noemotiovon <757486878@qq.com> --- ggml/src/ggml-cann/ggml-cann.cpp | 107 ++++++++++++++++++------------- 1 file changed, 64 insertions(+), 43 deletions(-) diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 4b43e379dcbc4..b24c33e85767e 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1116,56 +1116,77 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor( return GGML_STATUS_SUCCESS; } -// ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed -class NzWorkspace { -public: - // Constructor: initialize with no allocated buffer - NzWorkspace() : ptr_(nullptr), allocated_(0) {} - - // Reset workspace to uninitialized state: - // - Free allocated device memory (if any) - // - Clear internal pointer and size - // Equivalent to release_nz_workspace(device) in old version - void init() { - if (ptr_) { - aclrtFree(ptr_); - ptr_ = nullptr; - allocated_ = 0; +/** + * @brief Workspace for caching NZ buffers per device. + * + * This struct manages a device buffer used in NZ computations. It supports + * allocation, reallocation, and clearing of cached memory. The struct is + * designed to be used with a global array, one per device. + */ +struct ggml_cann_nz_workspace { + void* ptr; // Pointer to allocated device buffer + size_t allocated; // Size of currently allocated buffer in bytes + + /** + * @brief Constructor. Initializes the workspace with no allocated memory. + */ + ggml_cann_nz_workspace() : ptr(nullptr), allocated(0) {} + + /** + * @brief Free cached memory and reset the workspace. + * + * If a buffer has been allocated, this function releases it using + * aclrtFree and resets internal state. + */ + void clear() { + if (ptr) { + aclrtFree(ptr); + ptr = nullptr; + allocated = 0; } } - // Allocate or reallocate the workspace buffer: - // - If requested size > currently allocated size: - // * Free the old buffer (if any) - // * Allocate a new buffer with requested size on device - // - If requested size <= currently allocated size: - // * Do nothing (reuse existing buffer) - // Equivalent to relloc_nz_workspace(device, new_size) in old version + /** + * @brief Allocate or reallocate the workspace buffer. + * + * If the requested size is larger than the currently allocated size, + * the old buffer will be freed and a new buffer of the requested size + * will be allocated on the device. + * + * @param new_size Size in bytes to allocate for the workspace. + */ void realloc(size_t new_size) { - if (new_size > allocated_) { - init(); - ACL_CHECK(aclrtMalloc(&ptr_, new_size, ACL_MEM_MALLOC_HUGE_FIRST)); - allocated_ = new_size; + if (new_size > allocated) { + clear(); + ACL_CHECK(aclrtMalloc(&ptr, new_size, ACL_MEM_MALLOC_HUGE_FIRST)); + allocated = new_size; } } - // Return raw device pointer (may be nullptr if not allocated) - // Equivalent to get_nz_workspace(device) in old version - void* get() const { return ptr_; } - -private: - void* ptr_; // Pointer to allocated device buffer - size_t allocated_; // Size of currently allocated buffer (bytes) + /** + * @brief Get the device buffer pointer. + * + * @return Pointer to the allocated buffer, or nullptr if not allocated. + */ + void* get() const { return ptr; } }; -// Global array of NzWorkspace, one per device -// g_nz_workspaces[device] corresponds to workspace of given device -static std::array g_nz_workspaces; +/** + * @brief Global array of NZ workspaces, one per device. + */ +static std::array g_nz_workspaces; -// Accessor for workspace of a given device -// - Throws std::out_of_range if device index is invalid -// - Caller can then use .init(), .realloc(), .get() -inline NzWorkspace& get_workspace(int device) { +/** + * @brief Get the NZ workspace for a specific device. + * + * This function returns a reference to the workspace corresponding to the + * given device index. + * + * @param device Device index (0-based). Must be less than GGML_CANN_MAX_DEVICES. + * @return Reference to the device's NZ workspace. + * @throws std::out_of_range if device index is invalid. + */ +inline ggml_cann_nz_workspace& get_nz_workspace(int device) { if (device < 0 || device >= static_cast(g_nz_workspaces.size())) { throw std::out_of_range("device id out of range"); } @@ -1197,9 +1218,9 @@ static void weight_format_to_nz(ggml_tensor *tensor, size_t offset, int device) ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor)); // Avoid frequent malloc/free of the workspace. - get_workspace(device).realloc(workspaceSize); + get_nz_workspace(device).realloc(workspaceSize); - void* g_nz_workspace = get_workspace(device).get(); + void* g_nz_workspace = get_nz_workspace(device).get(); ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr)); ACL_CHECK(aclDestroyTensor(weightTransposed)); @@ -2280,7 +2301,7 @@ static enum ggml_status ggml_backend_cann_graph_compute( ggml_backend_cann_context* cann_ctx = (ggml_backend_cann_context*)backend->context; ggml_cann_set_device(cann_ctx->device); - get_workspace(cann_ctx->device).init(); + get_nz_workspace(cann_ctx->device).clear(); #ifdef USE_ACL_GRAPH bool use_cann_graph = true; From 11d6aa7a94fba438ef6477d0604c6c951301729f Mon Sep 17 00:00:00 2001 From: noemotiovon <757486878@qq.com> Date: Thu, 4 Sep 2025 07:06:09 +0000 Subject: [PATCH 4/4] fix review comments Signed-off-by: noemotiovon <757486878@qq.com> --- ggml/src/ggml-cann/ggml-cann.cpp | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index b24c33e85767e..fd82fd27d3e0f 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1140,7 +1140,7 @@ struct ggml_cann_nz_workspace { */ void clear() { if (ptr) { - aclrtFree(ptr); + ACL_CHECK(aclrtFree(ptr)); ptr = nullptr; allocated = 0; } @@ -1174,25 +1174,7 @@ struct ggml_cann_nz_workspace { /** * @brief Global array of NZ workspaces, one per device. */ -static std::array g_nz_workspaces; - -/** - * @brief Get the NZ workspace for a specific device. - * - * This function returns a reference to the workspace corresponding to the - * given device index. - * - * @param device Device index (0-based). Must be less than GGML_CANN_MAX_DEVICES. - * @return Reference to the device's NZ workspace. - * @throws std::out_of_range if device index is invalid. - */ -inline ggml_cann_nz_workspace& get_nz_workspace(int device) { - if (device < 0 || device >= static_cast(g_nz_workspaces.size())) { - throw std::out_of_range("device id out of range"); - } - return g_nz_workspaces[device]; -} - +static ggml_cann_nz_workspace g_nz_workspaces[GGML_CANN_MAX_DEVICES]; /** * @brief Convert tensor weights to NZ format using Ascend CANN API. @@ -1218,9 +1200,9 @@ static void weight_format_to_nz(ggml_tensor *tensor, size_t offset, int device) ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor)); // Avoid frequent malloc/free of the workspace. - get_nz_workspace(device).realloc(workspaceSize); + g_nz_workspaces[device].realloc(workspaceSize); - void* g_nz_workspace = get_nz_workspace(device).get(); + void* g_nz_workspace = g_nz_workspaces[device].get(); ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr)); ACL_CHECK(aclDestroyTensor(weightTransposed)); @@ -2301,7 +2283,7 @@ static enum ggml_status ggml_backend_cann_graph_compute( ggml_backend_cann_context* cann_ctx = (ggml_backend_cann_context*)backend->context; ggml_cann_set_device(cann_ctx->device); - get_nz_workspace(cann_ctx->device).clear(); + g_nz_workspaces[cann_ctx->device].clear(); #ifdef USE_ACL_GRAPH bool use_cann_graph = true;