From a83dc461a4fcddd3108d48cfeda917c4a70c7f5d Mon Sep 17 00:00:00 2001
From: noemotiovon <757486878@qq.com>
Date: Wed, 3 Sep 2025 09:30:52 +0000
Subject: [PATCH 1/4] CANN:Refactor ND to NZ workspace to be per-device in
 Ascend backend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replaced the previous single global ND→NZ workspace with a per-device
  cache using unordered_map keyed by device ID.
- Functions `release_nz_workspace`, `relloc_nz_workspace`, and
  `get_nz_workspace` now manage workspace independently for each device,
  preventing memory conflicts in multi-device / pipeline parallel scenarios.
- This change fixes potential precision issues caused by workspace
  overwrites when multiple devices perform ND→NZ conversions concurrently.

Co-authored-by: hipudding <huafengchun@gmail.com>
---
 ggml/src/ggml-cann/ggml-cann.cpp | 55 ++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 0d9eb8fa1b9ca..cc2ac4e243cc5 100755
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1118,28 +1118,39 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
 
 // ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed
 namespace {
-    void* g_nz_workspace = nullptr;
-    size_t g_nz_workspace_allocated = 0;
-
-    void release_nz_workspace() {
-        if (g_nz_workspace) {
-            aclrtFree(g_nz_workspace);
-            g_nz_workspace = nullptr;
-            g_nz_workspace_allocated = 0;
+
+    static std::unordered_map<int, void*> g_nz_workspace_map;
+    static std::unordered_map<int, size_t> g_nz_workspace_allocated_map;
+
+    void release_nz_workspace(int device) {
+        auto it = g_nz_workspace_map.find(device);
+        if (it != g_nz_workspace_map.end() && it->second) {
+            aclrtFree(it->second);
+            g_nz_workspace_map.erase(it);
+            g_nz_workspace_allocated_map.erase(device);
         }
     }
 
-    void relloc_nz_workspace(size_t new_size) {
-        if (new_size > g_nz_workspace_allocated) {
-        if (g_nz_workspace) {
-            aclrtFree(g_nz_workspace);
-            g_nz_workspace = nullptr;
+    void relloc_nz_workspace(int device, size_t new_size) {
+        void* &workspace = g_nz_workspace_map[device];
+        size_t &allocated = g_nz_workspace_allocated_map[device];
+
+        if (new_size > allocated) {
+            if (workspace) {
+                aclrtFree(workspace);
+                workspace = nullptr;
+            }
+            ACL_CHECK(aclrtMalloc(&workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
+            allocated = new_size;
         }
-        ACL_CHECK(aclrtMalloc(&g_nz_workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
-        g_nz_workspace_allocated = new_size;
     }
+
+    void* get_nz_workspace(int device) {
+        auto it = g_nz_workspace_map.find(device);
+        return (it != g_nz_workspace_map.end()) ? it->second : nullptr;
     }
-}
+
+} // namespace
 
 /**
  * @brief Convert tensor weights to NZ format using Ascend CANN API.
@@ -1149,13 +1160,13 @@ namespace {
  * improve performance on certain hardware.
  *
  * @param tensor Pointer to the input ggml_tensor containing the weights.
- * @param data Pointer to the raw data buffer for the tensor weights.
  * @param offset Byte offset within the tensor data buffer where weights start.
+ * @param device device id.
  *
  * @note The workspace buffer used in this function is managed globally and reused
  *       across calls. This reduces overhead from repeated memory allocation and deallocation.
  */
-static void weight_format_to_nz(ggml_tensor *tensor, size_t offset) {
+static void weight_format_to_nz(ggml_tensor *tensor, size_t offset, int device) {
     aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne,
                                     tensor->nb, 2, ACL_FORMAT_ND, offset);
     uint64_t workspaceSize = 0;
@@ -1165,7 +1176,9 @@ static void weight_format_to_nz(ggml_tensor *tensor, size_t offset) {
     ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed,
                                                     &workspaceSize, &executor));
     // Avoid frequent malloc/free of the workspace.
-    relloc_nz_workspace(workspaceSize);
+    relloc_nz_workspace(device, workspaceSize);
+    
+    void* g_nz_workspace = get_nz_workspace(device);
 
     ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
     ACL_CHECK(aclDestroyTensor(weightTransposed));
@@ -1203,7 +1216,7 @@ static void ggml_backend_cann_buffer_set_tensor(
         if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
             GGML_ASSERT(tensor->ne[2] == 1);
             GGML_ASSERT(tensor->ne[3] == 1);
-            weight_format_to_nz(tensor, offset);
+            weight_format_to_nz(tensor, offset, ctx->device);
         }
     } else {
         void *transform_buffer = malloc(size);
@@ -2246,7 +2259,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
     ggml_backend_cann_context* cann_ctx =
         (ggml_backend_cann_context*)backend->context;
     ggml_cann_set_device(cann_ctx->device);
-    release_nz_workspace();
+    release_nz_workspace(cann_ctx->device);
 
 #ifdef USE_ACL_GRAPH
     bool use_cann_graph = true;

From 540d7b4c8b1923c3e3820c2538192c0a1a1deee5 Mon Sep 17 00:00:00 2001
From: noemotiovon <757486878@qq.com>
Date: Thu, 4 Sep 2025 02:48:10 +0000
Subject: [PATCH 2/4] refactor

Signed-off-by: noemotiovon <757486878@qq.com>
---
 ggml/src/ggml-cann/ggml-cann.cpp | 81 ++++++++++++++++++++------------
 1 file changed, 51 insertions(+), 30 deletions(-)

diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index cc2ac4e243cc5..4b43e379dcbc4 100755
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1117,40 +1117,61 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
 }
 
 // ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed
-namespace {
-
-    static std::unordered_map<int, void*> g_nz_workspace_map;
-    static std::unordered_map<int, size_t> g_nz_workspace_allocated_map;
-
-    void release_nz_workspace(int device) {
-        auto it = g_nz_workspace_map.find(device);
-        if (it != g_nz_workspace_map.end() && it->second) {
-            aclrtFree(it->second);
-            g_nz_workspace_map.erase(it);
-            g_nz_workspace_allocated_map.erase(device);
+class NzWorkspace {
+public:
+    // Constructor: initialize with no allocated buffer
+    NzWorkspace() : ptr_(nullptr), allocated_(0) {}
+
+    // Reset workspace to uninitialized state:
+    // - Free allocated device memory (if any)
+    // - Clear internal pointer and size
+    // Equivalent to release_nz_workspace(device) in old version
+    void init() {
+        if (ptr_) {
+            aclrtFree(ptr_);
+            ptr_ = nullptr;
+            allocated_ = 0;
         }
     }
 
-    void relloc_nz_workspace(int device, size_t new_size) {
-        void* &workspace = g_nz_workspace_map[device];
-        size_t &allocated = g_nz_workspace_allocated_map[device];
-
-        if (new_size > allocated) {
-            if (workspace) {
-                aclrtFree(workspace);
-                workspace = nullptr;
-            }
-            ACL_CHECK(aclrtMalloc(&workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
-            allocated = new_size;
+    // Allocate or reallocate the workspace buffer:
+    // - If requested size > currently allocated size:
+    //   * Free the old buffer (if any)
+    //   * Allocate a new buffer with requested size on device
+    // - If requested size <= currently allocated size:
+    //   * Do nothing (reuse existing buffer)
+    // Equivalent to relloc_nz_workspace(device, new_size) in old version
+    void realloc(size_t new_size) {
+        if (new_size > allocated_) {
+            init();
+            ACL_CHECK(aclrtMalloc(&ptr_, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
+            allocated_ = new_size;
         }
     }
 
-    void* get_nz_workspace(int device) {
-        auto it = g_nz_workspace_map.find(device);
-        return (it != g_nz_workspace_map.end()) ? it->second : nullptr;
+    // Return raw device pointer (may be nullptr if not allocated)
+    // Equivalent to get_nz_workspace(device) in old version
+    void* get() const { return ptr_; }
+
+private:
+    void* ptr_;  // Pointer to allocated device buffer
+    size_t allocated_;  // Size of currently allocated buffer (bytes)
+};
+
+// Global array of NzWorkspace, one per device
+// g_nz_workspaces[device] corresponds to workspace of given device
+static std::array<NzWorkspace, GGML_CANN_MAX_DEVICES> g_nz_workspaces;
+
+// Accessor for workspace of a given device
+// - Throws std::out_of_range if device index is invalid
+// - Caller can then use .init(), .realloc(), .get()
+inline NzWorkspace& get_workspace(int device) {
+    if (device < 0 || device >= static_cast<int>(g_nz_workspaces.size())) {
+        throw std::out_of_range("device id out of range");
     }
+    return g_nz_workspaces[device];
+}
 
-} // namespace
 
 /**
  * @brief Convert tensor weights to NZ format using Ascend CANN API.
@@ -1176,9 +1197,9 @@ static void weight_format_to_nz(ggml_tensor *tensor, size_t offset, int device)
     ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed,
                                                     &workspaceSize, &executor));
     // Avoid frequent malloc/free of the workspace.
-    relloc_nz_workspace(device, workspaceSize);
-    
-    void* g_nz_workspace = get_nz_workspace(device);
+    get_workspace(device).realloc(workspaceSize);
+
+    void* g_nz_workspace = get_workspace(device).get();
 
     ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
     ACL_CHECK(aclDestroyTensor(weightTransposed));
@@ -2259,7 +2280,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
     ggml_backend_cann_context* cann_ctx =
         (ggml_backend_cann_context*)backend->context;
     ggml_cann_set_device(cann_ctx->device);
-    release_nz_workspace(cann_ctx->device);
+    get_workspace(cann_ctx->device).init();
 
 #ifdef USE_ACL_GRAPH
     bool use_cann_graph = true;

From 600ef99b2fe469d180f6979ed8c5939d05f0ad7b Mon Sep 17 00:00:00 2001
From: noemotiovon <757486878@qq.com>
Date: Thu, 4 Sep 2025 04:24:08 +0000
Subject: [PATCH 3/4] rename

Signed-off-by: noemotiovon <757486878@qq.com>
---
 ggml/src/ggml-cann/ggml-cann.cpp | 107 ++++++++++++++++++-------------
 1 file changed, 64 insertions(+), 43 deletions(-)

diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 4b43e379dcbc4..b24c33e85767e 100755
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1116,56 +1116,77 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
     return GGML_STATUS_SUCCESS;
 }
 
-// ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed
-class NzWorkspace {
-public:
-    // Constructor: initialize with no allocated buffer
-    NzWorkspace() : ptr_(nullptr), allocated_(0) {}
-
-    // Reset workspace to uninitialized state:
-    // - Free allocated device memory (if any)
-    // - Clear internal pointer and size
-    // Equivalent to release_nz_workspace(device) in old version
-    void init() {
-        if (ptr_) {
-            aclrtFree(ptr_);
-            ptr_ = nullptr;
-            allocated_ = 0;
+/**
+ * @brief Workspace for caching NZ buffers per device.
+ *
+ * This struct manages a device buffer used in NZ computations. It supports
+ * allocation, reallocation, and clearing of cached memory. The struct is
+ * designed to be used with a global array, one per device.
+ */
+struct ggml_cann_nz_workspace {
+    void*  ptr;       // Pointer to allocated device buffer
+    size_t allocated; // Size of currently allocated buffer in bytes
+
+    /**
+     * @brief Constructor. Initializes the workspace with no allocated memory.
+     */
+    ggml_cann_nz_workspace() : ptr(nullptr), allocated(0) {}
+
+    /**
+     * @brief Free cached memory and reset the workspace.
+     *
+     * If a buffer has been allocated, this function releases it using
+     * aclrtFree and resets internal state.
+     */
+    void clear() {
+        if (ptr) {
+            aclrtFree(ptr);
+            ptr = nullptr;
+            allocated = 0;
         }
     }
 
-    // Allocate or reallocate the workspace buffer:
-    // - If requested size > currently allocated size:
-    //   * Free the old buffer (if any)
-    //   * Allocate a new buffer with requested size on device
-    // - If requested size <= currently allocated size:
-    //   * Do nothing (reuse existing buffer)
-    // Equivalent to relloc_nz_workspace(device, new_size) in old version
+    /**
+     * @brief Allocate or reallocate the workspace buffer.
+     *
+     * If the requested size is larger than the currently allocated size,
+     * the old buffer will be freed and a new buffer of the requested size
+     * will be allocated on the device.
+     *
+     * @param new_size Size in bytes to allocate for the workspace.
+     */
     void realloc(size_t new_size) {
-        if (new_size > allocated_) {
-            init();
-            ACL_CHECK(aclrtMalloc(&ptr_, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
-            allocated_ = new_size;
+        if (new_size > allocated) {
+            clear();
+            ACL_CHECK(aclrtMalloc(&ptr, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
+            allocated = new_size;
         }
     }
 
-    // Return raw device pointer (may be nullptr if not allocated)
-    // Equivalent to get_nz_workspace(device) in old version
-    void* get() const { return ptr_; }
-
-private:
-    void* ptr_;  // Pointer to allocated device buffer
-    size_t allocated_;  // Size of currently allocated buffer (bytes)
+    /**
+     * @brief Get the device buffer pointer.
+     *
+     * @return Pointer to the allocated buffer, or nullptr if not allocated.
+     */
+    void* get() const { return ptr; }
 };
 
-// Global array of NzWorkspace, one per device
-// g_nz_workspaces[device] corresponds to workspace of given device
-static std::array<NzWorkspace, GGML_CANN_MAX_DEVICES> g_nz_workspaces;
+/**
+ * @brief Global array of NZ workspaces, one per device.
+ */
+static std::array<ggml_cann_nz_workspace, GGML_CANN_MAX_DEVICES> g_nz_workspaces;
 
-// Accessor for workspace of a given device
-// - Throws std::out_of_range if device index is invalid
-// - Caller can then use .init(), .realloc(), .get()
-inline NzWorkspace& get_workspace(int device) {
+/**
+ * @brief Get the NZ workspace for a specific device.
+ *
+ * This function returns a reference to the workspace corresponding to the
+ * given device index.
+ *
+ * @param device Device index (0-based). Must be less than GGML_CANN_MAX_DEVICES.
+ * @return Reference to the device's NZ workspace.
+ * @throws std::out_of_range if device index is invalid.
+ */
+inline ggml_cann_nz_workspace& get_nz_workspace(int device) {
     if (device < 0 || device >= static_cast<int>(g_nz_workspaces.size())) {
         throw std::out_of_range("device id out of range");
     }
@@ -1197,9 +1218,9 @@ static void weight_format_to_nz(ggml_tensor *tensor, size_t offset, int device)
     ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed,
                                                     &workspaceSize, &executor));
     // Avoid frequent malloc/free of the workspace.
-    get_workspace(device).realloc(workspaceSize);
+    get_nz_workspace(device).realloc(workspaceSize);
 
-    void* g_nz_workspace = get_workspace(device).get();
+    void* g_nz_workspace = get_nz_workspace(device).get();
 
     ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
     ACL_CHECK(aclDestroyTensor(weightTransposed));
@@ -2280,7 +2301,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
     ggml_backend_cann_context* cann_ctx =
         (ggml_backend_cann_context*)backend->context;
     ggml_cann_set_device(cann_ctx->device);
-    get_workspace(cann_ctx->device).init();
+    get_nz_workspace(cann_ctx->device).clear();
 
 #ifdef USE_ACL_GRAPH
     bool use_cann_graph = true;

From 11d6aa7a94fba438ef6477d0604c6c951301729f Mon Sep 17 00:00:00 2001
From: noemotiovon <757486878@qq.com>
Date: Thu, 4 Sep 2025 07:06:09 +0000
Subject: [PATCH 4/4] fix review comments

Signed-off-by: noemotiovon <757486878@qq.com>
---
 ggml/src/ggml-cann/ggml-cann.cpp | 28 +++++-----------------------
 1 file changed, 5 insertions(+), 23 deletions(-)

diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index b24c33e85767e..fd82fd27d3e0f 100755
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1140,7 +1140,7 @@ struct ggml_cann_nz_workspace {
      */
     void clear() {
         if (ptr) {
-            aclrtFree(ptr);
+            ACL_CHECK(aclrtFree(ptr));
             ptr = nullptr;
             allocated = 0;
         }
@@ -1174,25 +1174,7 @@ struct ggml_cann_nz_workspace {
 /**
  * @brief Global array of NZ workspaces, one per device.
  */
-static std::array<ggml_cann_nz_workspace, GGML_CANN_MAX_DEVICES> g_nz_workspaces;
-
-/**
- * @brief Get the NZ workspace for a specific device.
- *
- * This function returns a reference to the workspace corresponding to the
- * given device index.
- *
- * @param device Device index (0-based). Must be less than GGML_CANN_MAX_DEVICES.
- * @return Reference to the device's NZ workspace.
- * @throws std::out_of_range if device index is invalid.
- */
-inline ggml_cann_nz_workspace& get_nz_workspace(int device) {
-    if (device < 0 || device >= static_cast<int>(g_nz_workspaces.size())) {
-        throw std::out_of_range("device id out of range");
-    }
-    return g_nz_workspaces[device];
-}
-
+static ggml_cann_nz_workspace g_nz_workspaces[GGML_CANN_MAX_DEVICES];
 
 /**
  * @brief Convert tensor weights to NZ format using Ascend CANN API.
@@ -1218,9 +1200,9 @@ static void weight_format_to_nz(ggml_tensor *tensor, size_t offset, int device)
     ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed,
                                                     &workspaceSize, &executor));
     // Avoid frequent malloc/free of the workspace.
-    get_nz_workspace(device).realloc(workspaceSize);
+    g_nz_workspaces[device].realloc(workspaceSize);
 
-    void* g_nz_workspace = get_nz_workspace(device).get();
+    void* g_nz_workspace = g_nz_workspaces[device].get();
 
     ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
     ACL_CHECK(aclDestroyTensor(weightTransposed));
@@ -2301,7 +2283,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
     ggml_backend_cann_context* cann_ctx =
         (ggml_backend_cann_context*)backend->context;
     ggml_cann_set_device(cann_ctx->device);
-    get_nz_workspace(cann_ctx->device).clear();
+    g_nz_workspaces[cann_ctx->device].clear();
 
 #ifdef USE_ACL_GRAPH
     bool use_cann_graph = true;