@@ -1196,7 +1196,7 @@ static void ggml_backend_cann_buffer_set_tensor(
11961196 // Why aclrtSynchronizeDevice?
11971197
11981198 // Only check env once.
1199- static bool weight_to_nz = parse_bool (get_env (" GGML_CANN_WEIGHT_NZ" ).value_or (" " ));
1199+ static bool weight_to_nz = parse_bool (get_env (" GGML_CANN_WEIGHT_NZ" ).value_or (" on " ));
12001200 if (!need_transform (tensor->type )) {
12011201 ACL_CHECK (aclrtMemcpy ((char *)tensor->data + offset, size, data, size,
12021202 ACL_MEMCPY_HOST_TO_DEVICE));
@@ -1279,6 +1279,10 @@ static bool ggml_backend_cann_buffer_cpy_tensor(
12791279 ACL_MEMCPY_DEVICE_TO_DEVICE));
12801280 return true ;
12811281 } else {
1282+ #ifdef ASCEND_310P
1283+ // TODO: Support 310p P2P copy
1284+ return false ;
1285+ #endif
12821286 // Different device but can access by peer.
12831287 int32_t canAccessPeer = 0 ;
12841288 ACL_CHECK (aclrtDeviceCanAccessPeer (&canAccessPeer, src_ctx->device ,
@@ -1439,7 +1443,7 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
14391443 int64_t ne0 = tensor->ne [0 ];
14401444
14411445 // Only check env once.
1442- static bool weight_to_nz = parse_bool (get_env (" GGML_CANN_WEIGHT_NZ" ).value_or (" " ));
1446+ static bool weight_to_nz = parse_bool (get_env (" GGML_CANN_WEIGHT_NZ" ).value_or (" on " ));
14431447
14441448 // last line must bigger than 32, because every single op deal at
14451449 // least 32 bytes.
@@ -2000,6 +2004,8 @@ static bool ggml_backend_cann_cpy_tensor_async(
20002004 GGML_ASSERT (ggml_backend_is_cann (backend_src) ||
20012005 ggml_backend_is_cann (backend_dst));
20022006
2007+ GGML_ASSERT (!is_matmul_weight ((const ggml_tensor*)src));
2008+
20032009 if (!ggml_backend_buffer_is_cann (src->buffer ) ||
20042010 !ggml_backend_buffer_is_cann (dst->buffer )) {
20052011 return false ;
@@ -2020,6 +2026,10 @@ static bool ggml_backend_cann_cpy_tensor_async(
20202026 return true ;
20212027 }
20222028 if (backend_src != backend_dst) {
2029+ #ifdef ASCEND_310P
2030+ // TODO: Support 310p P2P copy
2031+ return false ;
2032+ #endif
20232033 ggml_backend_cann_buffer_context* buf_ctx_src =
20242034 (ggml_backend_cann_buffer_context*)buf_src->context ;
20252035 ggml_backend_cann_buffer_context* buf_ctx_dst =
@@ -2036,7 +2046,6 @@ static bool ggml_backend_cann_cpy_tensor_async(
20362046 }
20372047
20382048 // need open both directions for memcpyasync between devices.
2039- ggml_cann_set_device (cann_ctx_dst->device );
20402049 ACL_CHECK (aclrtDeviceEnablePeerAccess (cann_ctx_src->device , 0 ));
20412050 ggml_cann_set_device (cann_ctx_src->device );
20422051 ACL_CHECK (aclrtDeviceEnablePeerAccess (cann_ctx_dst->device , 0 ));
@@ -2047,8 +2056,15 @@ static bool ggml_backend_cann_cpy_tensor_async(
20472056 ACL_MEMCPY_DEVICE_TO_DEVICE,
20482057 cann_ctx_src->stream ()));
20492058
2050- // TODO: workaround for Event didn`t work here.
2051- aclrtSynchronizeStream (cann_ctx_src->stream ());
2059+ // record event on src stream after the copy
2060+ if (!cann_ctx_src->copy_event ) {
2061+ ACL_CHECK (aclrtCreateEventWithFlag (&cann_ctx_src->copy_event , ACL_EVENT_SYNC));
2062+ }
2063+ ACL_CHECK (aclrtRecordEvent (cann_ctx_src->copy_event , cann_ctx_src->stream ()));
2064+
2065+ // wait on dst stream for the copy to complete
2066+ ggml_cann_set_device (cann_ctx_dst->device );
2067+ ACL_CHECK (aclrtStreamWaitEvent (cann_ctx_dst->stream (), cann_ctx_src->copy_event ));
20522068 } else {
20532069 // src and dst are on the same backend
20542070 ACL_CHECK (aclrtMemcpyAsync (dst->data , copy_size, src->data , copy_size,
0 commit comments