2323#ifndef  CANN_ACLNN_OPS
2424#define  CANN_ACLNN_OPS 
2525
26+ #include  < functional> 
2627#include  < aclnnop/aclnn_abs.h> 
2728#include  < aclnnop/aclnn_neg.h> 
2829#include  < aclnnop/aclnn_exp.h> 
@@ -713,6 +714,270 @@ void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
713714 */  
714715void  ggml_cann_step (ggml_backend_cann_context& ctx, ggml_tensor* dst);
715716
717+ /* 
718+  * @brief A generic wrapper for ACL resources with custom deleter support. 
719+  */  
720+ using  any_acl_resource = std::unique_ptr<void , std::function<void (void *)>>;
721+ 
722+ /* *
723+  * @brief Trait structure used to define how to destroy a given ACL resource type. 
724+  * 
725+  * @tparam T ACL resource type. 
726+  */  
727+ template <typename  T>
728+ struct  acl_resource_traits ;
729+ 
730+ /* *
731+  * @brief Specialization for aclTensor, defines how to destroy an aclTensor resource. 
732+  */  
733+ template <>
734+ struct  acl_resource_traits <aclTensor> {
735+     static  void  destroy (void * p) {
736+         ACL_CHECK (aclDestroyTensor (static_cast <aclTensor*>(p)));
737+     }
738+ };
739+ 
740+ /* *
741+  * @brief Specialization for aclIntArray, defines how to destroy an aclIntArray resource. 
742+  */  
743+ template <>
744+ struct  acl_resource_traits <aclIntArray> {
745+     static  void  destroy (void * p) {
746+         ACL_CHECK (aclDestroyIntArray (static_cast <aclIntArray*>(p)));
747+     }
748+ };
749+ 
750+ /* *
751+  * @brief Specialization for aclScalar, defines how to destroy an aclScalar resource. 
752+  */  
753+ template <>
754+ struct  acl_resource_traits <aclScalar> {
755+     static  void  destroy (void * p) {
756+         ACL_CHECK (aclDestroyScalar (static_cast <aclScalar*>(p)));
757+     }
758+ };
759+ 
760+ /* *
761+  * @brief Specialization for aclTensorList, defines how to destroy an aclTensorList resource. 
762+  */  
763+ template <>
764+ struct  acl_resource_traits <aclTensorList> {
765+     static  void  destroy (void * p) {
766+         ACL_CHECK (aclDestroyTensorList (static_cast <aclTensorList*>(p)));
767+     }
768+ };
769+ 
770+ /* *
771+  * @brief Creates a generic ACL resource wrapper with proper destruction logic. 
772+  * 
773+  * @tparam T ACL resource type. 
774+  * @param ptr Raw pointer to ACL resource. 
775+  * @return any_acl_resource Smart pointer that handles destruction. 
776+  */  
777+ template <typename  T>
778+ any_acl_resource make_acl_resource (T* ptr) {
779+     return  any_acl_resource (
780+         static_cast <void *>(ptr),
781+         [](void * p) {
782+             acl_resource_traits<T>::destroy (p);
783+         }
784+     );
785+ }
786+ 
787+ /* *
788+  * @brief Registers multiple ACL resources into a vector for lifetime management. 
789+  * 
790+  * @tparam Args Variadic list of ACL resource types. 
791+  * @param vec Target vector to hold ACL resources. 
792+  * @param args Raw pointers to ACL resources. 
793+  */  
794+ template <typename ... Args>
795+ void  register_acl_resources (std::vector<any_acl_resource>& vec, Args*... args) {
796+     (vec.emplace_back (make_acl_resource (args)), ...);
797+ }
798+ 
799+ /* *
800+  * @brief Task class that wraps the execution of an aclnn function call. 
801+  */  
802+ class  aclnn_task  : public  cann_task  {
803+     public: 
804+         aclnn_task (aclnn_func_t  aclnn_func, void  * workspace_addr,
805+                    uint64_t  workspace_size, aclOpExecutor * executor,
806+                    aclrtStream stream) :
807+             aclnn_func_ (aclnn_func),
808+             workspace_addr_ (workspace_addr),
809+             workspace_size_ (workspace_size),
810+             executor_ (executor),
811+             stream_ (stream) {}
812+         virtual  void  run_task () override  {
813+             ACL_CHECK (aclnn_func_ (workspace_addr_, workspace_size_, executor_, stream_));
814+         }
815+     private: 
816+         aclnn_func_t  aclnn_func_;
817+         void  *          workspace_addr_;
818+         uint64_t         workspace_size_;
819+         aclOpExecutor * executor_;
820+         aclrtStream     stream_;
821+ };
822+ 
823+ /* *
824+  * @brief Task class that releases ACL resources after usage. 
825+  */  
826+ class  release_resource_task  : public  cann_task  {
827+ public: 
828+     release_resource_task (std::vector<any_acl_resource>&& resources){
829+         resource_ = std::move (resources);
830+     }
831+ 
832+     virtual  void  run_task () override  {
833+         resource_.clear ();
834+     }
835+ private: 
836+     std::vector<any_acl_resource> resource_;
837+ };
838+ 
839+ /* *
840+  * @brief Task class for performing asynchronous memory copy operations. 
841+  */  
842+ class  async_memcpy_task  : public  cann_task  {
843+ public: 
844+     async_memcpy_task (void * dst, const  void * src, size_t  size,
845+                       aclrtMemcpyKind kind, aclrtStream stream)
846+         : dst_(dst), src_(src), size_(size), kind_(kind), stream_(stream) {}
847+ 
848+     virtual  void  run_task () override  {
849+         ACL_CHECK (aclrtMemcpyAsync (dst_, size_, src_, size_, kind_, stream_));
850+     }
851+ private: 
852+     void * dst_;
853+     const  void * src_;
854+     size_t  size_;
855+     aclrtMemcpyKind kind_;
856+     aclrtStream stream_;
857+ };
858+ 
859+ /* *
860+  * @brief Task class for performing asynchronous memory set operations. 
861+  */  
862+ class  async_memset_task  : public  cann_task  {
863+     public: 
864+     async_memset_task (void * buffer, size_t  size, int32_t  value, aclrtStream stream)
865+             : buffer_(buffer), size_(size), value_(value), stream_(stream) {}
866+ 
867+         virtual  void  run_task () override  {
868+             ACL_CHECK (aclrtMemsetAsync (buffer_, size_, value_, size_, stream_));
869+         }
870+     private: 
871+         void * buffer_;
872+         size_t  size_;
873+         int32_t  value_;
874+         aclrtStream stream_;
875+ };
876+ 
877+ /* *
878+  * @brief Launches an asynchronous task using the memory allocator. 
879+  * 
880+  * This macro submit an asynchronous task on the specified stream. 
881+  * The task uses memory allocated by the allocator. It is guaranteed 
882+  * that the memory will not be accessed by other tasks until this task 
883+  * completes, due to the sequential execution order within the same stream. 
884+  * 
885+  * @param OP_NAME aclnn operator name. 
886+  * @param args Additional arguments required by the task. 
887+  * 
888+  * @note 
889+  * Memory from the allocator will be "freed" immediately and can be 
890+  * reallocated to other pointers. However, it won't be accessed by any 
891+  * other task before this asynchronous task ends, because all tasks in the 
892+  * same stream are executed in queue order. 
893+  */  
894+ 
895+ #define  GGML_CANN_CALL_ACLNN_OP (CTX, OP_NAME, ...)                                          \
896+     do  {                                                                                    \
897+         uint64_t         workspaceSize = 0 ;                                                  \
898+         aclOpExecutor * executor;                                                           \
899+         void  *          workspaceAddr = nullptr ;                                            \
900+         ACL_CHECK (aclnn##OP_NAME##GetWorkspaceSize (__VA_ARGS__, &workspaceSize, &executor));\
901+         /*  workspace should alloced in main thread to keep malloc order when using vmm. */    \
902+         if  (workspaceSize > 0 ) {                                                            \
903+             ggml_cann_pool_alloc workspace_allocator (CTX.pool (), workspaceSize);            \
904+             workspaceAddr = workspace_allocator.get ();                                      \
905+         }                                                                                   \
906+         if  (CTX.async_mode ) {                                                               \
907+             auto  task =                                                                     \
908+                 std::make_unique<aclnn_task>(aclnn##OP_NAME, workspaceAddr, workspaceSize,  \
909+                     executor, CTX.stream ()); \
910+             CTX.task_queue .submit_task (std::move (task));                                    \
911+         } else  {                                                                            \
912+             ACL_CHECK (aclnn##OP_NAME (workspaceAddr, workspaceSize, executor, CTX.stream ()));\
913+         }                                                                                   \
914+     } while  (0 )
915+ 
916+ /* *
917+  * @brief Registers and releases multiple ACL resources, optionally deferring the release 
918+  *        using a task. 
919+  * 
920+  * @tparam Args Types of the ACL resources. 
921+  * @param ctx Backend context which manages task submission and async mode. 
922+  * @param args Pointers to ACL resources to be released. 
923+  */  
924+ template  <typename ... Args>
925+ void  ggml_cann_release_resources (ggml_backend_cann_context & ctx, Args &&... args) {
926+     std::vector<any_acl_resource> resources;
927+     register_acl_resources (resources, std::forward<Args>(args)...);
928+     if (ctx.async_mode ) {
929+         auto  task = std::make_unique<release_resource_task>(std::move (resources));
930+         ctx.task_queue .submit_task (std::move (task));
931+     }
932+ }
933+ 
934+ /* *
935+  * @brief Performs an asynchronous memory copy operation, optionally deferred via task submission. 
936+  * 
937+  * @param ctx Backend context containing stream and async configuration. 
938+  * @param dst Destination memory address. 
939+  * @param src Source memory address. 
940+  * @param len Size of memory to copy (in bytes). 
941+  * @param kind Type of memory copy (host-to-device, device-to-host, etc). 
942+  */  
943+ inline  void  ggml_cann_async_memcpy (ggml_backend_cann_context & ctx, void  * dst,
944+                                    const  void  * src, size_t  len, aclrtMemcpyKind kind) {
945+     if  (ctx.async_mode ) {
946+         auto  task = std::make_unique<async_memcpy_task>(dst, const_cast <void  *>(src), len, kind, ctx.stream ());
947+         ctx.task_queue .submit_task (std::move (task));
948+     } else  {
949+         ACL_CHECK (aclrtMemcpyAsync (dst, len, src, len, kind, ctx.stream ()));
950+     }
951+ }
952+ 
953+ inline  void  ggml_cann_async_memcpy (ggml_backend_cann_context * ctx, void  * dst,
954+                                    const  void  * src, size_t  len, aclrtMemcpyKind kind) {
955+     if  (ctx->async_mode ) {
956+         auto  task = std::make_unique<async_memcpy_task>(dst, const_cast <void  *>(src), len, kind, ctx->stream ());
957+         ctx->task_queue .submit_task (std::move (task));
958+     } else  {
959+         ACL_CHECK (aclrtMemcpyAsync (dst, len, src, len, kind, ctx->stream ()));
960+     }
961+ }
962+ 
963+ /* *
964+  * @brief Performs an asynchronous memory set operation, optionally deferred via task submission. 
965+  * 
966+  * @param ctx Backend context containing stream and async configuration. 
967+  * @param buffer Memory buffer to be set. 
968+  * @param size Size of the memory buffer (in bytes). 
969+  * @param value Value to set in the buffer. 
970+  */  
971+ inline  void  ggml_cann_async_memset (ggml_backend_cann_context & ctx, void  * buffer,
972+                                    size_t  size, int  value) {
973+     if  (ctx.async_mode ) {
974+         auto  task = std::make_unique<async_memset_task>(buffer, size, value, ctx.stream ());
975+         ctx.task_queue .submit_task (std::move (task));
976+     } else  {
977+         ACL_CHECK (aclrtMemsetAsync (buffer, size, value, size, ctx.stream ()));
978+     }
979+ }
980+ 
716981/* *
717982 * @brief Applies a element-wise operation to two input tensors using the CANN 
718983 * backend. 
@@ -742,42 +1007,9 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
7421007    bcast_shape (src0, src1, dst, &acl_src0, &acl_src1, &acl_dst);
7431008    binary_op (ctx, acl_src0, acl_src1, acl_dst);
7441009
745-     ACL_CHECK (aclDestroyTensor (acl_src0));
746-     ACL_CHECK (aclDestroyTensor (acl_src1));
747-     ACL_CHECK (aclDestroyTensor (acl_dst));
1010+     ggml_cann_release_resources (ctx, acl_src0, acl_src1, acl_dst);
7481011}
7491012
750- /* *
751-  * @brief Launches an asynchronous task using the memory allocator. 
752-  * 
753-  * This macro submit an asynchronous task on the specified stream. 
754-  * The task uses memory allocated by the allocator. It is guaranteed 
755-  * that the memory will not be accessed by other tasks until this task 
756-  * completes, due to the sequential execution order within the same stream. 
757-  * 
758-  * @param OP_NAME aclnn operator name. 
759-  * @param args Additional arguments required by the task. 
760-  * 
761-  * @note 
762-  * Memory from the allocator will be "freed" immediately and can be 
763-  * reallocated to other pointers. However, it won't be accessed by any 
764-  * other task before this asynchronous task ends, because all tasks in the 
765-  * same stream are executed in queue order. 
766-  */  
767- #define  GGML_CANN_CALL_ACLNN_OP (OP_NAME, ...)                                                \
768-     do  {                                                                                     \
769-         uint64_t         workspaceSize = 0 ;                                                   \
770-         aclOpExecutor * executor;                                                            \
771-         void  *          workspaceAddr = nullptr ;                                             \
772-                                                                                              \
773-         ACL_CHECK (aclnn##OP_NAME##GetWorkspaceSize (__VA_ARGS__, &workspaceSize, &executor)); \
774-                                                                                              \
775-         if  (workspaceSize > 0 ) {                                                             \
776-             ggml_cann_pool_alloc workspace_allocator (ctx.pool (), workspaceSize);             \
777-             workspaceAddr = workspace_allocator.get ();                                       \
778-         }                                                                                    \
779-         ACL_CHECK (aclnn##OP_NAME (workspaceAddr, workspaceSize, executor, ctx.stream ()));     \
780-     } while  (0 )
7811013
7821014/* *
7831015 * @brief Applies a unary operation to an input tensor using the CANN backend. 
@@ -799,9 +1031,7 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
7991031    aclTensor* acl_dst = ggml_cann_create_tensor (dst);
8001032
8011033    unary_op (ctx, acl_src, acl_dst);
802- 
803-     ACL_CHECK (aclDestroyTensor (acl_src));
804-     ACL_CHECK (aclDestroyTensor (acl_dst));
1034+     ggml_cann_release_resources (ctx, acl_src, acl_dst);
8051035}
8061036
8071037/* *
@@ -832,22 +1062,22 @@ void ggml_cann_unary_op(
8321062 * 
8331063 * Internally, the lambda will call: 
8341064 * @code 
835-  * GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst); 
1065+  * GGML_CANN_CALL_ACLNN_OP(ctx,  OP_NAME, acl_src, acl_dst); 
8361066 * @endcode 
8371067 * 
8381068 * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP. 
8391069 * 
8401070 * @see ggml_cann_unary_op 
8411071 * @see GGML_CANN_CALL_ACLNN_OP 
8421072 */  
843- #define  GGML_CANN_CALL_UNARY_OP (OP_NAME )                         \
844-     do  {                                                         \
845-         auto  lambda = [](ggml_backend_cann_context& ctx,         \
846-             aclTensor* acl_src,                                  \
847-             aclTensor* acl_dst) {                                \
848-             GGML_CANN_CALL_ACLNN_OP (OP_NAME, acl_src, acl_dst);  \
849-         };                                                       \
850-         ggml_cann_unary_op (lambda, ctx, dst);                    \
851-     }                                                            \
1073+ #define  GGML_CANN_CALL_UNARY_OP (OP_NAME )                               \
1074+     do  {                                                               \
1075+         auto  lambda = [](ggml_backend_cann_context& ctx,               \
1076+             aclTensor* acl_src,                                        \
1077+             aclTensor* acl_dst) {                                      \
1078+             GGML_CANN_CALL_ACLNN_OP (ctx,  OP_NAME, acl_src, acl_dst);  \
1079+         };                                                             \
1080+         ggml_cann_unary_op (lambda, ctx, dst);                          \
1081+     }                                                                  \
8521082    while  (0 )
8531083#endif   //  CANN_ACLNN_OPS
0 commit comments