2323#ifndef CANN_ACLNN_OPS
2424#define CANN_ACLNN_OPS
2525
26+ #include < functional>
2627#include < aclnnop/aclnn_abs.h>
2728#include < aclnnop/aclnn_neg.h>
2829#include < aclnnop/aclnn_exp.h>
@@ -713,6 +714,270 @@ void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
713714 */
714715void ggml_cann_step (ggml_backend_cann_context& ctx, ggml_tensor* dst);
715716
717+ /*
718+ * @brief A generic wrapper for ACL resources with custom deleter support.
719+ */
720+ using any_acl_resource = std::unique_ptr<void , std::function<void (void *)>>;
721+
722+ /* *
723+ * @brief Trait structure used to define how to destroy a given ACL resource type.
724+ *
725+ * @tparam T ACL resource type.
726+ */
727+ template <typename T>
728+ struct acl_resource_traits ;
729+
730+ /* *
731+ * @brief Specialization for aclTensor, defines how to destroy an aclTensor resource.
732+ */
733+ template <>
734+ struct acl_resource_traits <aclTensor> {
735+ static void destroy (void * p) {
736+ ACL_CHECK (aclDestroyTensor (static_cast <aclTensor*>(p)));
737+ }
738+ };
739+
740+ /* *
741+ * @brief Specialization for aclIntArray, defines how to destroy an aclIntArray resource.
742+ */
743+ template <>
744+ struct acl_resource_traits <aclIntArray> {
745+ static void destroy (void * p) {
746+ ACL_CHECK (aclDestroyIntArray (static_cast <aclIntArray*>(p)));
747+ }
748+ };
749+
750+ /* *
751+ * @brief Specialization for aclScalar, defines how to destroy an aclScalar resource.
752+ */
753+ template <>
754+ struct acl_resource_traits <aclScalar> {
755+ static void destroy (void * p) {
756+ ACL_CHECK (aclDestroyScalar (static_cast <aclScalar*>(p)));
757+ }
758+ };
759+
760+ /* *
761+ * @brief Specialization for aclTensorList, defines how to destroy an aclTensorList resource.
762+ */
763+ template <>
764+ struct acl_resource_traits <aclTensorList> {
765+ static void destroy (void * p) {
766+ ACL_CHECK (aclDestroyTensorList (static_cast <aclTensorList*>(p)));
767+ }
768+ };
769+
770+ /* *
771+ * @brief Creates a generic ACL resource wrapper with proper destruction logic.
772+ *
773+ * @tparam T ACL resource type.
774+ * @param ptr Raw pointer to ACL resource.
775+ * @return any_acl_resource Smart pointer that handles destruction.
776+ */
777+ template <typename T>
778+ any_acl_resource make_acl_resource (T* ptr) {
779+ return any_acl_resource (
780+ static_cast <void *>(ptr),
781+ [](void * p) {
782+ acl_resource_traits<T>::destroy (p);
783+ }
784+ );
785+ }
786+
787+ /* *
788+ * @brief Registers multiple ACL resources into a vector for lifetime management.
789+ *
790+ * @tparam Args Variadic list of ACL resource types.
791+ * @param vec Target vector to hold ACL resources.
792+ * @param args Raw pointers to ACL resources.
793+ */
794+ template <typename ... Args>
795+ void register_acl_resources (std::vector<any_acl_resource>& vec, Args*... args) {
796+ (vec.emplace_back (make_acl_resource (args)), ...);
797+ }
798+
799+ /* *
800+ * @brief Task class that wraps the execution of an aclnn function call.
801+ */
802+ class aclnn_task : public cann_task {
803+ public:
804+ aclnn_task (aclnn_func_t aclnn_func, void * workspace_addr,
805+ uint64_t workspace_size, aclOpExecutor * executor,
806+ aclrtStream stream) :
807+ aclnn_func_ (aclnn_func),
808+ workspace_addr_ (workspace_addr),
809+ workspace_size_ (workspace_size),
810+ executor_ (executor),
811+ stream_ (stream) {}
812+ virtual void run_task () override {
813+ ACL_CHECK (aclnn_func_ (workspace_addr_, workspace_size_, executor_, stream_));
814+ }
815+ private:
816+ aclnn_func_t aclnn_func_;
817+ void * workspace_addr_;
818+ uint64_t workspace_size_;
819+ aclOpExecutor * executor_;
820+ aclrtStream stream_;
821+ };
822+
823+ /* *
824+ * @brief Task class that releases ACL resources after usage.
825+ */
826+ class release_resource_task : public cann_task {
827+ public:
828+ release_resource_task (std::vector<any_acl_resource>&& resources){
829+ resource_ = std::move (resources);
830+ }
831+
832+ virtual void run_task () override {
833+ resource_.clear ();
834+ }
835+ private:
836+ std::vector<any_acl_resource> resource_;
837+ };
838+
839+ /* *
840+ * @brief Task class for performing asynchronous memory copy operations.
841+ */
842+ class async_memcpy_task : public cann_task {
843+ public:
844+ async_memcpy_task (void * dst, const void * src, size_t size,
845+ aclrtMemcpyKind kind, aclrtStream stream)
846+ : dst_(dst), src_(src), size_(size), kind_(kind), stream_(stream) {}
847+
848+ virtual void run_task () override {
849+ ACL_CHECK (aclrtMemcpyAsync (dst_, size_, src_, size_, kind_, stream_));
850+ }
851+ private:
852+ void * dst_;
853+ const void * src_;
854+ size_t size_;
855+ aclrtMemcpyKind kind_;
856+ aclrtStream stream_;
857+ };
858+
859+ /* *
860+ * @brief Task class for performing asynchronous memory set operations.
861+ */
862+ class async_memset_task : public cann_task {
863+ public:
864+ async_memset_task (void * buffer, size_t size, int32_t value, aclrtStream stream)
865+ : buffer_(buffer), size_(size), value_(value), stream_(stream) {}
866+
867+ virtual void run_task () override {
868+ ACL_CHECK (aclrtMemsetAsync (buffer_, size_, value_, size_, stream_));
869+ }
870+ private:
871+ void * buffer_;
872+ size_t size_;
873+ int32_t value_;
874+ aclrtStream stream_;
875+ };
876+
877+ /* *
878+ * @brief Launches an asynchronous task using the memory allocator.
879+ *
880+ * This macro submit an asynchronous task on the specified stream.
881+ * The task uses memory allocated by the allocator. It is guaranteed
882+ * that the memory will not be accessed by other tasks until this task
883+ * completes, due to the sequential execution order within the same stream.
884+ *
885+ * @param OP_NAME aclnn operator name.
886+ * @param args Additional arguments required by the task.
887+ *
888+ * @note
889+ * Memory from the allocator will be "freed" immediately and can be
890+ * reallocated to other pointers. However, it won't be accessed by any
891+ * other task before this asynchronous task ends, because all tasks in the
892+ * same stream are executed in queue order.
893+ */
894+
895+ #define GGML_CANN_CALL_ACLNN_OP (CTX, OP_NAME, ...) \
896+ do { \
897+ uint64_t workspaceSize = 0 ; \
898+ aclOpExecutor * executor; \
899+ void * workspaceAddr = nullptr ; \
900+ ACL_CHECK (aclnn##OP_NAME##GetWorkspaceSize (__VA_ARGS__, &workspaceSize, &executor));\
901+ /* workspace should alloced in main thread to keep malloc order when using vmm. */ \
902+ if (workspaceSize > 0 ) { \
903+ ggml_cann_pool_alloc workspace_allocator (CTX.pool (), workspaceSize); \
904+ workspaceAddr = workspace_allocator.get (); \
905+ } \
906+ if (CTX.async_mode ) { \
907+ auto task = \
908+ std::make_unique<aclnn_task>(aclnn##OP_NAME, workspaceAddr, workspaceSize, \
909+ executor, CTX.stream ()); \
910+ CTX.task_queue .submit_task (std::move (task)); \
911+ } else { \
912+ ACL_CHECK (aclnn##OP_NAME (workspaceAddr, workspaceSize, executor, CTX.stream ()));\
913+ } \
914+ } while (0 )
915+
916+ /* *
917+ * @brief Registers and releases multiple ACL resources, optionally deferring the release
918+ * using a task.
919+ *
920+ * @tparam Args Types of the ACL resources.
921+ * @param ctx Backend context which manages task submission and async mode.
922+ * @param args Pointers to ACL resources to be released.
923+ */
924+ template <typename ... Args>
925+ void ggml_cann_release_resources (ggml_backend_cann_context & ctx, Args &&... args) {
926+ std::vector<any_acl_resource> resources;
927+ register_acl_resources (resources, std::forward<Args>(args)...);
928+ if (ctx.async_mode ) {
929+ auto task = std::make_unique<release_resource_task>(std::move (resources));
930+ ctx.task_queue .submit_task (std::move (task));
931+ }
932+ }
933+
934+ /* *
935+ * @brief Performs an asynchronous memory copy operation, optionally deferred via task submission.
936+ *
937+ * @param ctx Backend context containing stream and async configuration.
938+ * @param dst Destination memory address.
939+ * @param src Source memory address.
940+ * @param len Size of memory to copy (in bytes).
941+ * @param kind Type of memory copy (host-to-device, device-to-host, etc).
942+ */
943+ inline void ggml_cann_async_memcpy (ggml_backend_cann_context & ctx, void * dst,
944+ const void * src, size_t len, aclrtMemcpyKind kind) {
945+ if (ctx.async_mode ) {
946+ auto task = std::make_unique<async_memcpy_task>(dst, const_cast <void *>(src), len, kind, ctx.stream ());
947+ ctx.task_queue .submit_task (std::move (task));
948+ } else {
949+ ACL_CHECK (aclrtMemcpyAsync (dst, len, src, len, kind, ctx.stream ()));
950+ }
951+ }
952+
953+ inline void ggml_cann_async_memcpy (ggml_backend_cann_context * ctx, void * dst,
954+ const void * src, size_t len, aclrtMemcpyKind kind) {
955+ if (ctx->async_mode ) {
956+ auto task = std::make_unique<async_memcpy_task>(dst, const_cast <void *>(src), len, kind, ctx->stream ());
957+ ctx->task_queue .submit_task (std::move (task));
958+ } else {
959+ ACL_CHECK (aclrtMemcpyAsync (dst, len, src, len, kind, ctx->stream ()));
960+ }
961+ }
962+
963+ /* *
964+ * @brief Performs an asynchronous memory set operation, optionally deferred via task submission.
965+ *
966+ * @param ctx Backend context containing stream and async configuration.
967+ * @param buffer Memory buffer to be set.
968+ * @param size Size of the memory buffer (in bytes).
969+ * @param value Value to set in the buffer.
970+ */
971+ inline void ggml_cann_async_memset (ggml_backend_cann_context & ctx, void * buffer,
972+ size_t size, int value) {
973+ if (ctx.async_mode ) {
974+ auto task = std::make_unique<async_memset_task>(buffer, size, value, ctx.stream ());
975+ ctx.task_queue .submit_task (std::move (task));
976+ } else {
977+ ACL_CHECK (aclrtMemsetAsync (buffer, size, value, size, ctx.stream ()));
978+ }
979+ }
980+
716981/* *
717982 * @brief Applies a element-wise operation to two input tensors using the CANN
718983 * backend.
@@ -742,42 +1007,9 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
7421007 bcast_shape (src0, src1, dst, &acl_src0, &acl_src1, &acl_dst);
7431008 binary_op (ctx, acl_src0, acl_src1, acl_dst);
7441009
745- ACL_CHECK (aclDestroyTensor (acl_src0));
746- ACL_CHECK (aclDestroyTensor (acl_src1));
747- ACL_CHECK (aclDestroyTensor (acl_dst));
1010+ ggml_cann_release_resources (ctx, acl_src0, acl_src1, acl_dst);
7481011}
7491012
750- /* *
751- * @brief Launches an asynchronous task using the memory allocator.
752- *
753- * This macro submit an asynchronous task on the specified stream.
754- * The task uses memory allocated by the allocator. It is guaranteed
755- * that the memory will not be accessed by other tasks until this task
756- * completes, due to the sequential execution order within the same stream.
757- *
758- * @param OP_NAME aclnn operator name.
759- * @param args Additional arguments required by the task.
760- *
761- * @note
762- * Memory from the allocator will be "freed" immediately and can be
763- * reallocated to other pointers. However, it won't be accessed by any
764- * other task before this asynchronous task ends, because all tasks in the
765- * same stream are executed in queue order.
766- */
767- #define GGML_CANN_CALL_ACLNN_OP (OP_NAME, ...) \
768- do { \
769- uint64_t workspaceSize = 0 ; \
770- aclOpExecutor * executor; \
771- void * workspaceAddr = nullptr ; \
772- \
773- ACL_CHECK (aclnn##OP_NAME##GetWorkspaceSize (__VA_ARGS__, &workspaceSize, &executor)); \
774- \
775- if (workspaceSize > 0 ) { \
776- ggml_cann_pool_alloc workspace_allocator (ctx.pool (), workspaceSize); \
777- workspaceAddr = workspace_allocator.get (); \
778- } \
779- ACL_CHECK (aclnn##OP_NAME (workspaceAddr, workspaceSize, executor, ctx.stream ())); \
780- } while (0 )
7811013
7821014/* *
7831015 * @brief Applies a unary operation to an input tensor using the CANN backend.
@@ -799,9 +1031,7 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
7991031 aclTensor* acl_dst = ggml_cann_create_tensor (dst);
8001032
8011033 unary_op (ctx, acl_src, acl_dst);
802-
803- ACL_CHECK (aclDestroyTensor (acl_src));
804- ACL_CHECK (aclDestroyTensor (acl_dst));
1034+ ggml_cann_release_resources (ctx, acl_src, acl_dst);
8051035}
8061036
8071037/* *
@@ -832,22 +1062,22 @@ void ggml_cann_unary_op(
8321062 *
8331063 * Internally, the lambda will call:
8341064 * @code
835- * GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst);
1065+ * GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
8361066 * @endcode
8371067 *
8381068 * @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
8391069 *
8401070 * @see ggml_cann_unary_op
8411071 * @see GGML_CANN_CALL_ACLNN_OP
8421072 */
843- #define GGML_CANN_CALL_UNARY_OP (OP_NAME ) \
844- do { \
845- auto lambda = [](ggml_backend_cann_context& ctx, \
846- aclTensor* acl_src, \
847- aclTensor* acl_dst) { \
848- GGML_CANN_CALL_ACLNN_OP (OP_NAME, acl_src, acl_dst); \
849- }; \
850- ggml_cann_unary_op (lambda, ctx, dst); \
851- } \
1073+ #define GGML_CANN_CALL_UNARY_OP (OP_NAME ) \
1074+ do { \
1075+ auto lambda = [](ggml_backend_cann_context& ctx, \
1076+ aclTensor* acl_src, \
1077+ aclTensor* acl_dst) { \
1078+ GGML_CANN_CALL_ACLNN_OP (ctx, OP_NAME, acl_src, acl_dst); \
1079+ }; \
1080+ ggml_cann_unary_op (lambda, ctx, dst); \
1081+ } \
8521082 while (0 )
8531083#endif // CANN_ACLNN_OPS
0 commit comments