Skip to content

Commit e199a87

Browse files
Merge pull request #60 from menloresearch/update-dev-from-master-2025-04-18-00-08
Sync master with upstream release b5149
2 parents e9c6088 + 2f74c35 commit e199a87

File tree

15 files changed

+733
-388
lines changed

15 files changed

+733
-388
lines changed

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 164 additions & 276 deletions
Large diffs are not rendered by default.

ggml/src/ggml-cann/aclnn_ops.h

Lines changed: 277 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#ifndef CANN_ACLNN_OPS
2424
#define CANN_ACLNN_OPS
2525

26+
#include <functional>
2627
#include <aclnnop/aclnn_abs.h>
2728
#include <aclnnop/aclnn_neg.h>
2829
#include <aclnnop/aclnn_exp.h>
@@ -713,6 +714,270 @@ void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
713714
*/
714715
void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst);
715716

717+
/*
718+
* @brief A generic wrapper for ACL resources with custom deleter support.
719+
*/
720+
using any_acl_resource = std::unique_ptr<void, std::function<void(void*)>>;
721+
722+
/**
723+
* @brief Trait structure used to define how to destroy a given ACL resource type.
724+
*
725+
* @tparam T ACL resource type.
726+
*/
727+
template<typename T>
728+
struct acl_resource_traits;
729+
730+
/**
731+
* @brief Specialization for aclTensor, defines how to destroy an aclTensor resource.
732+
*/
733+
template<>
734+
struct acl_resource_traits<aclTensor> {
735+
static void destroy(void* p) {
736+
ACL_CHECK(aclDestroyTensor(static_cast<aclTensor*>(p)));
737+
}
738+
};
739+
740+
/**
741+
* @brief Specialization for aclIntArray, defines how to destroy an aclIntArray resource.
742+
*/
743+
template<>
744+
struct acl_resource_traits<aclIntArray> {
745+
static void destroy(void* p) {
746+
ACL_CHECK(aclDestroyIntArray(static_cast<aclIntArray*>(p)));
747+
}
748+
};
749+
750+
/**
751+
* @brief Specialization for aclScalar, defines how to destroy an aclScalar resource.
752+
*/
753+
template<>
754+
struct acl_resource_traits<aclScalar> {
755+
static void destroy(void* p) {
756+
ACL_CHECK(aclDestroyScalar(static_cast<aclScalar*>(p)));
757+
}
758+
};
759+
760+
/**
761+
* @brief Specialization for aclTensorList, defines how to destroy an aclTensorList resource.
762+
*/
763+
template<>
764+
struct acl_resource_traits<aclTensorList> {
765+
static void destroy(void* p) {
766+
ACL_CHECK(aclDestroyTensorList(static_cast<aclTensorList*>(p)));
767+
}
768+
};
769+
770+
/**
771+
* @brief Creates a generic ACL resource wrapper with proper destruction logic.
772+
*
773+
* @tparam T ACL resource type.
774+
* @param ptr Raw pointer to ACL resource.
775+
* @return any_acl_resource Smart pointer that handles destruction.
776+
*/
777+
template<typename T>
778+
any_acl_resource make_acl_resource(T* ptr) {
779+
return any_acl_resource(
780+
static_cast<void*>(ptr),
781+
[](void* p) {
782+
acl_resource_traits<T>::destroy(p);
783+
}
784+
);
785+
}
786+
787+
/**
788+
* @brief Registers multiple ACL resources into a vector for lifetime management.
789+
*
790+
* @tparam Args Variadic list of ACL resource types.
791+
* @param vec Target vector to hold ACL resources.
792+
* @param args Raw pointers to ACL resources.
793+
*/
794+
template<typename... Args>
795+
void register_acl_resources(std::vector<any_acl_resource>& vec, Args*... args) {
796+
(vec.emplace_back(make_acl_resource(args)), ...);
797+
}
798+
799+
/**
800+
* @brief Task class that wraps the execution of an aclnn function call.
801+
*/
802+
class aclnn_task : public cann_task {
803+
public:
804+
aclnn_task(aclnn_func_t aclnn_func, void * workspace_addr,
805+
uint64_t workspace_size, aclOpExecutor * executor,
806+
aclrtStream stream) :
807+
aclnn_func_(aclnn_func),
808+
workspace_addr_(workspace_addr),
809+
workspace_size_(workspace_size),
810+
executor_(executor),
811+
stream_(stream) {}
812+
virtual void run_task() override {
813+
ACL_CHECK(aclnn_func_(workspace_addr_, workspace_size_, executor_, stream_));
814+
}
815+
private:
816+
aclnn_func_t aclnn_func_;
817+
void * workspace_addr_;
818+
uint64_t workspace_size_;
819+
aclOpExecutor * executor_;
820+
aclrtStream stream_;
821+
};
822+
823+
/**
824+
* @brief Task class that releases ACL resources after usage.
825+
*/
826+
class release_resource_task : public cann_task {
827+
public:
828+
release_resource_task(std::vector<any_acl_resource>&& resources){
829+
resource_ = std::move(resources);
830+
}
831+
832+
virtual void run_task() override {
833+
resource_.clear();
834+
}
835+
private:
836+
std::vector<any_acl_resource> resource_;
837+
};
838+
839+
/**
840+
* @brief Task class for performing asynchronous memory copy operations.
841+
*/
842+
class async_memcpy_task : public cann_task {
843+
public:
844+
async_memcpy_task(void* dst, const void* src, size_t size,
845+
aclrtMemcpyKind kind, aclrtStream stream)
846+
: dst_(dst), src_(src), size_(size), kind_(kind), stream_(stream) {}
847+
848+
virtual void run_task() override {
849+
ACL_CHECK(aclrtMemcpyAsync(dst_, size_, src_, size_, kind_, stream_));
850+
}
851+
private:
852+
void* dst_;
853+
const void* src_;
854+
size_t size_;
855+
aclrtMemcpyKind kind_;
856+
aclrtStream stream_;
857+
};
858+
859+
/**
860+
* @brief Task class for performing asynchronous memory set operations.
861+
*/
862+
class async_memset_task : public cann_task {
863+
public:
864+
async_memset_task(void* buffer, size_t size, int32_t value, aclrtStream stream)
865+
: buffer_(buffer), size_(size), value_(value), stream_(stream) {}
866+
867+
virtual void run_task() override {
868+
ACL_CHECK(aclrtMemsetAsync(buffer_, size_, value_, size_, stream_));
869+
}
870+
private:
871+
void* buffer_;
872+
size_t size_;
873+
int32_t value_;
874+
aclrtStream stream_;
875+
};
876+
877+
/**
878+
* @brief Launches an asynchronous task using the memory allocator.
879+
*
880+
* This macro submit an asynchronous task on the specified stream.
881+
* The task uses memory allocated by the allocator. It is guaranteed
882+
* that the memory will not be accessed by other tasks until this task
883+
* completes, due to the sequential execution order within the same stream.
884+
*
885+
* @param OP_NAME aclnn operator name.
886+
* @param args Additional arguments required by the task.
887+
*
888+
* @note
889+
* Memory from the allocator will be "freed" immediately and can be
890+
* reallocated to other pointers. However, it won't be accessed by any
891+
* other task before this asynchronous task ends, because all tasks in the
892+
* same stream are executed in queue order.
893+
*/
894+
895+
#define GGML_CANN_CALL_ACLNN_OP(CTX, OP_NAME, ...) \
896+
do { \
897+
uint64_t workspaceSize = 0; \
898+
aclOpExecutor * executor; \
899+
void * workspaceAddr = nullptr; \
900+
ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor));\
901+
/* workspace should alloced in main thread to keep malloc order when using vmm. */ \
902+
if (workspaceSize > 0) { \
903+
ggml_cann_pool_alloc workspace_allocator(CTX.pool(), workspaceSize); \
904+
workspaceAddr = workspace_allocator.get(); \
905+
} \
906+
if (CTX.async_mode) { \
907+
auto task = \
908+
std::make_unique<aclnn_task>(aclnn##OP_NAME, workspaceAddr, workspaceSize, \
909+
executor, CTX.stream()); \
910+
CTX.task_queue.submit_task(std::move(task)); \
911+
} else { \
912+
ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, CTX.stream()));\
913+
} \
914+
} while (0)
915+
916+
/**
917+
* @brief Registers and releases multiple ACL resources, optionally deferring the release
918+
* using a task.
919+
*
920+
* @tparam Args Types of the ACL resources.
921+
* @param ctx Backend context which manages task submission and async mode.
922+
* @param args Pointers to ACL resources to be released.
923+
*/
924+
template <typename... Args>
925+
void ggml_cann_release_resources(ggml_backend_cann_context & ctx, Args &&... args) {
926+
std::vector<any_acl_resource> resources;
927+
register_acl_resources(resources, std::forward<Args>(args)...);
928+
if(ctx.async_mode) {
929+
auto task = std::make_unique<release_resource_task>(std::move(resources));
930+
ctx.task_queue.submit_task(std::move(task));
931+
}
932+
}
933+
934+
/**
935+
* @brief Performs an asynchronous memory copy operation, optionally deferred via task submission.
936+
*
937+
* @param ctx Backend context containing stream and async configuration.
938+
* @param dst Destination memory address.
939+
* @param src Source memory address.
940+
* @param len Size of memory to copy (in bytes).
941+
* @param kind Type of memory copy (host-to-device, device-to-host, etc).
942+
*/
943+
inline void ggml_cann_async_memcpy(ggml_backend_cann_context & ctx, void * dst,
944+
const void * src, size_t len, aclrtMemcpyKind kind) {
945+
if (ctx.async_mode) {
946+
auto task = std::make_unique<async_memcpy_task>(dst, const_cast<void *>(src), len, kind, ctx.stream());
947+
ctx.task_queue.submit_task(std::move(task));
948+
} else {
949+
ACL_CHECK(aclrtMemcpyAsync(dst, len, src, len, kind, ctx.stream()));
950+
}
951+
}
952+
953+
inline void ggml_cann_async_memcpy(ggml_backend_cann_context * ctx, void * dst,
954+
const void * src, size_t len, aclrtMemcpyKind kind) {
955+
if (ctx->async_mode) {
956+
auto task = std::make_unique<async_memcpy_task>(dst, const_cast<void *>(src), len, kind, ctx->stream());
957+
ctx->task_queue.submit_task(std::move(task));
958+
} else {
959+
ACL_CHECK(aclrtMemcpyAsync(dst, len, src, len, kind, ctx->stream()));
960+
}
961+
}
962+
963+
/**
964+
* @brief Performs an asynchronous memory set operation, optionally deferred via task submission.
965+
*
966+
* @param ctx Backend context containing stream and async configuration.
967+
* @param buffer Memory buffer to be set.
968+
* @param size Size of the memory buffer (in bytes).
969+
* @param value Value to set in the buffer.
970+
*/
971+
inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffer,
972+
size_t size, int value) {
973+
if (ctx.async_mode) {
974+
auto task = std::make_unique<async_memset_task>(buffer, size, value, ctx.stream());
975+
ctx.task_queue.submit_task(std::move(task));
976+
} else {
977+
ACL_CHECK(aclrtMemsetAsync(buffer, size, value, size, ctx.stream()));
978+
}
979+
}
980+
716981
/**
717982
* @brief Applies a element-wise operation to two input tensors using the CANN
718983
* backend.
@@ -742,42 +1007,9 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
7421007
bcast_shape(src0, src1, dst, &acl_src0, &acl_src1, &acl_dst);
7431008
binary_op(ctx, acl_src0, acl_src1, acl_dst);
7441009

745-
ACL_CHECK(aclDestroyTensor(acl_src0));
746-
ACL_CHECK(aclDestroyTensor(acl_src1));
747-
ACL_CHECK(aclDestroyTensor(acl_dst));
1010+
ggml_cann_release_resources(ctx, acl_src0, acl_src1, acl_dst);
7481011
}
7491012

750-
/**
751-
* @brief Launches an asynchronous task using the memory allocator.
752-
*
753-
* This macro submit an asynchronous task on the specified stream.
754-
* The task uses memory allocated by the allocator. It is guaranteed
755-
* that the memory will not be accessed by other tasks until this task
756-
* completes, due to the sequential execution order within the same stream.
757-
*
758-
* @param OP_NAME aclnn operator name.
759-
* @param args Additional arguments required by the task.
760-
*
761-
* @note
762-
* Memory from the allocator will be "freed" immediately and can be
763-
* reallocated to other pointers. However, it won't be accessed by any
764-
* other task before this asynchronous task ends, because all tasks in the
765-
* same stream are executed in queue order.
766-
*/
767-
#define GGML_CANN_CALL_ACLNN_OP(OP_NAME, ...) \
768-
do { \
769-
uint64_t workspaceSize = 0; \
770-
aclOpExecutor * executor; \
771-
void * workspaceAddr = nullptr; \
772-
\
773-
ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor)); \
774-
\
775-
if (workspaceSize > 0) { \
776-
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); \
777-
workspaceAddr = workspace_allocator.get(); \
778-
} \
779-
ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, ctx.stream())); \
780-
} while (0)
7811013

7821014
/**
7831015
* @brief Applies a unary operation to an input tensor using the CANN backend.
@@ -799,9 +1031,7 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
7991031
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
8001032

8011033
unary_op(ctx, acl_src, acl_dst);
802-
803-
ACL_CHECK(aclDestroyTensor(acl_src));
804-
ACL_CHECK(aclDestroyTensor(acl_dst));
1034+
ggml_cann_release_resources(ctx, acl_src, acl_dst);
8051035
}
8061036

8071037
/**
@@ -832,22 +1062,22 @@ void ggml_cann_unary_op(
8321062
*
8331063
* Internally, the lambda will call:
8341064
* @code
835-
* GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst);
1065+
* GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
8361066
* @endcode
8371067
*
8381068
* @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
8391069
*
8401070
* @see ggml_cann_unary_op
8411071
* @see GGML_CANN_CALL_ACLNN_OP
8421072
*/
843-
#define GGML_CANN_CALL_UNARY_OP(OP_NAME) \
844-
do { \
845-
auto lambda = [](ggml_backend_cann_context& ctx, \
846-
aclTensor* acl_src, \
847-
aclTensor* acl_dst) { \
848-
GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst); \
849-
}; \
850-
ggml_cann_unary_op(lambda, ctx, dst); \
851-
} \
1073+
#define GGML_CANN_CALL_UNARY_OP(OP_NAME) \
1074+
do { \
1075+
auto lambda = [](ggml_backend_cann_context& ctx, \
1076+
aclTensor* acl_src, \
1077+
aclTensor* acl_dst) { \
1078+
GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \
1079+
}; \
1080+
ggml_cann_unary_op(lambda, ctx, dst); \
1081+
} \
8521082
while (0)
8531083
#endif // CANN_ACLNN_OPS

0 commit comments

Comments
 (0)