diff --git a/include/infinicore/context/context.hpp b/include/infinicore/context/context.hpp index 1612db830..eb46dc622 100644 --- a/include/infinicore/context/context.hpp +++ b/include/infinicore/context/context.hpp @@ -3,6 +3,8 @@ #include "../device.hpp" #include "../memory.hpp" +#include "../graph/graph.hpp" + #include #include @@ -40,6 +42,12 @@ void destroyEvent(infinirtEvent_t event); float elapsedTime(infinirtEvent_t start, infinirtEvent_t end); void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event); +// Graph recording APIs +bool isGraphRecording(); +void startGraphRecording(); +void addGraphOperator(std::shared_ptr op); +std::shared_ptr stopGraphRecording(); + } // namespace context } // namespace infinicore diff --git a/include/infinicore/graph/graph.hpp b/include/infinicore/graph/graph.hpp new file mode 100644 index 000000000..c63b3272d --- /dev/null +++ b/include/infinicore/graph/graph.hpp @@ -0,0 +1,92 @@ +#pragma once + +#include +#include + +#include "../tensor.hpp" + +namespace infinicore::graph { +// Forward declarations +class GraphManager; + +class GraphTensor : public Tensor { +public: + GraphTensor(const Tensor &); +}; + +class GraphOperator { + +public: + void run() const; + ~GraphOperator(); + +protected: + using run_schema = void (*)(void *); + using cleanup_schema = void (*)(void **); + void *planned_meta_; + run_schema runner_; + cleanup_schema deleter_; +}; + +class Graph { +public: + Graph() = default; + ~Graph() = default; + + void run() const; + +protected: + void add_operator(std::shared_ptr op); + + std::vector> op_list_; + + friend class GraphManager; +}; +} // namespace infinicore::graph + +#define INFINICORE_GRAPH_OP_CLASS(__OP_NAME__, ...) \ + class __OP_NAME__ : public graph::GraphOperator { \ + public: \ + using schema = void (*)(__VA_ARGS__); \ + using plan_schema = void *(*)(__VA_ARGS__); \ + static common::OpDispatcher &plan_dispatcher(); \ + static common::OpDispatcher &run_dispatcher(); \ + static common::OpDispatcher &cleanup_dispatcher(); \ + __OP_NAME__(__VA_ARGS__); \ + static void execute(__VA_ARGS__); \ + }; + +#define INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(__OP_NAME__) \ + common::OpDispatcher<__OP_NAME__::plan_schema> &__OP_NAME__::plan_dispatcher() { \ + static common::OpDispatcher<__OP_NAME__::plan_schema> dispatcher_; \ + return dispatcher_; \ + } \ + common::OpDispatcher<__OP_NAME__::run_schema> &__OP_NAME__::run_dispatcher() { \ + static common::OpDispatcher<__OP_NAME__::run_schema> dispatcher_; \ + return dispatcher_; \ + } \ + common::OpDispatcher<__OP_NAME__::cleanup_schema> &__OP_NAME__::cleanup_dispatcher() { \ + static common::OpDispatcher<__OP_NAME__::cleanup_schema> dispatcher_; \ + return dispatcher_; \ + } + +#define INFINICORE_GRAPH_OP_DISPATCH(__DEVICE_TYPE__, ...) \ + planned_meta_ = plan_dispatcher().lookup(__DEVICE_TYPE__)(__VA_ARGS__); \ + runner_ = run_dispatcher().lookup(__DEVICE_TYPE__); \ + deleter_ = cleanup_dispatcher().lookup(__DEVICE_TYPE__); + +#define INFINICORE_GRAPH_OP_RECORD_OR_RUN(__OP_NAME__, ...) \ + auto op = std::make_shared<__OP_NAME__>(__VA_ARGS__); \ + if (context::isGraphRecording()) { \ + context::addGraphOperator(op); \ + } else { \ + op->run(); \ + } + +#define INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(__OP_NAME__, __PLAN_F__, __RUN_F__, __CLEANUP_F__) \ + static bool registered = []() { \ + __OP_NAME__::plan_dispatcher().registerAll(__PLAN_F__, false); \ + __OP_NAME__::run_dispatcher().registerAll(__RUN_F__, false); \ + __OP_NAME__::cleanup_dispatcher().registerAll(__CLEANUP_F__, false); \ + return true; \ + }(); diff --git a/include/infinicore/ops/gemm.hpp b/include/infinicore/ops/gemm.hpp index 6562f087d..481d47cf6 100644 --- a/include/infinicore/ops/gemm.hpp +++ b/include/infinicore/ops/gemm.hpp @@ -1,16 +1,12 @@ #pragma once #include "../device.hpp" +#include "../graph/graph.hpp" #include "common/op.hpp" namespace infinicore::op { -class Gemm { -public: - using schema = void (*)(Tensor, Tensor, Tensor, float, float); - static void execute(Tensor c, Tensor a, Tensor b, float alpha, float beta); - static common::OpDispatcher &dispatcher(); -}; +INFINICORE_GRAPH_OP_CLASS(Gemm, Tensor, Tensor, Tensor, float, float); Tensor gemm(Tensor a, Tensor b, float alpha = 1.0f, float beta = 0.0f); void gemm_(Tensor c, Tensor a, Tensor b, float alpha, float beta); diff --git a/include/infinicore/ops/paged_caching.hpp b/include/infinicore/ops/paged_caching.hpp index 4cf8d4f02..e357cda38 100644 --- a/include/infinicore/ops/paged_caching.hpp +++ b/include/infinicore/ops/paged_caching.hpp @@ -8,10 +8,10 @@ namespace infinicore::op { class PagedCaching { public: using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor); - static void execute(Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, Tensor slot_mapping); + static void execute(Tensor k_cache, Tensor v_cache, Tensor k, Tensor v, Tensor slot_mapping); static common::OpDispatcher &dispatcher(); }; -void paged_caching_(Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, Tensor slot_mapping); +void paged_caching_(Tensor k_cache, Tensor v_cache, Tensor k, Tensor v, Tensor slot_mapping); } // namespace infinicore::op diff --git a/include/infinicore/tensor.hpp b/include/infinicore/tensor.hpp index 58a8f59e7..e9f210186 100644 --- a/include/infinicore/tensor.hpp +++ b/include/infinicore/tensor.hpp @@ -133,6 +133,8 @@ class TensorImpl : public std::enable_shared_from_this { void debug() const; + Tensor to_blob() const; + /// /// Data Transfer APIs /// @@ -294,7 +296,7 @@ class TensorImpl : public std::enable_shared_from_this { friend class Tensor; -private: +protected: TensorMetaData meta_; TensorData data_; }; diff --git a/include/infiniop/ops/paged_caching.h b/include/infiniop/ops/paged_caching.h index cf162ba38..33764138d 100644 --- a/include/infiniop/ops/paged_caching.h +++ b/include/infiniop/ops/paged_caching.h @@ -14,20 +14,20 @@ typedef struct InfiniopDescriptor *infiniopPagedCachingDescriptor_t; * * @param handle The handle to the InfiniOP library context. * @param desc_ptr A pointer to store the created descriptor. - * @param k_desc Descriptor for the source key tensor. - * @param v_desc Descriptor for the source value tensor. * @param k_cache_desc Descriptor for the key cache pool tensor. * @param v_cache_desc Descriptor for the value cache pool tensor. + * @param k_desc Descriptor for the source key tensor. + * @param v_desc Descriptor for the source value tensor. * @param slot_mapping_desc Descriptor for the slot mapping tensor. * @return infiniStatus_t Status code of the operation. */ __C __export infiniStatus_t infiniopCreatePagedCachingDescriptor( infiniopHandle_t handle, infiniopPagedCachingDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t k_desc, - infiniopTensorDescriptor_t v_desc, infiniopTensorDescriptor_t k_cache_desc, infiniopTensorDescriptor_t v_cache_desc, + infiniopTensorDescriptor_t k_desc, + infiniopTensorDescriptor_t v_desc, infiniopTensorDescriptor_t slot_mapping_desc); /** @@ -46,10 +46,10 @@ __C __export infiniStatus_t infiniopGetPagedCachingWorkspaceSize( * @param desc The Paged Caching descriptor. * @param workspace Pointer to the workspace memory. * @param workspace_size The size of the workspace. - * @param k Pointer to the source key tensor data. - * @param v Pointer to the source value tensor data. * @param k_cache Pointer to the key cache pool data. * @param v_cache Pointer to the value cache pool data. + * @param k Pointer to the source key tensor data. + * @param v Pointer to the source value tensor data. * @param slot_mapping Pointer to the slot mapping data. * @param stream The CUDA stream for the operation. Can be NULL. * @return infiniStatus_t Status code of the operation. @@ -58,10 +58,10 @@ __C __export infiniStatus_t infiniopPagedCaching( infiniopPagedCachingDescriptor_t desc, void *workspace, size_t workspace_size, - const void *k, - const void *v, void *k_cache, void *v_cache, + const void *k, + const void *v, const void *slot_mapping, void *stream); diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index b7288f3ac..c6b01d5aa 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -8,7 +8,10 @@ get_device, get_device_count, get_stream, + is_graph_recording, set_device, + start_graph_recording, + stop_graph_recording, sync_device, sync_stream, ) @@ -81,6 +84,9 @@ "set_device", "sync_device", "sync_stream", + "is_graph_recording", + "start_graph_recording", + "stop_graph_recording", # Data Types. "bfloat16", "bool", diff --git a/python/infinicore/context.py b/python/infinicore/context.py index d74a839f2..1181738c5 100644 --- a/python/infinicore/context.py +++ b/python/infinicore/context.py @@ -1,4 +1,5 @@ import infinicore.device +from infinicore.graph import Graph from infinicore.lib import _infinicore @@ -49,3 +50,24 @@ def get_stream(): stream: The current stream object """ return _infinicore.get_stream() + + +def is_graph_recording(): + """Check if the current graph is recording. + + Returns: + bool: True if the current graph is recording, False otherwise + """ + return _infinicore.is_graph_recording() + + +def start_graph_recording(device=None): + """Start recording the current graph.""" + if device is not None: + set_device(device) + _infinicore.start_graph_recording() + + +def stop_graph_recording(): + """Stop recording the current graph.""" + return Graph(_infinicore.stop_graph_recording()) diff --git a/python/infinicore/graph.py b/python/infinicore/graph.py new file mode 100644 index 000000000..7d7feb970 --- /dev/null +++ b/python/infinicore/graph.py @@ -0,0 +1,18 @@ +from infinicore.lib import _infinicore + + +class Graph: + """ + Python wrapper around a InfiniCore Graph instance. + """ + + def __init__(self, graph: _infinicore.Graph): + if not isinstance(graph, _infinicore.Graph): + raise TypeError("Expected _infinicore.Graph") + self._graph = graph + + def run(self): + return self._graph.run() + + def __repr__(self): + return f"" diff --git a/python/infinicore/ops/paged_caching.py b/python/infinicore/ops/paged_caching.py index 59459eba1..e3b8d63fb 100644 --- a/python/infinicore/ops/paged_caching.py +++ b/python/infinicore/ops/paged_caching.py @@ -3,18 +3,18 @@ def paged_caching( - k: Tensor, - v: Tensor, k_cache: Tensor, v_cache: Tensor, + k: Tensor, + v: Tensor, slot_mapping: Tensor, ): Tensor( _infinicore.paged_caching_( - k._underlying, - v._underlying, k_cache._underlying, v_cache._underlying, + k._underlying, + v._underlying, slot_mapping._underlying, ) ) diff --git a/src/infinicore/context/allocators/device_pinned_allocator.cc b/src/infinicore/context/allocators/device_pinned_allocator.cc index b9e8ea217..36ee599ff 100644 --- a/src/infinicore/context/allocators/device_pinned_allocator.cc +++ b/src/infinicore/context/allocators/device_pinned_allocator.cc @@ -12,12 +12,18 @@ DevicePinnedHostAllocator::~DevicePinnedHostAllocator() { } std::byte *DevicePinnedHostAllocator::allocate(size_t size) { + if (size == 0) { + return nullptr; + } void *ptr; INFINICORE_CHECK_ERROR(infinirtMallocHost(&ptr, size)); return (std::byte *)ptr; } void DevicePinnedHostAllocator::deallocate(std::byte *ptr) { + if (ptr == nullptr) { + return; + } if (owner_ == context::getDevice()) { INFINICORE_CHECK_ERROR(infinirtFreeHost(ptr)); gc(); diff --git a/src/infinicore/context/allocators/host_allocator.cc b/src/infinicore/context/allocators/host_allocator.cc index a06ca8284..41a55b453 100644 --- a/src/infinicore/context/allocators/host_allocator.cc +++ b/src/infinicore/context/allocators/host_allocator.cc @@ -4,10 +4,16 @@ namespace infinicore { std::byte *HostAllocator::allocate(size_t size) { + if (size == 0) { + return nullptr; + } return (std::byte *)std::malloc(size); } void HostAllocator::deallocate(std::byte *ptr) { + if (ptr == nullptr) { + return; + } std::free(ptr); } diff --git a/src/infinicore/context/allocators/pinnable_block_allocator.cc b/src/infinicore/context/allocators/pinnable_block_allocator.cc index ad81367a3..f41800d7c 100644 --- a/src/infinicore/context/allocators/pinnable_block_allocator.cc +++ b/src/infinicore/context/allocators/pinnable_block_allocator.cc @@ -1,5 +1,7 @@ #include "pinnable_block_allocator.hpp" +#include "../context_impl.hpp" + #include "../../utils.hpp" #include @@ -35,6 +37,9 @@ PinnableBlockAllocator::PinnableBlockAllocator(Device device) // ------------------- allocate ------------------- std::byte *PinnableBlockAllocator::allocate(size_t size) { + if (size == 0) { + return nullptr; + } std::lock_guard lock(mutex_); // Align size to 256 bytes for GPU @@ -92,7 +97,7 @@ std::byte *PinnableBlockAllocator::allocate(size_t size) { // ------------------- deallocate ------------------- void PinnableBlockAllocator::deallocate(std::byte *ptr) { - if (!ptr) { + if (ptr == nullptr) { return; } diff --git a/src/infinicore/context/allocators/pinnable_block_allocator.hpp b/src/infinicore/context/allocators/pinnable_block_allocator.hpp index 65aac7784..8911d2a6d 100644 --- a/src/infinicore/context/allocators/pinnable_block_allocator.hpp +++ b/src/infinicore/context/allocators/pinnable_block_allocator.hpp @@ -2,8 +2,6 @@ #include "memory_allocator.hpp" -#include "../context_impl.hpp" - #include #include #include @@ -25,7 +23,7 @@ class PinnableBlockAllocator : public MemoryAllocator { }; public: - explicit PinnableBlockAllocator(Device device); + PinnableBlockAllocator(Device device); ~PinnableBlockAllocator(); std::byte *allocate(size_t size) override; diff --git a/src/infinicore/context/allocators/stream_ordered_allocator.cc b/src/infinicore/context/allocators/stream_ordered_allocator.cc index 5e5148973..3bd0aaab1 100644 --- a/src/infinicore/context/allocators/stream_ordered_allocator.cc +++ b/src/infinicore/context/allocators/stream_ordered_allocator.cc @@ -8,12 +8,18 @@ namespace infinicore { StreamOrderedAllocator::StreamOrderedAllocator(Device device) : MemoryAllocator(), device_(device) {} std::byte *StreamOrderedAllocator::allocate(size_t size) { + if (size == 0) { + return nullptr; + } void *ptr = nullptr; INFINICORE_CHECK_ERROR(infinirtMallocAsync(&ptr, size, context::getStream())); return (std::byte *)ptr; } void StreamOrderedAllocator::deallocate(std::byte *ptr) { + if (ptr == nullptr) { + return; + } INFINICORE_CHECK_ERROR(infinirtFreeAsync(ptr, context::getStream())); } } // namespace infinicore diff --git a/src/infinicore/context/context_impl.cc b/src/infinicore/context/context_impl.cc index c5ed7acd1..5faf3bfe7 100644 --- a/src/infinicore/context/context_impl.cc +++ b/src/infinicore/context/context_impl.cc @@ -39,6 +39,10 @@ void ContextImpl::setDevice(Device device) { return; } + if (getCurrentRuntime()->isGraphRecording()) { + spdlog::warn("Switching device runtime during graph recording may break the graph!"); + } + if (runtime_table_[int(device.getType())][device.getIndex()] == nullptr) { // Lazy initialization of runtime if never set before. runtime_table_[int(device.getType())][device.getIndex()] = std::unique_ptr(new Runtime(device)); @@ -178,6 +182,21 @@ void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) { ContextImpl::singleton().getCurrentRuntime()->streamWaitEvent(stream, event); } +bool isGraphRecording() { + return ContextImpl::singleton().getCurrentRuntime()->isGraphRecording(); +} + +void startGraphRecording() { + ContextImpl::singleton().getCurrentRuntime()->startGraphRecording(); +} + +void addGraphOperator(std::shared_ptr op) { + ContextImpl::singleton().getCurrentRuntime()->addGraphOperator(op); +} + +std::shared_ptr stopGraphRecording() { + return ContextImpl::singleton().getCurrentRuntime()->stopGraphRecording(); +} } // namespace context } // namespace infinicore diff --git a/src/infinicore/context/runtime/runtime.cc b/src/infinicore/context/runtime/runtime.cc index 85ab094df..a6dd7eb7e 100644 --- a/src/infinicore/context/runtime/runtime.cc +++ b/src/infinicore/context/runtime/runtime.cc @@ -8,12 +8,12 @@ #include "../allocators/stream_ordered_allocator.hpp" namespace infinicore { -Runtime::Runtime(Device device) : device_(device) { +Runtime::Runtime(Device device) : device_(device), graph_manager_(std::make_unique()) { activate(); INFINICORE_CHECK_ERROR(infinirtStreamCreate(&stream_)); INFINICORE_CHECK_ERROR(infiniopCreateHandle(&infiniop_handle_)); if (device_.getType() == Device::Type::CPU) { - device_memory_allocator_ = std::make_unique(); + device_memory_allocator_ = std::make_unique(device); } else { device_memory_allocator_ = std::make_unique(device); pinned_host_memory_allocator_ = std::make_unique(device); @@ -145,6 +145,25 @@ void Runtime::streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) { INFINICORE_CHECK_ERROR(infinirtStreamWaitEvent(stream, event)); } +bool Runtime::isGraphRecording() const { + return graph_manager_->is_recording(); +} + +void Runtime::startGraphRecording() { + device_memory_allocator_->set_pin_mode(true); + return graph_manager_->start_recording(); +} + +void Runtime::addGraphOperator(std::shared_ptr op) { + return graph_manager_->add_operator(op); +} + +std::shared_ptr Runtime::stopGraphRecording() { + auto graph = graph_manager_->stop_recording(); + device_memory_allocator_->set_pin_mode(false); + return graph; +} + std::string Runtime::toString() const { return fmt::format("Runtime({})", device_.toString()); } diff --git a/src/infinicore/context/runtime/runtime.hpp b/src/infinicore/context/runtime/runtime.hpp index c731b7804..58d8bd424 100644 --- a/src/infinicore/context/runtime/runtime.hpp +++ b/src/infinicore/context/runtime/runtime.hpp @@ -1,8 +1,11 @@ #pragma once -#include "../allocators/memory_allocator.hpp" +#include "../allocators/pinnable_block_allocator.hpp" + #include "infinicore/context/context.hpp" +#include "../../graph/graph_manager.hpp" + #include #include @@ -13,8 +16,9 @@ class Runtime { Device device_; infinirtStream_t stream_; infiniopHandle_t infiniop_handle_; - std::unique_ptr device_memory_allocator_; + std::unique_ptr device_memory_allocator_; std::unique_ptr pinned_host_memory_allocator_; + std::unique_ptr graph_manager_; protected: Runtime(Device device); @@ -48,6 +52,12 @@ class Runtime { float elapsedTime(infinirtEvent_t start, infinirtEvent_t end); void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event); + // Graph + bool isGraphRecording() const; + void startGraphRecording(); + void addGraphOperator(std::shared_ptr op); + std::shared_ptr stopGraphRecording(); + std::string toString() const; friend class ContextImpl; diff --git a/src/infinicore/graph/graph.cc b/src/infinicore/graph/graph.cc new file mode 100644 index 000000000..86944af36 --- /dev/null +++ b/src/infinicore/graph/graph.cc @@ -0,0 +1,67 @@ +#include "graph_manager.hpp" + +#include "../utils.hpp" + +namespace infinicore::graph { + +/* ========================= + * GraphTensor + * ========================= */ + +GraphTensor::GraphTensor(const Tensor &tensor) : Tensor(tensor->to_blob()) { +} + +/* ========================= + * GraphOperator + * ========================= */ + +void GraphOperator::run() const { + runner_(planned_meta_); +} + +GraphOperator::~GraphOperator() { + if (deleter_) { + deleter_(&planned_meta_); + } +} + +/* ========================= + * Graph + * ========================= */ + +void Graph::run() const { + for (auto &op : op_list_) { + op->run(); + } +} + +void Graph::add_operator(std::shared_ptr op) { + op_list_.push_back(op); +} + +/* ========================= + * GraphManager + * ========================= */ + +bool GraphManager::is_recording() const { + return recording_; +} + +void GraphManager::start_recording() { + recording_ = true; + graph_ = std::make_shared(); +} + +void GraphManager::add_operator(std::shared_ptr op) { + INFINICORE_ASSERT(recording_); + + graph_->add_operator(op); +} + +std::shared_ptr GraphManager::stop_recording() { + + recording_ = false; + return std::exchange(graph_, nullptr); +} + +} // namespace infinicore::graph diff --git a/src/infinicore/graph/graph_manager.hpp b/src/infinicore/graph/graph_manager.hpp new file mode 100644 index 000000000..9cc040366 --- /dev/null +++ b/src/infinicore/graph/graph_manager.hpp @@ -0,0 +1,25 @@ +#pragma once + +#include "infinicore/graph/graph.hpp" + +#include +#include + +namespace infinicore::graph { + +class GraphManager { +public: + GraphManager() = default; + ~GraphManager() = default; + + bool is_recording() const; + void start_recording(); + void add_operator(std::shared_ptr op); + std::shared_ptr stop_recording(); + +private: + std::shared_ptr graph_; + bool recording_ = false; +}; + +} // namespace infinicore::graph diff --git a/src/infinicore/ops/gemm/gemm.cc b/src/infinicore/ops/gemm/gemm.cc index ecaaa4960..e2b3924f7 100644 --- a/src/infinicore/ops/gemm/gemm.cc +++ b/src/infinicore/ops/gemm/gemm.cc @@ -3,16 +3,15 @@ #include "../../utils.hpp" namespace infinicore::op { +INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Gemm); -common::OpDispatcher &Gemm::dispatcher() { - static common::OpDispatcher dispatcher_; - return dispatcher_; -}; +Gemm::Gemm(Tensor c, Tensor a, Tensor b, float alpha, float beta) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b); + INFINICORE_GRAPH_OP_DISPATCH(c->device().getType(), c, a, b, alpha, beta); +} void Gemm::execute(Tensor c, Tensor a, Tensor b, float alpha, float beta) { - INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b); - infinicore::context::setDevice(c->device()); - dispatcher().lookup(c->device().getType())(c, a, b, alpha, beta); + INFINICORE_GRAPH_OP_RECORD_OR_RUN(Gemm, c, a, b, alpha, beta); } Tensor gemm(Tensor a, Tensor b, float alpha, float beta) { diff --git a/src/infinicore/ops/gemm/gemm_infiniop.cc b/src/infinicore/ops/gemm/gemm_infiniop.cc index 3577be190..670fdbc2a 100644 --- a/src/infinicore/ops/gemm/gemm_infiniop.cc +++ b/src/infinicore/ops/gemm/gemm_infiniop.cc @@ -1,50 +1,49 @@ -#include "../../utils.hpp" -#include "infinicore/common/hash.hpp" -#include "infinicore/ops/common/cache.hpp" +#include "../infiniop_impl.hpp" #include "infinicore/ops/gemm.hpp" -#include namespace infinicore::op::gemm_impl::infiniop { -thread_local common::OpCache caches( - 100, // capacity - [](infiniopGemmDescriptor_t &desc) { - if (desc != nullptr) { - INFINICORE_CHECK_ERROR(infiniopDestroyGemmDescriptor(desc)); - desc = nullptr; - } - }); - -void calculate(Tensor c, Tensor a, Tensor b, float alpha, float beta) { - size_t seed = hash_combine(c, b, a, alpha, beta); - - auto device = context::getDevice(); - auto &cache = caches.getCache(device); - - auto desc_opt = cache.get(seed); - infiniopGemmDescriptor_t desc = nullptr; - - if (!desc_opt) { - INFINICORE_CHECK_ERROR(infiniopCreateGemmDescriptor( - context::getInfiniopHandle(device), &desc, - c->desc(), a->desc(), b->desc())); - cache.put(seed, desc); - } else { - desc = *desc_opt; - } - - size_t workspace_size = 0; - INFINICORE_CHECK_ERROR(infiniopGetGemmWorkspaceSize(desc, &workspace_size)); - std::shared_ptr workspace = context::allocateMemory(workspace_size); +INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Gemm, 100); + +struct PlannedMeta { + std::shared_ptr descriptor; + graph::GraphTensor workspace, c, a, b; + float alpha, beta; +}; + +void *plan(Tensor c, Tensor a, Tensor b, float alpha, float beta) { + size_t seed = hash_combine(c, a, b); + + INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE( + Descriptor, descriptor, Gemm, + seed, c->desc(), a->desc(), b->desc()); + + INFINIOP_WORKSPACE_TENSOR(workspace, Gemm, descriptor); + + auto planned = new PlannedMeta{ + descriptor, + graph::GraphTensor(workspace), + graph::GraphTensor(c), + graph::GraphTensor(a), + graph::GraphTensor(b), + alpha, beta}; + + return planned; +} + +void run(void *planned_meta) { + auto planned = reinterpret_cast(planned_meta); INFINICORE_CHECK_ERROR(infiniopGemm( - desc, workspace->data(), workspace_size, - c->data(), a->data(), b->data(), alpha, beta, context::getStream())); + planned->descriptor->desc, planned->workspace->data(), planned->workspace->numel(), + planned->c->data(), planned->a->data(), planned->b->data(), planned->alpha, planned->beta, context::getStream())); +} + +void cleanup(void **planned_meta_ptr) { + delete *reinterpret_cast(planned_meta_ptr); + *planned_meta_ptr = nullptr; } -static bool registered = []() { - Gemm::dispatcher().registerAll(&calculate, false); - return true; -}(); +INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Gemm, &plan, &run, &cleanup); } // namespace infinicore::op::gemm_impl::infiniop diff --git a/src/infinicore/ops/infiniop_impl.hpp b/src/infinicore/ops/infiniop_impl.hpp new file mode 100644 index 000000000..2bf38c8c6 --- /dev/null +++ b/src/infinicore/ops/infiniop_impl.hpp @@ -0,0 +1,50 @@ +#pragma once + +#include "../utils.hpp" +#include "infinicore/common/hash.hpp" +#include "infinicore/ops/common/cache.hpp" +#include + +#define INFINIOP_CACHABLE_DESCRIPTOR(__DESC_TYPE__, __OP_NAME__, __SIZE__) \ + struct __DESC_TYPE__ { \ + infiniop##__OP_NAME__##Descriptor_t desc; \ + Descriptor(infiniop##__OP_NAME__##Descriptor_t desc) : desc(desc) {} \ + ~Descriptor() { \ + if (desc != nullptr) { \ + infiniopDestroy##__OP_NAME__##Descriptor(desc); \ + desc = nullptr; \ + } \ + } \ + }; \ + \ + thread_local common::OpCache> \ + caches( \ + __SIZE__, \ + [](std::shared_ptr<__DESC_TYPE__> &desc) { \ + desc = nullptr; \ + }); + +#define INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(__DESC_TYPE__, __DESC_NAME__, __INFINIOP_NAME__, __HASH_KEY__, ...) \ + std::shared_ptr<__DESC_TYPE__> __DESC_NAME__; \ + { \ + auto device__ = context::getDevice(); \ + auto &cache__ = caches.getCache(device__); \ + __DESC_NAME__ = cache__.get(__HASH_KEY__).value_or(nullptr); \ + if (!__DESC_NAME__) { \ + __DESC_NAME__ = std::make_shared<__DESC_TYPE__>(nullptr); \ + INFINICORE_CHECK_ERROR(infiniopCreate##__INFINIOP_NAME__##Descriptor( \ + context::getInfiniopHandle(device__), \ + &__DESC_NAME__->desc, \ + __VA_ARGS__)); \ + cache__.put(__HASH_KEY__, __DESC_NAME__); \ + } \ + } + +#define INFINIOP_WORKSPACE_TENSOR(__TENSOR_NAME__, __INFINIOP_NAME__, __DESC_NAME__) \ + Tensor __TENSOR_NAME__; \ + { \ + auto device__ = context::getDevice(); \ + size_t workspace_size = 0; \ + INFINICORE_CHECK_ERROR(infiniopGet##__INFINIOP_NAME__##WorkspaceSize(__DESC_NAME__->desc, &workspace_size)); \ + __TENSOR_NAME__ = Tensor::empty({workspace_size}, DataType::U8, device__); \ + } diff --git a/src/infinicore/ops/linear/linear.cc b/src/infinicore/ops/linear/linear.cc index cd766195e..097e4b48b 100644 --- a/src/infinicore/ops/linear/linear.cc +++ b/src/infinicore/ops/linear/linear.cc @@ -1,6 +1,6 @@ #include "infinicore/ops/linear.hpp" -#include "infinicore/ops/add.hpp" -#include "infinicore/ops/matmul.hpp" +#include "infinicore/ops/gemm.hpp" +#include "infinicore/ops/rearrange.hpp" namespace infinicore::op { @@ -42,16 +42,18 @@ void linear_(Tensor out, // linear transformation Tensor out_view = out->view({N, out_features}); - matmul_(out_view, - input->view({N, in_features}), - weight->permute({1, 0})); - // Add bias + float alpha = 1.0f; + float beta = 0.0f; if (bias.has_value()) { - add_(out_view, - out_view, - bias.value()->as_strided({N, out_features}, {0, 1})); + rearrange_(out_view, + bias.value()->as_strided({N, out_features}, {0, 1})); + beta = 1.0f; } + + gemm_(out_view, + input->view({N, in_features}), + weight->permute({1, 0}), alpha, beta); } } // namespace infinicore::op diff --git a/src/infinicore/ops/paged_caching/paged_caching.cc b/src/infinicore/ops/paged_caching/paged_caching.cc index b1fe104a9..cc14bf236 100644 --- a/src/infinicore/ops/paged_caching/paged_caching.cc +++ b/src/infinicore/ops/paged_caching/paged_caching.cc @@ -9,14 +9,14 @@ common::OpDispatcher &PagedCaching::dispatcher() { return dispatcher_; }; -void PagedCaching::execute(Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, Tensor slot_mapping) { - INFINICORE_ASSERT_TENSORS_SAME_DEVICE(k, v, k_cache, v_cache, slot_mapping); - infinicore::context::setDevice(k->device()); - dispatcher().lookup(k->device().getType())(k, v, k_cache, v_cache, slot_mapping); +void PagedCaching::execute(Tensor k_cache, Tensor v_cache, Tensor k, Tensor v, Tensor slot_mapping) { + INFINICORE_ASSERT_TENSORS_SAME_DEVICE(k_cache, v_cache, k, v, slot_mapping); + infinicore::context::setDevice(k_cache->device()); + dispatcher().lookup(k_cache->device().getType())(k_cache, v_cache, k, v, slot_mapping); } -void paged_caching_(Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, Tensor slot_mapping) { - PagedCaching::execute(k, v, k_cache, v_cache, slot_mapping); +void paged_caching_(Tensor k_cache, Tensor v_cache, Tensor k, Tensor v, Tensor slot_mapping) { + PagedCaching::execute(k_cache, v_cache, k, v, slot_mapping); } } // namespace infinicore::op diff --git a/src/infinicore/ops/paged_caching/paged_caching_infiniop.cc b/src/infinicore/ops/paged_caching/paged_caching_infiniop.cc index f312ef817..7dcaf47a0 100644 --- a/src/infinicore/ops/paged_caching/paged_caching_infiniop.cc +++ b/src/infinicore/ops/paged_caching/paged_caching_infiniop.cc @@ -15,8 +15,8 @@ thread_local common::OpCache caches( } }); -void calculate(Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, Tensor slot_mapping) { - size_t seed = hash_combine(k, v, k_cache, v_cache, slot_mapping); +void calculate(Tensor k_cache, Tensor v_cache, Tensor k, Tensor v, Tensor slot_mapping) { + size_t seed = hash_combine(k_cache, v_cache, k, v, slot_mapping); auto device = context::getDevice(); auto &cache = caches.getCache(device); @@ -27,7 +27,7 @@ void calculate(Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, Tensor slot_m if (!desc_opt) { INFINICORE_CHECK_ERROR(infiniopCreatePagedCachingDescriptor( context::getInfiniopHandle(device), &desc, - k->desc(), v->desc(), k_cache->desc(), v_cache->desc(), slot_mapping->desc())); + k_cache->desc(), v_cache->desc(), k->desc(), v->desc(), slot_mapping->desc())); cache.put(seed, desc); } else { desc = *desc_opt; @@ -39,7 +39,7 @@ void calculate(Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, Tensor slot_m INFINICORE_CHECK_ERROR(infiniopPagedCaching( desc, workspace->data(), workspace_size, - k->data(), v->data(), k_cache->data(), v_cache->data(), slot_mapping->data(), context::getStream())); + k_cache->data(), v_cache->data(), k->data(), v->data(), slot_mapping->data(), context::getStream())); } static bool registered = []() { diff --git a/src/infinicore/pybind11/context.hpp b/src/infinicore/pybind11/context.hpp index 657e30877..9c24a322e 100644 --- a/src/infinicore/pybind11/context.hpp +++ b/src/infinicore/pybind11/context.hpp @@ -24,6 +24,11 @@ inline void bind(py::module &m) { // Synchronization m.def("sync_stream", &syncStream, "Synchronize the current stream"); m.def("sync_device", &syncDevice, "Synchronize the current device"); + + // Graph + m.def("is_graph_recording", &isGraphRecording, "Check if graph recording is turned on"); + m.def("start_graph_recording", &startGraphRecording, "Start graph recording"); + m.def("stop_graph_recording", &stopGraphRecording, "Stop graph recording and return the graph"); } } // namespace infinicore::context diff --git a/src/infinicore/pybind11/graph.hpp b/src/infinicore/pybind11/graph.hpp new file mode 100644 index 000000000..d45c9b32c --- /dev/null +++ b/src/infinicore/pybind11/graph.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include +#include + +#include "infinicore.hpp" + +namespace py = pybind11; + +namespace infinicore::graph { +inline void bind(py::module_ &m) { + py::class_>(m, "Graph") + .def(py::init<>()) // allow construction + .def("run", &infinicore::graph::Graph::run); +} +} // namespace infinicore::graph diff --git a/src/infinicore/pybind11/infinicore.cc b/src/infinicore/pybind11/infinicore.cc index 152adefd9..937a281aa 100644 --- a/src/infinicore/pybind11/infinicore.cc +++ b/src/infinicore/pybind11/infinicore.cc @@ -6,6 +6,7 @@ #include "device.hpp" #include "device_event.hpp" #include "dtype.hpp" +#include "graph.hpp" #include "ops.hpp" #include "tensor.hpp" @@ -18,6 +19,7 @@ PYBIND11_MODULE(_infinicore, m) { dtype::bind(m); ops::bind(m); tensor::bind(m); + graph::bind(m); } } // namespace infinicore diff --git a/src/infinicore/pybind11/ops/paged_caching.hpp b/src/infinicore/pybind11/ops/paged_caching.hpp index 87c130f5d..4320b4eef 100644 --- a/src/infinicore/pybind11/ops/paged_caching.hpp +++ b/src/infinicore/pybind11/ops/paged_caching.hpp @@ -11,10 +11,10 @@ namespace infinicore::ops { inline void bind_paged_caching(py::module &m) { m.def("paged_caching_", &op::paged_caching_, - py::arg("k"), - py::arg("v"), py::arg("k_cache"), py::arg("v_cache"), + py::arg("k"), + py::arg("v"), py::arg("slot_mapping"), R"doc(Paged caching of key and value tensors.)doc"); } diff --git a/src/infinicore/tensor/tensor.cc b/src/infinicore/tensor/tensor.cc index 7c2ab5428..2acc6dec8 100644 --- a/src/infinicore/tensor/tensor.cc +++ b/src/infinicore/tensor/tensor.cc @@ -275,4 +275,12 @@ std::shared_ptr TensorImpl::strided_from_blob( return t; } +Tensor TensorImpl::to_blob() const { + auto t = std::shared_ptr(new TensorImpl(shape(), strides(), dtype())); + t->data_.offset = this->data_.offset; + t->data_.memory = std::make_shared(this->data_.memory->data(), this->data_.memory->size(), this->data_.memory->device(), nullptr); + + return Tensor{t}; +} + } // namespace infinicore diff --git a/src/infinicore/utils.hpp b/src/infinicore/utils.hpp index 11b70455f..2a5d3e20d 100644 --- a/src/infinicore/utils.hpp +++ b/src/infinicore/utils.hpp @@ -47,3 +47,14 @@ inline struct SpdlogInitializer { } \ } \ } while (0) + +#define INFINICORE_ASSERT(CONDITION__) \ + do { \ + if (!(CONDITION__)) { \ + SPDLOG_ERROR( \ + "Assertion `{}` failed from {} at {}:{}", \ + #CONDITION__, __func__, __FILE__, __LINE__); \ + throw std::runtime_error( \ + std::string("Assertion `") + #CONDITION__ + "` failed from " + __func__ + " at " + __FILE__ + ":" + std::to_string(__LINE__)); \ + } \ + } while (0) diff --git a/src/infiniop/ops/paged_attention/info.h b/src/infiniop/ops/paged_attention/info.h index 17b8bf65b..216bb2360 100644 --- a/src/infiniop/ops/paged_attention/info.h +++ b/src/infiniop/ops/paged_attention/info.h @@ -67,11 +67,9 @@ class PagedAttentionInfo { size_t num_heads = q_shape[1]; size_t head_size = q_shape[2]; - if (head_size != 128) { - // 输出具体的错误原因和当前的参数值 - std::cerr << "[Error] Now only supports head_size = 128, but got " + if (head_size != 16 && head_size != 32 && head_size != 64 && head_size != 128 && head_size != 256) { + std::cerr << "[Error] Now only supports head_size = 16/32/64/128/256, but got " << head_size << "." << std::endl; - // 建议返回 SHAPE 相关的错误码 return INFINI_STATUS_BAD_TENSOR_SHAPE; } diff --git a/src/infiniop/ops/paged_attention/nvidia/paged_attention_nvidia.cu b/src/infiniop/ops/paged_attention/nvidia/paged_attention_nvidia.cu index 9f86133e2..d544fd34a 100644 --- a/src/infiniop/ops/paged_attention/nvidia/paged_attention_nvidia.cu +++ b/src/infiniop/ops/paged_attention/nvidia/paged_attention_nvidia.cu @@ -98,37 +98,49 @@ infiniStatus_t Descriptor::calculate( const void *block_tables, const void *seq_lens, const void *alibi_slopes, void *stream_) const { cudaStream_t stream = (cudaStream_t)stream_; + +#define LAUNCH_HEADSIZE_BLOCKSIZE(__H_SIZE, __B_SIZE) \ + launchKernel<__H_SIZE, __B_SIZE>( \ + out, q, k_cache, v_cache, _info.dtype, block_tables, seq_lens, alibi_slopes, \ + _info.num_heads, _info.num_seqs, \ + _info.num_kv_heads, _info.scale, _info.max_num_blocks_per_seq, _info.block_size, \ + _info.q_stride, _info.kv_block_stride, _info.kv_head_stride, _info.o_stride, \ + stream); + +#define SWITCH_HEAD_SIZE(__B_SIZE) \ + switch (_info.head_size) { \ + case 16: \ + LAUNCH_HEADSIZE_BLOCKSIZE(16, __B_SIZE) \ + break; \ + case 32: \ + LAUNCH_HEADSIZE_BLOCKSIZE(32, __B_SIZE) \ + break; \ + case 64: \ + LAUNCH_HEADSIZE_BLOCKSIZE(64, __B_SIZE) \ + break; \ + case 128: \ + LAUNCH_HEADSIZE_BLOCKSIZE(128, __B_SIZE) \ + break; \ + case 256: \ + LAUNCH_HEADSIZE_BLOCKSIZE(256, __B_SIZE) \ + break; \ + default: \ + return INFINI_STATUS_BAD_TENSOR_SHAPE; \ + } + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { - if (_info.head_size == 128) { - launchKernel<128, CUDA_BLOCK_SIZE_1024>( - out, q, k_cache, v_cache, _info.dtype, block_tables, seq_lens, alibi_slopes, - _info.num_heads, _info.num_seqs, - _info.num_kv_heads, _info.scale, _info.max_num_blocks_per_seq, _info.block_size, - _info.q_stride, _info.kv_block_stride, _info.kv_head_stride, _info.o_stride, - stream); - } + SWITCH_HEAD_SIZE(CUDA_BLOCK_SIZE_1024) } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { - if (_info.head_size == 128) { - launchKernel<128, CUDA_BLOCK_SIZE_512>( - out, q, k_cache, v_cache, _info.dtype, block_tables, seq_lens, alibi_slopes, - _info.num_heads, _info.num_seqs, - _info.num_kv_heads, _info.scale, _info.max_num_blocks_per_seq, _info.block_size, - _info.q_stride, _info.kv_block_stride, _info.kv_head_stride, _info.o_stride, - stream); - } + SWITCH_HEAD_SIZE(CUDA_BLOCK_SIZE_512) } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { - if (_info.head_size == 128) { - launchKernel<128, CUDA_BLOCK_SIZE_4096>( - out, q, k_cache, v_cache, _info.dtype, block_tables, seq_lens, alibi_slopes, - _info.num_heads, _info.num_seqs, - _info.num_kv_heads, _info.scale, _info.max_num_blocks_per_seq, _info.block_size, - _info.q_stride, _info.kv_block_stride, _info.kv_head_stride, _info.o_stride, - stream); - } + SWITCH_HEAD_SIZE(CUDA_BLOCK_SIZE_4096) } else { return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; } +#undef LAUNCH_HEADSIZE_BLOCKSIZE +#undef SWITCH_HEAD_SIZE + return INFINI_STATUS_SUCCESS; } diff --git a/src/infiniop/ops/paged_attention/operator.cc b/src/infiniop/ops/paged_attention/operator.cc index f41adb2cb..1d7d4fee3 100644 --- a/src/infiniop/ops/paged_attention/operator.cc +++ b/src/infiniop/ops/paged_attention/operator.cc @@ -5,9 +5,9 @@ #ifdef ENABLE_NVIDIA_API #include "nvidia/paged_attention_nvidia.cuh" #endif -#ifdef ENABLE_METAX_API -#include "metax/paged_attention_metax.h" -#endif +// #ifdef ENABLE_METAX_API +// #include "metax/paged_attention_metax.h" +// #endif __C infiniStatus_t infiniopCreatePagedAttentionDescriptor( infiniopHandle_t handle, @@ -34,11 +34,12 @@ __C infiniStatus_t infiniopCreatePagedAttentionDescriptor( #ifdef ENABLE_NVIDIA_API CREATE(INFINI_DEVICE_NVIDIA, nvidia) #endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax) -#endif + // #ifdef ENABLE_METAX_API + // CREATE(INFINI_DEVICE_METAX, metax) + // #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } __C infiniStatus_t infiniopGetPagedAttentionWorkspaceSize( @@ -54,11 +55,12 @@ __C infiniStatus_t infiniopGetPagedAttentionWorkspaceSize( #ifdef ENABLE_NVIDIA_API GET(INFINI_DEVICE_NVIDIA, nvidia) #endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax) -#endif + // #ifdef ENABLE_METAX_API + // GET(INFINI_DEVICE_METAX, metax) + // #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } __C infiniStatus_t infiniopPagedAttention( @@ -78,11 +80,12 @@ __C infiniStatus_t infiniopPagedAttention( #ifdef ENABLE_NVIDIA_API CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) #endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax) -#endif + // #ifdef ENABLE_METAX_API + // CALCULATE(INFINI_DEVICE_METAX, metax) + // #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } __C infiniStatus_t infiniopDestroyPagedAttentionDescriptor( @@ -97,9 +100,10 @@ __C infiniStatus_t infiniopDestroyPagedAttentionDescriptor( #ifdef ENABLE_NVIDIA_API DESTROY(INFINI_DEVICE_NVIDIA, nvidia) #endif -#ifdef ENABLE_METAX_API - DESTROY(INFINI_DEVICE_METAX, metax) -#endif + // #ifdef ENABLE_METAX_API + // DESTROY(INFINI_DEVICE_METAX, metax) + // #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } diff --git a/src/infiniop/ops/paged_caching/info.h b/src/infiniop/ops/paged_caching/info.h index 5b8a9234d..87f6b5181 100644 --- a/src/infiniop/ops/paged_caching/info.h +++ b/src/infiniop/ops/paged_caching/info.h @@ -28,10 +28,10 @@ class PagedCachingInfo { ptrdiff_t v_cache_block_stride; static utils::Result create( - infiniopTensorDescriptor_t k_desc, - infiniopTensorDescriptor_t v_desc, infiniopTensorDescriptor_t k_cache_desc, infiniopTensorDescriptor_t v_cache_desc, + infiniopTensorDescriptor_t k_desc, + infiniopTensorDescriptor_t v_desc, infiniopTensorDescriptor_t slot_mapping_desc) { auto dtype = k_desc->dtype(); diff --git a/src/infiniop/ops/paged_caching/nvidia/paged_caching_nvidia.cu b/src/infiniop/ops/paged_caching/nvidia/paged_caching_nvidia.cu index b0344999e..9b87b3f7f 100644 --- a/src/infiniop/ops/paged_caching/nvidia/paged_caching_nvidia.cu +++ b/src/infiniop/ops/paged_caching/nvidia/paged_caching_nvidia.cu @@ -31,13 +31,13 @@ Descriptor::~Descriptor() { infiniStatus_t Descriptor::create( infiniopHandle_t handle, Descriptor **desc_ptr, - infiniopTensorDescriptor_t k_desc, - infiniopTensorDescriptor_t v_desc, infiniopTensorDescriptor_t k_cache_desc, infiniopTensorDescriptor_t v_cache_desc, + infiniopTensorDescriptor_t k_desc, + infiniopTensorDescriptor_t v_desc, infiniopTensorDescriptor_t slot_mapping_desc) { - auto info = PagedCachingInfo::create(k_desc, v_desc, k_cache_desc, v_cache_desc, slot_mapping_desc); + auto info = PagedCachingInfo::create(k_cache_desc, v_cache_desc, k_desc, v_desc, slot_mapping_desc); CHECK_RESULT(info); // Create and return the Descriptor instance. @@ -121,8 +121,8 @@ infiniStatus_t launchKernel(const PagedCachingInfo &info, // Execution method implementation infiniStatus_t Descriptor::calculate( void *workspace, size_t workspace_size, - const void *k, const void *v, void *k_cache, void *v_cache, + const void *k, const void *v, const void *slot_mapping, void *stream_) const { diff --git a/src/infiniop/ops/paged_caching/operator.cc b/src/infiniop/ops/paged_caching/operator.cc index a69b0e07e..3bfd92280 100644 --- a/src/infiniop/ops/paged_caching/operator.cc +++ b/src/infiniop/ops/paged_caching/operator.cc @@ -5,17 +5,17 @@ #ifdef ENABLE_NVIDIA_API #include "nvidia/paged_caching_nvidia.cuh" #endif -#ifdef ENABLE_METAX_API -#include "metax/paged_caching_metax.h" -#endif +// #ifdef ENABLE_METAX_API +// #include "metax/paged_caching_metax.h" +// #endif __C infiniStatus_t infiniopCreatePagedCachingDescriptor( infiniopHandle_t handle, infiniopPagedCachingDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t k_desc, - infiniopTensorDescriptor_t v_desc, infiniopTensorDescriptor_t k_cache_desc, infiniopTensorDescriptor_t v_cache_desc, + infiniopTensorDescriptor_t k_desc, + infiniopTensorDescriptor_t v_desc, infiniopTensorDescriptor_t slot_mapping_desc) { #define CREATE(CASE, NAMESPACE) \ @@ -23,17 +23,18 @@ __C infiniStatus_t infiniopCreatePagedCachingDescriptor( return op::paged_caching::NAMESPACE::Descriptor::create( \ handle, \ reinterpret_cast(desc_ptr), \ - k_desc, v_desc, k_cache_desc, v_cache_desc, slot_mapping_desc); + k_cache_desc, v_cache_desc, k_desc, v_desc, slot_mapping_desc); switch (handle->device) { #ifdef ENABLE_NVIDIA_API CREATE(INFINI_DEVICE_NVIDIA, nvidia) #endif -#ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax) -#endif + // #ifdef ENABLE_METAX_API + // CREATE(INFINI_DEVICE_METAX, metax) + // #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } __C infiniStatus_t infiniopGetPagedCachingWorkspaceSize( @@ -49,35 +50,37 @@ __C infiniStatus_t infiniopGetPagedCachingWorkspaceSize( #ifdef ENABLE_NVIDIA_API GET(INFINI_DEVICE_NVIDIA, nvidia) #endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax) -#endif + // #ifdef ENABLE_METAX_API + // GET(INFINI_DEVICE_METAX, metax) + // #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } __C infiniStatus_t infiniopPagedCaching( infiniopPagedCachingDescriptor_t desc, void *workspace, size_t workspace_size, - const void *k, const void *v, void *k_cache, void *v_cache, + const void *k, const void *v, const void *slot_mapping, void *stream) { #define CALCULATE(CASE, NAMESPACE) \ case CASE: \ return reinterpret_cast(desc)->calculate( \ - workspace, workspace_size, k, v, k_cache, v_cache, slot_mapping, stream); + workspace, workspace_size, k_cache, v_cache, k, v, slot_mapping, stream); switch (desc->device_type) { #ifdef ENABLE_NVIDIA_API CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) #endif -#ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax) -#endif + // #ifdef ENABLE_METAX_API + // CALCULATE(INFINI_DEVICE_METAX, metax) + // #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } __C infiniStatus_t infiniopDestroyPagedCachingDescriptor( @@ -92,9 +95,10 @@ __C infiniStatus_t infiniopDestroyPagedCachingDescriptor( #ifdef ENABLE_NVIDIA_API DESTROY(INFINI_DEVICE_NVIDIA, nvidia) #endif -#ifdef ENABLE_METAX_API - DESTROY(INFINI_DEVICE_METAX, metax) -#endif + // #ifdef ENABLE_METAX_API + // DESTROY(INFINI_DEVICE_METAX, metax) + // #endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } diff --git a/src/infiniop/ops/paged_caching/paged_caching.h b/src/infiniop/ops/paged_caching/paged_caching.h index 473ddb5fd..cbd7a2701 100644 --- a/src/infiniop/ops/paged_caching/paged_caching.h +++ b/src/infiniop/ops/paged_caching/paged_caching.h @@ -32,16 +32,16 @@ static infiniStatus_t create( \ infiniopHandle_t handle, \ Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t k_desc, \ - infiniopTensorDescriptor_t v_desc, \ infiniopTensorDescriptor_t k_cache_desc, \ infiniopTensorDescriptor_t v_cache_desc, \ + infiniopTensorDescriptor_t k_desc, \ + infiniopTensorDescriptor_t v_desc, \ infiniopTensorDescriptor_t slot_mapping_desc); \ \ infiniStatus_t calculate( \ void *workspace, size_t workspace_size, \ - const void *k, const void *v, \ void *k_cache, void *v_cache, \ + const void *k, const void *v, \ const void *slot_mapping, \ void *stream) const; \ }; \ diff --git a/test/infinicore/framework/tensor.py b/test/infinicore/framework/tensor.py index 10a4655ab..317135d65 100644 --- a/test/infinicore/framework/tensor.py +++ b/test/infinicore/framework/tensor.py @@ -60,7 +60,12 @@ def _create_contiguous_tensor(shape, torch_dtype, torch_device_str, mode, **kwar # Handle real floating-point types if mode == TensorInitializer.RANDOM: - return torch.rand(shape, dtype=torch_dtype, device=torch_device_str) + scale = kwargs.get("scale", 1.0) + bias = kwargs.get("bias", 0.0) + return ( + torch.rand(shape, dtype=torch_dtype, device=torch_device_str) * scale + + bias + ) elif mode == TensorInitializer.ZEROS: return torch.zeros(shape, dtype=torch_dtype, device=torch_device_str) elif mode == TensorInitializer.ONES: diff --git a/test/infinicore/graph/graph.py b/test/infinicore/graph/graph.py new file mode 100644 index 000000000..2f8927110 --- /dev/null +++ b/test/infinicore/graph/graph.py @@ -0,0 +1,85 @@ +import sys +import os + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +import torch +import infinicore +from framework import BaseOperatorTest, TensorSpec, TestCase, GenericTestRunner + +# Test cases format: (in_shape, proj_w_shape) +_TEST_CASES_DATA = [ + ((32, 4096), (4096, 4096)), +] + +_TOLERANCE_MAP = { + infinicore.float16: {"atol": 0, "rtol": 1e-2}, + infinicore.float32: {"atol": 1e-4, "rtol": 1e-3}, + infinicore.bfloat16: {"atol": 0, "rtol": 5e-2}, +} +_TENSOR_DTYPES = [infinicore.float16, infinicore.float32, infinicore.bfloat16] + + +def parse_test_cases(): + cases = [] + for in_shape, proj_w_shape in _TEST_CASES_DATA: + for dtype in _TENSOR_DTYPES: + tol = _TOLERANCE_MAP[dtype] + in_spec = TensorSpec.from_tensor(in_shape, dtype=dtype) + proj_w_spec = TensorSpec.from_tensor(proj_w_shape, dtype=dtype) + temp_spec = TensorSpec.from_tensor(in_shape, dtype=dtype) + + # Out-of-place + cases.append( + TestCase( + inputs=[in_spec, proj_w_spec, temp_spec], + kwargs={}, + output_spec=None, + comparison_target=None, + tolerance=tol, + description="Graph", + ) + ) + + return cases + + +class OpTest(BaseOperatorTest): + """Test Operator Graph""" + + def __init__(self): + super().__init__("Graph") + + def get_test_cases(self): + return parse_test_cases() + + def torch_operator(self, *args, **kwargs): + a = args[0] + b = args[1] + + return torch.matmul(a, b) + + def infinicore_operator(self, *args, **kwargs): + """Record graph and run""" + a = args[0] + b = args[1] + temp_a = args[2] + + infinicore.start_graph_recording() + c = infinicore.matmul(temp_a, b) + op_graph = infinicore.stop_graph_recording() + + temp_a.copy_(a) + op_graph.run() + + return c + + +def main(): + """Main entry point""" + runner = GenericTestRunner(OpTest) + runner.run_and_exit() + + +if __name__ == "__main__": + main() diff --git a/test/infinicore/ops/paged_attention.py b/test/infinicore/ops/paged_attention.py index cc9f2977f..06515633d 100644 --- a/test/infinicore/ops/paged_attention.py +++ b/test/infinicore/ops/paged_attention.py @@ -25,6 +25,7 @@ (4, 40, 40, 128, 16, 1024, False), (6, 40, 40, 128, 16, 1024, False), (3, 8, 8, 128, 16, 1024, False), + (3, 8, 8, 64, 16, 1024, False), (8, 64, 8, 128, 16, 2048, False), ] @@ -68,8 +69,6 @@ def parse_test_cases(): 0, num_seqs * max_blocks_per_seq, dtype=torch.int64 ).view(num_seqs, max_blocks_per_seq) - print("block_tables.shape", block_tables.shape, block_tables) - q_shape = (num_seqs, num_heads, head_size) k_cache_shape = (num_blocks, num_kv_heads, block_size, head_size) v_cache_shape = (num_blocks, num_kv_heads, block_size, head_size) diff --git a/test/infinicore/ops/paged_caching.py b/test/infinicore/ops/paged_caching.py index 4098aa06c..65e59eaae 100644 --- a/test/infinicore/ops/paged_caching.py +++ b/test/infinicore/ops/paged_caching.py @@ -28,9 +28,9 @@ # Tolerance configuration _TOLERANCE_MAP = { - infinicore.float16: {"atol": 0, "rtol": 1e-2}, - infinicore.float32: {"atol": 1e-4, "rtol": 1e-3}, - infinicore.bfloat16: {"atol": 0, "rtol": 5e-2}, + infinicore.float16: {"atol": 0, "rtol": 1e-5}, + infinicore.float32: {"atol": 0, "rtol": 1e-5}, + infinicore.bfloat16: {"atol": 0, "rtol": 1e-5}, } # Data types to test @@ -40,15 +40,15 @@ # ============================================================================== # Reference Implementation # ============================================================================== -def ref_paged_caching(key, value, key_cache_pool, value_cache_pool, slot_mapping): +def ref_paged_caching(key_cache_pool, value_cache_pool, key, value, slot_mapping): """ Reference implementation for paged_caching operator. Args: - key (torch.Tensor): Keys, shape [ntok, nkvh, dh] - value (torch.Tensor): Values, shape [ntok, nkvh, dh] key_cache_pool (torch.Tensor): K cache pool, shape [num_blocks, nkvh, block_size, dh] value_cache_pool (torch.Tensor): V cache pool, shape [num_blocks, nkvh, block_size, dh] + key (torch.Tensor): Keys, shape [ntok, nkvh, dh] + value (torch.Tensor): Values, shape [ntok, nkvh, dh] slot_mapping (torch.Tensor): Slot mapping, shape [ntok] """ ntok = key.shape[0] @@ -56,8 +56,8 @@ def ref_paged_caching(key, value, key_cache_pool, value_cache_pool, slot_mapping # This reference implementation operates on a cloned cache to avoid modifying the original input tensor, # mimicking the behavior where the custom operator writes to its output tensor. - k_cache_ref = key_cache_pool.clone() - v_cache_ref = value_cache_pool.clone() + k_cache_ref = key_cache_pool + v_cache_ref = value_cache_pool for i in range(ntok): slot = slot_mapping[i].item() @@ -98,9 +98,9 @@ def parse_test_cases(): current_slot += length.item() # Ensure we don't exceed the total number of slots in the cache - assert ( - current_slot <= num_blocks * block_size - ), "Not enough blocks in the cache pool for this test case" + assert current_slot <= num_blocks * block_size, ( + "Not enough blocks in the cache pool for this test case" + ) slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64) @@ -119,8 +119,12 @@ def parse_test_cases(): # Create typed tensor specs k_spec = TensorSpec.from_tensor(k_shape, None, dtype) v_spec = TensorSpec.from_tensor(v_shape, None, dtype) - k_cache_spec = TensorSpec.from_tensor(k_cache_shape, None, dtype) - v_cache_spec = TensorSpec.from_tensor(v_cache_shape, None, dtype) + k_cache_spec = TensorSpec.from_tensor( + k_cache_shape, None, dtype, init_mode=TensorInitializer.ZEROS + ) + v_cache_spec = TensorSpec.from_tensor( + v_cache_shape, None, dtype, init_mode=TensorInitializer.ZEROS + ) slot_mapping_spec = TensorSpec.from_tensor( slot_mapping_shape, init_mode=TensorInitializer.MANUAL, @@ -132,10 +136,10 @@ def parse_test_cases(): test_cases.append( TestCase( inputs=[ - k_spec, - v_spec, k_cache_spec, v_cache_spec, + k_spec, + v_spec, slot_mapping_spec, ], kwargs=None, diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index 283bdb1cd..618be2b05 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -1066,10 +1066,10 @@ def paged_caching_(lib): lib.infiniopCreatePagedCachingDescriptor.argtypes = [ infiniopHandle_t, POINTER(infiniopOperatorDescriptor_t), - infiniopTensorDescriptor_t, # k_desc - infiniopTensorDescriptor_t, # v_desc infiniopTensorDescriptor_t, # k_cache_desc infiniopTensorDescriptor_t, # v_cache_desc + infiniopTensorDescriptor_t, # k_desc + infiniopTensorDescriptor_t, # v_desc infiniopTensorDescriptor_t, # slot_mapping_desc ] @@ -1086,10 +1086,10 @@ def paged_caching_(lib): infiniopOperatorDescriptor_t, c_void_p, # workspace c_size_t, # workspace_size - c_void_p, # k - c_void_p, # v c_void_p, # k_cache c_void_p, # v_cache + c_void_p, # k + c_void_p, # v c_void_p, # slot_mapping c_void_p, # stream ] diff --git a/test/infiniop/paged_attention.py b/test/infiniop/paged_attention.py index da44c8e62..882e9cfee 100644 --- a/test/infiniop/paged_attention.py +++ b/test/infiniop/paged_attention.py @@ -95,6 +95,7 @@ def ref_single_query_cached_kv_attention( (4, 40, 40, 128, 16, 1024, False), (6, 40, 40, 128, 16, 1024, False), (3, 8, 8, 128, 16, 1024, False), + (3, 8, 8, 64, 16, 1024, False), (8, 64, 8, 128, 16, 2048, False), ] diff --git a/test/infiniop/paged_caching.py b/test/infiniop/paged_caching.py index 947859ec9..012067465 100644 --- a/test/infiniop/paged_caching.py +++ b/test/infiniop/paged_caching.py @@ -22,15 +22,15 @@ # ============================================================================== # Reference Implementation # ============================================================================== -def ref_paged_caching(key, value, key_cache_pool, value_cache_pool, slot_mapping): +def ref_paged_caching(key_cache_pool, value_cache_pool, key, value, slot_mapping): """ Reference implementation for paged_caching operator. Args: - key (torch.Tensor): Keys, shape [ntok, nkvh, dh] - value (torch.Tensor): Values, shape [ntok, nkvh, dh] key_cache_pool (torch.Tensor): K cache pool, shape [num_blocks, nkvh, block_size, dh] value_cache_pool (torch.Tensor): V cache pool, shape [num_blocks, nkvh, block_size, dh] + key (torch.Tensor): Keys, shape [ntok, nkvh, dh] + value (torch.Tensor): Values, shape [ntok, nkvh, dh] slot_mapping (torch.Tensor): Slot mapping, shape [ntok] """ ntok = key.shape[0] @@ -71,9 +71,9 @@ def ref_paged_caching(key, value, key_cache_pool, value_cache_pool, slot_mapping # Tolerance map for different data types _TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2}, - InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-2}, - InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5}, + InfiniDtype.F16: {"atol": 0, "rtol": 1e-5}, + InfiniDtype.BF16: {"atol": 0, "rtol": 1e-5}, + InfiniDtype.F32: {"atol": 0, "rtol": 1e-5}, } # Global flags for controlling test behavior @@ -123,9 +123,9 @@ def test( current_slot += length.item() # Ensure we don't exceed the total number of slots in the cache - assert ( - current_slot <= num_blocks * block_size - ), "Not enough blocks in the cache pool for this test case" + assert current_slot <= num_blocks * block_size, ( + "Not enough blocks in the cache pool for this test case" + ) slot_mapping_torch = torch.tensor(slot_mapping_list, dtype=torch.int64) @@ -144,10 +144,10 @@ def test( # Run reference implementation k_cache_ref, v_cache_ref = ref_paged_caching( - k.torch_tensor(), - v.torch_tensor(), k_cache_pool.torch_tensor(), v_cache_pool.torch_tensor(), + k.torch_tensor(), + v.torch_tensor(), slot_mapping.torch_tensor(), ) @@ -160,10 +160,10 @@ def test( LIBINFINIOP.infiniopCreatePagedCachingDescriptor( handle, ctypes.byref(descriptor), - k.descriptor, - v.descriptor, k_cache_pool.descriptor, v_cache_pool.descriptor, + k.descriptor, + v.descriptor, slot_mapping.descriptor, ) ) @@ -191,10 +191,10 @@ def lib_paged_caching(): descriptor, workspace.data(), workspace_size.value, - k.data(), - v.data(), k_cache_pool.data(), v_cache_pool.data(), + k.data(), + v.data(), slot_mapping.data(), None, ) diff --git a/test/infiniop/paged_caching_prefill.py b/test/infiniop/paged_caching_prefill.py index 1fa9957fc..f39cd2afc 100644 --- a/test/infiniop/paged_caching_prefill.py +++ b/test/infiniop/paged_caching_prefill.py @@ -80,7 +80,7 @@ def allocate_slots(self, request_id, num_new_tokens): return torch.tensor(slots, dtype=torch.int32) -def ref_paged_caching(k_new, v_new, k_pool, v_pool, slots, block_size): +def ref_paged_caching(k_pool, v_pool, k_new, v_new, slots, block_size): """Reference implementation for incremental caching.""" for i in range(k_new.shape[0]): slot = slots[i].item() @@ -152,10 +152,10 @@ def test( def torch_caching(): nonlocal k_pool_ref, v_pool_ref return ref_paged_caching( - k_in.torch_tensor(), - v_in.torch_tensor(), k_pool_ref, v_pool_ref, + k_in.torch_tensor(), + v_in.torch_tensor(), slots_torch, block_size, ) @@ -168,10 +168,10 @@ def torch_caching(): LIBINFINIOP.infiniopCreatePagedCachingDescriptor( handle, ctypes.byref(descriptor), - k_in.descriptor, - v_in.descriptor, k_cache_pool.descriptor, v_cache_pool.descriptor, + k_in.descriptor, + v_in.descriptor, slot_mapping.descriptor, ) ) @@ -190,10 +190,10 @@ def lib_caching(): descriptor, workspace.data(), workspace_size.value, - k_in.data(), - v_in.data(), k_cache_pool.data(), v_cache_pool.data(), + k_in.data(), + v_in.data(), slot_mapping.data(), None, ) diff --git a/xmake.lua b/xmake.lua index 0a5b2d473..d5a4ba7f7 100644 --- a/xmake.lua +++ b/xmake.lua @@ -268,6 +268,9 @@ target("infinirt") add_deps("infinirt-hygon") end set_languages("cxx17") + if not is_plat("windows") then + add_cxflags("-fPIC") + end set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")) add_files("src/infinirt/*.cc") add_installfiles("include/infinirt.h", {prefixdir = "include"}) @@ -390,6 +393,7 @@ target("infinicore_cpp_api") add_files("src/infinicore/context/*.cc") add_files("src/infinicore/context/*/*.cc") add_files("src/infinicore/tensor/*.cc") + add_files("src/infinicore/graph/*.cc") add_files("src/infinicore/nn/*.cc") add_files("src/infinicore/ops/*/*.cc") add_files("src/utils/*.cc") @@ -418,6 +422,8 @@ target("_infinicore") add_packages("pybind11") set_languages("cxx17") + add_deps("infinicore_cpp_api") + set_kind("shared") local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini") add_includedirs(INFINI_ROOT.."/include", { public = true }) @@ -425,14 +431,7 @@ target("_infinicore") add_linkdirs(INFINI_ROOT.."/lib") add_links("infiniop", "infinirt", "infiniccl") - add_files("src/infinicore/*.cc") - add_files("src/infinicore/context/*.cc") - add_files("src/infinicore/context/*/*.cc") - add_files("src/infinicore/tensor/*.cc") - add_files("src/infinicore/nn/*.cc") - add_files("src/infinicore/ops/*/*.cc") add_files("src/infinicore/pybind11/**.cc") - add_files("src/utils/*.cc") set_installdir("python/infinicore") target_end()