Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ void releaseAllResourcesImpl() {
called = true;
releaseAllGenerator();
releaseAllDeviceMem();
releaseAllEvent();
event_pool_clear();
devproxy::finalizeVendor();
}

Expand Down
29 changes: 29 additions & 0 deletions dipu/torch_dipu/csrc_dipu/base/utility.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#pragma once

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

是否应该放到 utils/ 目录下

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

我也在犹豫放哪里,有没有啥规范或者惯例,还有就是文件名字有没有更好的

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

附议

#include <array>
#include <utility>

namespace dipu {

// Create an array of functor. Each functor should provide a 'value()' function
// to return a reference of a static variable. Thus the value could be lazily
// initialized.
template <typename T, std::size_t N>
struct make_static_function_array {
template <typename U = std::make_index_sequence<N>>
struct slot {};
template <std::size_t... I>
struct slot<std::index_sequence<I...>> {
// deduction friendly
using type = std::decay_t<decltype(T::template value<0>)>;
// the actual array
auto inline static constexpr value =
std::array<type, sizeof...(I)>{T::template value<I>...};
};
};

template <typename T, std::size_t N>
using static_function_array =
typename make_static_function_array<T, N>::template slot<>;

} // namespace dipu
127 changes: 53 additions & 74 deletions dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp
Original file line number Diff line number Diff line change
@@ -1,102 +1,81 @@
#include "DIPUEventPool.h"

#include <array>
#include <deque>
#include <functional>
#include <iostream>
#include <mutex>

namespace dipu {
#include "csrc_dipu/base/utility.h"
#include "csrc_dipu/runtime/device/deviceapis.h"

template <typename T>
class EventPool final {
protected:
std::deque<T> event_pool_;
unsigned int allocate_num_ = 0;
namespace {

std::function<void(T&)> allocator_;
std::function<void(T&)> deleter_;
using mutex_t = std::recursive_mutex;
mutex_t event_mutex_;
class EventPool {
std::deque<dipu::deviceEvent_t> pool;
std::recursive_mutex mutex;

public:
EventPool(const std::function<void(T&)>& allocator,
const std::function<void(T&)>& deleter)
: allocator_(allocator), deleter_(deleter) {}

EventPool(const EventPool&) = delete;
EventPool(EventPool&&) = delete;
EventPool& operator=(const EventPool&) = delete;
EventPool& operator=(EventPool&&) = delete;
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

有锁作为成员,EventPool 自动变得无法拷贝和移动,所以我都删掉了


~EventPool() = default;

void release() {
std::lock_guard<mutex_t> _(event_mutex_);
for (auto& event : event_pool_) {
deleter_(event);
allocate_num_--;
}
event_pool_.clear();
}

void get(T& event) {
bool need_allocator = false;
void acquire(dipu::deviceEvent_t& event) {
{
std::lock_guard<mutex_t> _(event_mutex_);
if (event_pool_.empty()) {
need_allocator = true;
} else {
event = event_pool_.back();
event_pool_.pop_back();
std::scoped_lock _(mutex);
if (!pool.empty()) {
event = pool.back();
pool.pop_back();
return;
}
}
if (need_allocator) {
allocator_(event);
}

dipu::devapis::createEvent(&event);
}

void release(dipu::deviceEvent_t& event) {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

新的名字确实比原来的规范

std::scoped_lock _(mutex);
pool.emplace_back(event);
}

void restore(T& event) {
std::lock_guard<mutex_t> _(event_mutex_);
event_pool_.emplace_back(event);
void clear() { // should it called inside destructor?
std::scoped_lock _(mutex);
for (auto& event : pool) {
dipu::devapis::destroyEvent(event);
}
pool.clear();
}
};

EventPool<deviceEvent_t>* getEventPool() {
const int index = devproxy::current_device();
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

我是觉得 event pool 不应该调用 devproxy 命名空间的方法,所以把它移到外面了

// GlobalEventPool for different cards , construct when really needed
#define dispatch_event_pool(device_id) \
if (index == (device_id)) { \
static EventPool<deviceEvent_t> gDIPUEventPool( \
[](deviceEvent_t& event) { devapis::createEvent(&event); }, \
[](deviceEvent_t& event) { devapis::destroyEvent(event); }); \
return &gDIPUEventPool; \
struct EventPoolHolder {
template <std::size_t I>
inline static auto& value() {
auto static instance = EventPool();
return instance;
}
};

auto constexpr max_card_number = 16;
using EventPoolHolderArray =
dipu::static_function_array<EventPoolHolder, max_card_number>;

dispatch_event_pool(0);
dispatch_event_pool(1);
dispatch_event_pool(2);
dispatch_event_pool(3);
dispatch_event_pool(4);
dispatch_event_pool(5);
dispatch_event_pool(6);
dispatch_event_pool(7);
dispatch_event_pool(8);
dispatch_event_pool(9);
dispatch_event_pool(10);
dispatch_event_pool(11);
dispatch_event_pool(12);
dispatch_event_pool(13);
dispatch_event_pool(14);
dispatch_event_pool(15);
TORCH_CHECK(false, "support up to 16 cards");
auto event_pool(int index) -> EventPool& {
TORCH_CHECK(0 <= index and index < max_card_number, "support up to 16 cards");
return EventPoolHolderArray::value[index]();
}

void getEventFromPool(deviceEvent_t& event) { getEventPool()->get(event); }
} // namespace

namespace dipu {

void event_pool_acquire(int index, deviceEvent_t& event) {
event_pool(index).acquire(event);
}

void restoreEventToPool(deviceEvent_t& event) {
getEventPool()->restore(event);
void event_pool_release(int index, deviceEvent_t& event) {
event_pool(index).release(event);
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

参数加了 index, 上下层的职责划分也比原来明确些。

}

void releaseAllEvent() { getEventPool()->release(); }
void event_pool_clear() {
for (auto& pool : EventPoolHolderArray::value) {
Copy link
Copy Markdown
Collaborator Author

@wiryls wiryls Jul 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个改法欠佳(导致全部 pool 都被初始化),但没想到更好同时又不麻烦的方法了

pool().clear();
}
}

} // namespace dipu
8 changes: 3 additions & 5 deletions dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@

namespace dipu {

void getEventFromPool(deviceEvent_t& event);

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

其实原来的驼峰也没哈问题,真没必要都改成下划线,纯个人习惯问题。。。

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

确实,这里是习惯了

void restoreEventToPool(deviceEvent_t& event);

void releaseAllEvent();
void event_pool_acquire(int index, deviceEvent_t& event);
void event_pool_release(int index, deviceEvent_t& event);
void event_pool_clear();

} // namespace dipu
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <c10/core/Device.h>
#include <c10/util/flat_hash_map.h>

#include "csrc_dipu/base/utility.h"
#include "csrc_dipu/runtime/core/DIPUEvent.h"

#include "DIPUAsyncResourcePool.h"
Expand Down Expand Up @@ -213,48 +214,32 @@ struct RawAllocator<at::DeviceType::CPU> {
using type = DIPURawHostAllocator;
};

template <typename AllocatorImpl, class AsyncMemPoolImpl, int device_id>
c10::Allocator* get_allocator_impl(c10::Allocator* raw_allocator) {
// Construct when really needed
// async_mem_pool is used when cache_allocator being destructed so it should
// be destructed after cache_allocator
static AsyncMemPoolImpl async_mem_pool;
static AllocatorImpl cache_allocator;
static int n = [&]() {
cache_allocator.set_raw_allocator(raw_allocator);
cache_allocator.set_async_mem_pool(&async_mem_pool);
return 0;
}();
return &cache_allocator;
}

template <class AllocatorImpl, class AsyncMemPoolImpl>
c10::Allocator* get_allocator(int device_id, c10::Allocator* raw_allocator) {
#define DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(id) \
if (device_id == (id)) { \
return get_allocator_impl<AllocatorImpl, AsyncMemPoolImpl, id>( \
raw_allocator); \
template <typename AllocatorImpl, class AsyncMemPoolImpl>
struct AllocatorHolder {
template <std::size_t I>
inline static c10::Allocator* value(c10::Allocator* raw_allocator) {
// Construct when really needed
// async_mem_pool is used when cache_allocator being destructed so it should
// be destructed after cache_allocator
static AsyncMemPoolImpl async_mem_pool;
static AllocatorImpl cache_allocator;
static int n = [&]() {
cache_allocator.set_raw_allocator(raw_allocator);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个 precondition 好强啊,没有任何警示吗

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这块没想到怎么改合适……索性直接复制之前的实现

cache_allocator.set_async_mem_pool(&async_mem_pool);
return 0;
}();
return &cache_allocator;
}
};

DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(0);
DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(1);
DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(2);
DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(3);
DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(4);
DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(5);
DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(6);
DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(7);
DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(8);
DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(9);
DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(10);
DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(11);
DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(12);
DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(13);
DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(14);
DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(15);
TORCH_CHECK(false, "support up to 16 cards");
template <class AllocatorImpl, class AsyncMemPoolImpl>
c10::Allocator* get_allocator(int index, c10::Allocator* raw_allocator) {
auto constexpr max_card_number = 16;
TORCH_CHECK(0 <= index and index < max_card_number, "support up to 16 cards");
return dipu::static_function_array<
AllocatorHolder<AllocatorImpl, AsyncMemPoolImpl>,
max_card_number>::value[index](raw_allocator);
}
#undef DIPU_ALLOCATOR_DISPATCH_DEVICE_ID

#define DIPU_REGISTER_ALLOCATOR(name, device_type, CachingAllocator, \
algorithm, priority) \
Expand Down
10 changes: 8 additions & 2 deletions dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,15 @@ bool isStreamEmpty(deviceStream_t stream) {
// device event related
// =====================

void createEvent(deviceEvent_t* event) { return getEventFromPool(*event); }
void createEvent(deviceEvent_t* event) {
auto index = current_device();
return event_pool_acquire(index, *event);
}

void destroyEvent(deviceEvent_t event) { return restoreEventToPool(event); }
void destroyEvent(deviceEvent_t event) {
auto index = current_device();
return event_pool_release(index, event);
}

void waitEvent(deviceEvent_t event) { return devapis::waitEvent(event); }

Expand Down