Skip to content

Commit 38ae191

Browse files
committed
refactor: remove deps of userdma lib
1 parent a021171 commit 38ae191

File tree

3 files changed

+269
-18
lines changed

3 files changed

+269
-18
lines changed

ggml/src/ggml-qnn/npu/CMakeLists.txt

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -188,14 +188,12 @@ else()
188188
file(GLOB common_srcs "${CMAKE_CURRENT_LIST_DIR}/common/*.cpp")
189189
file(GLOB device_srcs "${CMAKE_CURRENT_LIST_DIR}/device/*.cpp")
190190
file(GLOB device_op_srcs "${CMAKE_CURRENT_LIST_DIR}/device/op/*.cpp")
191-
file(GLOB dma_srcs "${HEXAGON_SDK_ROOT}/addons/compute/libs/userdma/utils_lib/src/*.c")
192191
set(skel_srcs "${CMAKE_CURRENT_BINARY_DIR}/npu_device_skel.c")
193192
add_library(hexagon_npu_skel_OBJS OBJECT
194193
${common_srcs}
195194
${device_srcs}
196195
${device_op_srcs}
197196
${skel_srcs}
198-
${dma_srcs}
199197
)
200198

201199
if(CMAKE_BUILD_TYPE MATCHES "Debug|Dbg")
@@ -243,8 +241,6 @@ else()
243241
${HEXAGON_SDK_ROOT}/libs/qprintf/inc/
244242

245243
# TODO: find a better way to include these
246-
${HEXAGON_SDK_ROOT}/addons/compute/libs/userdma/utils_lib/api/
247-
${HEXAGON_SDK_ROOT}/addons/compute/libs/userdma/utils_lib/inc/
248244
${CMAKE_CURRENT_LIST_DIR}/device/
249245
${CMAKE_CURRENT_LIST_DIR}/device/op/
250246
)

ggml/src/ggml-qnn/npu/device/dma_transfer.cpp

Lines changed: 260 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,228 @@
11
#include "dma_transfer.hpp"
22

3-
#include <dma_desc.h>
43
#include <qurt.h>
54

65
#include <array>
76
#include <cstdlib>
87

8+
namespace {
9+
10+
// From addons/compute/libs/userdma/utils_lib/
11+
12+
#define DM0_STATUS_MASK 0x00000003
13+
#define DM0_STATUS_SHIFT 0
14+
#define DM0_STATUS_IDLE 0
15+
#define DM0_STATUS_RUN 1
16+
#define DM0_STATUS_ERROR 2
17+
18+
#define DM0_DESC_ADDR_MASK 0xFFFFFFF0
19+
#define DM0_DESC_ADDR_SHIFT 4
20+
21+
#define DMA_COMPLETE 1
22+
#define DMA_INCOMPLETE 0
23+
24+
#define DMA_SUCCESS 0
25+
#define DMA_FAIL -1
26+
27+
#define DMA_DESC_TYPE_1D 0
28+
#define DMA_DESC_TYPE_2D 1
29+
30+
#define DESC_NEXT_MASK 0xFFFFFFFF
31+
#define DESC_NEXT_SHIFT 0
32+
33+
#define DESC_DSTATE_MASK 0x80000000
34+
#define DESC_DSTATE_SHIFT 31
35+
#define DESC_DSTATE_INCOMPLETE 0
36+
#define DESC_DSTATE_COMPLETE 1
37+
38+
#define DESC_ORDER_MASK 0x40000000
39+
#define DESC_ORDER_SHIFT 30
40+
#define DESC_ORDER_NOORDER 0
41+
#define DESC_ORDER_ORDER 1
42+
43+
#define DESC_BYPASSSRC_MASK 0x20000000
44+
#define DESC_BYPASSSRC_SHIFT 29
45+
#define DESC_BYPASSDST_MASK 0x10000000
46+
#define DESC_BYPASSDST_SHIFT 28
47+
#define DESC_BYPASS_OFF 0
48+
#define DESC_BYPASS_ON 1
49+
50+
#define DESC_DESCTYPE_MASK 0x03000000
51+
#define DESC_DESCTYPE_SHIFT 24
52+
#define DESC_DESCTYPE_1D 0
53+
#define DESC_DESCTYPE_2D 1
54+
55+
#define DESC_LENGTH_MASK 0x00FFFFFF
56+
#define DESC_LENGTH_SHIFT 0
57+
#define DESC_SRC_MASK 0xFFFFFFFF
58+
#define DESC_SRC_SHIFT 0
59+
#define DESC_DST_MASK 0xFFFFFFFF
60+
#define DESC_DST_SHIFT 0
61+
62+
#define DESC_CACHEALLOC_MASK 0x03000000
63+
#define DESC_CACHEALLOC_SHIFT 24
64+
#define DESC_CACHEALLOC_NONE 0
65+
#define DESC_CACHEALLOC_WRITEONLY 1
66+
#define DESC_CACHEALLOC_READONLY 2
67+
#define DESC_CACHEALLOC_READWRITE 3
68+
69+
#define DESC_ROIWIDTH_MASK 0x0000FFFF
70+
#define DESC_ROIWIDTH_SHIFT 0
71+
#define DESC_ROIHEIGHT_MASK 0xFFFF0000
72+
#define DESC_ROIHEIGHT_SHIFT 16
73+
74+
#define DESC_SRCSTRIDE_MASK 0x0000FFFF
75+
#define DESC_SRCSTRIDE_SHIFT 0
76+
#define DESC_DSTSTRIDE_MASK 0xFFFF0000
77+
#define DESC_DSTSTRIDE_SHIFT 16
78+
79+
#define DESC_SRCWIDTHOFFSET_MASK 0x0000FFFF
80+
#define DESC_SRCWIDTHOFFSET_SHIFT 0
81+
#define DESC_DSTWIDTHOFFSET_MASK 0xFFFF0000
82+
#define DESC_DSTWIDTHOFFSET_SHIFT 16
83+
84+
/**************************/
85+
/* 1D (linear) descriptor */
86+
/**************************/
87+
typedef struct _dma_desc_1d_t {
88+
uint32_t next;
89+
uint32_t dstate_order_bypass_desctype_length;
90+
uint32_t src;
91+
uint32_t dst;
92+
} dma_desc_1d_t;
93+
94+
static_assert(sizeof(dma_desc_1d_t) == hexagon::dma::kDmaDescSize1D, "kDmaDescSize1D size incorrect");
95+
96+
/***********************/
97+
/* 2D (box) descriptor */
98+
/***********************/
99+
typedef struct _dma_desc_2d_t {
100+
uint32_t next;
101+
uint32_t dstate_order_bypass_desctype_length;
102+
uint32_t src;
103+
uint32_t dst;
104+
uint32_t allocation;
105+
uint32_t roiheight_roiwidth;
106+
uint32_t dststride_srcstride;
107+
uint32_t dstwidthoffset_srcwidthoffset;
108+
} dma_desc_2d_t;
109+
110+
static_assert(sizeof(dma_desc_2d_t) == hexagon::dma::kDmaDescSize2D, "kDmaDescSize2D size incorrect");
111+
112+
inline void dmstart(void * next) {
113+
asm volatile(" release(%0):at" : : "r"(next));
114+
asm volatile(" dmstart(%0)" : : "r"(next));
115+
}
116+
117+
inline void dmlink(void * cur, void * next) {
118+
asm volatile(" release(%0):at" : : "r"(next));
119+
asm volatile(" dmlink(%0, %1)" : : "r"(cur), "r"(next));
120+
}
121+
122+
inline unsigned int dmpoll(void) {
123+
unsigned int ret = 0;
124+
asm volatile(" %0 = dmpoll" : "=r"(ret) : : "memory");
125+
return ret;
126+
}
127+
128+
inline unsigned int dmwait(void) {
129+
unsigned int ret = 0;
130+
asm volatile(" %0 = dmwait" : "=r"(ret) : : "memory");
131+
return ret;
132+
}
133+
134+
inline void dma_desc_set_next(void * d, uint32_t v) {
135+
(((dma_desc_1d_t *) d)->next) &= ~DESC_NEXT_MASK;
136+
(((dma_desc_1d_t *) d)->next) |= ((v << DESC_NEXT_SHIFT) & DESC_NEXT_MASK);
137+
}
138+
139+
inline uint32_t dma_desc_get_dstate(void * d) {
140+
return (((((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) & DESC_DSTATE_MASK) >> DESC_DSTATE_SHIFT);
141+
}
142+
143+
inline void dma_desc_set_dstate(void * d, uint32_t v) {
144+
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) &= ~DESC_DSTATE_MASK;
145+
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) |= ((v << DESC_DSTATE_SHIFT) & DESC_DSTATE_MASK);
146+
}
147+
148+
inline void dma_desc_set_desctype(void * d, uint32_t v) {
149+
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) &= ~DESC_DESCTYPE_MASK;
150+
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) |= ((v << DESC_DESCTYPE_SHIFT) & DESC_DESCTYPE_MASK);
151+
}
152+
153+
inline void dma_desc_set_order(void * d, uint32_t v) {
154+
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) &= ~DESC_ORDER_MASK;
155+
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) |= ((v << DESC_ORDER_SHIFT) & DESC_ORDER_MASK);
156+
}
157+
158+
inline void dma_desc_set_bypasssrc(void * d, uint32_t v) {
159+
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) &= ~DESC_BYPASSSRC_MASK;
160+
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) |= ((v << DESC_BYPASSSRC_SHIFT) & DESC_BYPASSSRC_MASK);
161+
}
162+
163+
inline void dma_desc_set_bypassdst(void * d, uint32_t v) {
164+
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) &= ~DESC_BYPASSDST_MASK;
165+
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) |= ((v << DESC_BYPASSDST_SHIFT) & DESC_BYPASSDST_MASK);
166+
}
167+
168+
inline void dma_desc_set_length(void * d, uint32_t v) {
169+
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) &= ~DESC_LENGTH_MASK;
170+
(((dma_desc_1d_t *) d)->dstate_order_bypass_desctype_length) |= ((v << DESC_LENGTH_SHIFT) & DESC_LENGTH_MASK);
171+
}
172+
173+
inline uint32_t dma_desc_get_src(void * d) {
174+
return (((((dma_desc_1d_t *) d)->src) & DESC_SRC_MASK) >> DESC_SRC_SHIFT);
175+
}
176+
177+
inline void dma_desc_set_src(void * d, uint32_t v) {
178+
(((dma_desc_1d_t *) d)->src) &= ~DESC_SRC_MASK;
179+
(((dma_desc_1d_t *) d)->src) |= ((v << DESC_SRC_SHIFT) & DESC_SRC_MASK);
180+
}
181+
182+
inline void dma_desc_set_dst(void * d, uint32_t v) {
183+
(((dma_desc_1d_t *) d)->dst) &= ~DESC_DST_MASK;
184+
(((dma_desc_1d_t *) d)->dst) |= ((v << DESC_DST_SHIFT) & DESC_DST_MASK);
185+
}
186+
187+
inline void dma_desc_set_roiwidth(void * d, uint32_t v) {
188+
(((dma_desc_2d_t *) d)->roiheight_roiwidth) &= ~DESC_ROIWIDTH_MASK;
189+
(((dma_desc_2d_t *) d)->roiheight_roiwidth) |= ((v << DESC_ROIWIDTH_SHIFT) & DESC_ROIWIDTH_MASK);
190+
}
191+
192+
inline void dma_desc_set_roiheight(void * d, uint32_t v) {
193+
(((dma_desc_2d_t *) d)->roiheight_roiwidth) &= ~DESC_ROIHEIGHT_MASK;
194+
(((dma_desc_2d_t *) d)->roiheight_roiwidth) |= ((v << DESC_ROIHEIGHT_SHIFT) & DESC_ROIHEIGHT_MASK);
195+
}
196+
197+
inline void dma_desc_set_srcstride(void * d, uint32_t v) {
198+
(((dma_desc_2d_t *) d)->dststride_srcstride) &= ~DESC_SRCSTRIDE_MASK;
199+
(((dma_desc_2d_t *) d)->dststride_srcstride) |= ((v << DESC_SRCSTRIDE_SHIFT) & DESC_SRCSTRIDE_MASK);
200+
}
201+
202+
inline void dma_desc_set_dststride(void * d, uint32_t v) {
203+
(((dma_desc_2d_t *) d)->dststride_srcstride) &= ~DESC_DSTSTRIDE_MASK;
204+
(((dma_desc_2d_t *) d)->dststride_srcstride) |= ((v << DESC_DSTSTRIDE_SHIFT) & DESC_DSTSTRIDE_MASK);
205+
}
206+
207+
inline void dma_desc_set_srcwidthoffset(void * d, uint32_t v) {
208+
(((dma_desc_2d_t *) d)->dstwidthoffset_srcwidthoffset) &= ~DESC_SRCWIDTHOFFSET_MASK;
209+
(((dma_desc_2d_t *) d)->dstwidthoffset_srcwidthoffset) |=
210+
((v << DESC_SRCWIDTHOFFSET_SHIFT) & DESC_SRCWIDTHOFFSET_MASK);
211+
}
212+
213+
inline void dma_desc_set_dstwidthoffset(void * d, uint32_t v) {
214+
(((dma_desc_2d_t *) d)->dstwidthoffset_srcwidthoffset) &= ~DESC_DSTWIDTHOFFSET_MASK;
215+
(((dma_desc_2d_t *) d)->dstwidthoffset_srcwidthoffset) |=
216+
((v << DESC_DSTWIDTHOFFSET_SHIFT) & DESC_DSTWIDTHOFFSET_MASK);
217+
}
218+
219+
inline void dma_desc_set_cachealloc(void * d, uint32_t v) {
220+
(((dma_desc_2d_t *) d)->allocation) &= ~DESC_CACHEALLOC_MASK;
221+
(((dma_desc_2d_t *) d)->allocation) |= ((v << DESC_CACHEALLOC_SHIFT) & DESC_CACHEALLOC_MASK);
222+
}
223+
224+
} // namespace
225+
9226
namespace hexagon::dma {
10227

11228
dma_transfer::dma_transfer() {
@@ -162,23 +379,58 @@ bool dma_transfer::submit2d(const uint8_t * src,
162379
}
163380

164381
void dma_transfer::wait() {
165-
auto ret = dma_wait_for_idle();
166-
if (ret != DMA_SUCCESS) {
167-
DEVICE_LOG_ERROR("dma_transfer: failed to wait for DMA idle: %d\n", ret);
382+
uint32_t dm0_status = dmwait() & DM0_STATUS_MASK;
383+
if (dm0_status != DM0_STATUS_IDLE) {
384+
DEVICE_LOG_ERROR("dma_transfer: failed to wait for DMA idle, dm0_status: %d\n", (int) dm0_status);
168385
}
169386
}
170387

171388
bool dma_transfer::is_desc_done(uint8_t * desc) {
172-
return !dma_desc_get_src(desc) || dma_desc_is_done(desc) == DMA_COMPLETE;
389+
if (!dma_desc_get_src(desc)) {
390+
return true;
391+
}
392+
393+
if (dma_desc_get_dstate(desc) == DESC_DSTATE_COMPLETE) {
394+
return true;
395+
}
396+
397+
dmpoll();
398+
return false;
173399
}
174400

175-
bool dma_transfer::submit_impl(void ** desc_batch, int batch_len) {
401+
bool dma_transfer::submit_impl(void ** desc_batch, size_t batch_len) {
176402
_dma_desc_mutex.lock();
177-
const bool succ = dma_desc_submit(desc_batch, batch_len) == DMA_SUCCESS;
403+
for (size_t i = 0; i < batch_len - 1; i++) {
404+
dma_desc_set_next(desc_batch[i], (uint32_t) desc_batch[i + 1]);
405+
}
406+
407+
dma_desc_set_next(desc_batch[batch_len - 1], (uint32_t) nullptr);
408+
uint32_t dm0_status = dmpoll() & DM0_STATUS_MASK;
409+
if (dm0_status == DM0_STATUS_IDLE) {
410+
dmstart(desc_batch[0]);
411+
} else if (dm0_status == DM0_STATUS_RUN) {
412+
if (_dma_last_desc == nullptr) {
413+
_dma_desc_mutex.unlock();
414+
DEVICE_LOG_ERROR("dma_transfer: last descriptor not found for linking. Submission failed\n");
415+
return false;
416+
} else {
417+
dmlink(_dma_last_desc, desc_batch[0]);
418+
}
419+
} else {
420+
_dma_desc_mutex.unlock();
421+
DEVICE_LOG_ERROR("dma_transfer: DMA not idle or running. Submission failed\n");
422+
return false;
423+
}
424+
425+
dmpoll();
426+
427+
_dma_last_desc = (void *) desc_batch[batch_len - 1];
428+
178429
_dma_desc_mutex.unlock();
179-
return succ;
430+
return true;
180431
}
181432

182433
qurt_mutex dma_transfer::_dma_desc_mutex;
434+
void * dma_transfer::_dma_last_desc = nullptr;
183435

184436
} // namespace hexagon::dma

ggml/src/ggml-qnn/npu/device/dma_transfer.hpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22

33
#include "util.hpp"
44

5-
#include <dma_utils.h>
6-
75
namespace hexagon::dma {
86

7+
constexpr const size_t kDmaDescSize1D = 16;
8+
constexpr const size_t kDmaDescSize2D = 32;
9+
910
class dma_transfer {
1011
public:
1112
dma_transfer();
@@ -32,12 +33,14 @@ class dma_transfer {
3233
private:
3334
static bool is_desc_done(uint8_t * desc); // TODO: should we use void * here?
3435
static qurt_mutex _dma_desc_mutex;
36+
static void * _dma_last_desc;
3537

36-
bool submit_impl(void ** desc_batch, int batch_len);
38+
// TODO: can we avoid the void ** here?
39+
bool submit_impl(void ** desc_batch, size_t batch_len);
3740

38-
alignas(DMA_DESC_SIZE_1D) uint8_t _dma_1d_desc0[DMA_DESC_SIZE_1D] = {};
39-
alignas(DMA_DESC_SIZE_1D) uint8_t _dma_1d_desc1[DMA_DESC_SIZE_1D] = {};
40-
alignas(DMA_DESC_SIZE_2D) uint8_t _dma_2d_desc0[DMA_DESC_SIZE_2D] = {};
41+
alignas(kDmaDescSize1D) uint8_t _dma_1d_desc0[kDmaDescSize1D] = {};
42+
alignas(kDmaDescSize1D) uint8_t _dma_1d_desc1[kDmaDescSize1D] = {};
43+
alignas(kDmaDescSize2D) uint8_t _dma_2d_desc0[kDmaDescSize2D] = {};
4144

4245
DISABLE_COPY_AND_MOVE(dma_transfer);
4346
};

0 commit comments

Comments
 (0)