Skip to content

Commit 3ba78a5

Browse files
committed
virtgpu: allocate a shared page with the host
1 parent ffa659f commit 3ba78a5

File tree

8 files changed

+628
-18
lines changed

8 files changed

+628
-18
lines changed

ggml/src/ggml-remotingfrontend/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ ggml_add_backend_library(ggml-remotingfrontend
1919
ggml-buffer-type.cpp
2020
ggml-host-buffer-type.cpp
2121
virtgpu.cpp
22+
virtgpu-shm.cpp
23+
virtgpu-utils.cpp
2224
../../include/ggml-remoting-frontend.h
2325
)
2426

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
#include <assert.h>
2+
3+
#include "virtgpu-shm.h"
4+
5+
static uint32_t
6+
virtgpu_ioctl_resource_create_blob(struct virtgpu *gpu,
7+
uint32_t blob_mem,
8+
uint32_t blob_flags,
9+
size_t blob_size,
10+
uint64_t blob_id,
11+
uint32_t *res_id)
12+
{
13+
#ifdef SIMULATE_BO_SIZE_FIX
14+
blob_size = align64(blob_size, 4096);
15+
#endif
16+
17+
struct drm_virtgpu_resource_create_blob args = {
18+
.blob_mem = blob_mem,
19+
.blob_flags = blob_flags,
20+
.bo_handle = 0,
21+
.res_handle = 0,
22+
.size = blob_size,
23+
.pad = 0,
24+
.cmd_size = 0,
25+
.cmd = 0,
26+
.blob_id = blob_id,
27+
};
28+
29+
if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_RESOURCE_CREATE_BLOB, &args))
30+
return 0;
31+
32+
*res_id = args.res_handle;
33+
return args.bo_handle;
34+
}
35+
36+
static void
37+
virtgpu_ioctl_gem_close(struct virtgpu *gpu, uint32_t gem_handle)
38+
{
39+
struct drm_gem_close args = {
40+
.handle = gem_handle,
41+
.pad = 0,
42+
};
43+
44+
const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_GEM_CLOSE, &args);
45+
assert(!ret);
46+
}
47+
48+
static void *
49+
virtgpu_ioctl_map(struct virtgpu *gpu, uint32_t gem_handle, size_t size)
50+
{
51+
struct drm_virtgpu_map args = {
52+
.offset = 0,
53+
.handle = gem_handle,
54+
.pad = 0,
55+
};
56+
printf("Say hello world\n");
57+
if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_MAP, &args))
58+
return NULL;
59+
60+
void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, gpu->fd,
61+
args.offset);
62+
if (ptr == MAP_FAILED)
63+
return NULL;
64+
65+
return ptr;
66+
}
67+
68+
void
69+
virtgpu_shmem_destroy(struct virtgpu *gpu,
70+
struct virtgpu_shmem *shmem)
71+
{
72+
munmap(shmem->base.mmap_ptr, shmem->base.mmap_size);
73+
virtgpu_ioctl_gem_close(gpu, shmem->gem_handle);
74+
}
75+
76+
struct vn_renderer_shmem *
77+
virtgpu_shmem_create(struct virtgpu *gpu, size_t size)
78+
{
79+
size = align64(size, 16384);
80+
81+
uint32_t res_id;
82+
uint32_t gem_handle = virtgpu_ioctl_resource_create_blob(
83+
gpu, gpu->shmem_blob_mem, VIRTGPU_BLOB_FLAG_USE_MAPPABLE, size, 0,
84+
&res_id);
85+
if (!gem_handle)
86+
return NULL;
87+
88+
void *ptr = virtgpu_ioctl_map(gpu, gem_handle, size);
89+
if (!ptr) {
90+
virtgpu_ioctl_gem_close(gpu, gem_handle);
91+
return NULL;
92+
}
93+
if (gpu->shmem_array.elem_size == 0) {
94+
INFO("gpu->shmem_array.elem_size == 0 | Not working :/\n");
95+
assert(false);
96+
}
97+
struct virtgpu_shmem *shmem = (struct virtgpu_shmem *) util_sparse_array_get(&gpu->shmem_array, gem_handle);
98+
99+
shmem->gem_handle = gem_handle;
100+
shmem->base.res_id = res_id;
101+
shmem->base.mmap_size = size;
102+
shmem->base.mmap_ptr = ptr;
103+
shmem->base.refcount.count = 1;
104+
shmem->base.gem_handle = gem_handle;
105+
106+
return &shmem->base;
107+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#pragma once
2+
3+
#include <cassert>
4+
#include <cstdint>
5+
#include <cstddef>
6+
#include <stdatomic.h>
7+
#include <sys/mman.h>
8+
9+
#include "virtgpu.h"
10+
#include "virtgpu-utils.h"
11+
12+
struct vn_refcount {
13+
int count; //atomic_int
14+
};
15+
16+
17+
struct vn_renderer_shmem {
18+
struct vn_refcount refcount;
19+
20+
uint32_t res_id;
21+
size_t mmap_size; /* for internal use only (i.e., munmap) */
22+
void *mmap_ptr;
23+
24+
struct list_head cache_head;
25+
int64_t cache_timestamp;
26+
27+
uint32_t gem_handle;
28+
};
29+
30+
struct vn_renderer_shmem *virtgpu_shmem_create(struct virtgpu *gpu, size_t size);
31+
void virtgpu_shmem_destroy(struct virtgpu *gpu, struct virtgpu_shmem *shmem);
32+
33+
34+
struct virtgpu_shmem {
35+
struct vn_renderer_shmem base;
36+
uint32_t gem_handle;
37+
};
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
#include "virtgpu-utils.h"
2+
#include <malloc.h>
3+
#include <cstring>
4+
#include <stdlib.h>
5+
6+
#define NODE_ALLOC_ALIGN 64
7+
#define NODE_PTR_MASK (~((uintptr_t)NODE_ALLOC_ALIGN - 1))
8+
#define NODE_LEVEL_MASK ((uintptr_t)NODE_ALLOC_ALIGN - 1)
9+
#define NULL_NODE 0
10+
11+
#define os_malloc_aligned(_size, _align) _aligned_malloc(_size, _align)
12+
#define os_free_aligned(_ptr) free(_ptr)
13+
#define p_atomic_cmpxchg(v, old, _new) \
14+
__sync_val_compare_and_swap((v), (old), (_new))
15+
16+
static inline uint64_t
17+
util_logbase2_64(uint64_t n)
18+
{
19+
#if defined(HAVE___BUILTIN_CLZLL)
20+
return ((sizeof(uint64_t) * 8 - 1) - __builtin_clzll(n | 1));
21+
#else
22+
uint64_t pos = 0ull;
23+
if (n >= 1ull<<32) { n >>= 32; pos += 32; }
24+
if (n >= 1ull<<16) { n >>= 16; pos += 16; }
25+
if (n >= 1ull<< 8) { n >>= 8; pos += 8; }
26+
if (n >= 1ull<< 4) { n >>= 4; pos += 4; }
27+
if (n >= 1ull<< 2) { n >>= 2; pos += 2; }
28+
if (n >= 1ull<< 1) { pos += 1; }
29+
return pos;
30+
#endif
31+
}
32+
33+
void
34+
util_sparse_array_init(struct util_sparse_array *arr,
35+
size_t elem_size, size_t node_size)
36+
{
37+
memset(arr, 0, sizeof(*arr));
38+
arr->elem_size = elem_size;
39+
arr->node_size_log2 = util_logbase2_64(node_size);
40+
assert(node_size >= 2 && node_size == (1ull << arr->node_size_log2));
41+
}
42+
43+
static inline void *
44+
os_malloc_aligned(size_t size, size_t alignment)
45+
{
46+
void *ptr;
47+
alignment = (alignment + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
48+
if(posix_memalign(&ptr, alignment, size) != 0)
49+
return NULL;
50+
return ptr;
51+
}
52+
53+
static inline void *
54+
_util_sparse_array_node_data(uintptr_t handle)
55+
{
56+
return (void *)(handle & NODE_PTR_MASK);
57+
}
58+
59+
static inline unsigned
60+
_util_sparse_array_node_level(uintptr_t handle)
61+
{
62+
return handle & NODE_LEVEL_MASK;
63+
}
64+
65+
static inline void
66+
_util_sparse_array_node_finish(struct util_sparse_array *arr,
67+
uintptr_t node)
68+
{
69+
if (_util_sparse_array_node_level(node) > 0) {
70+
uintptr_t *children = (uintptr_t *) _util_sparse_array_node_data(node);
71+
size_t node_size = 1ull << arr->node_size_log2;
72+
for (size_t i = 0; i < node_size; i++) {
73+
if (children[i])
74+
_util_sparse_array_node_finish(arr, children[i]);
75+
}
76+
}
77+
78+
os_free_aligned(_util_sparse_array_node_data(node));
79+
}
80+
81+
static inline uintptr_t
82+
_util_sparse_array_node(void *data, unsigned level)
83+
{
84+
assert(data != NULL);
85+
assert(((uintptr_t)data & NODE_LEVEL_MASK) == 0);
86+
assert((level & NODE_PTR_MASK) == 0);
87+
return (uintptr_t)data | level;
88+
}
89+
90+
inline uintptr_t
91+
_util_sparse_array_node_alloc(struct util_sparse_array *arr,
92+
unsigned level)
93+
{
94+
size_t size;
95+
if (level == 0) {
96+
size = arr->elem_size << arr->node_size_log2;
97+
} else {
98+
size = sizeof(uintptr_t) << arr->node_size_log2;
99+
}
100+
101+
void *data = os_malloc_aligned(size, NODE_ALLOC_ALIGN);
102+
memset(data, 0, size);
103+
104+
return _util_sparse_array_node(data, level);
105+
}
106+
107+
static inline uintptr_t
108+
_util_sparse_array_set_or_free_node(uintptr_t *node_ptr,
109+
uintptr_t cmp_node,
110+
uintptr_t node)
111+
{
112+
uintptr_t prev_node = p_atomic_cmpxchg(node_ptr, cmp_node, node);
113+
114+
if (prev_node != cmp_node) {
115+
/* We lost the race. Free this one and return the one that was already
116+
* allocated.
117+
*/
118+
os_free_aligned(_util_sparse_array_node_data(node));
119+
return prev_node;
120+
} else {
121+
return node;
122+
}
123+
}
124+
125+
void *
126+
util_sparse_array_get(struct util_sparse_array *arr, uint64_t idx)
127+
{
128+
const unsigned node_size_log2 = arr->node_size_log2;
129+
uintptr_t root = p_atomic_read(&arr->root);
130+
if (unlikely(!root)) {
131+
unsigned root_level = 0;
132+
uint64_t idx_iter = idx >> node_size_log2;
133+
while (idx_iter) {
134+
idx_iter >>= node_size_log2;
135+
root_level++;
136+
}
137+
uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level);
138+
root = _util_sparse_array_set_or_free_node(&arr->root,
139+
NULL_NODE, new_root);
140+
}
141+
142+
while (1) {
143+
unsigned root_level = _util_sparse_array_node_level(root);
144+
uint64_t root_idx = idx >> (root_level * node_size_log2);
145+
if (likely(root_idx < (1ull << node_size_log2)))
146+
break;
147+
148+
/* In this case, we have a root but its level is low enough that the
149+
* requested index is out-of-bounds.
150+
*/
151+
uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level + 1);
152+
153+
uintptr_t *new_root_children = (uintptr_t *) _util_sparse_array_node_data(new_root);
154+
new_root_children[0] = root;
155+
156+
/* We only add one at a time instead of the whole tree because it's
157+
* easier to ensure correctness of both the tree building and the
158+
* clean-up path. Because we're only adding one node we never have to
159+
* worry about trying to free multiple things without freeing the old
160+
* things.
161+
*/
162+
root = _util_sparse_array_set_or_free_node(&arr->root, root, new_root);
163+
}
164+
165+
void *node_data = _util_sparse_array_node_data(root);
166+
unsigned node_level = _util_sparse_array_node_level(root);
167+
while (node_level > 0) {
168+
uint64_t child_idx = (idx >> (node_level * node_size_log2)) &
169+
((1ull << node_size_log2) - 1);
170+
171+
uintptr_t *children = (uintptr_t *) node_data;
172+
uintptr_t child = p_atomic_read(&children[child_idx]);
173+
174+
if (unlikely(!child)) {
175+
child = _util_sparse_array_node_alloc(arr, node_level - 1);
176+
child = _util_sparse_array_set_or_free_node(&children[child_idx],
177+
NULL_NODE, child);
178+
}
179+
180+
node_data = _util_sparse_array_node_data(child);
181+
node_level = _util_sparse_array_node_level(child);
182+
}
183+
184+
uint64_t elem_idx = idx & ((1ull << node_size_log2) - 1);
185+
return (void *)((char *)node_data + (elem_idx * arr->elem_size));
186+
}
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#pragma once
2+
3+
#include <cstdint>
4+
#include <cassert>
5+
#include <cstddef>
6+
7+
#define unlikely(x) __builtin_expect(!!(x), 0)
8+
#define likely(x) __builtin_expect(!!(x), 1)
9+
10+
/** Checks is a value is a power of two. Does not handle zero. */
11+
#define IS_POT(v) (((v) & ((v) - 1)) == 0)
12+
13+
/** Checks is a value is a power of two. Zero handled. */
14+
#define IS_POT_NONZERO(v) ((v) != 0 && IS_POT(v))
15+
16+
/** Align a value to a power of two */
17+
#define ALIGN_POT(x, pot_align) (((x) + (pot_align) - 1) & ~((pot_align) - 1))
18+
19+
#define p_atomic_read(_v) __atomic_load_n((_v), __ATOMIC_ACQUIRE)
20+
21+
22+
static inline bool
23+
util_is_power_of_two_nonzero64(uint64_t v)
24+
{
25+
return IS_POT_NONZERO(v);
26+
}
27+
28+
static inline uint64_t
29+
align64(uint64_t value, uint64_t alignment)
30+
{
31+
assert(util_is_power_of_two_nonzero64(alignment));
32+
return ALIGN_POT(value, alignment);
33+
}
34+
35+
struct list_head
36+
{
37+
struct list_head *prev;
38+
struct list_head *next;
39+
};
40+
41+
struct util_sparse_array {
42+
size_t elem_size;
43+
unsigned node_size_log2;
44+
45+
uintptr_t root;
46+
};
47+
48+
void *util_sparse_array_get(struct util_sparse_array *arr, uint64_t idx);
49+
void util_sparse_array_init(struct util_sparse_array *arr,
50+
size_t elem_size, size_t node_size);

0 commit comments

Comments
 (0)