Skip to content

Commit 3b87888

Browse files
authored
Merge branch 'ggml-org:master' into clearAllConversations
2 parents a2313c8 + bc091a4 commit 3b87888

File tree

4 files changed

+29
-19
lines changed

4 files changed

+29
-19
lines changed

common/common.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -830,7 +830,7 @@ std::string fs_get_cache_directory() {
830830
if (getenv("LLAMA_CACHE")) {
831831
cache_directory = std::getenv("LLAMA_CACHE");
832832
} else {
833-
#if defined(__linux__) || defined(__FreeBSD__)
833+
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
834834
if (std::getenv("XDG_CACHE_HOME")) {
835835
cache_directory = std::getenv("XDG_CACHE_HOME");
836836
} else {

examples/llava/clip.cpp

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -323,36 +323,43 @@ struct clip_ctx {
323323
std::vector<ggml_backend_t> backend_ptrs;
324324
std::vector<ggml_backend_buffer_type_t> backend_buft;
325325

326-
ggml_backend_ptr backend;
327-
ggml_backend_ptr backend_cpu;
326+
ggml_backend_t backend;
327+
ggml_backend_t backend_cpu;
328328
ggml_backend_buffer_ptr buf;
329329

330330
ggml_backend_sched_ptr sched;
331331

332332
clip_image_size load_image_size;
333333

334334
clip_ctx(clip_context_params & ctx_params) {
335-
backend_cpu = ggml_backend_ptr(ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr));
336-
backend = ggml_backend_ptr(ctx_params.use_gpu
335+
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
336+
backend = ctx_params.use_gpu
337337
? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
338-
: nullptr);
338+
: nullptr;
339339

340340
if (backend) {
341-
LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend.get()));
342-
backend_ptrs.push_back(backend.get());
343-
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend.get()));
341+
LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
342+
backend_ptrs.push_back(backend);
343+
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
344344
} else {
345-
backend = std::move(backend_cpu);
345+
backend = backend_cpu;
346346
LOG_INF("%s: CLIP using CPU backend\n", __func__);
347347
}
348348

349-
backend_ptrs.push_back(backend_cpu.get());
350-
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu.get()));
349+
backend_ptrs.push_back(backend_cpu);
350+
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
351351

352352
sched.reset(
353353
ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
354354
);
355355
}
356+
357+
~clip_ctx() {
358+
ggml_backend_free(backend);
359+
if (backend != backend_cpu) {
360+
ggml_backend_free(backend_cpu);
361+
}
362+
}
356363
};
357364

358365
static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
@@ -1428,7 +1435,7 @@ struct clip_model_loader {
14281435
}
14291436

14301437
// alloc memory and offload data
1431-
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend.get());
1438+
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
14321439
ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
14331440
ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
14341441
for (auto & t : tensors_to_load) {
@@ -2610,7 +2617,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
26102617
}
26112618
}
26122619

2613-
ggml_backend_cpu_set_n_threads(ctx->backend_cpu.get(), n_threads);
2620+
ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
26142621

26152622
auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
26162623
if (status != GGML_STATUS_SUCCESS) {

examples/rpc/rpc-server.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ static std::string fs_get_cache_directory() {
126126
if (getenv("LLAMA_CACHE")) {
127127
cache_directory = std::getenv("LLAMA_CACHE");
128128
} else {
129-
#if defined(__linux__) || defined(__FreeBSD__)
129+
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
130130
if (std::getenv("XDG_CACHE_HOME")) {
131131
cache_directory = std::getenv("XDG_CACHE_HOME");
132132
} else {

ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,11 @@ void main() {
201201
uint32_t q_stride = p.gqa_ratio > 1 ? (p.nb02 / 4) : p.nb01;
202202
uint32_t k_stride = p.nb11;
203203
uint32_t v_stride = p.nb21;
204+
// When using grouped query attention, all rows use the same mask (stride 0).
205+
// "p.gqa_ratio >> 16" is just a roundabout way of writing zero
206+
// that prevents the compiler from folding the "&" through the select
207+
// and breaking the alignment detection.
208+
uint32_t m_stride = (p.gqa_ratio > 1) ? (p.gqa_ratio >> 16) : KV;
204209
// hint to the compiler that strides are aligned for the aligned variant of the shader
205210
if (Clamp != gl_CooperativeMatrixClampModeConstantNV)
206211
{
@@ -209,6 +214,7 @@ void main() {
209214
k_stride &= ~7;
210215
v_stride &= ~7;
211216
#endif
217+
m_stride &= ~7;
212218
}
213219
tensorLayoutQ = setTensorLayoutStrideNV(tensorLayoutQ, q_stride, 1);
214220
tensorLayoutK = setTensorLayoutStrideNV(tensorLayoutK, k_stride, 1);
@@ -261,10 +267,7 @@ void main() {
261267
if (p.mask != 0) {
262268
tensorLayoutNV<2, Clamp> tensorLayoutM = createTensorLayoutNV(2, Clamp);
263269
tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV);
264-
// When using grouped query attention, all rows use the same mask.
265-
if (p.gqa_ratio > 1) {
266-
tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, 0, 1);
267-
}
270+
tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1);
268271

269272
coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv;
270273

0 commit comments

Comments
 (0)