diff --git a/common/common.cpp b/common/common.cpp index f1842079435a7..94f545f815c27 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -830,7 +830,7 @@ std::string fs_get_cache_directory() { if (getenv("LLAMA_CACHE")) { cache_directory = std::getenv("LLAMA_CACHE"); } else { -#if defined(__linux__) || defined(__FreeBSD__) +#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) if (std::getenv("XDG_CACHE_HOME")) { cache_directory = std::getenv("XDG_CACHE_HOME"); } else { diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index a55b3f3835184..49c90b7506e73 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -323,8 +323,8 @@ struct clip_ctx { std::vector backend_ptrs; std::vector backend_buft; - ggml_backend_ptr backend; - ggml_backend_ptr backend_cpu; + ggml_backend_t backend; + ggml_backend_t backend_cpu; ggml_backend_buffer_ptr buf; ggml_backend_sched_ptr sched; @@ -332,27 +332,34 @@ struct clip_ctx { clip_image_size load_image_size; clip_ctx(clip_context_params & ctx_params) { - backend_cpu = ggml_backend_ptr(ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr)); - backend = ggml_backend_ptr(ctx_params.use_gpu + backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + backend = ctx_params.use_gpu ? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr) - : nullptr); + : nullptr; if (backend) { - LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend.get())); - backend_ptrs.push_back(backend.get()); - backend_buft.push_back(ggml_backend_get_default_buffer_type(backend.get())); + LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend)); + backend_ptrs.push_back(backend); + backend_buft.push_back(ggml_backend_get_default_buffer_type(backend)); } else { - backend = std::move(backend_cpu); + backend = backend_cpu; LOG_INF("%s: CLIP using CPU backend\n", __func__); } - backend_ptrs.push_back(backend_cpu.get()); - backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu.get())); + backend_ptrs.push_back(backend_cpu); + backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu)); sched.reset( ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false) ); } + + ~clip_ctx() { + ggml_backend_free(backend); + if (backend != backend_cpu) { + ggml_backend_free(backend_cpu); + } + } }; static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch & imgs) { @@ -1428,7 +1435,7 @@ struct clip_model_loader { } // alloc memory and offload data - ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend.get()); + ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend); ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft)); ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS); for (auto & t : tensors_to_load) { @@ -2610,7 +2617,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } } - ggml_backend_cpu_set_n_threads(ctx->backend_cpu.get(), n_threads); + ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads); auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf); if (status != GGML_STATUS_SUCCESS) { diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp index b61749c3f3a7b..f8f2cb90ea10d 100644 --- a/examples/rpc/rpc-server.cpp +++ b/examples/rpc/rpc-server.cpp @@ -126,7 +126,7 @@ static std::string fs_get_cache_directory() { if (getenv("LLAMA_CACHE")) { cache_directory = std::getenv("LLAMA_CACHE"); } else { -#if defined(__linux__) || defined(__FreeBSD__) +#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) if (std::getenv("XDG_CACHE_HOME")) { cache_directory = std::getenv("XDG_CACHE_HOME"); } else { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp index a8f4bc41726c2..e1baa85f9e330 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp @@ -201,6 +201,11 @@ void main() { uint32_t q_stride = p.gqa_ratio > 1 ? (p.nb02 / 4) : p.nb01; uint32_t k_stride = p.nb11; uint32_t v_stride = p.nb21; + // When using grouped query attention, all rows use the same mask (stride 0). + // "p.gqa_ratio >> 16" is just a roundabout way of writing zero + // that prevents the compiler from folding the "&" through the select + // and breaking the alignment detection. + uint32_t m_stride = (p.gqa_ratio > 1) ? (p.gqa_ratio >> 16) : KV; // hint to the compiler that strides are aligned for the aligned variant of the shader if (Clamp != gl_CooperativeMatrixClampModeConstantNV) { @@ -209,6 +214,7 @@ void main() { k_stride &= ~7; v_stride &= ~7; #endif + m_stride &= ~7; } tensorLayoutQ = setTensorLayoutStrideNV(tensorLayoutQ, q_stride, 1); tensorLayoutK = setTensorLayoutStrideNV(tensorLayoutK, k_stride, 1); @@ -261,10 +267,7 @@ void main() { if (p.mask != 0) { tensorLayoutNV<2, Clamp> tensorLayoutM = createTensorLayoutNV(2, Clamp); tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV); - // When using grouped query attention, all rows use the same mask. - if (p.gqa_ratio > 1) { - tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, 0, 1); - } + tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1); coopmat mv;