Skip to content

Commit 4e74031

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # ci/run.sh # docs/backend/SYCL.md # docs/build.md # ggml/src/ggml-vulkan/CMakeLists.txt # ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt # tests/test-chat-template.cpp
2 parents c48a4a7 + 74d4f5b commit 4e74031

File tree

5 files changed

+100
-31
lines changed

5 files changed

+100
-31
lines changed

common/minja/minja.hpp

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2606,14 +2606,18 @@ inline std::shared_ptr<Context> Context::builtins() {
26062606
auto & text = args.at("text");
26072607
return text.is_null() ? text : Value(strip(text.get<std::string>()));
26082608
}));
2609-
globals.set("lower", simple_function("lower", { "text" }, [](const std::shared_ptr<Context> &, Value & args) {
2610-
auto text = args.at("text");
2611-
if (text.is_null()) return text;
2612-
std::string res;
2613-
auto str = text.get<std::string>();
2614-
std::transform(str.begin(), str.end(), std::back_inserter(res), ::tolower);
2615-
return Value(res);
2616-
}));
2609+
auto char_transform_function = [](const std::string & name, const std::function<char(char)> & fn) {
2610+
return simple_function(name, { "text" }, [=](const std::shared_ptr<Context> &, Value & args) {
2611+
auto text = args.at("text");
2612+
if (text.is_null()) return text;
2613+
std::string res;
2614+
auto str = text.get<std::string>();
2615+
std::transform(str.begin(), str.end(), std::back_inserter(res), fn);
2616+
return Value(res);
2617+
});
2618+
};
2619+
globals.set("lower", char_transform_function("lower", ::tolower));
2620+
globals.set("upper", char_transform_function("upper", ::toupper));
26172621
globals.set("default", Value::callable([=](const std::shared_ptr<Context> &, ArgumentsValue & args) {
26182622
args.expectArgs("default", {2, 3}, {0, 1});
26192623
auto & value = args.args[0];

docs/backend/OPENCL.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,13 @@ A Snapdragon X Elite device with Windows 11 Arm64 is used. Make sure the followi
145145
* Clang 19
146146
* Ninja
147147
* Visual Studio 2022
148+
* Powershell 7
148149

149-
Powershell is used for the following instructions.
150+
Visual Studio provides necessary headers and libraries although it is not directly used for building.
151+
Alternatively, Visual Studio Build Tools can be installed instead of the full Visual Studio.
152+
153+
Powershell 7 is used for the following commands.
154+
If an older version of Powershell is used, these commands may not work as they are.
150155

151156
### I. Setup Environment
152157

@@ -196,10 +201,9 @@ ninja
196201

197202
## Known Issues
198203

199-
- Qwen2.5 0.5B model produces gibberish output with Adreno kernels.
204+
- Currently OpenCL backend does not work on Adreno 6xx GPUs.
200205

201206
## TODO
202207

203-
- Fix Qwen2.5 0.5B
204208
- Optimization for Q6_K
205209
- Support and optimization for Q4_K

ggml/src/ggml-cuda/fattn.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
299299
const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
300300
const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
301301
const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && cc < GGML_CUDA_CC_ADA_LOVELACE && !mma_needs_data_conversion;
302-
const bool can_use_vector_kernel = (Q->ne[0] % (2*warp_size) == 0) && (prec == GGML_PREC_DEFAULT || Q->ne[0] <= 128);
302+
const bool can_use_vector_kernel = Q->ne[0] % (2*warp_size) == 0;
303303
if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
304304
if (prec == GGML_PREC_DEFAULT) {
305305
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 78 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,28 @@
3333
#include <future>
3434
#include <thread>
3535

36+
#if defined(_MSC_VER)
37+
# define NOMINMAX 1
38+
# include <windows.h>
39+
# define YIELD() YieldProcessor()
40+
#elif defined(__clang__) || defined(__GNUC__)
41+
# if defined(__x86_64__) ||defined(__i386__)
42+
# include <immintrin.h>
43+
# define YIELD() _mm_pause()
44+
# elif defined(__arm__) || defined(__aarch64__)
45+
# if defined(__clang__)
46+
# include <arm_acle.h>
47+
# define YIELD() __yield()
48+
# else
49+
# define YIELD() asm volatile("yield")
50+
# endif
51+
# endif
52+
#endif
53+
54+
#if !defined(YIELD)
55+
#define YIELD()
56+
#endif
57+
3658
#include "ggml-impl.h"
3759
#include "ggml-backend-impl.h"
3860

@@ -796,7 +818,8 @@ struct ggml_backend_vk_context {
796818
ggml_vk_garbage_collector gc;
797819
size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
798820
vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
799-
vk::Fence fence;
821+
vk::Fence fence, almost_ready_fence;
822+
bool almost_ready_fence_pending {};
800823

801824
vk_buffer buffer_pool[MAX_VK_BUFFERS];
802825

@@ -887,6 +910,39 @@ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx
887910

888911
static void ggml_backend_vk_free(ggml_backend_t backend);
889912

913+
// Wait for ctx->fence to be signaled.
914+
static void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) {
915+
// Use waitForFences while most of the graph executes. Hopefully the CPU can sleep
916+
// during this wait.
917+
if (ctx->almost_ready_fence_pending) {
918+
VK_CHECK(ctx->device->device.waitForFences({ ctx->almost_ready_fence }, true, UINT64_MAX), "almost_ready_fence");
919+
ctx->device->device.resetFences({ ctx->almost_ready_fence });
920+
ctx->almost_ready_fence_pending = false;
921+
}
922+
923+
// Spin (w/pause) waiting for the graph to finish executing.
924+
vk::Result result;
925+
while ((result = ctx->device->device.getFenceStatus(ctx->fence)) != vk::Result::eSuccess) {
926+
if (result != vk::Result::eNotReady) {
927+
fprintf(stderr, "ggml_vulkan: error %s at %s:%d\n", to_string(result).c_str(), __FILE__, __LINE__);
928+
exit(1);
929+
}
930+
for (uint32_t i = 0; i < 100; ++i) {
931+
YIELD();
932+
YIELD();
933+
YIELD();
934+
YIELD();
935+
YIELD();
936+
YIELD();
937+
YIELD();
938+
YIELD();
939+
YIELD();
940+
YIELD();
941+
}
942+
}
943+
ctx->device->device.resetFences({ ctx->fence });
944+
}
945+
890946
// variables to track number of compiles in progress
891947
static uint32_t compile_count = 0;
892948
static std::mutex compile_count_mutex;
@@ -3372,6 +3428,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
33723428
ctx->prealloc_size_split_k = 0;
33733429

33743430
ctx->fence = ctx->device->device.createFence({});
3431+
ctx->almost_ready_fence = ctx->device->device.createFence({});
33753432

33763433
#ifdef GGML_VULKAN_CHECK_RESULTS
33773434
const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
@@ -7976,11 +8033,11 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
79768033
}
79778034
}
79788035

7979-
static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
8036+
static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence, bool almost_ready);
79808037

79818038
// Returns true if node has enqueued work into the queue, false otherwise
79828039
// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
7983-
static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
8040+
static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool almost_ready, bool submit){
79848041
if (ggml_is_empty(node) || !node->buffer) {
79858042
return false;
79868043
}
@@ -8352,7 +8409,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
83528409

83538410
ctx->compute_ctx.reset();
83548411

8355-
bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
8412+
bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false, almost_ready);
83568413
if (!ok) {
83578414
if (node->op == GGML_OP_UNARY) {
83588415
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
@@ -8366,7 +8423,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
83668423
return true;
83678424
}
83688425

8369-
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){
8426+
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true, bool almost_ready = false) {
83708427
ggml_backend_buffer * buf = nullptr;
83718428

83728429
switch (tensor->op) {
@@ -8469,12 +8526,15 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
84698526
memcpy(cpy.dst, cpy.src, cpy.n);
84708527
}
84718528

8472-
ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
8529+
if (almost_ready && !ctx->almost_ready_fence_pending && !use_fence) {
8530+
ggml_vk_submit(subctx, ctx->almost_ready_fence);
8531+
ctx->almost_ready_fence_pending = true;
8532+
} else {
8533+
ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
8534+
}
84738535

84748536
if (use_fence) {
8475-
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
8476-
8477-
ctx->device->device.resetFences({ ctx->fence });
8537+
ggml_vk_wait_for_fence(ctx);
84788538
}
84798539
#ifdef GGML_VULKAN_CHECK_RESULTS
84808540
ggml_vk_check_results_1(tensor);
@@ -8560,6 +8620,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
85608620
ctx->gc.events.clear();
85618621

85628622
ctx->device->device.destroyFence(ctx->fence);
8623+
ctx->device->device.destroyFence(ctx->almost_ready_fence);
85638624
}
85648625

85658626
static int ggml_vk_get_device_count() {
@@ -8906,8 +8967,7 @@ static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
89068967
}
89078968

89088969
ggml_vk_submit(transfer_ctx, ctx->fence);
8909-
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences");
8910-
ctx->device->device.resetFences({ ctx->fence });
8970+
ggml_vk_wait_for_fence(ctx);
89118971

89128972
for (auto& cpy : transfer_ctx->out_memcpys) {
89138973
memcpy(cpy.dst, cpy.src, cpy.n);
@@ -8926,7 +8986,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
89268986

89278987
uint64_t total_mat_mul_bytes = 0;
89288988
for (int i = 0; i < cgraph->n_nodes; i++) {
8929-
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
8989+
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false, false);
89308990
if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) {
89318991
total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
89328992
}
@@ -8968,11 +9028,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
89689028
mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]);
89699029
}
89709030

9031+
// Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
9032+
bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
89719033
bool submit = (submitted_nodes >= nodes_per_submit) ||
89729034
(mul_mat_bytes >= mul_mat_bytes_per_submit) ||
8973-
(i == last_node);
9035+
(i == last_node) ||
9036+
(almost_ready && !ctx->almost_ready_fence_pending);
89749037

8975-
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
9038+
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit);
89769039

89779040
if (enqueued) {
89789041
++submitted_nodes;
@@ -8984,7 +9047,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
89849047
#endif
89859048
}
89869049

8987-
if (submit) {
9050+
if (submit && enqueued) {
89889051
first_node_in_batch = true;
89899052
submitted_nodes = 0;
89909053
mul_mat_bytes = 0;

src/llama-vocab.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2461,14 +2461,12 @@ void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer
24612461
// find the first occurrence of a given special token in this fragment
24622462
// passing offset argument only limit the "search area" but match coordinates
24632463
// are still relative to the source full raw_text
2464-
auto match = raw_text.find(text, raw_text_base_offset);
2464+
// string_view begins at pos 0 for the same reason
2465+
auto match = std::string_view(raw_text.data(), raw_text_base_offset + raw_text_base_length).find(text, raw_text_base_offset);
24652466

24662467
// no occurrences found, stop processing this fragment for a given special token
24672468
if (match == std::string::npos) break;
24682469

2469-
// check if match is within bounds of offset <-> length
2470-
if (match + text.length() > raw_text_base_offset + raw_text_base_length) break;
2471-
24722470
#ifdef PRETOKENIZERDEBUG
24732471
LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
24742472
#endif

0 commit comments

Comments
 (0)