Skip to content

Commit 034d0a8

Browse files
Merge pull request #111 from menloresearch/update-dev-from-master-2025-06-01-00-11
Sync master with upstream release b5558
2 parents 1021f2f + 053b153 commit 034d0a8

33 files changed

+1627
-758
lines changed

common/arg.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1348,9 +1348,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
13481348
));
13491349
add_opt(common_arg(
13501350
{"--prio"}, "N",
1351-
string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
1351+
string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
13521352
[](common_params & params, int prio) {
1353-
if (prio < 0 || prio > 3) {
1353+
if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
13541354
throw std::invalid_argument("invalid value");
13551355
}
13561356
params.cpuparams.priority = (enum ggml_sched_priority) prio;

common/chat-parser.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,9 +154,10 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
154154
if (!rest.empty()) {
155155
handle_reasoning(rest, /* closed */ !is_partial());
156156
}
157-
if (!syntax_.thinking_forced_open) {
158-
throw common_chat_msg_partial_exception(end_think);
159-
}
157+
// Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
158+
// if (!syntax_.thinking_forced_open) {
159+
// throw common_chat_msg_partial_exception(end_think);
160+
// }
160161
return true;
161162
}
162163
}

common/common.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
203203

204204
DWORD p = NORMAL_PRIORITY_CLASS;
205205
switch (prio) {
206+
case GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break;
206207
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
207208
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
208209
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
@@ -228,6 +229,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
228229

229230
int p = 0;
230231
switch (prio) {
232+
case GGML_SCHED_PRIO_LOW: p = 5; break;
231233
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
232234
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
233235
case GGML_SCHED_PRIO_HIGH: p = -10; break;

docs/build.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ cmake --build build --config Release
6363
cmake --preset x64-windows-llvm-release
6464
cmake --build build-x64-windows-llvm-release
6565
```
66+
- Curl usage is enabled by default and can be turned off with `-DLLAMA_CURL=OFF`. Otherwise you need to install development libraries for libcurl.
6667
6768
## BLAS Build
6869

examples/parallel/parallel.cpp

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -362,15 +362,17 @@ int main(int argc, char ** argv) {
362362
// process in chunks of params.n_batch
363363
int32_t n_batch = params.n_batch;
364364

365-
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
365+
int32_t i_next = 0;
366+
367+
for (int32_t i = 0; i < batch.n_tokens; i = i_next) {
366368
// experiment: process in powers of 2
367369
//if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) {
368370
// n_batch /= 2;
369371
// i -= n_batch;
370372
// continue;
371373
//}
372374

373-
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
375+
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
374376

375377
llama_batch batch_view = {
376378
n_tokens,
@@ -390,19 +392,24 @@ int main(int argc, char ** argv) {
390392
return 1;
391393
}
392394

393-
LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
395+
LOG_WRN("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
394396

395397
n_cache_miss += 1;
396398

397399
// retry with half the batch size to try to find a free slot in the KV cache
398400
n_batch /= 2;
399-
i -= n_batch;
400401

401402
continue;
402403
}
403404

404405
LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
405406

407+
// move the head of the batch forward with the number of tokens we just processed
408+
i_next = i + n_tokens;
409+
410+
// on successful decode, restore the original batch size
411+
n_batch = params.n_batch;
412+
406413
for (auto & client : clients) {
407414
if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
408415
continue;

examples/passkey/passkey.cpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -133,9 +133,8 @@ int main(int argc, char ** argv) {
133133
const int ib = i/n_batch - 1;
134134
const int bd = n_batch_grp*(n_grp - 1);
135135

136-
llama_kv_self_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
137-
llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
138-
llama_kv_self_update (ctx);
136+
llama_kv_self_seq_add(ctx, 0, n_past - n_batch, n_past, ib*bd);
137+
llama_kv_self_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
139138

140139
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
141140
}
@@ -169,8 +168,6 @@ int main(int argc, char ** argv) {
169168

170169
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
171170
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
172-
//llama_kv_self_defrag (ctx);
173-
llama_kv_self_update (ctx);
174171

175172
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
176173

@@ -200,8 +197,6 @@ int main(int argc, char ** argv) {
200197

201198
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
202199
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
203-
//llama_kv_self_defrag (ctx);
204-
llama_kv_self_update (ctx);
205200

206201
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
207202
}

ggml/include/ggml.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2181,6 +2181,7 @@ extern "C" {
21812181

21822182
// scheduling priorities
21832183
enum ggml_sched_priority {
2184+
GGML_SCHED_PRIO_LOW = -1,
21842185
GGML_SCHED_PRIO_NORMAL,
21852186
GGML_SCHED_PRIO_MEDIUM,
21862187
GGML_SCHED_PRIO_HIGH,

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2418,12 +2418,32 @@ static bool ggml_thread_apply_priority(int32_t prio) {
24182418
// This is up to the applications.
24192419
DWORD p = THREAD_PRIORITY_NORMAL;
24202420
switch (prio) {
2421+
case GGML_SCHED_PRIO_LOW: p = THREAD_PRIORITY_BELOW_NORMAL; break;
24212422
case GGML_SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break;
24222423
case GGML_SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break;
24232424
case GGML_SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break;
24242425
case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
24252426
}
24262427

2428+
if (prio != GGML_SCHED_PRIO_LOW) {
2429+
// Tell Windows that this thread should not be throttled (needs its own CPU core).
2430+
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place
2431+
// all our threads onto the first 4 cores which results in terrible performance with
2432+
// n_threads > 4
2433+
#if _WIN32_WINNT >= 0x0602
2434+
THREAD_POWER_THROTTLING_STATE t;
2435+
ZeroMemory(&t, sizeof(t));
2436+
t.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION;
2437+
t.ControlMask = THREAD_POWER_THROTTLING_EXECUTION_SPEED;
2438+
t.StateMask = 0;
2439+
2440+
if (!SetThreadInformation(GetCurrentThread(), ThreadPowerThrottling, &t, sizeof(t))) {
2441+
GGML_LOG_DEBUG("failed to disable thread power throttling %d : (%d)\n", prio, (int) GetLastError());
2442+
return false;
2443+
}
2444+
#endif
2445+
}
2446+
24272447
if (prio == GGML_SCHED_PRIO_NORMAL) {
24282448
// Keep inherited policy/priority
24292449
return true;
@@ -2451,6 +2471,8 @@ static bool ggml_thread_apply_priority(int32_t prio) {
24512471
struct sched_param p;
24522472
int32_t policy = SCHED_OTHER;
24532473
switch (prio) {
2474+
// TODO: there seems to be no way to set lower prio on Apple platforms
2475+
case GGML_SCHED_PRIO_LOW: policy = SCHED_OTHER; p.sched_priority = 0; break;
24542476
case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
24552477
case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
24562478
case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
@@ -2507,6 +2529,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
25072529
struct sched_param p;
25082530
int32_t policy = SCHED_OTHER;
25092531
switch (prio) {
2532+
case GGML_SCHED_PRIO_LOW: policy = SCHED_BATCH; p.sched_priority = 0; break;
25102533
case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
25112534
case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
25122535
case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;

ggml/src/ggml-cuda/common.cuh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,7 @@ struct ggml_cuda_device_info {
635635
int nsm; // number of streaming multiprocessors
636636
size_t smpb; // max. shared memory per block
637637
size_t smpbo; // max. shared memory per block (with opt-in)
638+
bool integrated; // Device is integrated as opposed to discrete
638639
bool vmm; // virtual memory support
639640
size_t vmm_granularity; // granularity of virtual memory
640641
size_t total_vram;

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -243,10 +243,10 @@ static ggml_cuda_device_info ggml_cuda_init() {
243243

244244
info.default_tensor_split[id] = total_vram;
245245
total_vram += prop.totalGlobalMem;
246-
247-
info.devices[id].nsm = prop.multiProcessorCount;
248-
info.devices[id].smpb = prop.sharedMemPerBlock;
249-
info.devices[id].warp_size = prop.warpSize;
246+
info.devices[id].integrated = prop.integrated;
247+
info.devices[id].nsm = prop.multiProcessorCount;
248+
info.devices[id].smpb = prop.sharedMemPerBlock;
249+
info.devices[id].warp_size = prop.warpSize;
250250
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
251251
info.devices[id].smpbo = prop.sharedMemPerBlock;
252252

@@ -1065,6 +1065,10 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
10651065
GGML_UNUSED(buft);
10661066
}
10671067

1068+
static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
1069+
return buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
1070+
}
1071+
10681072
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10691073
CUDA_CHECK(cudaFreeHost(buffer->context));
10701074
}
@@ -2641,6 +2645,8 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
26412645

26422646
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
26432647
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
2648+
// flag used to determine whether it is an integrated_gpu
2649+
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
26442650

26452651
while (!graph_evaluated_or_captured) {
26462652
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
@@ -2659,7 +2665,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
26592665
if (node->src[j] != nullptr) {
26602666
assert(node->src[j]->buffer);
26612667
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
2662-
ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
2668+
ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft)));
26632669
}
26642670
}
26652671
#endif
@@ -3266,7 +3272,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
32663272
}
32673273

32683274
static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
3269-
return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
3275+
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
3276+
const bool integrated = ggml_cuda_info().devices[dev_ctx->device].integrated;
3277+
return (((ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev) || (integrated && ggml_backend_buft_is_cuda_host(buft)));
32703278
}
32713279

32723280
static int64_t get_op_batch_size(const ggml_tensor * op) {

0 commit comments

Comments
 (0)