Skip to content

Commit 4d6420f

Browse files
committed
Merge branch 'master' of https://github.com/VJHack/llama.cpp
2 parents c12f076 + faf67b3 commit 4d6420f

File tree

14 files changed

+344
-59
lines changed

14 files changed

+344
-59
lines changed

common/arg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1312,7 +1312,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
13121312
[](gpt_params & params, int value) {
13131313
params.n_parallel = value;
13141314
}
1315-
));
1315+
).set_env("LLAMA_ARG_N_PARALLEL"));
13161316
add_opt(llama_arg(
13171317
{"-ns", "--sequences"}, "N",
13181318
format("number of sequences to decode (default: %d)", params.n_sequences),

convert_hf_to_gguf.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4080,6 +4080,36 @@ def prepare_tensors(self):
40804080
super().prepare_tensors()
40814081

40824082

4083+
@Model.register("GraniteForCausalLM")
4084+
class GraniteModel(LlamaModel):
4085+
"""Conversion for IBM's GraniteForCausalLM"""
4086+
model_arch = gguf.MODEL_ARCH.GRANITE
4087+
4088+
def set_gguf_parameters(self):
4089+
"""Granite uses standard llama parameters with the following differences:
4090+
4091+
- No head_dim support
4092+
- New multiplier params:
4093+
- attention_scale
4094+
- embedding_scale
4095+
- residual_scale
4096+
- logits_scaling
4097+
"""
4098+
if head_dim := self.hparams.pop("head_dim", None):
4099+
logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim)
4100+
super().set_gguf_parameters()
4101+
# NOTE: Convert _multiplier params to _scale params for naming
4102+
# consistency
4103+
if attention_scale := self.hparams.get("attention_multiplier"):
4104+
self.gguf_writer.add_attention_scale(attention_scale)
4105+
if embedding_scale := self.hparams.get("embedding_multiplier"):
4106+
self.gguf_writer.add_embedding_scale(embedding_scale)
4107+
if residual_scale := self.hparams.get("residual_multiplier"):
4108+
self.gguf_writer.add_residual_scale(residual_scale)
4109+
if logits_scaling := self.hparams.get("logits_scaling"):
4110+
self.gguf_writer.add_logit_scale(logits_scaling)
4111+
4112+
40834113
###### CONVERSION LOGIC ######
40844114

40854115
# tree of lazy tensors

docs/backend/SYCL.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -636,6 +636,14 @@ use 1 SYCL GPUs: [0] with Max compute units:512
636636

637637
It's same for other projects including llama.cpp SYCL backend.
638638

639+
- Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer`
640+
641+
Device Memory is not enough.
642+
643+
|Reason|Solution|
644+
|-|-|
645+
|Default Context is too big. It leads to more memory usage.|Set `-c 8192` or smaller value.|
646+
|Model is big and require more memory than device's.|Choose smaller quantized model, like Q5 -> Q4;<br>Use more than one devices to load model.|
639647

640648
### **GitHub contribution**:
641649
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.

examples/llama-bench/llama-bench.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
439439
}
440440
types.push_back(gt);
441441
}
442+
if (invalid_param) {
443+
break;
444+
}
442445
params.type_k.insert(params.type_k.end(), types.begin(), types.end());
443446
} else if (arg == "-ctv" || arg == "--cache-type-v") {
444447
if (++i >= argc) {
@@ -455,6 +458,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
455458
}
456459
types.push_back(gt);
457460
}
461+
if (invalid_param) {
462+
break;
463+
}
458464
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
459465
} else if (arg == "-t" || arg == "--threads") {
460466
if (++i >= argc) {
@@ -520,6 +526,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
520526
}
521527
modes.push_back(mode);
522528
}
529+
if (invalid_param) {
530+
break;
531+
}
523532
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
524533
} else if (arg == "-mg" || arg == "--main-gpu") {
525534
if (++i >= argc) {

examples/server/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ The project is under active development, and we are [looking for feedback and co
8787
| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16) |
8888
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
8989
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
90-
| `-np, --parallel N` | number of parallel sequences to decode (default: 1) |
90+
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
9191
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
9292
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
9393
| `--mlock` | force system to keep model in RAM rather than swapping or compressing |

examples/sycl/run-llama2.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,17 @@ source /opt/intel/oneapi/setvars.sh
1111
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
1212

1313
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
14-
MODEL_FILE=llama-2-7b.Q4_0.gguf
14+
MODEL_FILE=models/llama-2-7b.Q4_0.gguf
1515
NGL=33
16+
CONEXT=8192
1617

1718
if [ $# -gt 0 ]; then
1819
GGML_SYCL_DEVICE=$1
1920
echo "use $GGML_SYCL_DEVICE as main GPU"
2021
#use signle GPU only
21-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -mg $GGML_SYCL_DEVICE -sm none
22+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT} -mg $GGML_SYCL_DEVICE -sm none
2223

2324
else
2425
#use multiple GPUs with same max compute units
25-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0
26+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT}
2627
fi

ggml/src/ggml.c

Lines changed: 74 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -2013,10 +2013,11 @@ struct ggml_threadpool {
20132013
// these are atomic as an annotation for thread-sanitizer
20142014
atomic_bool stop; // Used for stopping the threadpool altogether
20152015
atomic_bool pause; // Used for pausing the threadpool or individual threads
2016+
atomic_bool abort; // Used for aborting processing of a graph
20162017

20172018
struct ggml_compute_state * workers; // per thread state
20182019
int n_threads_max; // number of threads in the pool
2019-
int n_threads_cur; // number of threads used in the current graph
2020+
atomic_int n_threads_cur; // number of threads used in the current graph
20202021

20212022
int32_t prio; // Scheduling priority
20222023
uint32_t poll; // Polling level (0 - no polling)
@@ -3178,41 +3179,36 @@ inline static void ggml_critical_section_start(void) {
31783179
}
31793180
}
31803181

3181-
#ifdef GGML_USE_OPENMP
3182-
static void ggml_barrier(struct ggml_threadpool * threadpool) {
3183-
if (threadpool->n_threads_cur == 1) {
3182+
static void ggml_barrier(struct ggml_threadpool * tp) {
3183+
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
3184+
if (n_threads == 1) {
31843185
return;
31853186
}
31863187

3188+
#ifdef GGML_USE_OPENMP
31873189
#pragma omp barrier
3188-
}
31893190
#else
3190-
static void ggml_barrier(struct ggml_threadpool * threadpool) {
3191-
if (threadpool->n_threads_cur == 1) {
3192-
return;
3193-
}
3194-
3195-
atomic_int * n_barrier = &threadpool->n_barrier;
3196-
atomic_int * n_barrier_passed = &threadpool->n_barrier_passed;
3191+
int n_passed = atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed);
31973192

3198-
int n_threads = threadpool->n_threads_cur;
3199-
int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed);
3193+
// enter barrier (full seq-cst fence)
3194+
int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
32003195

3201-
if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
3196+
int last = 0;
3197+
if (n_barrier == (n_threads - 1)) {
32023198
// last thread
3203-
atomic_store(n_barrier, 0);
3204-
atomic_fetch_add_explicit(n_barrier_passed, 1, memory_order_relaxed);
3199+
atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
3200+
last = 1;
32053201
} else {
32063202
// wait for other threads
3207-
while (true) {
3208-
if (atomic_load_explicit(n_barrier_passed, memory_order_relaxed) != passed_old) {
3209-
return;
3210-
}
3203+
while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
32113204
ggml_thread_cpu_relax();
32123205
}
32133206
}
3214-
}
3207+
3208+
// exit barrier (full seq-cst fence)
3209+
atomic_fetch_add_explicit(&tp->n_barrier_passed, last, memory_order_seq_cst);
32153210
#endif
3211+
}
32163212

32173213
// TODO: make this somehow automatically executed
32183214
// some sort of "sentry" mechanism
@@ -19933,64 +19929,84 @@ struct ggml_cplan ggml_graph_plan(
1993319929

1993419930
static thread_ret_t ggml_graph_compute_thread(void * data) {
1993519931
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
19932+
struct ggml_threadpool * tp = state->threadpool;
1993619933

19937-
const struct ggml_cgraph * cgraph = state->threadpool->cgraph;
19938-
const struct ggml_cplan * cplan = state->threadpool->cplan;
19934+
const struct ggml_cgraph * cgraph = tp->cgraph;
19935+
const struct ggml_cplan * cplan = tp->cplan;
1993919936

1994019937
set_numa_thread_affinity(state->ith);
1994119938

1994219939
struct ggml_compute_params params = {
1994319940
/*.ith =*/ state->ith,
19944-
/*.nth =*/ state->threadpool->n_threads_cur,
19941+
/*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
1994519942
/*.wsize =*/ cplan->work_size,
1994619943
/*.wdata =*/ cplan->work_data,
19947-
/*.threadpool=*/ state->threadpool,
19944+
/*.threadpool=*/ tp,
1994819945
};
1994919946

19950-
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
19947+
for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) {
1995119948
struct ggml_tensor * node = cgraph->nodes[node_n];
1995219949

1995319950
ggml_compute_forward(&params, node);
1995419951

19955-
if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
19956-
state->threadpool->ec = GGML_STATUS_ABORTED;
19952+
if (state->ith == 0 && cplan->abort_callback &&
19953+
cplan->abort_callback(cplan->abort_callback_data)) {
19954+
tp->abort = true;
19955+
tp->ec = GGML_STATUS_ABORTED;
1995719956
}
1995819957

1995919958
ggml_barrier(state->threadpool);
19960-
19961-
if (state->threadpool->ec != GGML_STATUS_SUCCESS) {
19962-
break;
19963-
}
1996419959
}
1996519960

1996619961
return 0;
1996719962
}
1996819963

1996919964
#ifndef GGML_USE_OPENMP
1997019965

19971-
static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
19966+
// check if thread is active
19967+
static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
19968+
struct ggml_threadpool * threadpool = state->threadpool;
19969+
int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
19970+
return (state->ith < n_threads);
19971+
}
19972+
19973+
// check if thread is ready to proceed (exit from polling or sleeping)
19974+
static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
1997219975
struct ggml_threadpool * threadpool = state->threadpool;
1997319976

1997419977
if (state->pending || threadpool->stop || threadpool->pause) { return true; }
1997519978

1997619979
// check for new graph/work
1997719980
int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
1997819981
if (new_graph != state->last_graph) {
19979-
state->pending = (state->ith < threadpool->n_threads_cur);
19982+
state->pending = ggml_graph_compute_thread_active(state);
1998019983
state->last_graph = new_graph;
1998119984
}
1998219985

1998319986
return state->pending;
1998419987
}
1998519988

19989+
// sync thread state after polling
19990+
static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
19991+
struct ggml_threadpool * threadpool = state->threadpool;
19992+
// this should just be atomic_thread_fence(seq_cst) but it confuses thread-sanitizer
19993+
// so instead we just use a dummy read-modify-write
19994+
atomic_fetch_add_explicit(&threadpool->n_graph, 0, memory_order_seq_cst);
19995+
}
19996+
1998619997
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
1998719998
struct ggml_threadpool * threadpool = state->threadpool;
1998819999

20000+
// Skip polling for unused threads
20001+
if (!ggml_graph_compute_thread_active(state)) {
20002+
return state->pending;
20003+
}
20004+
1998920005
// This seems to make 0 ... 100 a decent range for polling level across modern processors.
1999020006
// Perhaps, we can adjust it dynamically based on load and things.
1999120007
const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
1999220008

19993-
for (uint64_t i=0; !ggml_graph_compute_ready(state) && i<n_rounds; i++) {
20009+
for (uint64_t i=0; !ggml_graph_compute_thread_ready(state) && i < n_rounds; i++) {
1999420010
// No new work. Keep polling.
1999520011
ggml_thread_cpu_relax();
1999620012
}
@@ -20002,13 +20018,14 @@ static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state *
2000220018
struct ggml_threadpool * threadpool = state->threadpool;
2000320019

2000420020
if (ggml_graph_compute_poll_for_work(state)) {
20021+
ggml_graph_compute_thread_sync(state);
2000520022
return state->pending;
2000620023
}
2000720024

2000820025
ggml_mutex_lock_shared(&threadpool->mutex);
20009-
while (!ggml_graph_compute_ready(state)) {
20026+
while (!ggml_graph_compute_thread_ready(state)) {
2001020027
// No new work. Wait for the signal.
20011-
GGML_PRINT_DEBUG("thread #%d waiting for work\n", state->ith);
20028+
GGML_PRINT_DEBUG("thread #%d waiting for work (sleeping)\n", state->ith);
2001220029
ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
2001320030
}
2001420031
ggml_mutex_unlock_shared(&threadpool->mutex);
@@ -20055,13 +20072,20 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
2005520072
}
2005620073

2005720074
// Start processing new graph
20058-
static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool)
20075+
static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int n_threads)
2005920076
{
20060-
// always take the mutex here because the worker threads are doing hybrid poll/wait
20077+
// Always take the mutex here because the worker threads are doing hybrid poll/wait
2006120078

2006220079
ggml_mutex_lock(&threadpool->mutex);
2006320080

20064-
atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
20081+
GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
20082+
20083+
// Update the number of active threads
20084+
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
20085+
20086+
// Indicate the graph is ready to be processed
20087+
// We need the full seq-cst fence here because of the polling threads (used in thread_sync)
20088+
atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
2006520089

2006620090
if (threadpool->pause) {
2006720091
// Update main thread prio and affinity to match the threadpool settings
@@ -20120,6 +20144,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
2012020144
threadpool->current_chunk = 0;
2012120145
threadpool->stop = false;
2012220146
threadpool->pause = tpp->paused;
20147+
threadpool->abort = false;
2012320148
threadpool->workers = NULL;
2012420149
threadpool->n_threads_max = tpp->n_threads;
2012520150
threadpool->n_threads_cur = tpp->n_threads;
@@ -20195,15 +20220,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
2019520220
// No worker threads should be accessing the parameters below at this stage
2019620221
threadpool->cgraph = cgraph;
2019720222
threadpool->cplan = cplan;
20198-
threadpool->n_threads_cur = n_threads;
2019920223
threadpool->current_chunk = 0;
20224+
threadpool->abort = false;
2020020225
threadpool->ec = GGML_STATUS_SUCCESS;
2020120226
}
2020220227

20203-
if (n_threads > threadpool->n_threads_max) {
20204-
GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
20205-
}
20206-
2020720228
#ifdef GGML_USE_OPENMP
2020820229
if (n_threads > 1) {
2020920230
#pragma omp parallel num_threads(n_threads)
@@ -20212,7 +20233,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
2021220233
{
2021320234
// update the number of threads from the actual number of threads that we got from OpenMP
2021420235
n_threads = omp_get_num_threads();
20215-
threadpool->n_threads_cur = n_threads;
20236+
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
2021620237
}
2021720238

2021820239
ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
@@ -20221,8 +20242,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
2022120242
ggml_graph_compute_thread(&threadpool->workers[0]);
2022220243
}
2022320244
#else
20245+
if (n_threads > threadpool->n_threads_max) {
20246+
GGML_PRINT("WARNING: cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
20247+
n_threads = threadpool->n_threads_max;
20248+
}
20249+
2022420250
// Kick all threads to start the new graph
20225-
ggml_graph_compute_kickoff(threadpool);
20251+
ggml_graph_compute_kickoff(threadpool, n_threads);
2022620252

2022720253
// This is a work thread too
2022820254
ggml_graph_compute_thread(&threadpool->workers[0]);

0 commit comments

Comments
 (0)