Skip to content

Commit a314634

Browse files
committed
[ggml/llama] Align structures after scoped enums for 64-bit platforms
- llama_model_params 72 bytes -> 64 bytes - ggml_cgraph 80 bytes -> 72 bytes - hash_node 32 bytes -> 24 bytes - ggml_threadpool 160 -> 152 bytes - best_tokenization 24 -> 16 bytes
1 parent 37592cc commit a314634

File tree

7 files changed

+11
-10
lines changed

7 files changed

+11
-10
lines changed

ggml/src/ggml-alloc.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -339,10 +339,10 @@ static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
339339
// graph allocator
340340

341341
struct hash_node {
342+
size_t offset; // offset within the buffer
342343
int n_children;
343344
int n_views;
344345
int buffer_id;
345-
size_t offset; // offset within the buffer
346346
bool allocated;
347347
};
348348

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,8 @@ struct ggml_threadpool {
451451
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
452452
atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
453453

454+
enum ggml_status ec;
455+
454456
// these are atomic as an annotation for thread-sanitizer
455457
atomic_bool stop; // Used for stopping the threadpool altogether
456458
atomic_bool pause; // Used for pausing the threadpool or individual threads
@@ -462,8 +464,6 @@ struct ggml_threadpool {
462464

463465
int32_t prio; // Scheduling priority
464466
uint32_t poll; // Polling level (0 - no polling)
465-
466-
enum ggml_status ec;
467467
};
468468

469469
// Per-thread state

ggml/src/ggml-impl.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -295,14 +295,14 @@ struct ggml_cgraph {
295295
int n_nodes; // number of nodes currently in use
296296
int n_leafs; // number of leafs currently in use
297297

298+
enum ggml_cgraph_eval_order order;
299+
298300
struct ggml_tensor ** nodes; // tensors with data that can change if the graph is evaluated
299301
struct ggml_tensor ** grads; // the outputs of these tensors are the gradients of the nodes
300302
struct ggml_tensor ** grad_accs; // accumulators for node gradients
301303
struct ggml_tensor ** leafs; // tensors with constant data
302304

303305
struct ggml_hash_set visited_hash_set;
304-
305-
enum ggml_cgraph_eval_order order;
306306
};
307307

308308
// returns a slice of cgraph with nodes [i0, i1)

ggml/src/ggml.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5921,12 +5921,12 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
59215921
/*.size =*/ size,
59225922
/*.n_nodes =*/ 0,
59235923
/*.n_leafs =*/ 0,
5924+
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
59245925
/*.nodes =*/ nodes_ptr,
59255926
/*.grads =*/ grads_ptr,
59265927
/*.grad_accs =*/ grad_accs_ptr,
59275928
/*.leafs =*/ leafs_ptr,
59285929
/*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
5929-
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
59305930
};
59315931

59325932
ggml_hash_set_reset(&cgraph->visited_hash_set);
@@ -5947,12 +5947,12 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
59475947
/*.size =*/ 0,
59485948
/*.n_nodes =*/ i1 - i0,
59495949
/*.n_leafs =*/ 0,
5950+
/*.order =*/ cgraph0->order,
59505951
/*.nodes =*/ cgraph0->nodes + i0,
59515952
/*.grads =*/ NULL, // gradients would need visited_hash_set
59525953
/*.grad_accs =*/ NULL,
59535954
/*.leafs =*/ NULL,
59545955
/*.visited_hash_set =*/ { 0, NULL, NULL },
5955-
/*.order =*/ cgraph0->order,
59565956
};
59575957

59585958
return cgraph;

include/llama.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,6 @@ extern "C" {
294294
const struct llama_model_tensor_buft_override * tensor_buft_overrides;
295295

296296
int32_t n_gpu_layers; // number of layers to store in VRAM
297-
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
298297

299298
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
300299
int32_t main_gpu;
@@ -313,6 +312,8 @@ extern "C" {
313312
// override key-value pairs of the model meta data
314313
const struct llama_model_kv_override * kv_overrides;
315314

315+
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
316+
316317
// Keep the booleans together to avoid misalignment during copy-by-value.
317318
bool vocab_only; // only load the vocabulary, no weights
318319
bool use_mmap; // use mmap if possible

src/llama-model.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13076,12 +13076,12 @@ llama_model_params llama_model_default_params() {
1307613076
/*.devices =*/ nullptr,
1307713077
/*.tensor_buft_overrides =*/ nullptr,
1307813078
/*.n_gpu_layers =*/ 0,
13079-
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
1308013079
/*.main_gpu =*/ 0,
1308113080
/*.tensor_split =*/ nullptr,
1308213081
/*.progress_callback =*/ nullptr,
1308313082
/*.progress_callback_user_data =*/ nullptr,
1308413083
/*.kv_overrides =*/ nullptr,
13084+
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
1308513085
/*.vocab_only =*/ false,
1308613086
/*.use_mmap =*/ true,
1308713087
/*.use_mlock =*/ false,

src/llama-vocab.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -996,8 +996,8 @@ struct llm_tokenizer_ugm_session {
996996

997997
// this structure stores the best tokenization so far at input_offset
998998
struct best_tokenization {
999-
llama_token token_id;
1000999
size_t input_offset;
1000+
llama_token token_id;
10011001
float score_sum;
10021002
};
10031003

0 commit comments

Comments
 (0)