[ggml/llama] Align structures after scoped enums for 64-bit platforms

GermanAizek · GermanAizek · commit a314634daed0 · 2025-04-23T04:15:40.000+03:00
- llama_model_params 72 bytes -&gt; 64 bytes
- ggml_cgraph 80 bytes -&gt; 72 bytes
- hash_node 32 bytes -&gt; 24 bytes
- ggml_threadpool 160 -&gt; 152 bytes
- best_tokenization 24 -&gt; 16 bytes
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
@@ -339,10 +339,10 @@ static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
 // graph allocator
 
 struct hash_node {
+    size_t offset; // offset within the buffer
     int n_children;
     int n_views;
     int buffer_id;
-    size_t offset; // offset within the buffer
     bool allocated;
 };
 
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -451,6 +451,8 @@ struct ggml_threadpool {
     atomic_int GGML_CACHE_ALIGN n_barrier_passed;
     atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
 
+    enum ggml_status ec;
+
     // these are atomic as an annotation for thread-sanitizer
     atomic_bool stop;         // Used for stopping the threadpool altogether
     atomic_bool pause;        // Used for pausing the threadpool or individual threads
@@ -462,8 +464,6 @@ struct ggml_threadpool {
 
     int32_t      prio;        // Scheduling priority
     uint32_t     poll;        // Polling level (0 - no polling)
-
-    enum ggml_status ec;
 };
 
 // Per-thread state
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
@@ -295,14 +295,14 @@ struct ggml_cgraph {
     int n_nodes; // number of nodes currently in use
     int n_leafs; // number of leafs currently in use
 
+    enum ggml_cgraph_eval_order order;
+
     struct ggml_tensor ** nodes;     // tensors with data that can change if the graph is evaluated
     struct ggml_tensor ** grads;     // the outputs of these tensors are the gradients of the nodes
     struct ggml_tensor ** grad_accs; // accumulators for node gradients
     struct ggml_tensor ** leafs;     // tensors with constant data
 
     struct ggml_hash_set visited_hash_set;
-
-    enum ggml_cgraph_eval_order order;
 };
 
 // returns a slice of cgraph with nodes [i0, i1)
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -5921,12 +5921,12 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
         /*.size         =*/ size,
         /*.n_nodes      =*/ 0,
         /*.n_leafs      =*/ 0,
+        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
         /*.nodes        =*/ nodes_ptr,
         /*.grads        =*/ grads_ptr,
         /*.grad_accs    =*/ grad_accs_ptr,
         /*.leafs        =*/ leafs_ptr,
         /*.hash_table   =*/ { hash_size, hash_used, hash_keys_ptr },
-        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
     };
 
     ggml_hash_set_reset(&cgraph->visited_hash_set);
@@ -5947,12 +5947,12 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
         /*.size             =*/ 0,
         /*.n_nodes          =*/ i1 - i0,
         /*.n_leafs          =*/ 0,
+        /*.order            =*/ cgraph0->order,
         /*.nodes            =*/ cgraph0->nodes + i0,
         /*.grads            =*/ NULL, // gradients would need visited_hash_set
         /*.grad_accs        =*/ NULL,
         /*.leafs            =*/ NULL,
         /*.visited_hash_set =*/ { 0, NULL, NULL },
-        /*.order            =*/ cgraph0->order,
     };
 
     return cgraph;
diff --git a/include/llama.h b/include/llama.h
@@ -294,7 +294,6 @@ extern "C" {
         const struct llama_model_tensor_buft_override * tensor_buft_overrides;
 
         int32_t n_gpu_layers; // number of layers to store in VRAM
-        enum llama_split_mode split_mode; // how to split the model across multiple GPUs
 
         // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
         int32_t main_gpu;
@@ -313,6 +312,8 @@ extern "C" {
         // override key-value pairs of the model meta data
         const struct llama_model_kv_override * kv_overrides;
 
+        enum llama_split_mode split_mode; // how to split the model across multiple GPUs
+
         // Keep the booleans together to avoid misalignment during copy-by-value.
         bool vocab_only;    // only load the vocabulary, no weights
         bool use_mmap;      // use mmap if possible
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -13076,12 +13076,12 @@ llama_model_params llama_model_default_params() {
         /*.devices                     =*/ nullptr,
         /*.tensor_buft_overrides       =*/ nullptr,
         /*.n_gpu_layers                =*/ 0,
-        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
         /*.main_gpu                    =*/ 0,
         /*.tensor_split                =*/ nullptr,
         /*.progress_callback           =*/ nullptr,
         /*.progress_callback_user_data =*/ nullptr,
         /*.kv_overrides                =*/ nullptr,
+        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
         /*.vocab_only                  =*/ false,
         /*.use_mmap                    =*/ true,
         /*.use_mlock                   =*/ false,
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
@@ -996,8 +996,8 @@ struct llm_tokenizer_ugm_session {
 
     // this structure stores the best tokenization so far at input_offset
     struct best_tokenization {
-        llama_token token_id;
         size_t input_offset;
+        llama_token token_id;
         float score_sum;
     };