[ggml/llama] Align structures after scoped enums for 64-bit platforms

GermanAizek · GermanAizek · commit a1610bd8510d · 2025-04-24T16:28:56.000+03:00
- llama_model_params 72 bytes -&gt; 64 bytes
- ggml_cgraph 80 bytes -&gt; 72 bytes
- hash_node 32 bytes -&gt; 24 bytes
- ggml_threadpool 160 -&gt; 152 bytes
- best_tokenization 24 -&gt; 16 bytes
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
@@ -339,10 +339,10 @@ static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
 // graph allocator
 
 struct hash_node {
+    size_t offset; // offset within the buffer
     int n_children;
     int n_views;
     int buffer_id;
-    size_t offset; // offset within the buffer
     bool allocated;
 };
 
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -451,6 +451,8 @@ struct ggml_threadpool {
     atomic_int GGML_CACHE_ALIGN n_barrier_passed;
     atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
 
+    enum ggml_status ec;
+
     // these are atomic as an annotation for thread-sanitizer
     atomic_bool stop;         // Used for stopping the threadpool altogether
     atomic_bool pause;        // Used for pausing the threadpool or individual threads
@@ -462,8 +464,6 @@ struct ggml_threadpool {
 
     int32_t      prio;        // Scheduling priority
     uint32_t     poll;        // Polling level (0 - no polling)
-
-    enum ggml_status ec;
 };
 
 // Per-thread state
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
@@ -295,14 +295,14 @@ struct ggml_cgraph {
     int n_nodes; // number of nodes currently in use
     int n_leafs; // number of leafs currently in use
 
+    enum ggml_cgraph_eval_order order;
+
     struct ggml_tensor ** nodes;     // tensors with data that can change if the graph is evaluated
     struct ggml_tensor ** grads;     // the outputs of these tensors are the gradients of the nodes
     struct ggml_tensor ** grad_accs; // accumulators for node gradients
     struct ggml_tensor ** leafs;     // tensors with constant data
 
     struct ggml_hash_set visited_hash_set;
-
-    enum ggml_cgraph_eval_order order;
 };
 
 // returns a slice of cgraph with nodes [i0, i1)
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
@@ -4996,7 +4996,7 @@ static bool validate_fp16(ggml_fp16_t f, size_t i) {
     }
 
 bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) {
-    if (type < 0 || type >= GGML_TYPE_COUNT) {
+    if (type >= GGML_TYPE_COUNT) {
         fprintf(stderr, "%s: invalid type %d\n", __func__, type);
         return false;
     }
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -77,7 +77,7 @@ struct rpc_tensor {
 static_assert(sizeof(rpc_tensor) % 8 == 0, "rpc_tensor size must be multiple of 8");
 
 // RPC commands
-enum rpc_cmd {
+enum rpc_cmd : uint8_t {
     RPC_CMD_ALLOC_BUFFER = 0,
     RPC_CMD_GET_ALIGNMENT,
     RPC_CMD_GET_MAX_SIZE,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -1565,7 +1565,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
         struct ggml_tensor  * view_src,
         size_t                view_offs) {
 
-    GGML_ASSERT(type >= 0 && type < GGML_TYPE_COUNT);
+    GGML_ASSERT(type < GGML_TYPE_COUNT);
     GGML_ASSERT(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
 
     // find the base tensor and absolute offset
@@ -5921,12 +5921,12 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
         /*.size         =*/ size,
         /*.n_nodes      =*/ 0,
         /*.n_leafs      =*/ 0,
+        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
         /*.nodes        =*/ nodes_ptr,
         /*.grads        =*/ grads_ptr,
         /*.grad_accs    =*/ grad_accs_ptr,
         /*.leafs        =*/ leafs_ptr,
         /*.hash_table   =*/ { hash_size, hash_used, hash_keys_ptr },
-        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
     };
 
     ggml_hash_set_reset(&cgraph->visited_hash_set);
@@ -5947,12 +5947,12 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
         /*.size             =*/ 0,
         /*.n_nodes          =*/ i1 - i0,
         /*.n_leafs          =*/ 0,
+        /*.order            =*/ cgraph0->order,
         /*.nodes            =*/ cgraph0->nodes + i0,
         /*.grads            =*/ NULL, // gradients would need visited_hash_set
         /*.grad_accs        =*/ NULL,
         /*.leafs            =*/ NULL,
         /*.visited_hash_set =*/ { 0, NULL, NULL },
-        /*.order            =*/ cgraph0->order,
     };
 
     return cgraph;
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
@@ -546,7 +546,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
             ok = ok && gr.read(info.t.type);
 
             // check that tensor type is within defined range
-            if (info.t.type < 0 || info.t.type >= GGML_TYPE_COUNT) {
+            if (info.t.type >= GGML_TYPE_COUNT) {
                 fprintf(stderr, "%s: tensor '%s' has invalid ggml type %d (%s)\n",
                     __func__, info.t.name, info.t.type, ggml_type_name(info.t.type));
                 ok = false;
diff --git a/include/llama.h b/include/llama.h
@@ -294,7 +294,6 @@ extern "C" {
         const struct llama_model_tensor_buft_override * tensor_buft_overrides;
 
         int32_t n_gpu_layers; // number of layers to store in VRAM
-        enum llama_split_mode split_mode; // how to split the model across multiple GPUs
 
         // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
         int32_t main_gpu;
@@ -313,6 +312,8 @@ extern "C" {
         // override key-value pairs of the model meta data
         const struct llama_model_kv_override * kv_overrides;
 
+        enum llama_split_mode split_mode; // how to split the model across multiple GPUs
+
         // Keep the booleans together to avoid misalignment during copy-by-value.
         bool vocab_only;    // only load the vocabulary, no weights
         bool use_mmap;      // use mmap if possible
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -13076,12 +13076,12 @@ llama_model_params llama_model_default_params() {
         /*.devices                     =*/ nullptr,
         /*.tensor_buft_overrides       =*/ nullptr,
         /*.n_gpu_layers                =*/ 0,
-        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
         /*.main_gpu                    =*/ 0,
         /*.tensor_split                =*/ nullptr,
         /*.progress_callback           =*/ nullptr,
         /*.progress_callback_user_data =*/ nullptr,
         /*.kv_overrides                =*/ nullptr,
+        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
         /*.vocab_only                  =*/ false,
         /*.use_mmap                    =*/ true,
         /*.use_mlock                   =*/ false,
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
@@ -826,9 +826,9 @@ struct llm_tokenizer_ugm_session {
         }
 
         // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
-        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX});
+        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {0, vocab.token_unk(), -FLT_MAX});
         // at the beginning tokenization score is zero
-        tokenization_results[0] = { vocab.token_unk(), 0, 0 };
+        tokenization_results[0] = {  0, vocab.token_unk(), 0 };
 
         for (size_t input_offset = 0; input_offset < input_len;) {
             size_t prefix_offset = input_offset;
@@ -858,7 +858,7 @@ struct llm_tokenizer_ugm_session {
                     const double challenger_score = current_best.score_sum + token_score;
                     struct best_tokenization & current_champ = tokenization_results[prefix_offset];
                     if (challenger_score > current_champ.score_sum) {
-                        struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
+                        struct best_tokenization challenger = { input_offset, token_id, (float) challenger_score };
                         current_champ = challenger;
                     }
                 }
@@ -872,7 +872,7 @@ struct llm_tokenizer_ugm_session {
                 prefix_offset = input_offset + n_utf8_code_units;
                 struct best_tokenization & current_champ = tokenization_results[prefix_offset];
                 if (challenger_score > current_champ.score_sum) {
-                    struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score };
+                    struct best_tokenization challenger = { input_offset, vocab.token_unk(), (float) challenger_score };
                     current_champ = challenger;
                 }
             }
@@ -996,8 +996,8 @@ struct llm_tokenizer_ugm_session {
 
     // this structure stores the best tokenization so far at input_offset
     struct best_tokenization {
-        llama_token token_id;
         size_t input_offset;
+        llama_token token_id;
         float score_sum;
     };
 

Original file line number	Diff line number	Diff line change
`@@ -4996,7 +4996,7 @@ static bool validate_fp16(ggml_fp16_t f, size_t i) {`
`4996`	`4996`	`}`
`4997`	`4997`
`4998`	`4998`	`bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) {`
`4999`		`- if (type < 0 \|\| type >= GGML_TYPE_COUNT) {`
	`4999`	`+ if (type >= GGML_TYPE_COUNT) {`
`5000`	`5000`	`fprintf(stderr, "%s: invalid type %d\n", __func__, type);`
`5001`	`5001`	`return false;`
`5002`	`5002`	`}`