@@ -4368,8 +4368,6 @@ struct llama_model_loader {
43684368 int n_created = 0;
43694369 // For tensor parallelism
43704370 int world_size = 1;
4371- int rank = 0;
4372- bool enable_tp = false;
43734371
43744372 int64_t n_elements = 0;
43754373 size_t n_bytes = 0;
@@ -4630,7 +4628,6 @@ struct llama_model_loader {
46304628 this->use_mmap = use_mmap;
46314629 this->check_tensors = check_tensors;
46324630 world_size = ggml_backend_get_world_size();
4633- rank = ggml_backend_get_rank();
46344631 }
46354632
46364633 ~llama_model_loader() {
@@ -4859,12 +4856,12 @@ struct llama_model_loader {
48594856 ggml_set_name(tensor, ggml_get_name(cur));
48604857 if (flags == TENSOR_SPLIT_BY_ROW) {
48614858 tensor->split_mode = tensor_parallel_mode::TENSOR_SPLIT_BY_ROW;
4862- }
4863- if (flags == TENSOR_SPLIT_BY_COLUMN) {
4859+ } else if (flags == TENSOR_SPLIT_BY_COLUMN) {
48644860 tensor->split_mode = tensor_parallel_mode::TENSOR_SPLIT_BY_COLUMN;
4865- }
4866- if (flags == TENSOR_KEEPED_ON_MASTER) {
4861+ } else if (flags == TENSOR_KEEPED_ON_MASTER) {
48674862 tensor->split_mode = tensor_parallel_mode::TENSOR_KEEPED_ON_MASTER;
4863+ } else {
4864+ tensor->split_mode = tensor_parallel_mode::TENSOR_NO_CHANGE;
48684865 }
48694866
48704867 if (flags == TENSOR_DUPLICATED) {
@@ -7023,8 +7020,9 @@ static bool llm_load_tensors(
70237020 if (n_expert > 0 && hparams.n_expert_used == 0) {
70247021 throw std::runtime_error("model has expert layers but no expert layers are used");
70257022 }
7026-
7023+ bool enable_tp = false;
70277024 if (split_mode == LLAMA_SPLIT_MODE_TENSOR) {
7025+ int world_size = ggml_backend_get_world_size();
70287026 if (world_size > 1) {
70297027 enable_tp = true;
70307028 // need to change the size before load tensor
@@ -7078,7 +7076,7 @@ static bool llm_load_tensors(
70787076 layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, llama_model_loader::TENSOR_SPLIT_BY_COLUMN);
70797077
70807078 // optional bias tensors
7081- auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN
7079+ auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN;
70827080 layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, bias_split_mode);
70837081 layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, bias_split_mode);
70847082 layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, bias_split_mode);
@@ -7109,7 +7107,7 @@ static bool llm_load_tensors(
71097107 layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_SPLIT_BY_ROW);
71107108
71117109 // optional MLP bias
7112- auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN
7110+ auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN;
71137111 layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, bias_split_mode);
71147112 layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_KEEPED_ON_MASTER);
71157113 layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, bias_split_mode);
0 commit comments