disable pipeline parallelism when tensor override present

firecoperana · firecoperana · commit 70ac64ef922e · 2025-10-29T08:54:21.000-05:00
diff --git a/common/common.cpp b/common/common.cpp
@@ -2653,7 +2653,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
     auto mparams = llama_model_params_from_gpt_params(params);
 
     llama_model * model = nullptr;
-
+    
     if (!params.hf_repo.empty() && !params.hf_file.empty()) {
         model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
     } else if (!params.model_url.empty()) {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -1237,6 +1237,10 @@ std::string LLM_TN::operator()(llm_tensor tensor, const std::string & suffix, in
     return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
 }
 
+void llama_model::set_tensor_overrides(const llama_model_params& params) {
+    tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
+}
+
 std::string llama_model_ftype_name(llama_ftype ftype) {
     if (ftype & LLAMA_FTYPE_GUESSED) {
         return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
diff --git a/src/llama-model.h b/src/llama-model.h
@@ -305,10 +305,18 @@ struct llama_model {
     // keep track of loaded lora adapters
     std::set<llama_lora_adapter *> lora_adapters;
 
+    bool tensor_overrides;
+
     ~llama_model();
 
     // Not actually needed, but left in place for now
     size_t max_nodes() const { return 65536; }
+
+    bool has_tensor_overrides() const {
+        return tensor_overrides;
+    };
+
+    void set_tensor_overrides(const llama_model_params& params);
 };
 
 struct llama_lora_weight {
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -3968,7 +3968,7 @@ struct llama_model * llama_load_model_from_file(
             return true;
         };
     }
-
+    model->set_tensor_overrides(params);
     // model->devices hold device indices that are used to offload
     // use model->devices to determine offload device
     // if no device is specified, all device are included
@@ -4478,7 +4478,7 @@ struct llama_context * llama_new_context_with_model(
                 llama_get_device_count(*model) > 1 &&
                 model->n_gpu_layers > (int)model->hparams.n_layer &&
                 model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
-                params.offload_kqv;
+                params.offload_kqv && !model->has_tensor_overrides();
 #ifndef GGML_USE_CUDA
             // pipeline parallelism requires support for async compute and events
             // currently this is only implemented in the CUDA backend