more strict condition

ngxson · ngxson · commit 3304b44e81ce · 2025-05-06T15:28:34.000+02:00
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -1988,12 +1988,12 @@ struct server_context {
 
             if (params_base.ctx_shift) {
                 params_base.ctx_shift = false;
-                SRV_INF("%s\n", "ctx_shift is not supported by multimodal, it will be disabled");
+                SRV_WRN("%s\n", "ctx_shift is not supported by multimodal, it will be disabled");
             }
 
             if (params_base.n_cache_reuse) {
                 params_base.n_cache_reuse = 0;
-                SRV_INF("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
+                SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
             }
 
             if (!params_base.speculative.model.path.empty()) {
@@ -2417,6 +2417,15 @@ struct server_context {
         queue_results.send(std::move(res));
     }
 
+    // if multimodal is enabled, send an error and return false
+    bool ensure_no_mtmd(const int id_task) {
+        if (mctx) {
+            send_error(id_task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED);
+            return false;
+        }
+        return true;
+    }
+
     void send_partial_response(server_slot & slot, const completion_token_output & tkn) {
         auto res = std::make_unique<server_task_result_cmpl_partial>();
 
@@ -2766,12 +2775,9 @@ struct server_context {
                 } break;
             case SERVER_TASK_TYPE_SLOT_SAVE:
                 {
+                    if (!ensure_no_mtmd(task.id)) break;
                     int id_slot = task.slot_action.slot_id;
                     server_slot * slot = get_slot_by_id(id_slot);
-                    if (mctx) {
-                        send_error(task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED);
-                        break;
-                    }
                     if (slot == nullptr) {
                         send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
                         break;
@@ -2807,10 +2813,7 @@ struct server_context {
                 } break;
             case SERVER_TASK_TYPE_SLOT_RESTORE:
                 {
-                    if (mctx) {
-                        send_error(task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED);
-                        break;
-                    }
+                    if (!ensure_no_mtmd(task.id)) break;
                     int id_slot = task.slot_action.slot_id;
                     server_slot * slot = get_slot_by_id(id_slot);
                     if (slot == nullptr) {
@@ -2857,10 +2860,7 @@ struct server_context {
                 } break;
             case SERVER_TASK_TYPE_SLOT_ERASE:
                 {
-                    if (mctx) {
-                        send_error(task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED);
-                        break;
-                    }
+                    if (!ensure_no_mtmd(task.id)) break;
                     int id_slot = task.slot_action.slot_id;
                     server_slot * slot = get_slot_by_id(id_slot);
                     if (slot == nullptr) {
@@ -3417,7 +3417,7 @@ struct server_context {
                 }
 
                 if (mctx) {
-                    // we should never reach this
+                    // we should never reach this, as speculative is automatically disabled if mmproj is loaded
                     GGML_ABORT("not supported by multimodal");
                 }
 
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
@@ -26,7 +26,7 @@
 import wget
 
 
-DEFAULT_HTTP_TIMEOUT = 120
+DEFAULT_HTTP_TIMEOUT = 12
 
 if "LLAMA_SANITIZE" in os.environ or "GITHUB_ACTION" in os.environ:
     DEFAULT_HTTP_TIMEOUT = 30
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
@@ -1078,7 +1078,6 @@ struct server_tokens {
             auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
             const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
             llama_pos start_pos = tokens.size();
-            printf("start_pos = %d, n_pos = %d\n", start_pos, n_pos);
             for (int i = 0; i < n_pos; ++i) {
                 tokens.emplace_back(LLAMA_TOKEN_NULL);
             }
@@ -1095,10 +1094,24 @@ struct server_tokens {
         }
     }
 
+    // for compatibility with context shift and prompt truncation
     void insert(llama_tokens & inp_tokens) {
+        GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
         tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
     }
 
+    // for compatibility with speculative decoding, ctx shift, slot save/load
+    const llama_tokens & get_text_tokens() const {
+        GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
+        return tokens;
+    }
+
+    // for compatibility with speculative decoding
+    void set_token(llama_pos pos, llama_token id) {
+        GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
+        tokens[pos] = id;
+    }
+
     size_t size() const {
         return tokens.size();
     }
@@ -1129,17 +1142,6 @@ struct server_tokens {
         tokens.resize(n);
     }
 
-    // for compatibility with speculative decoding, ctx shift, slot save/load
-    const llama_tokens & get_text_tokens() const {
-        return tokens;
-    }
-
-    // for compatibility with speculative decoding
-    void set_token(llama_pos pos, llama_token id) {
-        // TODO: may need validation
-        tokens[pos] = id;
-    }
-
     std::string detokenize(const llama_context * ctx, bool special) const {
         llama_tokens text_tokens;
         text_tokens.reserve(tokens.size());