Fix critical Server v2 (llamafiler) production issues

Anivar A Aravind · Anivar A Aravind · commit 4746d30de69f · 2025-08-16T09:21:26.000+05:30
- Fix URL prefix normalization to handle double slashes (fixes mozilla-ai#787) - Consolidates consecutive slashes (// -> /) - Ensures prefix starts with single slash - Removes trailing slash (except for root) - Matches old server behavior exactly - Fix .args file loading order (fixes mozilla-ai#783) - Load .args before determining program type - Allows --server --v2 flags in .args to work correctly - Fix connection stability issues (addresses mozilla-ai#767) - Remove aggressive client cancellation when workers are busy - Let TCP backlog handle connection queuing naturally - Fix partial write handling with simple retry logic - Increase file transfer buffer from 512B to 16KB These minimal changes make Server v2 production-ready while maintaining backward compatibility. All fixes follow existing patterns from the old server implementation.
diff --git a/llama.cpp/main/main.cpp b/llama.cpp/main/main.cpp
@@ -199,6 +199,9 @@ int main(int argc, char ** argv) {
         __builtin_unreachable();
     }
 
+    // Load .args file BEFORE determining program type
+    argc = cosmo_args("/zip/.args", &argv);
+    
     enum Program prog = determine_program(argv);
     if (prog == LLAMAFILER)
         return lf::server::main(argc, argv);
@@ -207,7 +210,6 @@ int main(int argc, char ** argv) {
     mallopt(M_MMAP_THRESHOLD, 16 * 1024 * 1024);
     mallopt(M_TRIM_THRESHOLD, 128 * 1024 * 1024);
     ShowCrashReports();
-    argc = cosmo_args("/zip/.args", &argv);
 
     if (prog == SERVER)
         return server_cli(argc, argv);
diff --git a/llamafile/flags.cpp b/llamafile/flags.cpp
@@ -313,15 +313,40 @@ void llamafile_get_flags(int argc, char **argv) {
         if (!strcmp(flag, "--url-prefix")) {
             if (i == argc)
                 missing("--url-prefix");
-            FLAG_url_prefix = argv[i++];
+            
+            // Normalize the URL prefix like the old server does
+            std::string url_prefix = argv[i++];
+            
+            // 1. Consolidate consecutive slashes
+            size_t pos = 0;
+            while ((pos = url_prefix.find("//", pos)) != std::string::npos) {
+                url_prefix.replace(pos, 2, "/");
+            }
+            
+            // 2. Ensure single slash at start
+            if (url_prefix.empty() || url_prefix[0] != '/') {
+                url_prefix = "/" + url_prefix;
+            }
+            
+            // 3. Remove trailing slash if present
+            if (url_prefix.length() > 1 && url_prefix.back() == '/') {
+                url_prefix.pop_back();
+            }
+            
+            // 4. If only a single slash remains, convert to empty string
+            if (url_prefix == "/") {
+                url_prefix = "";
+            }
+            
+            // Allocate persistent memory for the flag
+            char* normalized = (char*)malloc(url_prefix.length() + 1);
+            strcpy(normalized, url_prefix.c_str());
+            FLAG_url_prefix = normalized;
+            
             if (!IsAcceptablePath(FLAG_url_prefix, -1)) {
                 tinyprint(2, "error: --url-prefix must not have // or /. or /./ or /../\n", NULL);
                 exit(1);
             }
-            if (endswith(FLAG_url_prefix, "/")) {
-                tinyprint(2, "error: --url-prefix must not be slash or end with slash\n", NULL);
-                exit(1);
-            }
             continue;
         }
 
diff --git a/llamafile/server/client.cpp b/llamafile/server/client.cpp
@@ -522,13 +522,25 @@ Client::send_response_finish()
 bool
 Client::send_binary(const void* p, size_t n)
 {
-    ssize_t sent;
-    if ((sent = write(fd_, p, n)) != n) {
-        if (sent == -1 && errno != EAGAIN && errno != ECONNRESET)
+    ssize_t sent = write(fd_, p, n);
+    
+    // Handle partial write - try once more for the remainder
+    if (sent > 0 && sent < n) {
+        ssize_t sent2 = write(fd_, (char*)p + sent, n - sent);
+        if (sent2 > 0) {
+            sent += sent2;
+        }
+    }
+    
+    // Check if we sent everything
+    if (sent != n) {
+        if (sent == -1 && errno != EAGAIN && errno != ECONNRESET) {
             SLOG("write failed %m");
+        }
         close_connection_ = true;
         return false;
     }
+    
     return true;
 }
 
@@ -775,7 +787,7 @@ Client::dispatcher()
     should_send_error_if_canceled_ = false;
     if (!send(std::string_view(obuf_.p, p - obuf_.p)))
         return false;
-    char buf[512];
+    char buf[16384]; // Increase buffer size from 512 to 16KB
     size_t i, chunk;
     for (i = 0; i < size; i += chunk) {
         chunk = size - i;
diff --git a/llamafile/server/worker.cpp b/llamafile/server/worker.cpp
@@ -56,13 +56,16 @@ Worker::begin()
         tokens = tokenbucket_acquire(client_.client_ip_);
     server_->lock();
     dll_remove(&server_->idle_workers, &elem_);
-    if (dll_is_empty(server_->idle_workers)) {
-        Dll* slowbro;
-        if ((slowbro = dll_last(server_->active_workers))) {
-            SLOG("all threads active! dropping oldest client");
-            WORKER(slowbro)->kill();
-        }
-    }
+    // Remove aggressive client cancellation - let TCP backlog handle overflow
+    // The kernel's listen backlog will naturally queue incoming connections
+    // until a worker becomes available, providing better user experience
+    // if (dll_is_empty(server_->idle_workers)) {
+    //     Dll* slowbro;
+    //     if ((slowbro = dll_last(server_->active_workers))) {
+    //         SLOG("all threads active! dropping oldest client");
+    //         WORKER(slowbro)->kill();
+    //     }
+    // }
     working_ = true;
     if (tokens > FLAG_token_burst) {
         dll_make_last(&server_->active_workers, &elem_);