diff --git a/llama.cpp/main/main.cpp b/llama.cpp/main/main.cpp index 5a4daabc79..9990d5645b 100644 --- a/llama.cpp/main/main.cpp +++ b/llama.cpp/main/main.cpp @@ -199,6 +199,9 @@ int main(int argc, char ** argv) { __builtin_unreachable(); } + // Load .args file BEFORE determining program type + argc = cosmo_args("/zip/.args", &argv); + enum Program prog = determine_program(argv); if (prog == LLAMAFILER) return lf::server::main(argc, argv); @@ -207,7 +210,6 @@ int main(int argc, char ** argv) { mallopt(M_MMAP_THRESHOLD, 16 * 1024 * 1024); mallopt(M_TRIM_THRESHOLD, 128 * 1024 * 1024); ShowCrashReports(); - argc = cosmo_args("/zip/.args", &argv); if (prog == SERVER) return server_cli(argc, argv); diff --git a/llamafile/flags.cpp b/llamafile/flags.cpp index c0e3bb3b74..ef5aa58784 100644 --- a/llamafile/flags.cpp +++ b/llamafile/flags.cpp @@ -313,15 +313,40 @@ void llamafile_get_flags(int argc, char **argv) { if (!strcmp(flag, "--url-prefix")) { if (i == argc) missing("--url-prefix"); - FLAG_url_prefix = argv[i++]; + + // Normalize the URL prefix like the old server does + std::string url_prefix = argv[i++]; + + // 1. Consolidate consecutive slashes + size_t pos = 0; + while ((pos = url_prefix.find("//", pos)) != std::string::npos) { + url_prefix.replace(pos, 2, "/"); + } + + // 2. Ensure single slash at start + if (url_prefix.empty() || url_prefix[0] != '/') { + url_prefix = "/" + url_prefix; + } + + // 3. Remove trailing slash if present + if (url_prefix.length() > 1 && url_prefix.back() == '/') { + url_prefix.pop_back(); + } + + // 4. If only a single slash remains, convert to empty string + if (url_prefix == "/") { + url_prefix = ""; + } + + // Allocate persistent memory for the flag + char* normalized = (char*)malloc(url_prefix.length() + 1); + strcpy(normalized, url_prefix.c_str()); + FLAG_url_prefix = normalized; + if (!IsAcceptablePath(FLAG_url_prefix, -1)) { tinyprint(2, "error: --url-prefix must not have // or /. or /./ or /../\n", NULL); exit(1); } - if (endswith(FLAG_url_prefix, "/")) { - tinyprint(2, "error: --url-prefix must not be slash or end with slash\n", NULL); - exit(1); - } continue; } diff --git a/llamafile/server/client.cpp b/llamafile/server/client.cpp index e142a5a219..c25310346d 100644 --- a/llamafile/server/client.cpp +++ b/llamafile/server/client.cpp @@ -522,13 +522,25 @@ Client::send_response_finish() bool Client::send_binary(const void* p, size_t n) { - ssize_t sent; - if ((sent = write(fd_, p, n)) != n) { - if (sent == -1 && errno != EAGAIN && errno != ECONNRESET) + ssize_t sent = write(fd_, p, n); + + // Handle partial write - try once more for the remainder + if (sent > 0 && sent < n) { + ssize_t sent2 = write(fd_, (char*)p + sent, n - sent); + if (sent2 > 0) { + sent += sent2; + } + } + + // Check if we sent everything + if (sent != n) { + if (sent == -1 && errno != EAGAIN && errno != ECONNRESET) { SLOG("write failed %m"); + } close_connection_ = true; return false; } + return true; } @@ -775,7 +787,7 @@ Client::dispatcher() should_send_error_if_canceled_ = false; if (!send(std::string_view(obuf_.p, p - obuf_.p))) return false; - char buf[512]; + char buf[16384]; // Increase buffer size from 512 to 16KB size_t i, chunk; for (i = 0; i < size; i += chunk) { chunk = size - i; diff --git a/llamafile/server/worker.cpp b/llamafile/server/worker.cpp index a016c62218..5ddaa314cd 100644 --- a/llamafile/server/worker.cpp +++ b/llamafile/server/worker.cpp @@ -56,13 +56,16 @@ Worker::begin() tokens = tokenbucket_acquire(client_.client_ip_); server_->lock(); dll_remove(&server_->idle_workers, &elem_); - if (dll_is_empty(server_->idle_workers)) { - Dll* slowbro; - if ((slowbro = dll_last(server_->active_workers))) { - SLOG("all threads active! dropping oldest client"); - WORKER(slowbro)->kill(); - } - } + // Remove aggressive client cancellation - let TCP backlog handle overflow + // The kernel's listen backlog will naturally queue incoming connections + // until a worker becomes available, providing better user experience + // if (dll_is_empty(server_->idle_workers)) { + // Dll* slowbro; + // if ((slowbro = dll_last(server_->active_workers))) { + // SLOG("all threads active! dropping oldest client"); + // WORKER(slowbro)->kill(); + // } + // } working_ = true; if (tokens > FLAG_token_burst) { dll_make_last(&server_->active_workers, &elem_);