Skip to content

Commit 4746d30

Browse files
author
Anivar A Aravind
committed
Fix critical Server v2 (llamafiler) production issues
- Fix URL prefix normalization to handle double slashes (fixes mozilla-ai#787) - Consolidates consecutive slashes (// -> /) - Ensures prefix starts with single slash - Removes trailing slash (except for root) - Matches old server behavior exactly - Fix .args file loading order (fixes mozilla-ai#783) - Load .args before determining program type - Allows --server --v2 flags in .args to work correctly - Fix connection stability issues (addresses mozilla-ai#767) - Remove aggressive client cancellation when workers are busy - Let TCP backlog handle connection queuing naturally - Fix partial write handling with simple retry logic - Increase file transfer buffer from 512B to 16KB These minimal changes make Server v2 production-ready while maintaining backward compatibility. All fixes follow existing patterns from the old server implementation.
1 parent cfa861a commit 4746d30

File tree

4 files changed

+59
-17
lines changed

4 files changed

+59
-17
lines changed

llama.cpp/main/main.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,9 @@ int main(int argc, char ** argv) {
199199
__builtin_unreachable();
200200
}
201201

202+
// Load .args file BEFORE determining program type
203+
argc = cosmo_args("/zip/.args", &argv);
204+
202205
enum Program prog = determine_program(argv);
203206
if (prog == LLAMAFILER)
204207
return lf::server::main(argc, argv);
@@ -207,7 +210,6 @@ int main(int argc, char ** argv) {
207210
mallopt(M_MMAP_THRESHOLD, 16 * 1024 * 1024);
208211
mallopt(M_TRIM_THRESHOLD, 128 * 1024 * 1024);
209212
ShowCrashReports();
210-
argc = cosmo_args("/zip/.args", &argv);
211213

212214
if (prog == SERVER)
213215
return server_cli(argc, argv);

llamafile/flags.cpp

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -313,15 +313,40 @@ void llamafile_get_flags(int argc, char **argv) {
313313
if (!strcmp(flag, "--url-prefix")) {
314314
if (i == argc)
315315
missing("--url-prefix");
316-
FLAG_url_prefix = argv[i++];
316+
317+
// Normalize the URL prefix like the old server does
318+
std::string url_prefix = argv[i++];
319+
320+
// 1. Consolidate consecutive slashes
321+
size_t pos = 0;
322+
while ((pos = url_prefix.find("//", pos)) != std::string::npos) {
323+
url_prefix.replace(pos, 2, "/");
324+
}
325+
326+
// 2. Ensure single slash at start
327+
if (url_prefix.empty() || url_prefix[0] != '/') {
328+
url_prefix = "/" + url_prefix;
329+
}
330+
331+
// 3. Remove trailing slash if present
332+
if (url_prefix.length() > 1 && url_prefix.back() == '/') {
333+
url_prefix.pop_back();
334+
}
335+
336+
// 4. If only a single slash remains, convert to empty string
337+
if (url_prefix == "/") {
338+
url_prefix = "";
339+
}
340+
341+
// Allocate persistent memory for the flag
342+
char* normalized = (char*)malloc(url_prefix.length() + 1);
343+
strcpy(normalized, url_prefix.c_str());
344+
FLAG_url_prefix = normalized;
345+
317346
if (!IsAcceptablePath(FLAG_url_prefix, -1)) {
318347
tinyprint(2, "error: --url-prefix must not have // or /. or /./ or /../\n", NULL);
319348
exit(1);
320349
}
321-
if (endswith(FLAG_url_prefix, "/")) {
322-
tinyprint(2, "error: --url-prefix must not be slash or end with slash\n", NULL);
323-
exit(1);
324-
}
325350
continue;
326351
}
327352

llamafile/server/client.cpp

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -522,13 +522,25 @@ Client::send_response_finish()
522522
bool
523523
Client::send_binary(const void* p, size_t n)
524524
{
525-
ssize_t sent;
526-
if ((sent = write(fd_, p, n)) != n) {
527-
if (sent == -1 && errno != EAGAIN && errno != ECONNRESET)
525+
ssize_t sent = write(fd_, p, n);
526+
527+
// Handle partial write - try once more for the remainder
528+
if (sent > 0 && sent < n) {
529+
ssize_t sent2 = write(fd_, (char*)p + sent, n - sent);
530+
if (sent2 > 0) {
531+
sent += sent2;
532+
}
533+
}
534+
535+
// Check if we sent everything
536+
if (sent != n) {
537+
if (sent == -1 && errno != EAGAIN && errno != ECONNRESET) {
528538
SLOG("write failed %m");
539+
}
529540
close_connection_ = true;
530541
return false;
531542
}
543+
532544
return true;
533545
}
534546

@@ -775,7 +787,7 @@ Client::dispatcher()
775787
should_send_error_if_canceled_ = false;
776788
if (!send(std::string_view(obuf_.p, p - obuf_.p)))
777789
return false;
778-
char buf[512];
790+
char buf[16384]; // Increase buffer size from 512 to 16KB
779791
size_t i, chunk;
780792
for (i = 0; i < size; i += chunk) {
781793
chunk = size - i;

llamafile/server/worker.cpp

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,13 +56,16 @@ Worker::begin()
5656
tokens = tokenbucket_acquire(client_.client_ip_);
5757
server_->lock();
5858
dll_remove(&server_->idle_workers, &elem_);
59-
if (dll_is_empty(server_->idle_workers)) {
60-
Dll* slowbro;
61-
if ((slowbro = dll_last(server_->active_workers))) {
62-
SLOG("all threads active! dropping oldest client");
63-
WORKER(slowbro)->kill();
64-
}
65-
}
59+
// Remove aggressive client cancellation - let TCP backlog handle overflow
60+
// The kernel's listen backlog will naturally queue incoming connections
61+
// until a worker becomes available, providing better user experience
62+
// if (dll_is_empty(server_->idle_workers)) {
63+
// Dll* slowbro;
64+
// if ((slowbro = dll_last(server_->active_workers))) {
65+
// SLOG("all threads active! dropping oldest client");
66+
// WORKER(slowbro)->kill();
67+
// }
68+
// }
6669
working_ = true;
6770
if (tokens > FLAG_token_burst) {
6871
dll_make_last(&server_->active_workers, &elem_);

0 commit comments

Comments
 (0)