From 1a0960ce2e9d4d08b9d5356b75247f64e2c8c4c1 Mon Sep 17 00:00:00 2001 From: Vlad Lasky Date: Tue, 3 Jun 2025 12:09:45 +1000 Subject: [PATCH 1/3] Remove aggressive "cancel oldest client" logic in Worker::begin() Previously, when all worker threads were busy, the code would forcibly cancel the oldest active connection to make room for a new one. This approach: * Prematurely terminates in-flight requests, leading to broken or truncated responses. * Forces cleanup of file descriptors mid-stream, causing spurious "Illegal seek"/EBADF errors. * Circumvents the TCP backlog queuing provided by the OS, instead dropping live clients and degrading user experience. By deleting this block, we let the kernel's listen backlog naturally queue incoming connections until a worker becomes available. As a result: * Active requests are no longer killed arbitrarily under load. * File descriptors aren't closed unexpectedly, eliminating related "static asset pread" failures. * The server benefits from standard TCP connection handling without manual interference. This change improves reliability under high concurrency and avoids unintended side effects from thread cancellation. --- llamafile/server/worker.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/llamafile/server/worker.cpp b/llamafile/server/worker.cpp index a016c62218..84ce56e2ed 100644 --- a/llamafile/server/worker.cpp +++ b/llamafile/server/worker.cpp @@ -56,13 +56,6 @@ Worker::begin() tokens = tokenbucket_acquire(client_.client_ip_); server_->lock(); dll_remove(&server_->idle_workers, &elem_); - if (dll_is_empty(server_->idle_workers)) { - Dll* slowbro; - if ((slowbro = dll_last(server_->active_workers))) { - SLOG("all threads active! dropping oldest client"); - WORKER(slowbro)->kill(); - } - } working_ = true; if (tokens > FLAG_token_burst) { dll_make_last(&server_->active_workers, &elem_); From fe194e69dc4a0b755864ce950dce7bc7c87f5c20 Mon Sep 17 00:00:00 2001 From: Vlad Lasky Date: Tue, 3 Jun 2025 12:19:54 +1000 Subject: [PATCH 2/3] Fix partial write handling in send_binary() The current send_binary() implementation treats any write() that returns less than the requested byte count as a failure, immediately setting close_connection_ = true and returning false. This is incorrect behavior. Per POSIX, write() may legitimately return fewer bytes than requested when: - The socket send buffer is nearly full - Network congestion causes backpressure - Large write sizes exceed kernel buffers - System is under memory pressure These partial writes are normal, not error conditions. The current code incorrectly drops active connections during normal operation, especially under load when partial writes become more common. This commit replaces the single write() call with a proper write loop that: - Accumulates partial writes until all data is sent - Retries on EAGAIN/EWOULDBLOCK without closing the connection - Only treats actual errors (not partial writes) as failures - Maintains the existing error logging behavior This fix prevents spurious connection drops during large responses or when the network is congested, significantly improving reliability under production load. Fixes: Connection drops during static file serving Fixes: Increased failure rate under high load --- llamafile/server/client.cpp | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/llamafile/server/client.cpp b/llamafile/server/client.cpp index e142a5a219..ddf8dc70c2 100644 --- a/llamafile/server/client.cpp +++ b/llamafile/server/client.cpp @@ -520,14 +520,23 @@ Client::send_response_finish() // // unlike send() this won't fail if binary content is detected. bool -Client::send_binary(const void* p, size_t n) -{ - ssize_t sent; - if ((sent = write(fd_, p, n)) != n) { - if (sent == -1 && errno != EAGAIN && errno != ECONNRESET) - SLOG("write failed %m"); - close_connection_ = true; - return false; +Client::send_binary(const void* p, size_t n) { + const char* buf = (const char*)p; + size_t written = 0; + while (written < n) { + ssize_t sent = write(fd_, buf + written, n - written); + if (sent == -1) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + // no data can be written right now; retry + continue; + } + if (errno != ECONNRESET) + SLOG("write failed %m"); + close_connection_ = true; + return false; + } + // sent ≥ 0 + written += sent; } return true; } From 8b325d96c70cf808394fa021ccc6afbc2987af83 Mon Sep 17 00:00:00 2001 From: Vlad Lasky Date: Tue, 3 Jun 2025 12:22:51 +1000 Subject: [PATCH 3/3] Increase file transfer buffer from 512 to 16384 bytes The 512-byte buffer size results in excessive system calls when serving files. For a 1MB file, this requires 2048 read/write operations. Using 16KB reduces system call overhead by 32x and better matches typical OS page sizes and network buffer defaults. This should improve throughput for static file serving while maintaining reasonable stack usage. --- llamafile/server/client.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llamafile/server/client.cpp b/llamafile/server/client.cpp index ddf8dc70c2..c0509ac0ff 100644 --- a/llamafile/server/client.cpp +++ b/llamafile/server/client.cpp @@ -784,7 +784,7 @@ Client::dispatcher() should_send_error_if_canceled_ = false; if (!send(std::string_view(obuf_.p, p - obuf_.p))) return false; - char buf[512]; + char buf[16384]; size_t i, chunk; for (i = 0; i < size; i += chunk) { chunk = size - i;