Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion llama.cpp/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,9 @@ int main(int argc, char ** argv) {
__builtin_unreachable();
}

// Load .args file BEFORE determining program type
argc = cosmo_args("/zip/.args", &argv);

enum Program prog = determine_program(argv);
if (prog == LLAMAFILER)
return lf::server::main(argc, argv);
Expand All @@ -207,7 +210,6 @@ int main(int argc, char ** argv) {
mallopt(M_MMAP_THRESHOLD, 16 * 1024 * 1024);
mallopt(M_TRIM_THRESHOLD, 128 * 1024 * 1024);
ShowCrashReports();
argc = cosmo_args("/zip/.args", &argv);

if (prog == SERVER)
return server_cli(argc, argv);
Expand Down
35 changes: 30 additions & 5 deletions llamafile/flags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -313,15 +313,40 @@ void llamafile_get_flags(int argc, char **argv) {
if (!strcmp(flag, "--url-prefix")) {
if (i == argc)
missing("--url-prefix");
FLAG_url_prefix = argv[i++];

// Normalize the URL prefix like the old server does
std::string url_prefix = argv[i++];

// 1. Consolidate consecutive slashes
size_t pos = 0;
while ((pos = url_prefix.find("//", pos)) != std::string::npos) {
url_prefix.replace(pos, 2, "/");
}

// 2. Ensure single slash at start
if (url_prefix.empty() || url_prefix[0] != '/') {
url_prefix = "/" + url_prefix;
}

// 3. Remove trailing slash if present
if (url_prefix.length() > 1 && url_prefix.back() == '/') {
url_prefix.pop_back();
}

// 4. If only a single slash remains, convert to empty string
if (url_prefix == "/") {
url_prefix = "";
}

// Allocate persistent memory for the flag
char* normalized = (char*)malloc(url_prefix.length() + 1);
strcpy(normalized, url_prefix.c_str());
FLAG_url_prefix = normalized;

if (!IsAcceptablePath(FLAG_url_prefix, -1)) {
tinyprint(2, "error: --url-prefix must not have // or /. or /./ or /../\n", NULL);
exit(1);
}
if (endswith(FLAG_url_prefix, "/")) {
tinyprint(2, "error: --url-prefix must not be slash or end with slash\n", NULL);
exit(1);
}
continue;
}

Expand Down
20 changes: 16 additions & 4 deletions llamafile/server/client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -522,13 +522,25 @@ Client::send_response_finish()
bool
Client::send_binary(const void* p, size_t n)
{
ssize_t sent;
if ((sent = write(fd_, p, n)) != n) {
if (sent == -1 && errno != EAGAIN && errno != ECONNRESET)
ssize_t sent = write(fd_, p, n);

// Handle partial write - try once more for the remainder
if (sent > 0 && sent < n) {
ssize_t sent2 = write(fd_, (char*)p + sent, n - sent);
if (sent2 > 0) {
sent += sent2;
}
}

// Check if we sent everything
if (sent != n) {
if (sent == -1 && errno != EAGAIN && errno != ECONNRESET) {
SLOG("write failed %m");
}
close_connection_ = true;
return false;
}

return true;
}

Expand Down Expand Up @@ -775,7 +787,7 @@ Client::dispatcher()
should_send_error_if_canceled_ = false;
if (!send(std::string_view(obuf_.p, p - obuf_.p)))
return false;
char buf[512];
char buf[16384]; // Increase buffer size from 512 to 16KB
size_t i, chunk;
for (i = 0; i < size; i += chunk) {
chunk = size - i;
Expand Down
17 changes: 10 additions & 7 deletions llamafile/server/worker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,16 @@ Worker::begin()
tokens = tokenbucket_acquire(client_.client_ip_);
server_->lock();
dll_remove(&server_->idle_workers, &elem_);
if (dll_is_empty(server_->idle_workers)) {
Dll* slowbro;
if ((slowbro = dll_last(server_->active_workers))) {
SLOG("all threads active! dropping oldest client");
WORKER(slowbro)->kill();
}
}
// Remove aggressive client cancellation - let TCP backlog handle overflow
// The kernel's listen backlog will naturally queue incoming connections
// until a worker becomes available, providing better user experience
// if (dll_is_empty(server_->idle_workers)) {
// Dll* slowbro;
// if ((slowbro = dll_last(server_->active_workers))) {
// SLOG("all threads active! dropping oldest client");
// WORKER(slowbro)->kill();
// }
// }
working_ = true;
if (tokens > FLAG_token_burst) {
dll_make_last(&server_->active_workers, &elem_);
Expand Down