Skip to content

Commit be94c1f

Browse files
committed
Add OpenAI /v1/embeddings to new llamafiler server
This change also removes the embedding endpoints from llamafile --server because recent upstream llama.cpp library changes appear to have made it mutually exclusive to use --embedding mode and normal mode, due to weird issues relating to logits not being generated.
1 parent 2232b0e commit be94c1f

File tree

7 files changed

+171
-31
lines changed

7 files changed

+171
-31
lines changed

llama.cpp/server/server.cpp

Lines changed: 48 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2141,7 +2141,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
21412141
printf(" - distribute: spread execution evenly over all nodes\n");
21422142
printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
21432143
printf(" - numactl: use the CPU map provided my numactl\n");
2144-
if (llama_supports_gpu_offload()) {
2144+
// if (llama_supports_gpu_offload()) { // [jart] prevent init error
21452145
printf(" -ngl N, --n-gpu-layers N\n");
21462146
printf(" number of layers to store in VRAM\n");
21472147
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
@@ -2153,7 +2153,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
21532153
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
21542154
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
21552155
printf(" or for intermediate results and KV (with split-mode = row)\n");
2156-
}
2156+
// } // [jart] prevent init error
21572157
printf(" -m FNAME, --model FNAME\n");
21582158
printf(" model path (default: %s)\n", params.model.c_str());
21592159
printf(" -a ALIAS, --alias ALIAS\n");
@@ -2431,13 +2431,13 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
24312431
invalid_param = true;
24322432
break;
24332433
}
2434-
if (llama_supports_gpu_offload()) {
2434+
// if (llama_supports_gpu_offload()) { // [jart] prevent init error
24352435
params.n_gpu_layers = std::stoi(argv[i]);
2436-
} else {
2437-
LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
2438-
"See main README.md for information on enabling GPU BLAS support",
2439-
{{"n_gpu_layers", params.n_gpu_layers}});
2440-
}
2436+
// } else {
2437+
// LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
2438+
// "See main README.md for information on enabling GPU BLAS support",
2439+
// {{"n_gpu_layers", params.n_gpu_layers}});
2440+
// }
24412441
}
24422442
else if (arg == "--split-mode" || arg == "-sm")
24432443
{
@@ -2580,7 +2580,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
25802580
}
25812581
else if (arg == "--embedding")
25822582
{
2583-
params.embedding = true;
2583+
fprintf(stderr, "error: the --embedding endpoint is no longer supported in the the standard llamafile --server. "
2584+
"please use our new llamafiler command, which gives you a 4x faster embedding server. this is our new "
2585+
"server for llamafile that, once feature complete, will replace this one entirely.\n");
2586+
exit(1);
25842587
}
25852588
else if (arg == "-cb" || arg == "--cont-batching")
25862589
{
@@ -2694,12 +2697,14 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
26942697

26952698
FLAGS_READY = true;
26962699

2700+
#if 0
26972701
// [jart] setting `embeddings = true` on the clip model causes a
26982702
// llama_get_logits_ith() fail crash due to how this param
26992703
// due to `const bool has_logits = !cparams.embeddings;` from
27002704
// llama.cpp interacting strangely with this parameter.
27012705
if (params.mmproj.empty())
27022706
params.embedding = true; // [jart] #243 always enable embedding mode
2707+
#endif
27032708

27042709
params.n_gpu_layers = llamafile_gpu_layers(params.n_gpu_layers);
27052710

@@ -3512,6 +3517,23 @@ int server_cli(int argc, char **argv)
35123517

35133518
svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res)
35143519
{
3520+
3521+
// TODO(jart): something llama.cpp did upstream causes
3522+
// logits to no longer be saved when we
3523+
// enable embedding mode. let's use this as
3524+
// an opportunity to nudge people into using
3525+
// the newer better server, which is now
3526+
// production worthy and recommended for
3527+
// /embedding serving. it's compatible with
3528+
// the existing http api.
3529+
if (1) {
3530+
fprintf(stderr, "warning: the --embedding endpoint is no longer supported in the the standard llamafile --server. "
3531+
"please use our new llamafiler command, which gives you a 4x faster embedding server. this is our new "
3532+
"server for llamafile that, once feature complete, will replace this one entirely.\n");
3533+
res.status = 503;
3534+
return res.set_content("Service Unavailable", "text/plain; charset=utf-8");
3535+
}
3536+
35153537
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
35163538
const json body = json::parse(req.body);
35173539
json prompt;
@@ -3548,6 +3570,23 @@ int server_cli(int argc, char **argv)
35483570

35493571
svr.Post("/v1/embeddings", [&llama](const httplib::Request &req, httplib::Response &res)
35503572
{
3573+
3574+
// TODO(jart): something llama.cpp did upstream causes
3575+
// logits to no longer be saved when we
3576+
// enable embedding mode. let's use this as
3577+
// an opportunity to nudge people into using
3578+
// the newer better server, which is now
3579+
// production worthy and recommended for
3580+
// /embedding serving. it's compatible with
3581+
// the existing http api.
3582+
if (1) {
3583+
fprintf(stderr, "warning: the --embedding endpoint is no longer supported in the the standard llamafile --server. "
3584+
"please use our new llamafiler command, which gives you a 4x faster embedding server. this is our new "
3585+
"server for llamafile that, once feature complete, will replace this one entirely.\n");
3586+
res.status = 503;
3587+
return res.set_content("Service Unavailable", "text/plain; charset=utf-8");
3588+
}
3589+
35513590
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
35523591
const json body = json::parse(req.body);
35533592

llamafile/flags.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,12 @@ static wontreturn void bad(const char *flag) {
8484
exit(1);
8585
}
8686

87+
static wontreturn void nogpu(const char *flag) {
88+
tinyprint(2, program_invocation_name, ": ", flag, " was passed but ",
89+
program_invocation_short_name, " doesn't support GPU mode yet.\n", NULL);
90+
exit(1);
91+
}
92+
8793
static wontreturn void missing(const char *flag) {
8894
tinyprint(2, program_invocation_name, ": ", flag, " missing argument\n", NULL);
8995
exit(1);
@@ -100,6 +106,7 @@ static wontreturn void unknown(const char *flag) {
100106
}
101107

102108
void llamafile_get_flags(int argc, char **argv) {
109+
bool program_supports_gpu = FLAG_gpu != LLAMAFILE_GPU_DISABLE;
103110
FLAG_threads = cpu_get_num_math();
104111
for (int i = 1; i < argc;) {
105112
const char *flag = argv[i++];
@@ -320,16 +327,22 @@ void llamafile_get_flags(int argc, char **argv) {
320327
// gpu flags
321328

322329
if (!strcmp(flag, "--tinyblas")) {
330+
if (!program_supports_gpu)
331+
nogpu("--tinyblas");
323332
FLAG_tinyblas = true;
324333
continue;
325334
}
326335

327336
if (!strcmp(flag, "--nocompile")) {
337+
if (!program_supports_gpu)
338+
nogpu("--nocompile");
328339
FLAG_nocompile = true;
329340
continue;
330341
}
331342

332343
if (!strcmp(flag, "--recompile")) {
344+
if (!program_supports_gpu)
345+
nogpu("--recompile");
333346
FLAG_recompile = true;
334347
continue;
335348
}
@@ -346,6 +359,8 @@ void llamafile_get_flags(int argc, char **argv) {
346359
if (!strcmp(flag, "-ngl") || //
347360
!strcmp(flag, "--gpu-layers") || //
348361
!strcmp(flag, "--n-gpu-layers")) {
362+
if (!program_supports_gpu)
363+
nogpu("--n-gpu-layers");
349364
if (i == argc)
350365
missing("--n-gpu-layers");
351366
FLAG_n_gpu_layers = atoi(argv[i++]);
@@ -355,13 +370,17 @@ void llamafile_get_flags(int argc, char **argv) {
355370
}
356371

357372
if (!strcmp(flag, "-mg") || !strcmp(flag, "--main-gpu")) {
373+
if (!program_supports_gpu)
374+
nogpu("--main-gpu");
358375
if (i == argc)
359376
missing("--main-gpu");
360377
FLAG_main_gpu = atoi(argv[i++]);
361378
continue;
362379
}
363380

364381
if (!strcmp(flag, "-sm") || !strcmp(flag, "--split-mode")) {
382+
if (!program_supports_gpu)
383+
nogpu("--split-mode");
365384
if (i == argc)
366385
missing("--split-mode");
367386
const char *value = argv[i];

llamafile/gpu.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ bool llamafile_has_gpu(void) {
5353
*/
5454
int llamafile_gpu_layers(int n_gpu_layers) {
5555

56+
if (FLAG_gpu == LLAMAFILE_GPU_DISABLE)
57+
return 0;
58+
5659
// if user explicitly passed `--gpu KIND` but didn't specify `-ngl
5760
// LAYERS` then assume the user wants their model fully offloaded.
5861
if (n_gpu_layers < 0 && FLAG_gpu > 0)

llamafile/server/client.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,8 @@ Client::dispatcher()
462462
return tokenize();
463463
if (path() == "/embedding")
464464
return embedding();
465+
if (path() == "/v1/embeddings")
466+
return embedding();
465467
if (path() == "/completion")
466468
return completion();
467469
return send_error(404);

llamafile/server/doc/embedding.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ classification, or content recommendation systems.
1414

1515
## Request URIs
1616

17-
- `/embedding`
17+
- `/embedding` (llama.cpp compatible)
18+
- `/v1/embeddings` (OpenAI compatible)
1819

1920
## Request Methods
2021

@@ -52,6 +53,12 @@ classification, or content recommendation systems.
5253
`tokens_provided`. The `/tokenize` endpoint may also be used to check
5354
beforehand how the model chops up strings and into how many pieces.
5455

56+
- `input` (string) is an alias for `content`, which is provided for
57+
OpenAI API compatibility.
58+
59+
- `prompt` (string) is an alias for `content`, which is provided for
60+
consistency with the `/tokenize` endpoint.
61+
5562
- `add_special` (bool; default: true) may be specified to indicate if
5663
the tokenizer should insert special tokens automatically. What tokens
5764
get inserted, depends on the model architecture. For example,

llamafile/server/embedding.cpp

Lines changed: 91 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ struct EmbeddingParams
3737
bool parse_special;
3838
ctl::string_view prompt;
3939
ctl::string content;
40+
ctl::string model;
4041
};
4142

4243
void
@@ -78,10 +79,29 @@ Client::get_embedding_params(EmbeddingParams* params)
7879
{
7980
params->add_special = atob(or_empty(param("add_special")), true);
8081
params->parse_special = atob(or_empty(param("parse_special")), false);
82+
83+
// try obtaining prompt (or its aliases) from request-uri
8184
ctl::optional<ctl::string_view> prompt = param("content");
85+
if (!prompt.has_value()) {
86+
ctl::optional<ctl::string_view> prompt2 = param("prompt");
87+
if (prompt2.has_value()) {
88+
prompt = ctl::move(prompt2);
89+
} else {
90+
ctl::optional<ctl::string_view> prompt3 = param("input");
91+
if (prompt3.has_value()) {
92+
prompt = ctl::move(prompt3);
93+
}
94+
}
95+
}
96+
8297
if (prompt.has_value()) {
98+
// [simple mode] if the prompt was supplied in the request-uri
99+
// then we don't bother looking for a json body.
83100
params->prompt = prompt.value();
84101
} else if (HasHeader(kHttpContentType)) {
102+
// [standard mode] if the prompt wasn't specified as a
103+
// request-uri parameter, then it must be in the
104+
// http message body.
85105
if (IsMimeType(HeaderData(kHttpContentType),
86106
HeaderLength(kHttpContentType),
87107
"text/plain")) {
@@ -94,14 +114,21 @@ Client::get_embedding_params(EmbeddingParams* params)
94114
return send_error(400, Json::StatusToString(json.first));
95115
if (!json.second.isObject())
96116
return send_error(400, "JSON body must be an object");
97-
if (!json.second["content"].isString())
98-
return send_error(400, "JSON missing \"content\" key");
99-
params->content = ctl::move(json.second["content"].getString());
117+
if (json.second["content"].isString())
118+
params->content = ctl::move(json.second["content"].getString());
119+
else if (json.second["prompt"].isString())
120+
params->content = ctl::move(json.second["prompt"].getString());
121+
else if (json.second["input"].isString())
122+
params->content = ctl::move(json.second["input"].getString());
123+
else
124+
return send_error(400, "JSON missing content/prompt/input key");
100125
params->prompt = params->content;
101126
if (json.second["add_special"].isBool())
102127
params->add_special = json.second["add_special"].getBool();
103128
if (json.second["parse_special"].isBool())
104129
params->parse_special = json.second["parse_special"].getBool();
130+
if (json.second["model"].isString())
131+
params->model = ctl::move(json.second["model"].getString());
105132
} else {
106133
return send_error(501, "Content Type Not Implemented");
107134
}
@@ -207,21 +234,68 @@ Client::embedding()
207234
embd, embeddings->data() + batch->seq_id[i][0] * n_embd, n_embd);
208235
}
209236

237+
// determine how output json should look
238+
bool in_openai_mode = path() == "/v1/embeddings";
239+
210240
// serialize tokens to json
211241
char* p = obuf.p;
212242
p = stpcpy(p, "{\n");
213-
p = stpcpy(p, " \"add_special\": ");
214-
p = encode_bool(p, params->add_special);
215-
p = stpcpy(p, ",\n");
216-
p = stpcpy(p, " \"parse_special\": ");
217-
p = encode_bool(p, params->parse_special);
218-
p = stpcpy(p, ",\n");
219-
p = stpcpy(p, " \"tokens_provided\": ");
220-
p = encode_json(p, toks->size());
221-
p = stpcpy(p, ",\n");
222-
p = stpcpy(p, " \"tokens_used\": ");
223-
p = encode_json(p, count);
224-
p = stpcpy(p, ",\n");
243+
244+
// Here's what an OpenAI /v1/embedding response looks like:
245+
//
246+
// {
247+
// "object": "list",
248+
// "data": [
249+
// {
250+
// "object": "embedding",
251+
// "index": 0,
252+
// "embedding": [
253+
// -0.006929283495992422,
254+
// -0.005336422007530928,
255+
// ... (omitted for spacing)
256+
// -4.547132266452536e-05,
257+
// -0.024047505110502243
258+
// ],
259+
// }
260+
// ],
261+
// "model": "text-embedding-3-small",
262+
// "usage": {
263+
// "prompt_tokens": 5,
264+
// "total_tokens": 5
265+
// }
266+
// }
267+
//
268+
269+
if (in_openai_mode) {
270+
p = stpcpy(p, " \"object\": \"list\",\n");
271+
p = stpcpy(p, " \"model\": ");
272+
p = encode_json(p, params->model);
273+
p = stpcpy(p, ",\n");
274+
p = stpcpy(p, " \"usage\": {\n");
275+
p = stpcpy(p, " \"prompt_tokens\": ");
276+
p = encode_json(p, count);
277+
p = stpcpy(p, ",\n");
278+
p = stpcpy(p, " \"total_tokens\": ");
279+
p = encode_json(p, toks->size());
280+
p = stpcpy(p, "\n },\n");
281+
p = stpcpy(p, " \"data\": [{\n");
282+
p = stpcpy(p, " \"object\": \"embedding\",\n");
283+
p = stpcpy(p, " \"index\": 0,\n");
284+
} else {
285+
p = stpcpy(p, " \"add_special\": ");
286+
p = encode_bool(p, params->add_special);
287+
p = stpcpy(p, ",\n");
288+
p = stpcpy(p, " \"parse_special\": ");
289+
p = encode_bool(p, params->parse_special);
290+
p = stpcpy(p, ",\n");
291+
p = stpcpy(p, " \"tokens_provided\": ");
292+
p = encode_json(p, toks->size());
293+
p = stpcpy(p, ",\n");
294+
p = stpcpy(p, " \"tokens_used\": ");
295+
p = encode_json(p, count);
296+
p = stpcpy(p, ",\n");
297+
}
298+
225299
p = stpcpy(p, " \"embedding\": [");
226300
for (size_t i = 0; i < embeddings->size(); ++i) {
227301
if (i) {
@@ -231,6 +305,8 @@ Client::embedding()
231305
p = encode_json(p, (*embeddings)[i]);
232306
}
233307
p = stpcpy(p, "]\n");
308+
if (in_openai_mode)
309+
p = stpcpy(p, " }]\n");
234310
p = stpcpy(p, "}\n");
235311
ctl::string_view content(obuf.p, p - obuf.p);
236312

llamafile/server/main.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,6 @@ main(int argc, char* argv[])
5858
LoadZipArgs(&argc, &argv);
5959
llamafile_get_flags(argc, argv);
6060

61-
// bounce users wanting gpu support (not ready yet)
62-
if (FLAG_n_gpu_layers) {
63-
fprintf(stderr, "error: llamafiler doesn't support gpu yet\n");
64-
return 1;
65-
}
66-
6761
// initialize subsystems
6862
time_init();
6963
tokenbucket_init();

0 commit comments

Comments
 (0)