Add OpenAI /v1/embeddings to new llamafiler server

jart · jart · commit be94c1ff317e · 2024-08-18T09:26:51.000-07:00
This change also removes the embedding endpoints from llamafile --server
because recent upstream llama.cpp library changes appear to have made it
mutually exclusive to use --embedding mode and normal mode, due to weird
issues relating to logits not being generated.
diff --git a/llama.cpp/server/server.cpp b/llama.cpp/server/server.cpp
@@ -2141,7 +2141,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("                              - distribute: spread execution evenly over all nodes\n");
     printf("                              - isolate: only spawn threads on CPUs on the node that execution started on\n");
     printf("                              - numactl: use the CPU map provided my numactl\n");
-    if (llama_supports_gpu_offload()) {
+    // if (llama_supports_gpu_offload()) { // [jart] prevent init error
         printf("  -ngl N, --n-gpu-layers N\n");
         printf("                            number of layers to store in VRAM\n");
         printf("  -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
@@ -2153,7 +2153,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
         printf("                            fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
         printf("  -mg i, --main-gpu i       the GPU to use for the model (with split-mode = none),\n");
         printf("                            or for intermediate results and KV (with split-mode = row)\n");
-    }
+    // } // [jart] prevent init error
     printf("  -m FNAME, --model FNAME\n");
     printf("                            model path (default: %s)\n", params.model.c_str());
     printf("  -a ALIAS, --alias ALIAS\n");
@@ -2431,13 +2431,13 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 invalid_param = true;
                 break;
             }
-            if (llama_supports_gpu_offload()) {
+            // if (llama_supports_gpu_offload()) { // [jart] prevent init error
                 params.n_gpu_layers = std::stoi(argv[i]);
-            } else {
-                LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
-                        "See main README.md for information on enabling GPU BLAS support",
-                        {{"n_gpu_layers", params.n_gpu_layers}});
-            }
+            // } else {
+            //     LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
+            //             "See main README.md for information on enabling GPU BLAS support",
+            //             {{"n_gpu_layers", params.n_gpu_layers}});
+            // }
         }
         else if (arg == "--split-mode" || arg == "-sm")
         {
@@ -2580,7 +2580,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
         }
         else if (arg == "--embedding")
         {
-            params.embedding = true;
+            fprintf(stderr, "error: the --embedding endpoint is no longer supported in the the standard llamafile --server. "
+                    "please use our new llamafiler command, which gives you a 4x faster embedding server. this is our new "
+                    "server for llamafile that, once feature complete, will replace this one entirely.\n");
+            exit(1);
         }
         else if (arg == "-cb" || arg == "--cont-batching")
         {
@@ -2694,12 +2697,14 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
 
     FLAGS_READY = true;
 
+#if 0
     // [jart] setting `embeddings = true` on the clip model causes a
     //        llama_get_logits_ith() fail crash due to how this param
     //        due to `const bool has_logits = !cparams.embeddings;` from
     //        llama.cpp interacting strangely with this parameter.
     if (params.mmproj.empty())
         params.embedding = true;  // [jart] #243 always enable embedding mode
+#endif
 
     params.n_gpu_layers = llamafile_gpu_layers(params.n_gpu_layers);
 
@@ -3512,6 +3517,23 @@ int server_cli(int argc, char **argv)
 
     svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res)
             {
+
+                // TODO(jart): something llama.cpp did upstream causes
+                //             logits to no longer be saved when we
+                //             enable embedding mode. let's use this as
+                //             an opportunity to nudge people into using
+                //             the newer better server, which is now
+                //             production worthy and recommended for
+                //             /embedding serving. it's compatible with
+                //             the existing http api.
+                if (1) {
+                    fprintf(stderr, "warning: the --embedding endpoint is no longer supported in the the standard llamafile --server. "
+                            "please use our new llamafiler command, which gives you a 4x faster embedding server. this is our new "
+                            "server for llamafile that, once feature complete, will replace this one entirely.\n");
+                    res.status = 503;
+                    return res.set_content("Service Unavailable", "text/plain; charset=utf-8");
+                }
+
                 res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
                 const json body = json::parse(req.body);
                 json prompt;
@@ -3548,6 +3570,23 @@ int server_cli(int argc, char **argv)
 
     svr.Post("/v1/embeddings", [&llama](const httplib::Request &req, httplib::Response &res)
             {
+
+                // TODO(jart): something llama.cpp did upstream causes
+                //             logits to no longer be saved when we
+                //             enable embedding mode. let's use this as
+                //             an opportunity to nudge people into using
+                //             the newer better server, which is now
+                //             production worthy and recommended for
+                //             /embedding serving. it's compatible with
+                //             the existing http api.
+                if (1) {
+                    fprintf(stderr, "warning: the --embedding endpoint is no longer supported in the the standard llamafile --server. "
+                            "please use our new llamafiler command, which gives you a 4x faster embedding server. this is our new "
+                            "server for llamafile that, once feature complete, will replace this one entirely.\n");
+                    res.status = 503;
+                    return res.set_content("Service Unavailable", "text/plain; charset=utf-8");
+                }
+
                 res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
                 const json body = json::parse(req.body);
 
diff --git a/llamafile/flags.cpp b/llamafile/flags.cpp
@@ -84,6 +84,12 @@ static wontreturn void bad(const char *flag) {
     exit(1);
 }
 
+static wontreturn void nogpu(const char *flag) {
+    tinyprint(2, program_invocation_name, ": ", flag, " was passed but ",
+              program_invocation_short_name, " doesn't support GPU mode yet.\n", NULL);
+    exit(1);
+}
+
 static wontreturn void missing(const char *flag) {
     tinyprint(2, program_invocation_name, ": ", flag, " missing argument\n", NULL);
     exit(1);
@@ -100,6 +106,7 @@ static wontreturn void unknown(const char *flag) {
 }
 
 void llamafile_get_flags(int argc, char **argv) {
+    bool program_supports_gpu = FLAG_gpu != LLAMAFILE_GPU_DISABLE;
     FLAG_threads = cpu_get_num_math();
     for (int i = 1; i < argc;) {
         const char *flag = argv[i++];
@@ -320,16 +327,22 @@ void llamafile_get_flags(int argc, char **argv) {
         // gpu flags
 
         if (!strcmp(flag, "--tinyblas")) {
+            if (!program_supports_gpu)
+                nogpu("--tinyblas");
             FLAG_tinyblas = true;
             continue;
         }
 
         if (!strcmp(flag, "--nocompile")) {
+            if (!program_supports_gpu)
+                nogpu("--nocompile");
             FLAG_nocompile = true;
             continue;
         }
 
         if (!strcmp(flag, "--recompile")) {
+            if (!program_supports_gpu)
+                nogpu("--recompile");
             FLAG_recompile = true;
             continue;
         }
@@ -346,6 +359,8 @@ void llamafile_get_flags(int argc, char **argv) {
         if (!strcmp(flag, "-ngl") || //
             !strcmp(flag, "--gpu-layers") || //
             !strcmp(flag, "--n-gpu-layers")) {
+            if (!program_supports_gpu)
+                nogpu("--n-gpu-layers");
             if (i == argc)
                 missing("--n-gpu-layers");
             FLAG_n_gpu_layers = atoi(argv[i++]);
@@ -355,13 +370,17 @@ void llamafile_get_flags(int argc, char **argv) {
         }
 
         if (!strcmp(flag, "-mg") || !strcmp(flag, "--main-gpu")) {
+            if (!program_supports_gpu)
+                nogpu("--main-gpu");
             if (i == argc)
                 missing("--main-gpu");
             FLAG_main_gpu = atoi(argv[i++]);
             continue;
         }
 
         if (!strcmp(flag, "-sm") || !strcmp(flag, "--split-mode")) {
+            if (!program_supports_gpu)
+                nogpu("--split-mode");
             if (i == argc)
                 missing("--split-mode");
             const char *value = argv[i];
diff --git a/llamafile/gpu.c b/llamafile/gpu.c
@@ -53,6 +53,9 @@ bool llamafile_has_gpu(void) {
  */
 int llamafile_gpu_layers(int n_gpu_layers) {
 
+    if (FLAG_gpu == LLAMAFILE_GPU_DISABLE)
+        return 0;
+
     // if user explicitly passed `--gpu KIND` but didn't specify `-ngl
     // LAYERS` then assume the user wants their model fully offloaded.
     if (n_gpu_layers < 0 && FLAG_gpu > 0)
diff --git a/llamafile/server/client.cpp b/llamafile/server/client.cpp
@@ -462,6 +462,8 @@ Client::dispatcher()
         return tokenize();
     if (path() == "/embedding")
         return embedding();
+    if (path() == "/v1/embeddings")
+        return embedding();
     if (path() == "/completion")
         return completion();
     return send_error(404);
diff --git a/llamafile/server/doc/embedding.md b/llamafile/server/doc/embedding.md
@@ -14,7 +14,8 @@ classification, or content recommendation systems.
 
 ## Request URIs
 
-- `/embedding`
+- `/embedding` (llama.cpp compatible)
+- `/v1/embeddings` (OpenAI compatible)
 
 ## Request Methods
 
@@ -52,6 +53,12 @@ classification, or content recommendation systems.
   `tokens_provided`. The `/tokenize` endpoint may also be used to check
   beforehand how the model chops up strings and into how many pieces.
 
+- `input` (string) is an alias for `content`, which is provided for
+  OpenAI API compatibility.
+
+- `prompt` (string) is an alias for `content`, which is provided for
+  consistency with the `/tokenize` endpoint.
+
 - `add_special` (bool; default: true) may be specified to indicate if
   the tokenizer should insert special tokens automatically. What tokens
   get inserted, depends on the model architecture. For example,
diff --git a/llamafile/server/embedding.cpp b/llamafile/server/embedding.cpp
@@ -37,6 +37,7 @@ struct EmbeddingParams
     bool parse_special;
     ctl::string_view prompt;
     ctl::string content;
+    ctl::string model;
 };
 
 void
@@ -78,10 +79,29 @@ Client::get_embedding_params(EmbeddingParams* params)
 {
     params->add_special = atob(or_empty(param("add_special")), true);
     params->parse_special = atob(or_empty(param("parse_special")), false);
+
+    // try obtaining prompt (or its aliases) from request-uri
     ctl::optional<ctl::string_view> prompt = param("content");
+    if (!prompt.has_value()) {
+        ctl::optional<ctl::string_view> prompt2 = param("prompt");
+        if (prompt2.has_value()) {
+            prompt = ctl::move(prompt2);
+        } else {
+            ctl::optional<ctl::string_view> prompt3 = param("input");
+            if (prompt3.has_value()) {
+                prompt = ctl::move(prompt3);
+            }
+        }
+    }
+
     if (prompt.has_value()) {
+        // [simple mode] if the prompt was supplied in the request-uri
+        //               then we don't bother looking for a json body.
         params->prompt = prompt.value();
     } else if (HasHeader(kHttpContentType)) {
+        // [standard mode] if the prompt wasn't specified as a
+        //                 request-uri parameter, then it must be in the
+        //                 http message body.
         if (IsMimeType(HeaderData(kHttpContentType),
                        HeaderLength(kHttpContentType),
                        "text/plain")) {
@@ -94,14 +114,21 @@ Client::get_embedding_params(EmbeddingParams* params)
                 return send_error(400, Json::StatusToString(json.first));
             if (!json.second.isObject())
                 return send_error(400, "JSON body must be an object");
-            if (!json.second["content"].isString())
-                return send_error(400, "JSON missing \"content\" key");
-            params->content = ctl::move(json.second["content"].getString());
+            if (json.second["content"].isString())
+                params->content = ctl::move(json.second["content"].getString());
+            else if (json.second["prompt"].isString())
+                params->content = ctl::move(json.second["prompt"].getString());
+            else if (json.second["input"].isString())
+                params->content = ctl::move(json.second["input"].getString());
+            else
+                return send_error(400, "JSON missing content/prompt/input key");
             params->prompt = params->content;
             if (json.second["add_special"].isBool())
                 params->add_special = json.second["add_special"].getBool();
             if (json.second["parse_special"].isBool())
                 params->parse_special = json.second["parse_special"].getBool();
+            if (json.second["model"].isString())
+                params->model = ctl::move(json.second["model"].getString());
         } else {
             return send_error(501, "Content Type Not Implemented");
         }
@@ -207,21 +234,68 @@ Client::embedding()
           embd, embeddings->data() + batch->seq_id[i][0] * n_embd, n_embd);
     }
 
+    // determine how output json should look
+    bool in_openai_mode = path() == "/v1/embeddings";
+
     // serialize tokens to json
     char* p = obuf.p;
     p = stpcpy(p, "{\n");
-    p = stpcpy(p, "  \"add_special\": ");
-    p = encode_bool(p, params->add_special);
-    p = stpcpy(p, ",\n");
-    p = stpcpy(p, "  \"parse_special\": ");
-    p = encode_bool(p, params->parse_special);
-    p = stpcpy(p, ",\n");
-    p = stpcpy(p, "  \"tokens_provided\": ");
-    p = encode_json(p, toks->size());
-    p = stpcpy(p, ",\n");
-    p = stpcpy(p, "  \"tokens_used\": ");
-    p = encode_json(p, count);
-    p = stpcpy(p, ",\n");
+
+    // Here's what an OpenAI /v1/embedding response looks like:
+    //
+    //     {
+    //       "object": "list",
+    //       "data": [
+    //         {
+    //           "object": "embedding",
+    //           "index": 0,
+    //           "embedding": [
+    //             -0.006929283495992422,
+    //             -0.005336422007530928,
+    //             ... (omitted for spacing)
+    //             -4.547132266452536e-05,
+    //             -0.024047505110502243
+    //           ],
+    //         }
+    //       ],
+    //       "model": "text-embedding-3-small",
+    //       "usage": {
+    //         "prompt_tokens": 5,
+    //         "total_tokens": 5
+    //       }
+    //     }
+    //
+
+    if (in_openai_mode) {
+        p = stpcpy(p, "  \"object\": \"list\",\n");
+        p = stpcpy(p, "  \"model\": ");
+        p = encode_json(p, params->model);
+        p = stpcpy(p, ",\n");
+        p = stpcpy(p, "  \"usage\": {\n");
+        p = stpcpy(p, "    \"prompt_tokens\": ");
+        p = encode_json(p, count);
+        p = stpcpy(p, ",\n");
+        p = stpcpy(p, "    \"total_tokens\": ");
+        p = encode_json(p, toks->size());
+        p = stpcpy(p, "\n  },\n");
+        p = stpcpy(p, "  \"data\": [{\n");
+        p = stpcpy(p, "  \"object\": \"embedding\",\n");
+        p = stpcpy(p, "  \"index\": 0,\n");
+    } else {
+        p = stpcpy(p, "  \"add_special\": ");
+        p = encode_bool(p, params->add_special);
+        p = stpcpy(p, ",\n");
+        p = stpcpy(p, "  \"parse_special\": ");
+        p = encode_bool(p, params->parse_special);
+        p = stpcpy(p, ",\n");
+        p = stpcpy(p, "  \"tokens_provided\": ");
+        p = encode_json(p, toks->size());
+        p = stpcpy(p, ",\n");
+        p = stpcpy(p, "  \"tokens_used\": ");
+        p = encode_json(p, count);
+        p = stpcpy(p, ",\n");
+    }
+
     p = stpcpy(p, "  \"embedding\": [");
     for (size_t i = 0; i < embeddings->size(); ++i) {
         if (i) {
@@ -231,6 +305,8 @@ Client::embedding()
         p = encode_json(p, (*embeddings)[i]);
     }
     p = stpcpy(p, "]\n");
+    if (in_openai_mode)
+        p = stpcpy(p, "  }]\n");
     p = stpcpy(p, "}\n");
     ctl::string_view content(obuf.p, p - obuf.p);
 
diff --git a/llamafile/server/main.cpp b/llamafile/server/main.cpp
@@ -58,12 +58,6 @@ main(int argc, char* argv[])
     LoadZipArgs(&argc, &argv);
     llamafile_get_flags(argc, argv);
 
-    // bounce users wanting gpu support (not ready yet)
-    if (FLAG_n_gpu_layers) {
-        fprintf(stderr, "error: llamafiler doesn't support gpu yet\n");
-        return 1;
-    }
-
     // initialize subsystems
     time_init();
     tokenbucket_init();