|
31 | 31 | #include <unordered_map> |
32 | 32 | #include <unordered_set> |
33 | 33 |
|
| 34 | +// llama-server-one START |
| 35 | +#ifdef COSMOCC |
| 36 | +#include <cosmo.h> |
| 37 | +#endif |
| 38 | +// llama-server-one END |
| 39 | + |
34 | 40 | using json = nlohmann::ordered_json; |
35 | 41 |
|
36 | 42 | constexpr int HTTP_POLLING_SECONDS = 1; |
@@ -1596,13 +1602,15 @@ struct server_queue { |
1596 | 1602 | return 0; |
1597 | 1603 | } |
1598 | 1604 |
|
| 1605 | + // llama-server-one START - defer() --> defer_task() to make Cosmo STL happy. |
1599 | 1606 | // Add a new task, but defer until one slot is available |
1600 | | - void defer(server_task && task) { |
| 1607 | + void defer_task(server_task task) { |
1601 | 1608 | std::unique_lock<std::mutex> lock(mutex_tasks); |
1602 | 1609 | QUE_DBG("defer task, id = %d\n", task.id); |
1603 | 1610 | queue_tasks_deferred.push_back(std::move(task)); |
1604 | 1611 | condition_tasks.notify_one(); |
1605 | 1612 | } |
| 1613 | + // llama-server-one END |
1606 | 1614 |
|
1607 | 1615 | // Get the next id for creating a new task |
1608 | 1616 | int get_new_id() { |
@@ -2652,13 +2660,17 @@ struct server_context { |
2652 | 2660 | if (slot == nullptr) { |
2653 | 2661 | // if no slot is available, we defer this task for processing later |
2654 | 2662 | SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id); |
2655 | | - queue_tasks.defer(std::move(task)); |
| 2663 | + // llama-server-one START |
| 2664 | + queue_tasks.defer_task(task); |
| 2665 | + // llama-server-one END |
2656 | 2666 | break; |
2657 | 2667 | } |
2658 | 2668 | if (slot->is_processing()) { |
2659 | 2669 | // if requested slot is unavailable, we defer this task for processing later |
2660 | 2670 | SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); |
2661 | | - queue_tasks.defer(std::move(task)); |
| 2671 | + // llama-server-one START |
| 2672 | + queue_tasks.defer_task(task); |
| 2673 | + // llama-server-one END |
2662 | 2674 | break; |
2663 | 2675 | } |
2664 | 2676 |
|
@@ -2741,7 +2753,9 @@ struct server_context { |
2741 | 2753 | if (slot->is_processing()) { |
2742 | 2754 | // if requested slot is unavailable, we defer this task for processing later |
2743 | 2755 | SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); |
2744 | | - queue_tasks.defer(std::move(task)); |
| 2756 | + // llama-server-one START |
| 2757 | + queue_tasks.defer_task(task); |
| 2758 | + // llama-server-one END |
2745 | 2759 | break; |
2746 | 2760 | } |
2747 | 2761 |
|
@@ -2777,7 +2791,9 @@ struct server_context { |
2777 | 2791 | if (slot->is_processing()) { |
2778 | 2792 | // if requested slot is unavailable, we defer this task for processing later |
2779 | 2793 | SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); |
2780 | | - queue_tasks.defer(std::move(task)); |
| 2794 | + // llama-server-one START |
| 2795 | + queue_tasks.defer_task(task); |
| 2796 | + // llama-server-one END |
2781 | 2797 | break; |
2782 | 2798 | } |
2783 | 2799 |
|
@@ -2820,7 +2836,9 @@ struct server_context { |
2820 | 2836 | if (slot->is_processing()) { |
2821 | 2837 | // if requested slot is unavailable, we defer this task for processing later |
2822 | 2838 | SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); |
2823 | | - queue_tasks.defer(std::move(task)); |
| 2839 | + // llama-server-one START |
| 2840 | + queue_tasks.defer_task(task); |
| 2841 | + // llama-server-one END |
2824 | 2842 | break; |
2825 | 2843 | } |
2826 | 2844 |
|
@@ -3402,15 +3420,51 @@ struct server_context { |
3402 | 3420 | } |
3403 | 3421 |
|
3404 | 3422 | json model_meta() const { |
| 3423 | + char general_architecture[64]; |
| 3424 | + char general_type[64]; |
| 3425 | + char general_name[64]; |
| 3426 | + char general_version[64]; |
| 3427 | + char general_finetune[64]; |
| 3428 | + char general_basename[64]; |
| 3429 | + char general_size_label[64]; |
| 3430 | + char general_license[64]; |
| 3431 | + |
| 3432 | + general_architecture[0] = 0; |
| 3433 | + general_type[0] = 0; |
| 3434 | + general_name[0] = 0; |
| 3435 | + general_version[0] = 0; |
| 3436 | + general_finetune[0] = 0; |
| 3437 | + general_basename[0] = 0; |
| 3438 | + general_size_label[0] = 0; |
| 3439 | + general_license[0] = 0; |
| 3440 | + |
| 3441 | + llama_model_meta_val_str(model, "general.architecture", general_architecture, 64); |
| 3442 | + llama_model_meta_val_str(model, "general.type", general_type, 64); |
| 3443 | + llama_model_meta_val_str(model, "general.name", general_name, 64); |
| 3444 | + llama_model_meta_val_str(model, "general.version", general_version, 64); |
| 3445 | + llama_model_meta_val_str(model, "general.finetune", general_finetune, 64); |
| 3446 | + llama_model_meta_val_str(model, "general.basename", general_basename, 64); |
| 3447 | + llama_model_meta_val_str(model, "general.size_label", general_size_label, 64); |
| 3448 | + llama_model_meta_val_str(model, "general.license", general_license, 64); |
| 3449 | + |
3405 | 3450 | return json { |
3406 | | - {"vocab_type", llama_vocab_type (vocab)}, |
3407 | | - {"n_vocab", llama_vocab_n_tokens (vocab)}, |
3408 | | - {"n_ctx_train", llama_model_n_ctx_train(model)}, |
3409 | | - {"n_embd", llama_model_n_embd (model)}, |
3410 | | - {"n_params", llama_model_n_params (model)}, |
3411 | | - {"size", llama_model_size (model)}, |
| 3451 | + {"vocab_type", llama_vocab_type (vocab)}, |
| 3452 | + {"n_vocab", llama_vocab_n_tokens (vocab)}, |
| 3453 | + {"n_ctx_train", llama_n_ctx_train (model)}, |
| 3454 | + {"n_embd", llama_n_embd (model)}, |
| 3455 | + {"n_params", llama_model_n_params (model)}, |
| 3456 | + {"size", llama_model_size (model)}, |
| 3457 | + {"general.architecture", general_architecture }, |
| 3458 | + {"general.type", general_type }, |
| 3459 | + {"general.name", general_name }, |
| 3460 | + {"general.version", general_version }, |
| 3461 | + {"general.finetune", general_finetune }, |
| 3462 | + {"general.basename", general_basename }, |
| 3463 | + {"general.size_label", general_size_label }, |
| 3464 | + {"general.license", general_license }, |
3412 | 3465 | }; |
3413 | 3466 | } |
| 3467 | + // llama-server-one END |
3414 | 3468 | }; |
3415 | 3469 |
|
3416 | 3470 | static void log_server_request(const httplib::Request & req, const httplib::Response & res) { |
@@ -3442,6 +3496,40 @@ inline void signal_handler(int signal) { |
3442 | 3496 | } |
3443 | 3497 |
|
3444 | 3498 | int main(int argc, char ** argv) { |
| 3499 | + // llama-server-one START |
| 3500 | + // This implements an args file feature inspired by llamafile's. |
| 3501 | + #ifdef COSMOCC |
| 3502 | + // Keep the build from showing up as ape in the process list. |
| 3503 | + pthread_setname_np(pthread_self(), "llama-server-one"); |
| 3504 | + |
| 3505 | + // Args files if present. The names are different to remove confusion during packaging. |
| 3506 | + const std::string& argsFilename = "llama-server-one-args"; |
| 3507 | + const std::string& zipArgsFilename = "/zip/default-args"; |
| 3508 | + struct stat buffer; |
| 3509 | + |
| 3510 | + // At this point, argc, argv represent: |
| 3511 | + // command (User supplied args) |
| 3512 | + |
| 3513 | + if (stat (argsFilename.c_str(), &buffer) == 0) { |
| 3514 | + argc = cosmo_args(argsFilename.c_str(), &argv); |
| 3515 | + } |
| 3516 | + |
| 3517 | + // At this point, argc, argv represent: |
| 3518 | + // command (argsFilename args) (User supplied args) |
| 3519 | + |
| 3520 | + if (stat (zipArgsFilename.c_str(), &buffer) == 0) { |
| 3521 | + argc = cosmo_args(zipArgsFilename.c_str(), &argv); |
| 3522 | + } |
| 3523 | + |
| 3524 | + // At this point, argc, argv represent: |
| 3525 | + // command (zipArgsFilename args) (argsFilename args) (User supplied args) |
| 3526 | + |
| 3527 | + // Yep, this is counterintuitive, but how the cosmo_args command works. |
| 3528 | + // argsFilename args override zipArgsFilename file args. |
| 3529 | + // User supplied args override argsFilename and zipArgsFilename args. |
| 3530 | + #endif |
| 3531 | + // llama-server-one END |
| 3532 | + |
3445 | 3533 | // own arguments required by this example |
3446 | 3534 | common_params params; |
3447 | 3535 |
|
@@ -4500,6 +4588,26 @@ int main(int argc, char ** argv) { |
4500 | 4588 | } |
4501 | 4589 | } |
4502 | 4590 |
|
| 4591 | + // llama-server-one START |
| 4592 | + svr->Get("/chat", [](const httplib::Request & req, httplib::Response & res) { |
| 4593 | + if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) { |
| 4594 | + res.set_content("Error: gzip is not supported by this browser", "text/plain"); |
| 4595 | + } else { |
| 4596 | + res.set_header("Content-Encoding", "gzip"); |
| 4597 | + // COEP and COOP headers, required by pyodide (python interpreter) |
| 4598 | + res.set_header("Cross-Origin-Embedder-Policy", "require-corp"); |
| 4599 | + res.set_header("Cross-Origin-Opener-Policy", "same-origin"); |
| 4600 | + res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8"); |
| 4601 | + } |
| 4602 | + return false; |
| 4603 | + }); |
| 4604 | + |
| 4605 | + svr->Get("/chat/", [](const httplib::Request &, httplib::Response & res) { |
| 4606 | + res.set_redirect("/chat"); |
| 4607 | + return false; |
| 4608 | + }); |
| 4609 | + // llama-server-one END |
| 4610 | + |
4503 | 4611 | // register API routes |
4504 | 4612 | svr->Get ("/health", handle_health); // public endpoint (no API key check) |
4505 | 4613 | svr->Get ("/metrics", handle_metrics); |
|
0 commit comments