add request aggregation functionality

kalabYibeltal · kalabYibeltal · commit fb93f7053398 · 2024-12-04T11:45:40.000-05:00
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1279,6 +1279,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.n_parallel = value;
         }
     ).set_env("LLAMA_ARG_N_PARALLEL"));
+    add_opt(common_arg(
+        {"--aggregate", "-ag"},
+        string_format("apply request aggregation (default: %s)", params.aggregate ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.aggregate = true;
+        }
+    ).set_env("LLAMA_ARG_AGGREGATION"));
+    add_opt(common_arg(
+        {"-bs", "--buffer-size"}, "N",
+        string_format("buffer size if aggregation is enabled (default: %d)", params.buffer_size),
+        [](common_params & params, int value) {
+            params.buffer_size = value;
+        }
+    ).set_env("LLAMA_ARG_BUFFER_SIZE"));
+
+    add_opt(common_arg(
+        {"-bks", "--block-size"}, "N",
+        string_format("block size if aggregation is enabled and should be equal to or less than buffer_size (default: %d)", params.block_size),
+        [](common_params & params, int value) {
+            params.block_size = value;
+        }
+    ).set_env("LLAMA_ARG_BLOCK_SIZE"));
     add_opt(common_arg(
         {"-ns", "--sequences"}, "N",
         string_format("number of sequences to decode (default: %d)", params.n_sequences),
diff --git a/common/common.h b/common/common.h
@@ -191,6 +191,9 @@ struct common_params {
     float   yarn_beta_slow        =  1.0f; // YaRN high correction dim
     int32_t yarn_orig_ctx         =     0; // YaRN original context length
     float   defrag_thold          =  0.1f; // KV cache defragmentation threshold
+    bool    aggregate             = false; // The aggregation feature essentially groups multiple requests over a specific time period before starting to process the prompts.
+    int32_t buffer_size           = 36;    // We would wait until there are buffer_size requests or 50 ms before starting to process the requests.
+    int32_t block_size            = 12;    // We group the requests in the buffer into blocks of block_size and process them as an array of prompts, similar to how /completions does.
 
     // offload params
     std::vector<ggml_backend_dev_t> devices;         // devices to use for offloading
diff --git a/examples/server/README.md b/examples/server/README.md
@@ -170,6 +170,9 @@ The project is under active development, and we are [looking for feedback and co
 | `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
 | `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model |
 | `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) |
+| `-ag, --aggregate` | to enable request aggregation |
+| `-bs, --buffer-size N` | to specify buffer size of the aggregation |
+| `-bks,--block-size N` | to specify the block size (array size) of requests processed together when aggregation is enabled; it should be less than the buffer size. |
 
 
 Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp