Skip to content

Commit fb93f70

Browse files
committed
add request aggregation functionality
1 parent 59f4db1 commit fb93f70

File tree

4 files changed

+333
-28
lines changed

4 files changed

+333
-28
lines changed

common/arg.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1279,6 +1279,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
12791279
params.n_parallel = value;
12801280
}
12811281
).set_env("LLAMA_ARG_N_PARALLEL"));
1282+
add_opt(common_arg(
1283+
{"--aggregate", "-ag"},
1284+
string_format("apply request aggregation (default: %s)", params.aggregate ? "enabled" : "disabled"),
1285+
[](common_params & params) {
1286+
params.aggregate = true;
1287+
}
1288+
).set_env("LLAMA_ARG_AGGREGATION"));
1289+
add_opt(common_arg(
1290+
{"-bs", "--buffer-size"}, "N",
1291+
string_format("buffer size if aggregation is enabled (default: %d)", params.buffer_size),
1292+
[](common_params & params, int value) {
1293+
params.buffer_size = value;
1294+
}
1295+
).set_env("LLAMA_ARG_BUFFER_SIZE"));
1296+
1297+
add_opt(common_arg(
1298+
{"-bks", "--block-size"}, "N",
1299+
string_format("block size if aggregation is enabled and should be equal to or less than buffer_size (default: %d)", params.block_size),
1300+
[](common_params & params, int value) {
1301+
params.block_size = value;
1302+
}
1303+
).set_env("LLAMA_ARG_BLOCK_SIZE"));
12821304
add_opt(common_arg(
12831305
{"-ns", "--sequences"}, "N",
12841306
string_format("number of sequences to decode (default: %d)", params.n_sequences),

common/common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,9 @@ struct common_params {
191191
float yarn_beta_slow = 1.0f; // YaRN high correction dim
192192
int32_t yarn_orig_ctx = 0; // YaRN original context length
193193
float defrag_thold = 0.1f; // KV cache defragmentation threshold
194+
bool aggregate = false; // The aggregation feature essentially groups multiple requests over a specific time period before starting to process the prompts.
195+
int32_t buffer_size = 36; // We would wait until there are buffer_size requests or 50 ms before starting to process the requests.
196+
int32_t block_size = 12; // We group the requests in the buffer into blocks of block_size and process them as an array of prompts, similar to how /completions does.
194197

195198
// offload params
196199
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

examples/server/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,9 @@ The project is under active development, and we are [looking for feedback and co
170170
| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
171171
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model |
172172
| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) |
173+
| `-ag, --aggregate` | to enable request aggregation |
174+
| `-bs, --buffer-size N` | to specify buffer size of the aggregation |
175+
| `-bks,--block-size N` | to specify the block size (array size) of requests processed together when aggregation is enabled; it should be less than the buffer size. |
173176

174177

175178
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.

0 commit comments

Comments
 (0)