Skip to content

Commit af46dc2

Browse files
authored
Merge branch 'ggerganov:master' into k-shift2
2 parents 77afcd1 + 9e0ecfb commit af46dc2

File tree

10 files changed

+944
-941
lines changed

10 files changed

+944
-941
lines changed

examples/server/README.md

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -692,7 +692,10 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
692692

693693
### GET `/slots`: Returns the current slots processing state
694694

695-
This endpoint can be disabled with `--no-slots`
695+
> [!WARNING]
696+
> This endpoint is intended for debugging and may be modified in future versions. For security reasons, we strongly advise against enabling it in production environments.
697+
698+
This endpoint is disabled by default and can be enabled with `--slots`
696699

697700
If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots.
698701

@@ -709,6 +712,7 @@ Example:
709712
"grammar": "",
710713
"id": 0,
711714
"ignore_eos": false,
715+
"is_processing": false,
712716
"logit_bias": [],
713717
"min_p": 0.05000000074505806,
714718
"mirostat": 0,
@@ -741,7 +745,6 @@ Example:
741745
"temperature"
742746
],
743747
"seed": 42,
744-
"state": 1,
745748
"stop": [
746749
"\n"
747750
],
@@ -755,10 +758,6 @@ Example:
755758
]
756759
```
757760

758-
Possible values for `slot[i].state` are:
759-
- `0`: SLOT_STATE_IDLE
760-
- `1`: SLOT_STATE_PROCESSING
761-
762761
### GET `/metrics`: Prometheus compatible metrics exporter
763762

764763
This endpoint is only accessible if `--metrics` is set.

examples/server/server.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1568,11 +1568,11 @@ struct server_context {
15681568

15691569
for (server_slot & slot : slots) {
15701570
json slot_data = get_formated_generation(slot);
1571-
slot_data["id"] = slot.id;
1572-
slot_data["id_task"] = slot.id_task;
1573-
slot_data["state"] = slot.state;
1574-
slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens);
1575-
slot_data["next_token"] = {
1571+
slot_data["id"] = slot.id;
1572+
slot_data["id_task"] = slot.id_task;
1573+
slot_data["is_processing"] = slot.is_processing();
1574+
slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens);
1575+
slot_data["next_token"] = {
15761576
{"has_next_token", slot.has_next_token},
15771577
{"has_new_line", slot.has_new_line},
15781578
{"n_remain", slot.n_remaining},
@@ -1583,10 +1583,10 @@ struct server_context {
15831583
{"stopping_word", slot.stopping_word},
15841584
};
15851585

1586-
if (slot_data["state"] == SLOT_STATE_IDLE) {
1587-
n_idle_slots++;
1588-
} else {
1586+
if (slot.is_processing()) {
15891587
n_processing_slots++;
1588+
} else {
1589+
n_idle_slots++;
15901590
}
15911591

15921592
slots_data.push_back(slot_data);

examples/server/tests/features/steps/steps.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -260,13 +260,13 @@ async def step_wait_for_server_status(context, expecting_status: Literal['health
260260
async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
261261
match expected_slot_status_string:
262262
case 'idle':
263-
expected_slot_status = 0
263+
expected_slot_status = False
264264
case 'busy':
265-
expected_slot_status = 1
265+
expected_slot_status = True
266266
case _:
267267
assert False, "unknown status"
268268

269-
expected_slots = [{'id': slot_id, 'state': expected_slot_status}
269+
expected_slots = [{'id': slot_id, 'is_processing': expected_slot_status}
270270
for slot_id in range(context.n_slots)]
271271
await request_slots_status(context, expected_slots)
272272

@@ -1354,8 +1354,8 @@ async def wait_for_slots_status(context,
13541354
if status_code == 503 and status_code == expected_http_status_code:
13551355
return
13561356
if status_code == 200 and status_code == expected_http_status_code:
1357-
n_slots_idle = sum(1 if slot["state"] == 0 else 0 for slot in slots)
1358-
n_slots_processing = sum(1 if slot["state"] != 0 else 0 for slot in slots)
1357+
n_slots_idle = sum(1 if not slot["is_processing"] else 0 for slot in slots)
1358+
n_slots_processing = sum(1 if slot["is_processing"] else 0 for slot in slots)
13591359
if ((slots_idle is None or slots_idle == n_slots_idle)
13601360
and (slots_processing is None or slots_processing == n_slots_processing)):
13611361
return

ggml/src/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1396,7 +1396,7 @@ if (EMSCRIPTEN)
13961396
endif()
13971397

13981398
target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC})
1399-
target_include_directories(ggml PUBLIC ../include)
1399+
target_include_directories(ggml PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
14001400
target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
14011401
target_link_directories (ggml PRIVATE ${GGML_EXTRA_LIBDIRS})
14021402
target_compile_features (ggml PRIVATE c_std_11) # don't bump

ggml/src/ggml-cann.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1227,7 +1227,6 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm
12271227

12281228
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
12291229
buffer->buft = buft;
1230-
buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
12311230
buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
12321231

12331232
return buffer;

ggml/src/ggml-cuda.cu

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1297,11 +1297,17 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
12971297
cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
12981298
if (err != cudaErrorPeerAccessAlreadyEnabled) {
12991299
CUDA_CHECK(err);
1300+
} else {
1301+
// reset the error
1302+
cudaGetLastError();
13001303
}
13011304
} else {
13021305
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
13031306
if (err != cudaErrorPeerAccessNotEnabled) {
13041307
CUDA_CHECK(err);
1308+
} else {
1309+
// reset the error
1310+
cudaGetLastError();
13051311
}
13061312
}
13071313
}

ggml/src/ggml-metal.m

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -450,7 +450,14 @@ @implementation GGMLMetalClass
450450
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
451451
return NULL;
452452
}
453+
454+
#if !__has_feature(objc_arc)
455+
[options release];
456+
#endif
453457
}
458+
#if GGML_METAL_EMBED_LIBRARY
459+
[src release];
460+
#endif // GGML_METAL_EMBED_LIBRARY
454461
}
455462
}
456463

0 commit comments

Comments
 (0)