Merge branch 'ggerganov:master' into k-shift2

MaggotHATE · web-flow · commit af46dc244501 · 2024-11-04T21:26:36.000+05:00
diff --git a/examples/server/README.md b/examples/server/README.md
@@ -692,7 +692,10 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
 
 ### GET `/slots`: Returns the current slots processing state
 
-This endpoint can be disabled with `--no-slots`
+> [!WARNING]
+> This endpoint is intended for debugging and may be modified in future versions. For security reasons, we strongly advise against enabling it in production environments.
+
+This endpoint is disabled by default and can be enabled with `--slots`
 
 If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots.
 
@@ -709,6 +712,7 @@ Example:
         "grammar": "",
         "id": 0,
         "ignore_eos": false,
+        "is_processing": false,
         "logit_bias": [],
         "min_p": 0.05000000074505806,
         "mirostat": 0,
@@ -741,7 +745,6 @@ Example:
             "temperature"
         ],
         "seed": 42,
-        "state": 1,
         "stop": [
             "\n"
         ],
@@ -755,10 +758,6 @@ Example:
 ]
 ```
 
-Possible values for `slot[i].state` are:
-- `0`: SLOT_STATE_IDLE
-- `1`: SLOT_STATE_PROCESSING
-
 ### GET `/metrics`: Prometheus compatible metrics exporter
 
 This endpoint is only accessible if `--metrics` is set.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1568,11 +1568,11 @@ struct server_context {
 
                     for (server_slot & slot : slots) {
                         json slot_data = get_formated_generation(slot);
-                        slot_data["id"]         = slot.id;
-                        slot_data["id_task"]    = slot.id_task;
-                        slot_data["state"]      = slot.state;
-                        slot_data["prompt"]     = common_detokenize(ctx, slot.prompt_tokens);
-                        slot_data["next_token"] = {
+                        slot_data["id"]            = slot.id;
+                        slot_data["id_task"]       = slot.id_task;
+                        slot_data["is_processing"] = slot.is_processing();
+                        slot_data["prompt"]        = common_detokenize(ctx, slot.prompt_tokens);
+                        slot_data["next_token"]    = {
                             {"has_next_token", slot.has_next_token},
                             {"has_new_line",   slot.has_new_line},
                             {"n_remain",       slot.n_remaining},
@@ -1583,10 +1583,10 @@ struct server_context {
                             {"stopping_word",  slot.stopping_word},
                         };
 
-                        if (slot_data["state"] == SLOT_STATE_IDLE) {
-                            n_idle_slots++;
-                        } else {
+                        if (slot.is_processing()) {
                             n_processing_slots++;
+                        } else {
+                            n_idle_slots++;
                         }
 
                         slots_data.push_back(slot_data);
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
@@ -260,13 +260,13 @@ async def step_wait_for_server_status(context, expecting_status: Literal['health
 async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
     match expected_slot_status_string:
         case 'idle':
-            expected_slot_status = 0
+            expected_slot_status = False
         case 'busy':
-            expected_slot_status = 1
+            expected_slot_status = True
         case _:
             assert False, "unknown status"
 
-    expected_slots = [{'id': slot_id, 'state': expected_slot_status}
+    expected_slots = [{'id': slot_id, 'is_processing': expected_slot_status}
                       for slot_id in range(context.n_slots)]
     await request_slots_status(context, expected_slots)
 
@@ -1354,8 +1354,8 @@ async def wait_for_slots_status(context,
                 if status_code == 503 and status_code == expected_http_status_code:
                     return
                 if status_code == 200 and status_code == expected_http_status_code:
-                    n_slots_idle = sum(1 if slot["state"] == 0 else 0 for slot in slots)
-                    n_slots_processing = sum(1 if slot["state"] != 0 else 0 for slot in slots)
+                    n_slots_idle = sum(1 if not slot["is_processing"] else 0 for slot in slots)
+                    n_slots_processing = sum(1 if slot["is_processing"] else 0 for slot in slots)
                     if ((slots_idle is None or slots_idle == n_slots_idle)
                         and (slots_processing is None or slots_processing == n_slots_processing)):
                         return
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
@@ -1396,7 +1396,7 @@ if (EMSCRIPTEN)
 endif()
 
 target_compile_definitions(ggml PUBLIC    ${GGML_CDEF_PUBLIC})
-target_include_directories(ggml PUBLIC  ../include)
+target_include_directories(ggml PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
 target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
 target_link_directories   (ggml PRIVATE   ${GGML_EXTRA_LIBDIRS})
 target_compile_features   (ggml PRIVATE c_std_11) # don't bump
diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp
@@ -1227,7 +1227,6 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm
 
     ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
     buffer->buft = buft;
-    buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
     buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
 
     return buffer;
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
@@ -1297,11 +1297,17 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
                     cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
                     if (err != cudaErrorPeerAccessAlreadyEnabled) {
                         CUDA_CHECK(err);
+                    } else {
+                        // reset the error
+                        cudaGetLastError();
                     }
                 } else {
                     cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
                     if (err != cudaErrorPeerAccessNotEnabled) {
                         CUDA_CHECK(err);
+                    } else {
+                        // reset the error
+                        cudaGetLastError();
                     }
                 }
             }
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
@@ -450,7 +450,14 @@ @implementation GGMLMetalClass
                     GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
                     return NULL;
                 }
+
+#if !__has_feature(objc_arc)
+                [options release];
+#endif
             }
+#if GGML_METAL_EMBED_LIBRARY
+            [src release];
+#endif // GGML_METAL_EMBED_LIBRARY
         }
     }
 
diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last

Original file line number	Diff line number	Diff line change
`@@ -1297,11 +1297,17 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {`
`1297`	`1297`	`cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);`
`1298`	`1298`	`if (err != cudaErrorPeerAccessAlreadyEnabled) {`
`1299`	`1299`	`CUDA_CHECK(err);`
	`1300`	`+ } else {`
	`1301`	`+ // reset the error`
	`1302`	`+ cudaGetLastError();`
`1300`	`1303`	`}`
`1301`	`1304`	`} else {`
`1302`	`1305`	`cudaError_t err = cudaDeviceDisablePeerAccess(id_other);`
`1303`	`1306`	`if (err != cudaErrorPeerAccessNotEnabled) {`
`1304`	`1307`	`CUDA_CHECK(err);`
	`1308`	`+ } else {`
	`1309`	`+ // reset the error`
	`1310`	`+ cudaGetLastError();`
`1305`	`1311`	`}`
`1306`	`1312`	`}`
`1307`	`1313`	`}`
Original file line number	Diff line number	Diff line change
`@@ -450,7 +450,14 @@ @implementation GGMLMetalClass`
`450`	`450`	`GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);`
`451`	`451`	`return NULL;`
`452`	`452`	`}`
	`453`	`+`
	`454`	`+#if !__has_feature(objc_arc)`
	`455`	`+ [options release];`
	`456`	`+#endif`
`453`	`457`	`}`
	`458`	`+#if GGML_METAL_EMBED_LIBRARY`
	`459`	`+ [src release];`
	`460`	`+#endif // GGML_METAL_EMBED_LIBRARY`
`454`	`461`	`}`
`455`	`462`	`}`
`456`	`463`