tangledgroup
diff --git a/‎BUILD.md‎
Lines changed: 1 addition & 1 deletion b/‎BUILD.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CHANGELOG.md‎
Lines changed: 12 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 1 deletion b/‎README.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/demo_openai_load_models.py‎
Lines changed: 4 additions & 3 deletions b/‎examples/demo_openai_load_models.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎json_hpp_7.patch‎
Lines changed: 19 additions & 0 deletions b/‎json_hpp_7.patch‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎json_schema_to_grammar_cpp_7.patch‎
Lines changed: 21 additions & 0 deletions b/‎json_schema_to_grammar_cpp_7.patch‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎json_schema_to_grammar_h_7.patch‎
Lines changed: 36 additions & 0 deletions b/‎json_schema_to_grammar_h_7.patch‎
Lines changed: 36 additions & 0 deletions
@@ -18,7 +18,7 @@ poetry run clean
 poetry run cibuildwheel --output-dir wheelhouse --platform linux --arch x86_64 .
 
 # aarch64
-docker run --rm --privileged linuxkit/binfmt:v0.8
+docker run --rm --privileged linuxkit/binfmt:v1.0.0
 poetry run cibuildwheel --output-dir wheelhouse --platform linux --arch aarch64 .
 
 # pyodide, pyscript, wasm (NOTE: cannot be published to PyPI)
 
@@ -1,7 +1,19 @@
 # CHANGELOG
 
+## v0.4.17
+
+Changed:
+  - `llama.cpp` revision `6152129d05870cb38162c422c6ba80434e021e9f`
+
+Fixed:
+  - Fixed build process, json patches.
+  - Reverted server code to previous version due to bug.
+
 ## v0.4.16
 
+Added:
+  - Dynamically load/unload models while executing prompts in parallel.
+
 Changed:
   - `llama.cpp` revision `adc5dd92e8aea98f5e7ac84f6e1bc15de35130b5`
 
 
@@ -20,7 +20,8 @@ NOTE: Currently supported operating system is **Linux** (`manylinux_2_28` and `m
 
 ## News
 
-- **Jan 14 2025, v0.4.14+**: Modular llama.cpp build using `cmake` build system. Deprecated `make` build system.
+- **Jan 15 2025, v0.4.15**: Dynamically load/unload models while executing prompts in parallel.
+- **Jan 14 2025, v0.4.14**: Modular llama.cpp build using `cmake` build system. Deprecated `make` build system.
 - **Jan 1 2025, v0.3.1**: OpenAI compatible API, **text** and **vision** models. Added support for **Qwen2-VL** models. Hot-swap of models on demand in server/API.
 - **Dec 9 2024, v0.2.0**: Low-level and high-level APIs: llama, llava, clip and ggml API.
 - **Nov 27 2024, v0.1.22**: Support for Multimodal models such as **llava** and **minicpmv**.
 
@@ -7,7 +7,8 @@
 
 
 client = OpenAI(
-    base_url = 'http://localhost:11434/v1',
+    # base_url = 'http://localhost:11434/v1',
+    base_url = 'http://openai.tangledlabs.com/v1',
     api_key='llama-cpp-cffi',
 )
 
@@ -49,7 +50,7 @@ def demo_text_chat_completions_stream():
             # llama-cpp-cffi
             extra_body=dict( # type: ignore
                 n_ctx=4 * 1024,
-                gpu_layers=99,
+                gpu_layers=5,
                 predict=512,
             ),
         )
@@ -98,7 +99,7 @@ def func(model):
             # llama-cpp-cffi
             extra_body=dict( # type: ignore
                 n_ctx=4 * 1024,
-                gpu_layers=99,
+                gpu_layers=5,
                 predict=512,
             ),
         )
 
@@ -0,0 +1,19 @@
+--- llama.cpp-master/common/json.hpp	2025-01-13 19:24:51.610146960 +0100
++++ llama.cpp/common/json.hpp	2025-01-22 18:47:26.842856380 +0100
+@@ -18,6 +18,8 @@
+ #ifndef INCLUDE_NLOHMANN_JSON_HPP_
+ #define INCLUDE_NLOHMANN_JSON_HPP_
+ 
++#ifdef __cplusplus
++
+ #include <algorithm> // all_of, find, for_each
+ #include <cstddef> // nullptr_t, ptrdiff_t, size_t
+ #include <functional> // hash, less
+@@ -24761,6 +24763,6 @@
+ #undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG
+ #undef JSON_HEDLEY_FALL_THROUGH
+ 
+-
++#endif // __cplusplus
+ 
+ #endif  // INCLUDE_NLOHMANN_JSON_HPP_
@@ -0,0 +1,21 @@
+--- llama.cpp-master/common/json-schema-to-grammar.cpp	2025-01-22 18:26:47.628379203 +0100
++++ llama.cpp/common/json-schema-to-grammar.cpp	2025-01-22 18:33:43.484435269 +0100
+@@ -13,6 +13,18 @@
+ 
+ using json = nlohmann::ordered_json;
+ 
++char * llama_json_schema_to_grammar(const char * c_value) {
++    std::string value(c_value);
++    std::string grammar = json_schema_to_grammar(json::parse(value));
++
++    // Allocate memory for the result string, including space for the null terminator
++    char* result = new char[grammar.length() + 1];
++    std::strcpy(result, grammar.c_str());
++
++    // The caller is now responsible for deleting this memory
++    return result;
++}
++
+ static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
+     auto has_max = max_items != std::numeric_limits<int>::max();
+ 
@@ -0,0 +1,36 @@
+--- llama.cpp-master/common/json-schema-to-grammar.h	2025-01-22 18:26:47.628379203 +0100
++++ llama.cpp/common/json-schema-to-grammar.h	2025-01-22 18:52:19.832711144 +0100
+@@ -5,6 +5,27 @@
+ #define JSON_ASSERT GGML_ASSERT
+ #include "json.hpp"
+ 
++#ifdef LLAMA_SHARED
++#    if defined(_WIN32) && !defined(__MINGW32__)
++#        ifdef LLAMA_BUILD
++#            define LLAMA_API __declspec(dllexport)
++#        else
++#            define LLAMA_API __declspec(dllimport)
++#        endif
++#    else
++#        define LLAMA_API __attribute__ ((visibility ("default")))
++#    endif
++#else
++#    define LLAMA_API
++#endif
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++LLAMA_API char * llama_json_schema_to_grammar(const char * c_value);
++#ifdef __cplusplus
++}
++
+ std::string json_schema_to_grammar(const nlohmann::ordered_json & schema);
+ 
+ struct llama_grammar_builder {
+@@ -14,3 +35,5 @@
+ };
+ 
+ std::string build_grammar(const std::function<void(const llama_grammar_builder &)> & cb);
++
++#endif