ggml-org
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/build.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎AUTHORS‎
Lines changed: 185 additions & 1 deletion b/‎AUTHORS‎
Lines changed: 185 additions & 1 deletion
diff --git a/‎common/arg.cpp‎
Lines changed: 6 additions & 4 deletions b/‎common/arg.cpp‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎docs/android.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/android.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/llava/clip.cpp‎
Lines changed: 11 additions & 4 deletions b/‎examples/llava/clip.cpp‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎examples/llava/llava.cpp‎
Lines changed: 19 additions & 9 deletions b/‎examples/llava/llava.cpp‎
Lines changed: 19 additions & 9 deletions
diff --git a/‎examples/server/tests/requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎examples/server/tests/requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/server/tests/utils.py‎
Lines changed: 2 additions & 17 deletions b/‎examples/server/tests/utils.py‎
Lines changed: 2 additions & 17 deletions
diff --git a/‎examples/simple/README.md‎
Lines changed: 1 addition & 1 deletion b/‎examples/simple/README.md‎
Lines changed: 1 addition & 1 deletion
@@ -904,6 +904,8 @@ jobs:
       - name: Clone
         id: checkout
         uses: actions/checkout@v4
+        with:
+            fetch-depth: 0
 
       - name: Install Cuda Toolkit 11.7
         if: ${{ matrix.cuda == '11.7' }}
@@ -1139,6 +1141,8 @@ jobs:
       - name: Clone
         id: checkout
         uses: actions/checkout@v4
+        with:
+            fetch-depth: 0
 
       - name: Install
         id: depends
 
@@ -1370,8 +1370,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.n_gpu_layers = value;
             if (!llama_supports_gpu_offload()) {
-                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
-                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+                fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
+                fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
+                fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
             }
         }
     ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
@@ -2104,8 +2105,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.speculative.n_gpu_layers = value;
             if (!llama_supports_gpu_offload()) {
-                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
-                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+                fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
+                fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
+                fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
             }
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
 
@@ -23,10 +23,10 @@ $ curl -L {model-url} -o ~/{model}.gguf
 Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
 
 ```
-$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
+$ ./build/bin/llama-cli -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
 ```
 
-Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
+Here, we show `llama-cli`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
 
 To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:
 
 
@@ -40,10 +40,17 @@
 #include <cinttypes>
 #include <limits>
 
-#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
-#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#if defined(LLAVA_LOG_OFF)
+#   define LOG_INF(...)
+#   define LOG_WRN(...)
+#   define LOG_ERR(...)
+#   define LOG_DBG(...)
+#else // defined(LLAVA_LOG_OFF)
+#   define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#   define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#   define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#   define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#endif // defined(LLAVA_LOG_OFF)
 
 //#define CLIP_DEBUG_FUNCTIONS
 
 
@@ -11,13 +11,17 @@
 #include <limits>
 #include <vector>
 
-#define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
-#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
-
-#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
-#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#if defined(LLAVA_LOG_OFF)
+#   define LOG_INF(...)
+#   define LOG_WRN(...)
+#   define LOG_ERR(...)
+#   define LOG_DBG(...)
+#else // defined(LLAVA_LOG_OFF)
+#   define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#   define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#   define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#   define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#endif // defined(LLAVA_LOG_OFF)
 
 // RGB uint8 image
 struct clip_image_u8 {
@@ -498,10 +502,16 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
     errno = 0;
     size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
     if (ferror(file)) {
-        die_fmt("read error: %s", strerror(errno));
+        LOG_ERR("read error: %s", strerror(errno));
+        free(buffer);
+        fclose(file);
+        return false;
     }
     if (ret != (size_t) fileSize) {
-        die("unexpectedly reached end of file");
+        LOG_ERR("unexpectedly reached end of file");
+        free(buffer);
+        fclose(file);
+        return false;
     }
     fclose(file); // Close the file
 
 
@@ -2,6 +2,6 @@ aiohttp~=3.9.3
 pytest~=8.3.3
 huggingface_hub~=0.23.2
 numpy~=1.26.4
-openai~=1.30.3
+openai~=1.55.3
 prometheus-client~=0.20.0
 requests~=2.32.3
@@ -8,7 +8,6 @@
 import re
 import json
 import sys
-import threading
 import requests
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -170,26 +169,12 @@ def start(self, timeout_seconds: int = 10) -> None:
         self.process = subprocess.Popen(
             [str(arg) for arg in [server_path, *server_args]],
             creationflags=flags,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            stdout=sys.stdout,
+            stderr=sys.stdout,
             env={**os.environ, "LLAMA_CACHE": "tmp"},
         )
         server_instances.add(self)
 
-        def server_log(in_stream, out_stream):
-            for line in iter(in_stream.readline, b""):
-                print(line.decode("utf-8"), end="", file=out_stream)
-
-        thread_stdout = threading.Thread(
-            target=server_log, args=(self.process.stdout, sys.stdout), daemon=True
-        )
-        thread_stdout.start()
-
-        thread_stderr = threading.Thread(
-            target=server_log, args=(self.process.stderr, sys.stderr), daemon=True
-        )
-        thread_stderr.start()
-
         print(f"server pid={self.process.pid}, pytest pid={os.getpid()}")
 
         # wait for server to start
 
@@ -3,7 +3,7 @@
 The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
 
 ```bash
-./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
+./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
 
 ...
Original file line number	Diff line number	Diff line change
`@@ -1370,8 +1370,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex`
`1370`	`1370`	`[](common_params & params, int value) {`
`1371`	`1371`	`params.n_gpu_layers = value;`
`1372`	`1372`	`if (!llama_supports_gpu_offload()) {`
`1373`		`- fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");`
`1374`		`- fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");`
	`1373`	`+ fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");`
	`1374`	`+ fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");`
	`1375`	`+ fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");`
`1375`	`1376`	`}`
`1376`	`1377`	`}`
`1377`	`1378`	`).set_env("LLAMA_ARG_N_GPU_LAYERS"));`
`@@ -2104,8 +2105,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex`
`2104`	`2105`	`[](common_params & params, int value) {`
`2105`	`2106`	`params.speculative.n_gpu_layers = value;`
`2106`	`2107`	`if (!llama_supports_gpu_offload()) {`
`2107`		`- fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");`
`2108`		`- fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");`
	`2108`	`+ fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");`
	`2109`	`+ fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");`
	`2110`	`+ fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");`
`2109`	`2111`	`}`
`2110`	`2112`	`}`
`2111`	`2113`	`).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));`