Skip to content

Commit 9864e0d

Browse files
committed
Merge branch 'master' into xsn/server_more_tests
2 parents 879c5eb + 266b851 commit 9864e0d

File tree

15 files changed

+523
-81
lines changed

15 files changed

+523
-81
lines changed

.github/workflows/build.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -904,6 +904,8 @@ jobs:
904904
- name: Clone
905905
id: checkout
906906
uses: actions/checkout@v4
907+
with:
908+
fetch-depth: 0
907909

908910
- name: Install Cuda Toolkit 11.7
909911
if: ${{ matrix.cuda == '11.7' }}
@@ -1139,6 +1141,8 @@ jobs:
11391141
- name: Clone
11401142
id: checkout
11411143
uses: actions/checkout@v4
1144+
with:
1145+
fetch-depth: 0
11421146

11431147
- name: Install
11441148
id: depends

AUTHORS

Lines changed: 185 additions & 1 deletion
Large diffs are not rendered by default.

common/arg.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1370,8 +1370,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
13701370
[](common_params & params, int value) {
13711371
params.n_gpu_layers = value;
13721372
if (!llama_supports_gpu_offload()) {
1373-
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
1374-
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
1373+
fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
1374+
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
1375+
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
13751376
}
13761377
}
13771378
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
@@ -2104,8 +2105,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
21042105
[](common_params & params, int value) {
21052106
params.speculative.n_gpu_layers = value;
21062107
if (!llama_supports_gpu_offload()) {
2107-
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
2108-
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
2108+
fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
2109+
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
2110+
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
21092111
}
21102112
}
21112113
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));

docs/android.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,10 @@ $ curl -L {model-url} -o ~/{model}.gguf
2323
Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
2424

2525
```
26-
$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
26+
$ ./build/bin/llama-cli -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
2727
```
2828

29-
Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
29+
Here, we show `llama-cli`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
3030

3131
To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:
3232

examples/llava/clip.cpp

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,17 @@
4040
#include <cinttypes>
4141
#include <limits>
4242

43-
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
44-
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
45-
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
46-
#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
43+
#if defined(LLAVA_LOG_OFF)
44+
# define LOG_INF(...)
45+
# define LOG_WRN(...)
46+
# define LOG_ERR(...)
47+
# define LOG_DBG(...)
48+
#else // defined(LLAVA_LOG_OFF)
49+
# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
50+
# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
51+
# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
52+
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
53+
#endif // defined(LLAVA_LOG_OFF)
4754

4855
//#define CLIP_DEBUG_FUNCTIONS
4956

examples/llava/llava.cpp

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,17 @@
1111
#include <limits>
1212
#include <vector>
1313

14-
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
15-
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
16-
17-
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
18-
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
19-
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
20-
#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
14+
#if defined(LLAVA_LOG_OFF)
15+
# define LOG_INF(...)
16+
# define LOG_WRN(...)
17+
# define LOG_ERR(...)
18+
# define LOG_DBG(...)
19+
#else // defined(LLAVA_LOG_OFF)
20+
# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
21+
# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
22+
# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
23+
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
24+
#endif // defined(LLAVA_LOG_OFF)
2125

2226
// RGB uint8 image
2327
struct clip_image_u8 {
@@ -498,10 +502,16 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
498502
errno = 0;
499503
size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
500504
if (ferror(file)) {
501-
die_fmt("read error: %s", strerror(errno));
505+
LOG_ERR("read error: %s", strerror(errno));
506+
free(buffer);
507+
fclose(file);
508+
return false;
502509
}
503510
if (ret != (size_t) fileSize) {
504-
die("unexpectedly reached end of file");
511+
LOG_ERR("unexpectedly reached end of file");
512+
free(buffer);
513+
fclose(file);
514+
return false;
505515
}
506516
fclose(file); // Close the file
507517

examples/server/tests/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@ aiohttp~=3.9.3
22
pytest~=8.3.3
33
huggingface_hub~=0.23.2
44
numpy~=1.26.4
5-
openai~=1.30.3
5+
openai~=1.55.3
66
prometheus-client~=0.20.0
77
requests~=2.32.3

examples/server/tests/utils.py

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
import re
99
import json
1010
import sys
11-
import threading
1211
import requests
1312
import time
1413
from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -170,26 +169,12 @@ def start(self, timeout_seconds: int = 10) -> None:
170169
self.process = subprocess.Popen(
171170
[str(arg) for arg in [server_path, *server_args]],
172171
creationflags=flags,
173-
stdout=subprocess.PIPE,
174-
stderr=subprocess.PIPE,
172+
stdout=sys.stdout,
173+
stderr=sys.stdout,
175174
env={**os.environ, "LLAMA_CACHE": "tmp"},
176175
)
177176
server_instances.add(self)
178177

179-
def server_log(in_stream, out_stream):
180-
for line in iter(in_stream.readline, b""):
181-
print(line.decode("utf-8"), end="", file=out_stream)
182-
183-
thread_stdout = threading.Thread(
184-
target=server_log, args=(self.process.stdout, sys.stdout), daemon=True
185-
)
186-
thread_stdout.start()
187-
188-
thread_stderr = threading.Thread(
189-
target=server_log, args=(self.process.stderr, sys.stderr), daemon=True
190-
)
191-
thread_stderr.start()
192-
193178
print(f"server pid={self.process.pid}, pytest pid={os.getpid()}")
194179

195180
# wait for server to start

examples/simple/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
44

55
```bash
6-
./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
6+
./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
77

88
...
99

0 commit comments

Comments
 (0)