Skip to content

Commit 3c8a053

Browse files
Merge branch 'ggerganov:master' into server-update-JSON-response
2 parents 220cf7f + a76c56f commit 3c8a053

24 files changed

+9556
-164
lines changed

.github/workflows/build.yml

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -662,6 +662,8 @@ jobs:
662662
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
663663
- build: 'msvc-arm64'
664664
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
665+
- build: 'llvm-arm64-opencl-adreno'
666+
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
665667

666668
steps:
667669
- name: Clone
@@ -703,6 +705,28 @@ jobs:
703705
run: |
704706
choco install ninja
705707
708+
- name: Install OpenCL Headers and Libs
709+
id: install_opencl
710+
if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
711+
run: |
712+
git clone https://github.com/KhronosGroup/OpenCL-Headers
713+
cd OpenCL-Headers
714+
mkdir build && cd build
715+
cmake .. `
716+
-DBUILD_TESTING=OFF `
717+
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
718+
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
719+
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
720+
cmake --build . --target install
721+
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
722+
cd OpenCL-ICD-Loader
723+
mkdir build-arm64-release && cd build-arm64-release
724+
cmake .. `
725+
-A arm64 `
726+
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
727+
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
728+
cmake --build . --target install --config release
729+
706730
- name: Build
707731
id: cmake_build
708732
run: |
@@ -732,7 +756,7 @@ jobs:
732756
- name: Test
733757
id: cmake_test
734758
# not all machines have native AVX-512
735-
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
759+
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
736760
run: |
737761
cd build
738762
ctest -L main -C Release --verbose --timeout 900

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,20 @@ To learn more about model quantization, [read this documentation](examples/quant
433433

434434
</details>
435435

436+
## [`llama-run`](examples/run)
437+
438+
#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
439+
440+
- <details>
441+
<summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
442+
443+
```bash
444+
llama-run granite-code
445+
```
446+
447+
</details>
448+
449+
[^3]: [https://github.com/containers/ramalama](RamaLama)
436450
437451
## [`llama-simple`](examples/simple)
438452

common/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)
8181
# Use curl to download model url
8282
if (LLAMA_CURL)
8383
find_package(CURL REQUIRED)
84-
add_definitions(-DLLAMA_USE_CURL)
84+
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
8585
include_directories(${CURL_INCLUDE_DIRS})
8686
find_library(CURL_LIBRARY curl REQUIRED)
8787
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})

common/common.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1076,12 +1076,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
10761076
#define CURL_MAX_RETRY 3
10771077
#define CURL_RETRY_DELAY_SECONDS 2
10781078

1079-
1080-
static bool starts_with(const std::string & str, const std::string & prefix) {
1081-
// While we wait for C++20's std::string::starts_with...
1082-
return str.rfind(prefix, 0) == 0;
1083-
}
1084-
10851079
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
10861080
int remaining_attempts = max_attempts;
10871081

common/common.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@ using llama_tokens = std::vector<llama_token>;
3737

3838
// build info
3939
extern int LLAMA_BUILD_NUMBER;
40-
extern char const * LLAMA_COMMIT;
41-
extern char const * LLAMA_COMPILER;
42-
extern char const * LLAMA_BUILD_TARGET;
40+
extern const char * LLAMA_COMMIT;
41+
extern const char * LLAMA_COMPILER;
42+
extern const char * LLAMA_BUILD_TARGET;
4343

4444
struct common_control_vector_load_info;
4545

@@ -437,6 +437,11 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
437437
return parts;
438438
}
439439

440+
static bool string_starts_with(const std::string & str,
441+
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
442+
return str.rfind(prefix, 0) == 0;
443+
}
444+
440445
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
441446
void string_process_escapes(std::string & input);
442447

examples/run/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
set(TARGET llama-run)
22
add_executable(${TARGET} run.cpp)
33
install(TARGETS ${TARGET} RUNTIME)
4-
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
4+
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
55
target_compile_features(${TARGET} PRIVATE cxx_std_17)

examples/run/README.md

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,45 @@
33
The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
44

55
```bash
6-
./llama-run Meta-Llama-3.1-8B-Instruct.gguf
6+
llama-run granite-code
7+
...
8+
9+
```bash
10+
llama-run -h
11+
Description:
12+
Runs a llm
13+
14+
Usage:
15+
llama-run [options] model [prompt]
16+
17+
Options:
18+
-c, --context-size <value>
19+
Context size (default: 2048)
20+
-n, --ngl <value>
21+
Number of GPU layers (default: 0)
22+
-h, --help
23+
Show help message
24+
25+
Commands:
26+
model
27+
Model is a string with an optional prefix of
28+
huggingface:// (hf://), ollama://, https:// or file://.
29+
If no protocol is specified and a file exists in the specified
30+
path, file:// is assumed, otherwise if a file does not exist in
31+
the specified path, ollama:// is assumed. Models that are being
32+
pulled are downloaded with .partial extension while being
33+
downloaded and then renamed as the file without the .partial
34+
extension when complete.
35+
36+
Examples:
37+
llama-run llama3
38+
llama-run ollama://granite-code
39+
llama-run ollama://smollm:135m
40+
llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf
41+
llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf
42+
llama-run https://example.com/some-file1.gguf
43+
llama-run some-file2.gguf
44+
llama-run file://some-file3.gguf
45+
llama-run --ngl 99 some-file4.gguf
46+
llama-run --ngl 99 some-file5.gguf Hello World
747
...

0 commit comments

Comments
 (0)