Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
3ad5451
Add some minimal optimizations for CDNA (#10498)
IMbackK Nov 27, 2024
9f91251
common : fix duplicated file name with hf_repo and hf_file (#10550)
ngxson Nov 27, 2024
b742013
CANN: ROPE operator optimization (#10540)
noemotiovon Nov 28, 2024
605fa66
CANN: Fix SOC_TYPE compile bug (#10519)
leo-pony Nov 28, 2024
c6bc739
CANN: Update cann.md to display correctly in CLion (#10538)
HRXWEB Nov 28, 2024
2025fa6
kompute : improve backend to pass test_backend_ops (#10542)
slp Nov 28, 2024
c202cef
ggml-cpu: support IQ4_NL_4_4 by runtime repack (#10541)
FanShupei Nov 28, 2024
eea986f
cmake : fix ARM feature detection (#10543)
ggerganov Nov 28, 2024
76b27d2
ggml : fix row condition for i8mm kernels (#10561)
ggerganov Nov 28, 2024
e90688e
ci : fix tag name in cuda and hip releases (#10566)
slaren Nov 28, 2024
7281cf1
docs: fix outdated usage of llama-simple (#10565)
rand-fly Nov 28, 2024
8907193
common: fix warning message when no GPU found (#10564)
JohannesGaessler Nov 28, 2024
6c59567
server : (tests) don't use thread for capturing stdout/stderr, bump o…
ngxson Nov 28, 2024
4c0a95b
llama : add missing model types
ggerganov Nov 28, 2024
dc22344
ggml : remove redundant copyright notice + update authors
ggerganov Nov 28, 2024
678d799
llava: return false instead of exit (#10546)
tinglou Nov 29, 2024
f095a64
vulkan: get the first command buffer submitted sooner (#10499)
jeffbolznv Nov 29, 2024
938f608
CANN: RoPE operator optimization (#10563)
noemotiovon Nov 29, 2024
266b851
sycl : Reroute permuted mul_mats through oneMKL (#10408)
Alcpz Nov 29, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -904,6 +904,8 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Install Cuda Toolkit 11.7
if: ${{ matrix.cuda == '11.7' }}
Expand Down Expand Up @@ -1139,6 +1141,8 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Install
id: depends
Expand Down
186 changes: 185 additions & 1 deletion AUTHORS

Large diffs are not rendered by default.

16 changes: 11 additions & 5 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,11 @@ static void common_params_handle_model_default(common_params & params) {
}
params.hf_file = params.model;
} else if (params.model.empty()) {
params.model = fs_get_cache_file(string_split<std::string>(params.hf_file, '/').back());
// this is to avoid different repo having same file name, or same file name in different subdirs
std::string filename = params.hf_repo + "_" + params.hf_file;
// to make sure we don't have any slashes in the filename
string_replace_all(filename, "/", "_");
params.model = fs_get_cache_file(filename);
}
} else if (!params.model_url.empty()) {
if (params.model.empty()) {
Expand Down Expand Up @@ -1366,8 +1370,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.n_gpu_layers = value;
if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
}
}
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
Expand Down Expand Up @@ -2100,8 +2105,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.speculative.n_gpu_layers = value;
if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
}
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
Expand Down
54 changes: 27 additions & 27 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -829,9 +829,9 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_model * model = nullptr;

if (!params.hf_repo.empty() && !params.hf_file.empty()) {
model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
} else if (!params.model_url.empty()) {
model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
} else {
model = llama_load_model_from_file(params.model.c_str(), mparams);
}
Expand Down Expand Up @@ -1342,17 +1342,17 @@ static bool common_download_file(const std::string & url, const std::string & pa
}

struct llama_model * common_load_model_from_url(
const char * model_url,
const char * path_model,
const char * hf_token,
const std::string & model_url,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params) {
// Basic validation of the model_url
if (!model_url || strlen(model_url) == 0) {
if (model_url.empty()) {
LOG_ERR("%s: invalid model_url\n", __func__);
return NULL;
}

if (!common_download_file(model_url, path_model, hf_token)) {
if (!common_download_file(model_url, local_path, hf_token)) {
return NULL;
}

Expand All @@ -1363,9 +1363,9 @@ struct llama_model * common_load_model_from_url(
/*.no_alloc = */ true,
/*.ctx = */ NULL,
};
auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
if (!ctx_gguf) {
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model);
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
return NULL;
}

Expand All @@ -1384,13 +1384,13 @@ struct llama_model * common_load_model_from_url(
// Verify the first split file format
// and extract split URL and PATH prefixes
{
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
return NULL;
}

if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
return NULL;
}
}
Expand All @@ -1417,14 +1417,14 @@ struct llama_model * common_load_model_from_url(
}
}

return llama_load_model_from_file(path_model, params);
return llama_load_model_from_file(local_path.c_str(), params);
}

struct llama_model * common_load_model_from_hf(
const char * repo,
const char * model,
const char * path_model,
const char * hf_token,
const std::string & repo,
const std::string & remote_path,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params) {
// construct hugging face model url:
//
Expand All @@ -1438,27 +1438,27 @@ struct llama_model * common_load_model_from_hf(
std::string model_url = "https://huggingface.co/";
model_url += repo;
model_url += "/resolve/main/";
model_url += model;
model_url += remote_path;

return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
return common_load_model_from_url(model_url, local_path, hf_token, params);
}

#else

struct llama_model * common_load_model_from_url(
const char * /*model_url*/,
const char * /*path_model*/,
const char * /*hf_token*/,
const std::string & /*model_url*/,
const std::string & /*local_path*/,
const std::string & /*hf_token*/,
const struct llama_model_params & /*params*/) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
return nullptr;
}

struct llama_model * common_load_model_from_hf(
const char * /*repo*/,
const char * /*model*/,
const char * /*path_model*/,
const char * /*hf_token*/,
const std::string & /*repo*/,
const std::string & /*remote_path*/,
const std::string & /*local_path*/,
const std::string & /*hf_token*/,
const struct llama_model_params & /*params*/) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
return nullptr;
Expand Down
13 changes: 11 additions & 2 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -470,8 +470,17 @@ struct llama_model_params common_model_params_to_llama ( common_params
struct llama_context_params common_context_params_to_llama(const common_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * common_load_model_from_url(
const std::string & model_url,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);
struct llama_model * common_load_model_from_hf(
const std::string & repo,
const std::string & remote_path,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);

// clear LoRA adapters from context, then apply new list of adapters
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
Expand Down
4 changes: 2 additions & 2 deletions docs/android.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ $ curl -L {model-url} -o ~/{model}.gguf
Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:

```
$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
$ ./build/bin/llama-cli -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
```

Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
Here, we show `llama-cli`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.

To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:

Expand Down
4 changes: 4 additions & 0 deletions docs/backend/CANN.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi

## News

- 2024.11
- Support F16 and F32 data type model for Ascend 310P NPU.
- 2024.8
- Support `Q4_0` and `Q8_0` data type for Ascend NPU.
- 2024.7
Expand All @@ -40,9 +42,11 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
### Ascend NPU

**Verified devices**

| Ascend NPU | Status |
|:-----------------------------:|:-------:|
| Atlas 300T A2 | Support |
| Atlas 300I Duo | Support |

*Notes:*

Expand Down
15 changes: 11 additions & 4 deletions examples/llava/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,17 @@
#include <cinttypes>
#include <limits>

#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#if defined(LLAVA_LOG_OFF)
# define LOG_INF(...)
# define LOG_WRN(...)
# define LOG_ERR(...)
# define LOG_DBG(...)
#else // defined(LLAVA_LOG_OFF)
# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#endif // defined(LLAVA_LOG_OFF)

//#define CLIP_DEBUG_FUNCTIONS

Expand Down
28 changes: 19 additions & 9 deletions examples/llava/llava.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,17 @@
#include <limits>
#include <vector>

#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)

#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#if defined(LLAVA_LOG_OFF)
# define LOG_INF(...)
# define LOG_WRN(...)
# define LOG_ERR(...)
# define LOG_DBG(...)
#else // defined(LLAVA_LOG_OFF)
# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#endif // defined(LLAVA_LOG_OFF)

// RGB uint8 image
struct clip_image_u8 {
Expand Down Expand Up @@ -498,10 +502,16 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
errno = 0;
size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
if (ferror(file)) {
die_fmt("read error: %s", strerror(errno));
LOG_ERR("read error: %s", strerror(errno));
free(buffer);
fclose(file);
return false;
}
if (ret != (size_t) fileSize) {
die("unexpectedly reached end of file");
LOG_ERR("unexpectedly reached end of file");
free(buffer);
fclose(file);
return false;
}
fclose(file); // Close the file

Expand Down
2 changes: 1 addition & 1 deletion examples/server/tests/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ aiohttp~=3.9.3
pytest~=8.3.3
huggingface_hub~=0.23.2
numpy~=1.26.4
openai~=1.30.3
openai~=1.55.3
prometheus-client~=0.20.0
requests~=2.32.3
20 changes: 2 additions & 18 deletions examples/server/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import re
import json
import sys
import threading
import requests
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
Expand Down Expand Up @@ -161,26 +160,12 @@ def start(self, timeout_seconds: int = 10) -> None:
self.process = subprocess.Popen(
[str(arg) for arg in [server_path, *server_args]],
creationflags=flags,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
stdout=sys.stdout,
stderr=sys.stdout,
env={**os.environ, "LLAMA_CACHE": "tmp"},
)
server_instances.add(self)

def server_log(in_stream, out_stream):
for line in iter(in_stream.readline, b""):
print(line.decode("utf-8"), end="", file=out_stream)

thread_stdout = threading.Thread(
target=server_log, args=(self.process.stdout, sys.stdout), daemon=True
)
thread_stdout.start()

thread_stderr = threading.Thread(
target=server_log, args=(self.process.stderr, sys.stderr), daemon=True
)
thread_stderr.start()

print(f"server pid={self.process.pid}, pytest pid={os.getpid()}")

# wait for server to start
Expand Down Expand Up @@ -319,7 +304,6 @@ def jina_reranker_tiny() -> ServerProcess:
server.model_hf_repo = "ggml-org/models"
server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
server.model_alias = "jina-reranker"
server.model_file = "./tmp/jina-reranker-v1-tiny-en.gguf"
server.n_ctx = 512
server.n_batch = 512
server.n_slots = 1
Expand Down
2 changes: 1 addition & 1 deletion examples/simple/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.

```bash
./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"

...

Expand Down
1 change: 1 addition & 0 deletions ggml/include/ggml-cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ extern "C" {
GGML_BACKEND_API int ggml_cpu_has_neon (void);
GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
GGML_BACKEND_API int ggml_cpu_has_dotprod (void);
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
GGML_BACKEND_API int ggml_cpu_has_sve (void);
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
Expand Down
3 changes: 3 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,9 @@ extern "C" {
GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_TQ1_0 = 34,
GGML_TYPE_TQ2_0 = 35,
GGML_TYPE_IQ4_NL_4_4 = 36,
// GGML_TYPE_IQ4_NL_4_8 = 37,
// GGML_TYPE_IQ4_NL_8_8 = 38,
GGML_TYPE_COUNT,
};

Expand Down
7 changes: 4 additions & 3 deletions ggml/src/ggml-cann/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,14 @@ if(NOT SOC_TYPE)
detect_ascend_soc_type(SOC_VERSION)
set(SOC_TYPE "${SOC_VERSION}")
message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
else()
string(TOLOWER ${SOC_TYPE} SOC_VERSION)
endif()

# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P.
string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower

# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND_310P.
string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)

if (CANN_INSTALL_DIR)
# Only Support Linux.
Expand Down
Loading
Loading