Skip to content

Commit 69c97bb

Browse files
authored
Merge branch 'ggerganov:master' into master
2 parents 42abdd0 + bd35cb0 commit 69c97bb

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+712
-343
lines changed

.github/workflows/build.yml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,7 @@ jobs:
375375
steps:
376376
- name: Clone
377377
id: checkout
378-
uses: actions/checkout@v3
378+
uses: actions/checkout@v4
379379

380380
- name: Dependencies
381381
id: depends
@@ -401,7 +401,7 @@ jobs:
401401
continue-on-error: true
402402

403403
steps:
404-
- uses: actions/checkout@v2
404+
- uses: actions/checkout@v4
405405

406406
- name: add oneAPI to apt
407407
shell: bash
@@ -442,7 +442,7 @@ jobs:
442442
continue-on-error: true
443443

444444
steps:
445-
- uses: actions/checkout@v2
445+
- uses: actions/checkout@v4
446446

447447
- name: add oneAPI to apt
448448
shell: bash
@@ -546,7 +546,7 @@ jobs:
546546
steps:
547547
- name: Clone
548548
id: checkout
549-
uses: actions/checkout@v1
549+
uses: actions/checkout@v4
550550

551551
- name: Dependencies
552552
id: depends
@@ -576,7 +576,7 @@ jobs:
576576
steps:
577577
- name: Clone
578578
id: checkout
579-
uses: actions/checkout@v1
579+
uses: actions/checkout@v4
580580

581581
- name: Dependencies
582582
id: depends
@@ -610,7 +610,7 @@ jobs:
610610
steps:
611611
- name: Clone
612612
id: checkout
613-
uses: actions/checkout@v1
613+
uses: actions/checkout@v4
614614

615615
- name: Dependencies
616616
id: depends
@@ -969,14 +969,14 @@ jobs:
969969
steps:
970970
- name: Clone
971971
id: checkout
972-
uses: actions/checkout@v3
972+
uses: actions/checkout@v4
973973

974974
- name: Install
975975
id: depends
976976
run: |
977977
$ErrorActionPreference = "Stop"
978978
write-host "Downloading AMD HIP SDK Installer"
979-
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
979+
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
980980
write-host "Installing AMD HIP SDK"
981981
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
982982
write-host "Completed AMD HIP SDK installation"

.github/workflows/server.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ jobs:
173173
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
174174
run: |
175175
cd examples/server/tests
176+
$env:PYTHONIOENCODING = ":replace"
176177
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
177178
178179
- name: Slow tests

CMakeLists.txt

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,10 +139,16 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o
139139
# determining _precisely_ which defines are necessary for the llama-config
140140
# package.
141141
#
142+
set(GGML_TRANSIENT_DEFINES)
142143
get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
143144
get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
145+
if (GGML_DIR_DEFINES)
146+
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
147+
endif()
144148
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
145-
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
149+
if (GGML_TARGET_DEFINES)
150+
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
151+
endif()
146152
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
147153

148154
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)

Makefile

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,7 @@ endif
434434
# TODO: probably these flags need to be tweaked on some architectures
435435
# feel free to update the Makefile for your architecture and send a pull request or issue
436436

437-
ifndef RISCV
437+
ifndef RISCV_CROSS_COMPILE
438438

439439
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
440440
# Use all CPU extensions that are available:
@@ -514,7 +514,12 @@ ifneq ($(filter loongarch64%,$(UNAME_M)),)
514514
MK_CXXFLAGS += -mlasx
515515
endif
516516

517-
else
517+
ifneq ($(filter riscv64%,$(UNAME_M)),)
518+
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
519+
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
520+
endif
521+
522+
else # RISC-V CROSS COMPILATION
518523
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
519524
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
520525
endif
@@ -1454,7 +1459,6 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
14541459
$(OBJ_ALL)
14551460
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
14561461
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1457-
./llama-gen-docs
14581462

14591463
libllava.a: examples/llava/llava.cpp \
14601464
examples/llava/llava.h \

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ Typically finetunes of the base models below are supported as well.
8989
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
9090
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
9191
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
92+
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
9293

9394
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
9495

common/arg.cpp

Lines changed: 15 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,6 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
173173
std::string arg;
174174
const std::string arg_prefix = "--";
175175
gpt_params & params = ctx_arg.params;
176-
gpt_sampler_params & sparams = params.sparams;
177176

178177
std::unordered_map<std::string, llama_arg *> arg_to_options;
179178
for (auto & opt : ctx_arg.options) {
@@ -283,10 +282,6 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
283282
params.kv_overrides.back().key[0] = 0;
284283
}
285284

286-
if (sparams.seed == LLAMA_DEFAULT_SEED) {
287-
sparams.seed = time(NULL);
288-
}
289-
290285
return true;
291286
}
292287

@@ -823,7 +818,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
823818
[](gpt_params & params) {
824819
params.special = true;
825820
}
826-
).set_examples({LLAMA_EXAMPLE_MAIN}));
821+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
827822
add_opt(llama_arg(
828823
{"-cnv", "--conversation"},
829824
format(
@@ -909,7 +904,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
909904
).set_sparam());
910905
add_opt(llama_arg(
911906
{"-s", "--seed"}, "SEED",
912-
format("RNG seed (default: %d, use random seed for < 0)", params.sparams.seed),
907+
format("RNG seed (default: %u, use random seed for %u)", params.sparams.seed, LLAMA_DEFAULT_SEED),
913908
[](gpt_params & params, const std::string & value) {
914909
params.sparams.seed = std::stoul(value);
915910
}
@@ -1422,20 +1417,18 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
14221417
params.split_mode = LLAMA_SPLIT_MODE_NONE;
14231418
} else if (arg_next == "layer") {
14241419
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
1425-
}
1426-
else if (arg_next == "row") {
1420+
} else if (arg_next == "row") {
14271421
#ifdef GGML_USE_SYCL
14281422
fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
14291423
exit(1);
14301424
#endif // GGML_USE_SYCL
14311425
params.split_mode = LLAMA_SPLIT_MODE_ROW;
1432-
}
1433-
else {
1426+
} else {
14341427
throw std::invalid_argument("invalid value");
14351428
}
1436-
#ifndef GGML_USE_CUDA_SYCL_VULKAN
1437-
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
1438-
#endif // GGML_USE_CUDA_SYCL_VULKAN
1429+
if (!llama_supports_gpu_offload()) {
1430+
fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n");
1431+
}
14391432
}
14401433
));
14411434
add_opt(llama_arg(
@@ -1455,24 +1448,24 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
14551448
}
14561449
for (size_t i = 0; i < llama_max_devices(); ++i) {
14571450
if (i < split_arg.size()) {
1458-
params.tensor_split[i] = std::stof(split_arg[i]);
1451+
params.tensor_split[i] = std::stof(split_arg[i]);
14591452
} else {
1460-
params.tensor_split[i] = 0.0f;
1453+
params.tensor_split[i] = 0.0f;
14611454
}
14621455
}
1463-
#ifndef GGML_USE_CUDA_SYCL_VULKAN
1464-
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
1465-
#endif // GGML_USE_CUDA_SYCL_VULKAN
1456+
if (!llama_supports_gpu_offload()) {
1457+
fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n");
1458+
}
14661459
}
14671460
));
14681461
add_opt(llama_arg(
14691462
{"-mg", "--main-gpu"}, "INDEX",
14701463
format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
14711464
[](gpt_params & params, int value) {
14721465
params.main_gpu = value;
1473-
#ifndef GGML_USE_CUDA_SYCL_VULKAN
1474-
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
1475-
#endif // GGML_USE_CUDA_SYCL_VULKAN
1466+
if (!llama_supports_gpu_offload()) {
1467+
fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
1468+
}
14761469
}
14771470
));
14781471
add_opt(llama_arg(

common/common.cpp

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,6 @@
5656
#pragma warning(disable: 4244 4267) // possible loss of data
5757
#endif
5858

59-
#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL))
60-
#define GGML_USE_CUDA_SYCL
61-
#endif
62-
63-
#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
64-
#define GGML_USE_CUDA_SYCL_VULKAN
65-
#endif
66-
6759
#if defined(LLAMA_USE_CURL)
6860
#ifdef __linux__
6961
#include <linux/limits.h>
@@ -949,11 +941,37 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
949941

950942
#ifdef LLAMA_USE_CURL
951943

944+
#define CURL_MAX_RETRY 3
945+
#define CURL_RETRY_DELAY_SECONDS 2
946+
947+
952948
static bool starts_with(const std::string & str, const std::string & prefix) {
953949
// While we wait for C++20's std::string::starts_with...
954950
return str.rfind(prefix, 0) == 0;
955951
}
956952

953+
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
954+
int remaining_attempts = max_attempts;
955+
956+
while (remaining_attempts > 0) {
957+
fprintf(stderr, "%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
958+
959+
CURLcode res = curl_easy_perform(curl);
960+
if (res == CURLE_OK) {
961+
return true;
962+
}
963+
964+
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
965+
fprintf(stderr, "%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
966+
967+
remaining_attempts--;
968+
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
969+
}
970+
971+
fprintf(stderr, "%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
972+
return false;
973+
}
974+
957975
static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
958976

959977
// Initialize libcurl
@@ -1057,9 +1075,8 @@ static bool llama_download_file(const std::string & url, const std::string & pat
10571075
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
10581076
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
10591077

1060-
CURLcode res = curl_easy_perform(curl.get());
1061-
if (res != CURLE_OK) {
1062-
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
1078+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
1079+
if (!was_perform_successful) {
10631080
return false;
10641081
}
10651082

@@ -1134,11 +1151,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat
11341151
};
11351152

11361153
// start the download
1137-
fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
1138-
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
1139-
auto res = curl_easy_perform(curl.get());
1140-
if (res != CURLE_OK) {
1141-
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
1154+
fprintf(stderr, "%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
1155+
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
1156+
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
1157+
if (!was_perform_successful) {
11421158
return false;
11431159
}
11441160

@@ -1812,6 +1828,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
18121828
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
18131829
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
18141830
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
1831+
fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false");
18151832
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
18161833
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
18171834
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");

common/sampling.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,10 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
310310
return cur_p.data[cur_p.selected].id;
311311
}
312312

313+
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
314+
return llama_sampler_get_seed(gsmpl->chain);
315+
}
316+
313317
// helpers
314318

315319
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {

common/sampling.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
6060
//
6161
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
6262

63+
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
64+
6365
// helpers
6466

6567
// access the internal list of current candidate tokens

convert_hf_to_gguf.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,8 @@ def prepare_tensors(self):
302302
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
303303
gguf.MODEL_TENSOR.TIME_MIX_W1,
304304
gguf.MODEL_TENSOR.TIME_MIX_W2,
305+
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
306+
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
305307
)
306308
)
307309
or not new_name.endswith(".weight")
@@ -624,6 +626,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
624626
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
625627
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
626628
res = "exaone"
629+
if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
630+
# ref: https://huggingface.co/microsoft/phi-2
631+
res = "phi-2"
627632

628633
if res is None:
629634
logger.warning("\n")
@@ -2769,6 +2774,8 @@ def set_vocab(self):
27692774
self.gguf_writer.add_tokenizer_model("rwkv")
27702775
self.gguf_writer.add_token_list(tokens)
27712776
self.gguf_writer.add_token_types(toktypes)
2777+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
2778+
special_vocab.add_to_gguf(self.gguf_writer)
27722779

27732780
def set_gguf_parameters(self):
27742781
block_count = self.hparams["num_hidden_layers"]

0 commit comments

Comments
 (0)