Skip to content

Commit b9f0aad

Browse files
authored
Merge branch 'ggerganov:master' into sparkle_master_fix
2 parents 809393c + 8d59d91 commit b9f0aad

File tree

17 files changed

+159
-43
lines changed

17 files changed

+159
-43
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1237,7 +1237,7 @@ jobs:
12371237

12381238
- name: Create release
12391239
id: create_release
1240-
uses: anzz1/action-create-release@v1
1240+
uses: ggml-org/action-create-release@v1
12411241
env:
12421242
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
12431243
with:

.github/workflows/docker.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,9 @@ jobs:
9797
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
9898
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
9999

100-
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
101100
- name: Free Disk Space (Ubuntu)
102101
if: ${{ matrix.config.free_disk_space == true }}
103-
uses: jlumbroso/[email protected]
102+
uses: ggml-org/[email protected]
104103
with:
105104
# this might remove tools that are actually needed,
106105
# if set to "true" but frees about 6 GB

convert_lora_to_gguf.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,9 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
226226
base_name = lora_tensor_name.replace("base_model.model.", "")
227227
base_name = base_name.replace(".lora_A.weight", ".weight")
228228
base_name = base_name.replace(".lora_B.weight", ".weight")
229+
# models produced by mergekit-extract-lora have token embeddings in the adapter
230+
base_name = base_name.replace(".lora_embedding_A", ".weight")
231+
base_name = base_name.replace(".lora_embedding_B", ".weight")
229232
return base_name
230233

231234

@@ -260,6 +263,10 @@ def parse_args() -> argparse.Namespace:
260263
"--base", type=Path,
261264
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
262265
)
266+
parser.add_argument(
267+
"--base-model-id", type=str,
268+
help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
269+
)
263270
parser.add_argument(
264271
"lora_path", type=Path,
265272
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
@@ -290,6 +297,7 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
290297

291298
dir_base_model: Path | None = args.base
292299
dir_lora: Path = args.lora_path
300+
base_model_id: str | None = args.base_model_id
293301
lora_config = dir_lora / "adapter_config.json"
294302
input_model = dir_lora / "adapter_model.safetensors"
295303

@@ -313,7 +321,10 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
313321
lparams: dict[str, Any] = json.load(f)
314322

315323
# load base model
316-
if dir_base_model is None:
324+
if base_model_id is not None:
325+
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
326+
hparams = load_hparams_from_hf(base_model_id)
327+
elif dir_base_model is None:
317328
if "base_model_name_or_path" in lparams:
318329
model_id = lparams["base_model_name_or_path"]
319330
logger.info(f"Loading base model from Hugging Face: {model_id}")
@@ -371,11 +382,16 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
371382
if self.lazy:
372383
tensor = LazyTorchTensor.from_eager(tensor)
373384
base_name = get_base_tensor_name(name)
374-
is_lora_a = ".lora_A.weight" in name
375-
is_lora_b = ".lora_B.weight" in name
385+
# note: mergekit-extract-lora also adds token embeddings to the adapter
386+
is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
387+
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
376388
if not is_lora_a and not is_lora_b:
377389
if ".base_layer.weight" in name:
378390
continue
391+
# mergekit-extract-lora add these layernorm to the adapter, we need to keep them
392+
if "_layernorm" in name or ".norm" in name:
393+
yield (base_name, tensor)
394+
continue
379395
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
380396
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
381397
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
@@ -407,9 +423,21 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
407423
if name == "lm_head.weight" and len(dest) == 0:
408424
raise ValueError("lm_head is present in adapter, but is ignored in base model")
409425
for dest_name, dest_data in dest:
426+
# mergekit-extract-lora add these layernorm to the adapter
427+
if "_norm" in dest_name:
428+
assert dest_data.dim() == 1
429+
yield (dest_name, dest_data)
430+
continue
431+
432+
# otherwise, we must get the lora_A and lora_B tensors
410433
assert isinstance(dest_data, LoraTorchTensor)
411434
lora_a, lora_b = dest_data.get_lora_A_B()
412435

436+
# note: mergekit-extract-lora flip and transpose A and B
437+
# here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
438+
if "token_embd.weight" in dest_name:
439+
lora_a = lora_a.T
440+
413441
yield (dest_name + ".lora_a", lora_a)
414442
yield (dest_name + ".lora_b", lora_b)
415443

examples/run/run.cpp

Lines changed: 60 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
# include <curl/curl.h>
1212
#endif
1313

14+
#include <signal.h>
15+
1416
#include <climits>
1517
#include <cstdarg>
1618
#include <cstdio>
@@ -25,6 +27,13 @@
2527
#include "json.hpp"
2628
#include "llama-cpp.h"
2729

30+
#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
31+
[[noreturn]] static void sigint_handler(int) {
32+
printf("\n");
33+
exit(0); // not ideal, but it's the only way to guarantee exit in all cases
34+
}
35+
#endif
36+
2837
GGML_ATTRIBUTE_FORMAT(1, 2)
2938
static std::string fmt(const char * fmt, ...) {
3039
va_list ap;
@@ -801,7 +810,20 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
801810

802811
static int read_user_input(std::string & user) {
803812
std::getline(std::cin, user);
804-
return user.empty(); // Should have data in happy path
813+
if (std::cin.eof()) {
814+
printf("\n");
815+
return 1;
816+
}
817+
818+
if (user == "/bye") {
819+
return 1;
820+
}
821+
822+
if (user.empty()) {
823+
return 2;
824+
}
825+
826+
return 0; // Should have data in happy path
805827
}
806828

807829
// Function to generate a response based on the prompt
@@ -868,15 +890,34 @@ static bool is_stdout_a_terminal() {
868890
#endif
869891
}
870892

871-
// Function to tokenize the prompt
893+
// Function to handle user input
894+
static int get_user_input(std::string & user_input, const std::string & user) {
895+
while (true) {
896+
const int ret = handle_user_input(user_input, user);
897+
if (ret == 1) {
898+
return 1;
899+
}
900+
901+
if (ret == 2) {
902+
continue;
903+
}
904+
905+
break;
906+
}
907+
908+
return 0;
909+
}
910+
911+
// Main chat loop function
872912
static int chat_loop(LlamaData & llama_data, const std::string & user) {
873913
int prev_len = 0;
874914
llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
875915
static const bool stdout_a_terminal = is_stdout_a_terminal();
876916
while (true) {
877917
// Get user input
878918
std::string user_input;
879-
while (handle_user_input(user_input, user)) {
919+
if (get_user_input(user_input, user) == 1) {
920+
return 0;
880921
}
881922

882923
add_message("user", user.empty() ? user_input : user, llama_data);
@@ -917,7 +958,23 @@ static std::string read_pipe_data() {
917958
return result.str();
918959
}
919960

961+
static void ctrl_c_handling() {
962+
#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
963+
struct sigaction sigint_action;
964+
sigint_action.sa_handler = sigint_handler;
965+
sigemptyset(&sigint_action.sa_mask);
966+
sigint_action.sa_flags = 0;
967+
sigaction(SIGINT, &sigint_action, NULL);
968+
#elif defined(_WIN32)
969+
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
970+
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
971+
};
972+
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
973+
#endif
974+
}
975+
920976
int main(int argc, const char ** argv) {
977+
ctrl_c_handling();
921978
Opt opt;
922979
const int ret = opt.init(argc, argv);
923980
if (ret == 2) {

ggml/src/ggml-cuda/concat.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
124124
uint64_t nb1,
125125
uint64_t nb2,
126126
uint64_t nb3){
127-
static_assert(dim >= 0 && dim <= 3);
127+
static_assert(dim >= 0 && dim <= 3, "dim must be in [0, 3]");
128128

129129
const int64_t i3 = blockIdx.z;
130130
const int64_t i2 = blockIdx.y;

gguf-py/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,13 @@ pip install gguf
1515

1616
[examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model.
1717

18-
[scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console.
18+
[gguf/scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console.
1919

20-
[scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key.
20+
[gguf/scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key.
2121

22-
[scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files.
22+
[gguf/scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files.
2323

24-
[scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
24+
[gguf/scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
2525

2626
## Development
2727
Maintainers who participate in development of this package are advised to install it in editable mode:
File renamed without changes.
File renamed without changes.
File renamed without changes.

0 commit comments

Comments
 (0)