Skip to content

Commit 69dd1e8

Browse files
committed
llama : quant (cont)
ggml-ci
1 parent e06d267 commit 69dd1e8

File tree

5 files changed

+37
-45
lines changed

5 files changed

+37
-45
lines changed

src/llama-adapter.cpp

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
149149
delete adapter;
150150
}
151151

152-
void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
152+
static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
153153
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
154154

155155
ggml_context * ctx_init;
@@ -317,3 +317,18 @@ void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_
317317

318318
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
319319
}
320+
321+
struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
322+
struct llama_lora_adapter * adapter = new llama_lora_adapter();
323+
324+
try {
325+
llama_lora_adapter_init_impl(*model, path_lora, *adapter);
326+
return adapter;
327+
} catch (const std::exception & err) {
328+
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
329+
330+
delete adapter;
331+
}
332+
333+
return nullptr;
334+
}

src/llama-adapter.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,5 +64,3 @@ struct llama_lora_adapter {
6464

6565
llama_lora_weight * get_weight(struct ggml_tensor * w);
6666
};
67-
68-
void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter);

src/llama-quant.cpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -464,7 +464,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
464464
return new_size;
465465
}
466466

467-
void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
467+
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
468468
ggml_type default_type;
469469
llama_ftype ftype = params->ftype;
470470

@@ -892,6 +892,10 @@ void llama_model_quantize_internal(const std::string & fname_inp, const std::str
892892
}
893893
}
894894

895+
//
896+
// interface implementation
897+
//
898+
895899
struct llama_model_quantize_params llama_model_quantize_default_params() {
896900
struct llama_model_quantize_params result = {
897901
/*.nthread =*/ 0,
@@ -909,3 +913,17 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
909913

910914
return result;
911915
}
916+
917+
uint32_t llama_model_quantize(
918+
const char * fname_inp,
919+
const char * fname_out,
920+
const llama_model_quantize_params * params) {
921+
try {
922+
llama_model_quantize_internal(fname_inp, fname_out, params);
923+
} catch (const std::exception & err) {
924+
LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
925+
return 1;
926+
}
927+
928+
return 0;
929+
}

src/llama-quant.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1 @@
11
#pragma once
2-
3-
#include <string>
4-
5-
struct llama_model_quantize_params;
6-
7-
void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params);

src/llama.cpp

Lines changed: 2 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -41,17 +41,13 @@
4141
#endif
4242

4343
//
44-
// helpers
44+
// tensor loading (TODO: add llama_tesor_loader?)
4545
//
4646

4747
static int llama_get_device_count(const llama_model & model) {
4848
return (int) model.devices.size();
4949
}
5050

51-
//
52-
// model loading and saving
53-
//
54-
5551
// checks if the weight tensor can be used with the specified buffer type and device
5652
static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
5753
GGML_ASSERT(w != nullptr);
@@ -11319,21 +11315,6 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
1131911315
}
1132011316
}
1132111317

11322-
struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
11323-
struct llama_lora_adapter * adapter = new llama_lora_adapter();
11324-
11325-
try {
11326-
llama_lora_adapter_init_impl(*model, path_lora, *adapter);
11327-
return adapter;
11328-
} catch (const std::exception & err) {
11329-
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
11330-
11331-
delete adapter;
11332-
}
11333-
11334-
return nullptr;
11335-
}
11336-
1133711318
int32_t llama_lora_adapter_set(
1133811319
struct llama_context * ctx,
1133911320
struct llama_lora_adapter * adapter,
@@ -11585,6 +11566,7 @@ struct llama_model * llama_load_model_from_file(
1158511566
} else if (status == -2) {
1158611567
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
1158711568
}
11569+
1158811570
llama_free_model(model);
1158911571
return nullptr;
1159011572
}
@@ -11943,20 +11925,6 @@ struct llama_context * llama_new_context_with_model(
1194311925
return ctx;
1194411926
}
1194511927

11946-
uint32_t llama_model_quantize(
11947-
const char * fname_inp,
11948-
const char * fname_out,
11949-
const llama_model_quantize_params * params) {
11950-
try {
11951-
llama_model_quantize_internal(fname_inp, fname_out, params);
11952-
} catch (const std::exception & err) {
11953-
LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
11954-
return 1;
11955-
}
11956-
11957-
return 0;
11958-
}
11959-
1196011928
//
1196111929
// kv cache
1196211930
//
@@ -12343,4 +12311,3 @@ void llama_perf_context_reset(struct llama_context * ctx) {
1234312311
ctx->t_eval_us = ctx->n_eval = 0;
1234412312
ctx->t_p_eval_us = ctx->n_p_eval = 0;
1234512313
}
12346-

0 commit comments

Comments
 (0)