Skip to content

Commit 7992aa7

Browse files
bartowski1182ggerganovCISC
authored
tests : add unit test coverage for llama_tensor_get_type (#20112)
* Add unit test coverage for llama_tensor_get_type * Fix merge conflicts, add more schemas * clang formatter changes * Trailing whitespace * Update name * Start rebase * Updating files with upstream changes prior to rebase * Changes needed from rebase * Update attn_qkv schema, change throw behaviour * Fix merge conflicts * White space * Update with latest changes to state counters * Revert accidental personal CLAUDE.md changes * Change quotation mark * Reuse metadata.name since we have it * Move test-only stuff out of llama-quant.cpp * Hide the regex functionality back in llama-quant.cpp, use a unique pointer to a new struct 'compiled_tensor_type_patterns' which contains the patterns * cont : inital deslop guidelines * Cleanup based on review comments * Continue cleanup * Small cleanup * Manually set proper ordering of tensors, mostly applies to gemma * Formatting * Update tests/test-quant-type-selection.cpp Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Fix merge conflicts --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
1 parent a1cfb64 commit 7992aa7

20 files changed

+35301
-51
lines changed

src/llama-ext.h

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,56 @@
11
#pragma once
22

3-
#include "llama-context.h"
4-
#include "ggml.h"
5-
#include "stdint.h"
3+
#include "llama.h"
4+
5+
#include <cstdint>
66

77
// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
88
LLAMA_API struct ggml_cgraph * llama_graph_reserve(
99
struct llama_context * ctx,
1010
uint32_t n_tokens,
1111
uint32_t n_seqs,
1212
uint32_t n_outputs);
13+
14+
// Get the default ggml_type for a given ftype.
15+
LLAMA_API ggml_type llama_ftype_get_default_type(llama_ftype ftype);
16+
17+
// Quantization state.
18+
struct quantize_state_impl;
19+
20+
LLAMA_API quantize_state_impl * llama_quant_init(
21+
const llama_model * model,
22+
const llama_model_quantize_params * params);
23+
24+
LLAMA_API void llama_quant_free(quantize_state_impl * qs);
25+
26+
// Descriptor for constructing a mock model for quantization testing.
27+
struct llama_quant_model_desc {
28+
const char * architecture;
29+
uint32_t n_embd;
30+
uint32_t n_ff;
31+
uint32_t n_layer;
32+
uint32_t n_head;
33+
uint32_t n_head_kv;
34+
uint32_t n_expert;
35+
uint32_t n_embd_head_k;
36+
uint32_t n_embd_head_v;
37+
};
38+
39+
// Create a mock model from a metadata descriptor (for testing).
40+
// The returned model must be freed with llama_model_free().
41+
LLAMA_API llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * desc);
42+
43+
// Returns true if this tensor should be quantized (based on name, dims, params).
44+
LLAMA_API bool llama_quant_tensor_allows_quantization(
45+
const quantize_state_impl * qs,
46+
const ggml_tensor * tensor);
47+
48+
// Compute quantization type assignments for a list of tensors.
49+
// All tensors should be quantizable (use llama_quant_tensor_allows_quantization to filter).
50+
// result_types: caller-allocated array of n_tensors elements, filled with assigned types.
51+
LLAMA_API void llama_quant_compute_types(
52+
quantize_state_impl * qs,
53+
llama_ftype ftype,
54+
ggml_tensor ** tensors,
55+
ggml_type * result_types,
56+
size_t n_tensors);

src/llama-quant.cpp

Lines changed: 125 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
#include "llama.h"
21
#include "llama-impl.h"
32
#include "llama-model.h"
43
#include "llama-model-loader.h"
4+
#include "llama-ext.h"
55

6+
#include <algorithm>
67
#include <cmath>
78
#include <cstring>
8-
#include <string>
99
#include <cinttypes>
1010
#include <fstream>
1111
#include <mutex>
@@ -197,6 +197,7 @@ struct quantize_state_impl {
197197

198198
// per-tensor metadata, computed in the preliminary loop and used in the main loop
199199
struct tensor_metadata {
200+
std::string name;
200201
ggml_type target_type;
201202
tensor_category category;
202203
std::string remapped_imatrix_name;
@@ -788,7 +789,7 @@ static bool tensor_requires_imatrix(const char * tensor_name, const ggml_type ds
788789
// given a file type, get the default tensor type
789790
//
790791

791-
static ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
792+
ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
792793
switch (ftype) {
793794
case LLAMA_FTYPE_MOSTLY_Q4_0: return GGML_TYPE_Q4_0;
794795
case LLAMA_FTYPE_MOSTLY_Q4_1: return GGML_TYPE_Q4_1;
@@ -827,16 +828,32 @@ static ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
827828
case LLAMA_FTYPE_MOSTLY_IQ3_S:
828829
case LLAMA_FTYPE_MOSTLY_IQ3_M: return GGML_TYPE_IQ3_S;
829830

830-
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
831+
default: return GGML_TYPE_COUNT;
831832
}
832833
}
833834

835+
836+
static void init_quantize_state_counters(quantize_state_impl & qs, std::vector<tensor_metadata> & metadata) {
837+
for (auto & tm : metadata) {
838+
tensor_category cat = tensor_get_category(tm.name);
839+
tm.category = cat;
840+
841+
if (category_is_attn_v(cat)) {
842+
++qs.n_attention_wv;
843+
}
844+
845+
if (cat == tensor_category::OUTPUT) {
846+
qs.has_tied_embeddings = false;
847+
}
848+
}
849+
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
850+
}
851+
834852
//
835853
// main quantization driver
836854
//
837855

838856
static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
839-
ggml_type default_type;
840857
llama_ftype ftype = params->ftype;
841858

842859
int nthread = params->nthread;
@@ -845,7 +862,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
845862
nthread = std::thread::hardware_concurrency();
846863
}
847864

848-
default_type = llama_ftype_get_default_type(ftype);
865+
ggml_type default_type = llama_ftype_get_default_type(ftype);
866+
if (default_type == GGML_TYPE_COUNT) {
867+
throw std::runtime_error(format("invalid output file type %d\n", ftype));
868+
}
849869

850870
// mmap consistently increases speed on Linux, and also increases speed on Windows with
851871
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
@@ -964,6 +984,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
964984
});
965985
}
966986

987+
// compute tensor metadata once and cache it
988+
std::vector<tensor_metadata> metadata(tensors.size());
989+
for (size_t i = 0; i < tensors.size(); ++i) {
990+
metadata[i].name = ggml_get_name(tensors[i]->tensor);
991+
}
992+
993+
// initialize quantization state counters and metadata categories
994+
init_quantize_state_counters(qs, metadata);
995+
967996
int idx = 0;
968997
uint16_t n_split = 1;
969998

@@ -976,25 +1005,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
9761005
std::vector<gguf_context_ptr> ctx_outs(n_split);
9771006
ctx_outs[0] = std::move(ctx_out);
9781007

979-
// compute tensor metadata once and cache it
980-
std::vector<tensor_metadata> metadata(tensors.size());
981-
982-
// initialize quantization state before preliminary loop (counters for use_more_bits)
983-
{
984-
for (size_t i = 0; i < tensors.size(); ++i) {
985-
const auto cat = tensor_get_category(tensors[i]->tensor->name);
986-
if (category_is_attn_v(cat)) {
987-
++qs.n_attention_wv;
988-
}
989-
if (cat == tensor_category::OUTPUT) {
990-
qs.has_tied_embeddings = false;
991-
}
992-
metadata[i].category = cat; // save and re-use the category while we're at it
993-
}
994-
// these also need to be set to n_layer by default
995-
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
996-
}
997-
9981008
// flag for --dry-run
9991009
bool will_require_imatrix = false;
10001010

@@ -1005,7 +1015,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
10051015
for (size_t i = 0; i < tensors.size(); ++i) {
10061016
const auto * it = tensors[i];
10071017
const struct ggml_tensor * tensor = it->tensor;
1008-
const std::string name = ggml_get_name(tensor);
10091018

10101019
uint16_t i_split = params->keep_split ? it->idx : 0;
10111020
if (!ctx_outs[i_split]) {
@@ -1034,7 +1043,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
10341043
" - offending tensor: %s\n"
10351044
" - target type: %s\n"
10361045
"============================================================================\n\n",
1037-
name.c_str(), ggml_type_name(metadata[i].target_type));
1046+
metadata[i].name.c_str(), ggml_type_name(metadata[i].target_type));
10381047
throw std::runtime_error("this quantization requires an imatrix!");
10391048
}
10401049
}
@@ -1107,7 +1116,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
11071116
new_ofstream(weight.idx);
11081117
}
11091118

1110-
const std::string name = ggml_get_name(tensor);
11111119
const size_t tensor_size = ggml_nbytes(tensor);
11121120

11131121
if (!params->dry_run) {
@@ -1238,9 +1246,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
12381246
total_size_new += new_size;
12391247

12401248
// update the gguf meta data as we go
1241-
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
1242-
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
1243-
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
1249+
gguf_set_tensor_type(ctx_outs[cur_split].get(), metadata[i].name.c_str(), new_type);
1250+
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), metadata[i].name.c_str())) == new_size);
1251+
gguf_set_tensor_data(ctx_outs[cur_split].get(), metadata[i].name.c_str(), new_data);
12441252

12451253
// write tensor data + padding
12461254
fout.write((const char *) new_data, new_size);
@@ -1305,3 +1313,89 @@ uint32_t llama_model_quantize(
13051313

13061314
return 0;
13071315
}
1316+
1317+
//
1318+
// Helper functions for external tools exposed in llama-ext.h
1319+
//
1320+
1321+
quantize_state_impl * llama_quant_init(
1322+
const llama_model * model,
1323+
const llama_model_quantize_params * params) {
1324+
return new quantize_state_impl(*model, params);
1325+
}
1326+
1327+
void llama_quant_free(quantize_state_impl * qs) {
1328+
delete qs;
1329+
}
1330+
1331+
llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * desc) {
1332+
struct llama_model_params mparams = llama_model_default_params();
1333+
auto * model = new llama_model(mparams);
1334+
1335+
model->arch = llm_arch_from_string(desc->architecture);
1336+
1337+
// infer llm_type: only LLM_TYPE_70B matters for quantization logic
1338+
if (model->arch == LLM_ARCH_LLAMA && desc->n_layer == 80 && desc->n_head != desc->n_head_kv) {
1339+
model->type = LLM_TYPE_70B;
1340+
}
1341+
1342+
model->hparams.n_embd = desc->n_embd;
1343+
model->hparams.n_embd_head_k_full = desc->n_embd_head_k;
1344+
model->hparams.n_embd_head_v_full = desc->n_embd_head_v;
1345+
model->hparams.n_layer = desc->n_layer;
1346+
model->hparams.n_expert = desc->n_expert;
1347+
1348+
for (uint32_t i = 0; i < desc->n_layer; i++) {
1349+
model->hparams.n_head_arr[i] = desc->n_head;
1350+
model->hparams.n_head_kv_arr[i] = desc->n_head_kv;
1351+
model->hparams.n_ff_arr[i] = desc->n_ff;
1352+
}
1353+
1354+
return model;
1355+
}
1356+
1357+
bool llama_quant_tensor_allows_quantization(
1358+
const quantize_state_impl * qs,
1359+
const ggml_tensor * tensor) {
1360+
return tensor_allows_quantization(qs->params, qs->model.arch, tensor);
1361+
}
1362+
1363+
void llama_quant_compute_types(
1364+
quantize_state_impl * qs,
1365+
llama_ftype ftype,
1366+
ggml_tensor ** tensors,
1367+
ggml_type * result_types,
1368+
size_t n_tensors) {
1369+
// reset per-computation state
1370+
qs->n_attention_wv = 0;
1371+
qs->n_ffn_down = 0;
1372+
qs->n_ffn_gate = 0;
1373+
qs->n_ffn_up = 0;
1374+
qs->i_attention_wv = 0;
1375+
qs->i_ffn_down = 0;
1376+
qs->i_ffn_gate = 0;
1377+
qs->i_ffn_up = 0;
1378+
qs->n_fallback = 0;
1379+
qs->has_imatrix = false;
1380+
qs->has_tied_embeddings = true;
1381+
1382+
// build metadata from tensor names
1383+
std::vector<tensor_metadata> metadata(n_tensors);
1384+
for (size_t i = 0; i < n_tensors; i++) {
1385+
metadata[i].name = ggml_get_name(tensors[i]);
1386+
}
1387+
1388+
// initialize counters and categories
1389+
init_quantize_state_counters(*qs, metadata);
1390+
1391+
// use a local copy of params with the requested ftype
1392+
llama_model_quantize_params local_params = *qs->params;
1393+
local_params.ftype = ftype;
1394+
1395+
ggml_type default_type = llama_ftype_get_default_type(ftype);
1396+
1397+
// compute types
1398+
for (size_t i = 0; i < n_tensors; i++) {
1399+
result_types[i] = llama_tensor_get_type(*qs, &local_params, tensors[i], default_type, metadata[i]);
1400+
}
1401+
}

tests/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
*
22
!*.*
3+
!snapshots/
34
*.o
45
ggml-common.h
56
**/*.swp

tests/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,12 @@ if (TARGET cpp-httplib)
274274
add_executable(test-gguf-model-data test-gguf-model-data.cpp)
275275
target_link_libraries(test-gguf-model-data PRIVATE gguf-model-data common)
276276
llama_test(test-gguf-model-data LABEL "model")
277+
278+
# test-quant-type-selection requires gguf-model-data for remote model metadata
279+
llama_build_and_test(test-quant-type-selection.cpp LABEL "model")
280+
target_link_libraries(test-quant-type-selection PRIVATE gguf-model-data)
281+
target_compile_definitions(test-quant-type-selection PRIVATE
282+
SNAPSHOT_DIR="${CMAKE_CURRENT_SOURCE_DIR}/snapshots")
277283
endif()
278284
endif()
279285

0 commit comments

Comments
 (0)