Skip to content

Commit e1ffb0a

Browse files
committed
SmartQuant from default.smartquant.json
1 parent dd373dd commit e1ffb0a

File tree

1 file changed

+130
-1
lines changed

1 file changed

+130
-1
lines changed

src/llama-quant.cpp

Lines changed: 130 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,33 @@
1313
#include <thread>
1414
#include <unordered_map>
1515

16+
// SmartQuant helper headers
17+
#include <stdio.h>
18+
#include <stdlib.h>
19+
#include <string.h>
20+
#include <stdint.h>
21+
#include <errno.h>
22+
23+
#define MAX_LINE_LENGTH 512
24+
#define MAX_KEY_LENGTH 256
25+
26+
typedef struct {
27+
char key[MAX_KEY_LENGTH];
28+
int8_t value;
29+
} WeightEntry;
30+
31+
typedef struct {
32+
WeightEntry *entries;
33+
size_t count;
34+
size_t capacity;
35+
} WeightMap;
36+
37+
void initWeightMap(WeightMap *map);
38+
int addWeightEntry(WeightMap *map, const char *key, int8_t value);
39+
int8_t getWeightValue(const WeightMap *map, const char *key, int *found);
40+
void freeWeightMap(WeightMap *map);
41+
42+
1643
static void zeros(std::ofstream & file, size_t n) {
1744
char zero = 0;
1845
for (size_t i = 0; i < n; ++i) {
@@ -463,6 +490,51 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
463490
return new_size;
464491
}
465492

493+
// SmartQuant map handlers
494+
// Initializes the WeightMap
495+
void initWeightMap(WeightMap *map) {
496+
map->entries = NULL;
497+
map->count = 0;
498+
map->capacity = 0;
499+
}
500+
501+
// Adds a new entry to the WeightMap
502+
int addWeightEntry(WeightMap *map, const char *key, int8_t value) {
503+
if (map->count >= map->capacity) {
504+
size_t new_capacity = (map->capacity == 0) ? 16 : map->capacity * 2;
505+
WeightEntry *new_entries = static_cast<WeightEntry*>(realloc(map->entries, new_capacity * sizeof(WeightEntry)));
506+
if (new_entries == NULL) {
507+
perror("realloc failed");
508+
return -1; // Indicate failure
509+
}
510+
map->entries = new_entries;
511+
map->capacity = new_capacity;
512+
}
513+
strncpy(map->entries[map->count].key, key, MAX_KEY_LENGTH - 1);
514+
map->entries[map->count].key[MAX_KEY_LENGTH - 1] = '\0'; // Ensure null termination
515+
map->entries[map->count].value = value;
516+
map->count++;
517+
return 0; // Indicate success
518+
}
519+
520+
// Retrieves the int8_t value associated with a key
521+
int8_t getWeightValue(const WeightMap *map, const char *key, int *found) {
522+
for (size_t i = 0; i < map->count; ++i) {
523+
if (strcmp(map->entries[i].key, key) == 0) {
524+
*found = 1;
525+
return map->entries[i].value;
526+
}
527+
}
528+
*found = 0;
529+
return 0; // Default value if not found
530+
}
531+
532+
// Frees the memory allocated for the WeightMap
533+
void freeWeightMap(WeightMap *map) {
534+
free(map->entries);
535+
initWeightMap(map); // Reset the map
536+
}
537+
466538
static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
467539
ggml_type default_type;
468540
llama_ftype ftype = params->ftype;
@@ -706,6 +778,52 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
706778
::zeros(fout, meta_size);
707779
};
708780

781+
// As workaround, read SmartQuant json here.
782+
// Should be read where the imatrix data is read and configurable via parameter
783+
FILE *fp;
784+
char line[MAX_LINE_LENGTH];
785+
WeightMap weight_map;
786+
787+
initWeightMap(&weight_map);
788+
789+
const char *filename = "default.smartquant.json";
790+
791+
fp = fopen(filename, "r");
792+
if (fp == NULL) {
793+
printf("Error opening SmartQuant JSON file\n");
794+
} else {
795+
while (fgets(line, sizeof(line), fp) != NULL) {
796+
// Basic parsing logic (assuming the file format is consistent)
797+
char *token = strtok(line, "{},:\"");
798+
char key[MAX_KEY_LENGTH];
799+
char value_str[16];
800+
801+
while (token != NULL) {
802+
// The keys are usually the tokens at even positions after the first '{'
803+
if (strstr(token, "blk.")) {
804+
strncpy(key, token, MAX_KEY_LENGTH - 1);
805+
key[MAX_KEY_LENGTH - 1] = '\0';
806+
807+
token = strtok(NULL, "{},:\""); // Move to the value
808+
if (token != NULL) {
809+
// The value should be the next token
810+
strncpy(value_str, token, sizeof(value_str) - 1);
811+
value_str[sizeof(value_str) - 1] = '\0';
812+
int value = atoi(value_str);
813+
if (value >= INT8_MIN && value <= INT8_MAX) {
814+
addWeightEntry(&weight_map, key, (int8_t)value);
815+
} else {
816+
fprintf(stderr, "Warning: Value '%d' for key '%s' is out of int8_t range and will be skipped.\n", value, key);
817+
}
818+
}
819+
}
820+
token = strtok(NULL, "{},:\"");
821+
}
822+
}
823+
fclose(fp);
824+
printf("SmartQuant JSON has %ld entries\n", weight_map.count);
825+
}
826+
709827
const auto tn = LLM_TN(model.arch);
710828
new_ofstream(0);
711829
for (const auto * it : tensors) {
@@ -815,6 +933,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
815933
} else {
816934
if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
817935
imatrix = it->second.data();
936+
937+
// if SmartQuant json data is available for tensor->name then set new_type
938+
int found = 0;
939+
char *search_key = tensor->name;
940+
int8_t retrieved_value = getWeightValue(&weight_map, search_key, &found);
941+
if (found) {
942+
printf("SmartQuant .. ");
943+
new_type = static_cast<ggml_type>(retrieved_value);
944+
} else printf("SmartQuant Key '%s' not found.\n", search_key);
945+
818946
} else {
819947
LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
820948
int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
@@ -896,7 +1024,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
8961024
zeros(fout, GGML_PAD(new_size, align) - new_size);
8971025
}
8981026
close_ofstream();
899-
1027+
freeWeightMap(&weight_map); // free SmartQuant map
1028+
9001029
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
9011030
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
9021031

0 commit comments

Comments
 (0)