|
13 | 13 | #include <thread> |
14 | 14 | #include <unordered_map> |
15 | 15 |
|
| 16 | +// SmartQuant helper headers |
| 17 | +#include <stdio.h> |
| 18 | +#include <stdlib.h> |
| 19 | +#include <string.h> |
| 20 | +#include <stdint.h> |
| 21 | +#include <errno.h> |
| 22 | + |
| 23 | +#define MAX_LINE_LENGTH 512 |
| 24 | +#define MAX_KEY_LENGTH 256 |
| 25 | + |
| 26 | +typedef struct { |
| 27 | + char key[MAX_KEY_LENGTH]; |
| 28 | + int8_t value; |
| 29 | +} WeightEntry; |
| 30 | + |
| 31 | +typedef struct { |
| 32 | + WeightEntry *entries; |
| 33 | + size_t count; |
| 34 | + size_t capacity; |
| 35 | +} WeightMap; |
| 36 | + |
| 37 | +void initWeightMap(WeightMap *map); |
| 38 | +int addWeightEntry(WeightMap *map, const char *key, int8_t value); |
| 39 | +int8_t getWeightValue(const WeightMap *map, const char *key, int *found); |
| 40 | +void freeWeightMap(WeightMap *map); |
| 41 | + |
| 42 | + |
16 | 43 | static void zeros(std::ofstream & file, size_t n) { |
17 | 44 | char zero = 0; |
18 | 45 | for (size_t i = 0; i < n; ++i) { |
@@ -463,6 +490,51 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * |
463 | 490 | return new_size; |
464 | 491 | } |
465 | 492 |
|
| 493 | +// SmartQuant map handlers |
| 494 | +// Initializes the WeightMap |
| 495 | +void initWeightMap(WeightMap *map) { |
| 496 | + map->entries = NULL; |
| 497 | + map->count = 0; |
| 498 | + map->capacity = 0; |
| 499 | +} |
| 500 | + |
| 501 | +// Adds a new entry to the WeightMap |
| 502 | +int addWeightEntry(WeightMap *map, const char *key, int8_t value) { |
| 503 | + if (map->count >= map->capacity) { |
| 504 | + size_t new_capacity = (map->capacity == 0) ? 16 : map->capacity * 2; |
| 505 | + WeightEntry *new_entries = static_cast<WeightEntry*>(realloc(map->entries, new_capacity * sizeof(WeightEntry))); |
| 506 | + if (new_entries == NULL) { |
| 507 | + perror("realloc failed"); |
| 508 | + return -1; // Indicate failure |
| 509 | + } |
| 510 | + map->entries = new_entries; |
| 511 | + map->capacity = new_capacity; |
| 512 | + } |
| 513 | + strncpy(map->entries[map->count].key, key, MAX_KEY_LENGTH - 1); |
| 514 | + map->entries[map->count].key[MAX_KEY_LENGTH - 1] = '\0'; // Ensure null termination |
| 515 | + map->entries[map->count].value = value; |
| 516 | + map->count++; |
| 517 | + return 0; // Indicate success |
| 518 | +} |
| 519 | + |
| 520 | +// Retrieves the int8_t value associated with a key |
| 521 | +int8_t getWeightValue(const WeightMap *map, const char *key, int *found) { |
| 522 | + for (size_t i = 0; i < map->count; ++i) { |
| 523 | + if (strcmp(map->entries[i].key, key) == 0) { |
| 524 | + *found = 1; |
| 525 | + return map->entries[i].value; |
| 526 | + } |
| 527 | + } |
| 528 | + *found = 0; |
| 529 | + return 0; // Default value if not found |
| 530 | +} |
| 531 | + |
| 532 | +// Frees the memory allocated for the WeightMap |
| 533 | +void freeWeightMap(WeightMap *map) { |
| 534 | + free(map->entries); |
| 535 | + initWeightMap(map); // Reset the map |
| 536 | +} |
| 537 | + |
466 | 538 | static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { |
467 | 539 | ggml_type default_type; |
468 | 540 | llama_ftype ftype = params->ftype; |
@@ -706,6 +778,52 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: |
706 | 778 | ::zeros(fout, meta_size); |
707 | 779 | }; |
708 | 780 |
|
| 781 | + // As workaround, read SmartQuant json here. |
| 782 | + // Should be read where the imatrix data is read and configurable via parameter |
| 783 | + FILE *fp; |
| 784 | + char line[MAX_LINE_LENGTH]; |
| 785 | + WeightMap weight_map; |
| 786 | + |
| 787 | + initWeightMap(&weight_map); |
| 788 | + |
| 789 | + const char *filename = "default.smartquant.json"; |
| 790 | + |
| 791 | + fp = fopen(filename, "r"); |
| 792 | + if (fp == NULL) { |
| 793 | + printf("Error opening SmartQuant JSON file\n"); |
| 794 | + } else { |
| 795 | + while (fgets(line, sizeof(line), fp) != NULL) { |
| 796 | + // Basic parsing logic (assuming the file format is consistent) |
| 797 | + char *token = strtok(line, "{},:\""); |
| 798 | + char key[MAX_KEY_LENGTH]; |
| 799 | + char value_str[16]; |
| 800 | + |
| 801 | + while (token != NULL) { |
| 802 | + // The keys are usually the tokens at even positions after the first '{' |
| 803 | + if (strstr(token, "blk.")) { |
| 804 | + strncpy(key, token, MAX_KEY_LENGTH - 1); |
| 805 | + key[MAX_KEY_LENGTH - 1] = '\0'; |
| 806 | + |
| 807 | + token = strtok(NULL, "{},:\""); // Move to the value |
| 808 | + if (token != NULL) { |
| 809 | + // The value should be the next token |
| 810 | + strncpy(value_str, token, sizeof(value_str) - 1); |
| 811 | + value_str[sizeof(value_str) - 1] = '\0'; |
| 812 | + int value = atoi(value_str); |
| 813 | + if (value >= INT8_MIN && value <= INT8_MAX) { |
| 814 | + addWeightEntry(&weight_map, key, (int8_t)value); |
| 815 | + } else { |
| 816 | + fprintf(stderr, "Warning: Value '%d' for key '%s' is out of int8_t range and will be skipped.\n", value, key); |
| 817 | + } |
| 818 | + } |
| 819 | + } |
| 820 | + token = strtok(NULL, "{},:\""); |
| 821 | + } |
| 822 | + } |
| 823 | + fclose(fp); |
| 824 | + printf("SmartQuant JSON has %ld entries\n", weight_map.count); |
| 825 | + } |
| 826 | + |
709 | 827 | const auto tn = LLM_TN(model.arch); |
710 | 828 | new_ofstream(0); |
711 | 829 | for (const auto * it : tensors) { |
@@ -815,6 +933,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: |
815 | 933 | } else { |
816 | 934 | if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) { |
817 | 935 | imatrix = it->second.data(); |
| 936 | + |
| 937 | + // if SmartQuant json data is available for tensor->name then set new_type |
| 938 | + int found = 0; |
| 939 | + char *search_key = tensor->name; |
| 940 | + int8_t retrieved_value = getWeightValue(&weight_map, search_key, &found); |
| 941 | + if (found) { |
| 942 | + printf("SmartQuant .. "); |
| 943 | + new_type = static_cast<ggml_type>(retrieved_value); |
| 944 | + } else printf("SmartQuant Key '%s' not found.\n", search_key); |
| 945 | + |
818 | 946 | } else { |
819 | 947 | LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__, |
820 | 948 | int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name); |
@@ -896,7 +1024,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: |
896 | 1024 | zeros(fout, GGML_PAD(new_size, align) - new_size); |
897 | 1025 | } |
898 | 1026 | close_ofstream(); |
899 | | - |
| 1027 | + freeWeightMap(&weight_map); // free SmartQuant map |
| 1028 | + |
900 | 1029 | LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); |
901 | 1030 | LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); |
902 | 1031 |
|
|
0 commit comments