Nexesenex
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 7 additions & 9 deletions b/‎Makefile‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎common/arg.cpp‎
Lines changed: 13 additions & 4 deletions b/‎common/arg.cpp‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎common/arg.h‎
Lines changed: 3 additions & 0 deletions b/‎common/arg.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎common/common.cpp‎
Lines changed: 3 additions & 0 deletions b/‎common/common.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎convert_lora_to_gguf.py‎
Lines changed: 31 additions & 3 deletions b/‎convert_lora_to_gguf.py‎
Lines changed: 31 additions & 3 deletions
diff --git a/‎examples/gguf-split/gguf-split.cpp‎
Lines changed: 8 additions & 6 deletions b/‎examples/gguf-split/gguf-split.cpp‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎examples/gguf/gguf.cpp‎
Lines changed: 10 additions & 6 deletions b/‎examples/gguf/gguf.cpp‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎examples/llava/clip.cpp‎
Lines changed: 4 additions & 2 deletions b/‎examples/llava/clip.cpp‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎ggml/include/ggml-cpp.h‎
Lines changed: 1 addition & 0 deletions b/‎ggml/include/ggml-cpp.h‎
Lines changed: 1 addition & 0 deletions
@@ -793,6 +793,8 @@ else ()
 				ggml/src/ggml-cpu/ggml-cpu-quants.c
 				ggml/src/ggml-cpu/ggml-cpu-quants.h
 				ggml/src/ggml-backend-reg.cpp
+                ggml/include/gguf.h
+                ggml/src/gguf.cpp
 				${GGML_SOURCES_CUDA})
     target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools)
     target_compile_features(ggml PUBLIC c_std_11) # don't bump
 
@@ -96,10 +96,10 @@ endif
 CUBLASLD_FLAGS =
 CUBLAS_OBJS =
 
-OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o ggml-cpu-aarch64.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o sgemm.o common.o sampling.o
-OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants_noavx2.o ggml-cpu-aarch64_noavx2.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o sgemm_noavx2.o common.o sampling.o
-OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants_noavx1.o ggml-cpu-aarch64_noavx1.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o sgemm_noavx1.o common.o sampling.o
-OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants_failsafe.o ggml-cpu-aarch64_failsafe.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o sgemm_failsafe.o common.o sampling.o
+OBJS_FULL += ggml-alloc.o ggml-cpu-traits.o ggml-quants.o ggml-cpu-quants.o ggml-cpu-aarch64.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm.o common.o sampling.o
+OBJS_SIMPLE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx2.o ggml-cpu-quants_noavx2.o ggml-cpu-aarch64_noavx2.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx2.o common.o sampling.o
+OBJS_SIMPLER += ggml-alloc.o ggml-cpu-traits.o ggml-quants_noavx1.o ggml-cpu-quants_noavx1.o ggml-cpu-aarch64_noavx1.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_noavx1.o common.o sampling.o
+OBJS_FAILSAFE += ggml-alloc.o ggml-cpu-traits.o ggml-quants_failsafe.o ggml-cpu-quants_failsafe.o ggml-cpu-aarch64_failsafe.o unicode.o unicode-data.o ggml-threading.o ggml-cpu-cpp.o gguf.o sgemm_failsafe.o common.o sampling.o
 
 # OS specific
 ifeq ($(UNAME_S),Linux)
@@ -587,6 +587,8 @@ ggml-threading.o: ggml/src/ggml-threading.cpp ggml/include/ggml.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 ggml-cpu-cpp.o: ggml/src/ggml-cpu/ggml-cpu.cpp ggml/include/ggml.h ggml/src/ggml-common.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
+gguf.o: ggml/src/gguf.cpp ggml/include/gguf.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 #these have special gpu defines
 ggml-backend_default.o: ggml/src/ggml-backend.cpp ggml/src/ggml-backend-impl.h ggml/include/ggml.h ggml/include/ggml-backend.h
@@ -705,7 +707,7 @@ gpttype_adapter_vulkan_noavx2.o: $(GPTTYPE_ADAPTER)
 	$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) $(VULKAN_FLAGS) -c $< -o $@
 
 clean:
-	rm -vf *.o main sdmain whispermain quantize_gguf quantize_clip quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf imatrix vulkan-shaders-gen gguf-split gguf-split.exe vulkan-shaders-gen.exe imatrix.exe gguf.exe main.exe sdmain.exe whispermain.exe quantize_clip.exe quantize_gguf.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_clblast_noavx2.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_vulkan.dll koboldcpp_vulkan_noavx2.dll koboldcpp_default.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_clblast_noavx2.so koboldcpp_cublas.so koboldcpp_hipblas.so koboldcpp_vulkan.so koboldcpp_vulkan_noavx2.so
+	rm -vf *.o main sdmain whispermain quantize_gguf quantize_clip quantize_gpt2 quantize_gptj quantize_neox quantize_mpt vulkan-shaders-gen gguf-split gguf-split.exe vulkan-shaders-gen.exe main.exe sdmain.exe whispermain.exe quantize_clip.exe quantize_gguf.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_clblast_noavx2.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_vulkan.dll koboldcpp_vulkan_noavx2.dll koboldcpp_default.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_clblast_noavx2.so koboldcpp_cublas.so koboldcpp_hipblas.so koboldcpp_vulkan.so koboldcpp_vulkan_noavx2.so
 	rm -vrf ggml/src/ggml-cuda/*.o
 	rm -vrf ggml/src/ggml-cuda/template-instances/*.o
 
@@ -722,10 +724,6 @@ sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 whispermain: otherarch/whispercpp/main.cpp otherarch/whispercpp/whisper.cpp build-info.h ggml.o ggml-cpu.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-imatrix: examples/imatrix/imatrix.cpp build-info.h ggml.o ggml-cpu.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-gguf: examples/gguf/gguf.cpp build-info.h ggml.o ggml-cpu.o llama.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 gguf-split: examples/gguf-split/gguf-split.cpp ggml.o ggml-cpu.o llama.o build-info.h llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
 
@@ -23,6 +23,11 @@ common_arg & common_arg::set_examples(std::initializer_list<enum llama_example>
     return *this;
 }
 
+common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> excludes) {
+    this->excludes = std::move(excludes);
+    return *this;
+}
+
 common_arg & common_arg::set_env(const char * env) {
     help = help + "\n(env: " + env + ")";
     this->env = env;
@@ -38,6 +43,10 @@ bool common_arg::in_example(enum llama_example ex) {
     return examples.find(ex) != examples.end();
 }
 
+bool common_arg::is_exclude(enum llama_example ex) {
+    return excludes.find(ex) != excludes.end();
+}
+
 bool common_arg::get_value_from_env(std::string & output) {
     if (env == nullptr) return false;
     char * value = std::getenv(env);
@@ -422,7 +431,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
      * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
      */
     auto add_opt = [&](common_arg arg) {
-        if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) {
+        if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
             ctx_arg.options.push_back(std::move(arg));
         }
     };
@@ -651,7 +660,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.prompt = value;
         }
-    ));
+    ).set_excludes({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--no-perf"},
         string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@@ -675,7 +684,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 params.prompt.pop_back();
             }
         }
-    ));
+    ).set_excludes({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--in-file"}, "FNAME",
         "an input file (repeat to specify multiple files)",
@@ -702,7 +711,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.prompt = ss.str();
             fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
         }
-    ));
+    ).set_excludes({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"-e", "--escape"},
         string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
 
@@ -12,6 +12,7 @@
 
 struct common_arg {
     std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
+    std::set<enum llama_example> excludes = {};
     std::vector<const char *> args;
     const char * value_hint   = nullptr; // help text or example for arg value
     const char * value_hint_2 = nullptr; // for second arg value
@@ -53,9 +54,11 @@ struct common_arg {
     ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
 
     common_arg & set_examples(std::initializer_list<enum llama_example> examples);
+    common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
     common_arg & set_env(const char * env);
     common_arg & set_sparam();
     bool in_example(enum llama_example ex);
+    bool is_exclude(enum llama_example ex);
     bool get_value_from_env(std::string & output);
     bool has_value_from_env();
     std::string to_string();
 
@@ -2,6 +2,9 @@
 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #endif
 
+#include "ggml.h"
+#include "gguf.h"
+
 #include "common.h"
 #include "log.h"
 #include "build-info.h"
 
@@ -226,6 +226,9 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
     base_name = lora_tensor_name.replace("base_model.model.", "")
     base_name = base_name.replace(".lora_A.weight", ".weight")
     base_name = base_name.replace(".lora_B.weight", ".weight")
+    # models produced by mergekit-extract-lora have token embeddings in the adapter
+    base_name = base_name.replace(".lora_embedding_A", ".weight")
+    base_name = base_name.replace(".lora_embedding_B", ".weight")
     return base_name
 
 
@@ -260,6 +263,10 @@ def parse_args() -> argparse.Namespace:
         "--base", type=Path,
         help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
     )
+    parser.add_argument(
+        "--base-model-id", type=str,
+        help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
+    )
     parser.add_argument(
         "lora_path", type=Path,
         help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
@@ -290,6 +297,7 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
 
     dir_base_model: Path | None = args.base
     dir_lora: Path = args.lora_path
+    base_model_id: str | None = args.base_model_id
     lora_config = dir_lora / "adapter_config.json"
     input_model = dir_lora / "adapter_model.safetensors"
 
@@ -313,7 +321,10 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
         lparams: dict[str, Any] = json.load(f)
 
     # load base model
-    if dir_base_model is None:
+    if base_model_id is not None:
+        logger.info(f"Loading base model from Hugging Face: {base_model_id}")
+        hparams = load_hparams_from_hf(base_model_id)
+    elif dir_base_model is None:
         if "base_model_name_or_path" in lparams:
             model_id = lparams["base_model_name_or_path"]
             logger.info(f"Loading base model from Hugging Face: {model_id}")
@@ -371,11 +382,16 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
                     if self.lazy:
                         tensor = LazyTorchTensor.from_eager(tensor)
                     base_name = get_base_tensor_name(name)
-                    is_lora_a = ".lora_A.weight" in name
-                    is_lora_b = ".lora_B.weight" in name
+                    # note: mergekit-extract-lora also adds token embeddings to the adapter
+                    is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
+                    is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
                     if not is_lora_a and not is_lora_b:
                         if ".base_layer.weight" in name:
                             continue
+                        # mergekit-extract-lora add these layernorm to the adapter, we need to keep them
+                        if "_layernorm" in name or ".norm" in name:
+                            yield (base_name, tensor)
+                            continue
                         logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
                         if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
                             logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
@@ -407,9 +423,21 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
                 if name == "lm_head.weight" and len(dest) == 0:
                     raise ValueError("lm_head is present in adapter, but is ignored in base model")
                 for dest_name, dest_data in dest:
+                    # mergekit-extract-lora add these layernorm to the adapter
+                    if "_norm" in dest_name:
+                        assert dest_data.dim() == 1
+                        yield (dest_name, dest_data)
+                        continue
+
+                    # otherwise, we must get the lora_A and lora_B tensors
                     assert isinstance(dest_data, LoraTorchTensor)
                     lora_a, lora_b = dest_data.get_lora_A_B()
 
+                    # note: mergekit-extract-lora flip and transpose A and B
+                    # here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
+                    if "token_embd.weight" in dest_name:
+                        lora_a = lora_a.T
+
                     yield (dest_name + ".lora_a", lora_a)
                     yield (dest_name + ".lora_b", lora_b)
 
 
@@ -1,17 +1,19 @@
+#include "ggml.h"
+#include "gguf.h"
 #include "llama.h"
 #include "common.h"
 #include "build-info.h"
 
 #include <algorithm>
+#include <cinttypes>
+#include <climits>
+#include <cstdio>
 #include <cstdlib>
+#include <stdexcept>
+#include <cstring>
 #include <fstream>
 #include <string>
 #include <vector>
-#include <climits>
-
-#include <cstdio>
-#include <cstring>
-#include <stdexcept>
 
 #if defined(_WIN32)
     #include <windows.h>
@@ -297,7 +299,7 @@ struct split_strategy {
                 total_size += ggml_nbytes(t);
             }
             total_size = total_size / 1000 / 1000; // convert to megabytes
-            printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
+            printf("split %05d: n_tensors = %" PRIi64 ", total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
             i_split++;
         }
     }
 
@@ -1,10 +1,9 @@
 #include "ggml.h"
+#include "gguf.h"
 
 #include <cstdio>
-#include <cinttypes>
 #include <string>
 #include <sstream>
-#include <fstream>
 #include <vector>
 
 #undef MIN
@@ -135,9 +134,10 @@ static bool gguf_ex_read_0(const std::string & fname) {
 
         for (int i = 0; i < n_tensors; ++i) {
             const char * name   = gguf_get_tensor_name  (ctx, i);
+            const size_t size   = gguf_get_tensor_size  (ctx, i);
             const size_t offset = gguf_get_tensor_offset(ctx, i);
 
-            printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+            printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
         }
     }
 
@@ -182,9 +182,10 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
 
         for (int i = 0; i < n_tensors; ++i) {
             const char * name   = gguf_get_tensor_name  (ctx, i);
+            const size_t size   = gguf_get_tensor_size  (ctx, i);
             const size_t offset = gguf_get_tensor_offset(ctx, i);
 
-            printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+            printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
         }
     }
 
@@ -199,7 +200,8 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
 
             struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
 
-            printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data);
+            printf("%s: tensor[%d]: n_dims = %d, ne = (%d, %d, %d, %d), name = %s, data = %p\n",
+                __func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data);
 
             // print first 10 elements
             const float * data = (const float *) cur->data;
@@ -215,7 +217,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
                 const float * data = (const float *) cur->data;
                 for (int j = 0; j < ggml_nelements(cur); ++j) {
                     if (data[j] != 100 + i) {
-                        fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
+                        fprintf(stderr, "%s: tensor[%d], data[%d]: found %f, expected %f\n", __func__, i, j, data[j], float(100 + i));
                         gguf_free(ctx);
                         return false;
                     }
@@ -245,6 +247,8 @@ int main(int argc, char ** argv) {
         check_data = false;
     }
 
+    srand(123456);
+
     const std::string fname(argv[1]);
     const std::string mode (argv[2]);
 
 
@@ -7,6 +7,7 @@
 #include "ggml-cpu.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
+#include "gguf.h"
 
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
@@ -265,7 +266,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
             {
                 const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
                 int arr_n = gguf_get_arr_n(ctx_gguf, i);
-                const void * data = gguf_get_arr_data(ctx_gguf, i);
+                const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
                 std::stringstream ss;
                 ss << "[";
                 for (int j = 0; j < arr_n; j++) {
@@ -2844,7 +2845,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
         total_size_org += orig_size;
         total_size_new += new_size;
         gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
-        gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
+        GGML_ASSERT(gguf_get_tensor_size(ctx_out, gguf_find_tensor(ctx_out, name.c_str())) == new_size);
+        gguf_set_tensor_data(ctx_out, name.c_str(), new_data);
         fout.write((const char *)new_data, new_size);
         size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
         for (size_t j = 0; j < pad; ++j) {
 
@@ -7,6 +7,7 @@
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
+#include "gguf.h"
 #include <memory>
 
 // Smart pointers for ggml types