model is loadable

ngxson · ngxson · commit a75c5c42956b · 2024-09-30T13:53:36.000+02:00
diff --git a/Makefile b/Makefile
@@ -1120,6 +1120,7 @@ src/llama.o: \
 	src/llama-vocab.h \
 	src/llama-grammar.h \
 	src/llama-sampling.h \
+	src/llama-vision.h \
 	src/unicode.h \
 	include/llama.h \
 	ggml/include/ggml-cuda.h \
@@ -1152,6 +1153,17 @@ src/llama-sampling.o: \
 	include/llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
+src/llama-vision.o: \
+	src/llama-vision.cpp \
+	src/llama-vision.h \
+	include/llama.h \
+	ggml/include/ggml-cuda.h \
+	ggml/include/ggml-metal.h \
+	ggml/include/ggml.h \
+	ggml/include/ggml-alloc.h \
+	ggml/include/ggml-backend.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 $(LIB_LLAMA): \
 	$(OBJ_LLAMA) \
 	$(LIB_GGML)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -1583,6 +1583,9 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_vision_clip_embedding_length(self.vparams["hidden_size"])
             self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"])
             self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"])
+            # TODO: should not hardcode these, but they are currently missing from config.json
+            self.gguf_writer.add_vision_clip_max_position_embeddings(577)
+            self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05)
 
     @staticmethod
     def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -195,6 +195,7 @@ class Clip:
             PROJECTION_TYPE     = "vision.clip.projection_type"
             PROJECTION_DIM      = "vision.clip.projection_dim"
             USE_GELU            = "vision.clip.use_gelu"
+            MAX_POS_EMBEDDING   = "vision.clip.max_position_embeddings"
             HEAD_COUNT          = "vision.clip.attention.head_count"
             LAYERNORM_EPS       = "vision.clip.attention.layer_norm_epsilon"
 
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -841,6 +841,9 @@ def add_vision_clip_feed_forward_length(self, value: int) -> None:
     def add_vision_clip_head_count(self, value: int) -> None:
         self.add_uint32(Keys.Vision.Clip.HEAD_COUNT, value)
 
+    def add_vision_clip_max_position_embeddings(self, value: int) -> None:
+        self.add_uint32(Keys.Vision.Clip.MAX_POS_EMBEDDING, value)
+
     def add_vision_clip_layer_norm_epsilon(self, value: float) -> None:
         self.add_float32(Keys.Vision.Clip.LAYERNORM_EPS, value)
 
diff --git a/include/llama.h b/include/llama.h
@@ -224,6 +224,20 @@ extern "C" {
 
     typedef bool (*llama_progress_callback)(float progress, void * user_data);
 
+    // represent an RGB image
+    // size of data must be equal to 3*nx*ny
+    typedef struct llama_img {
+        uint32_t nx;
+        uint32_t ny;
+        unsigned char * data;
+    } llama_img;
+
+    // Input data for llama_vision_decode
+    typedef struct llama_img_batch {
+        int32_t     n_imgs;
+        llama_img * imgs;
+    } llama_img_batch;
+
     // Input data for llama_decode
     // A llama_batch object can contain input about one or many sequences
     // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
@@ -875,6 +889,16 @@ extern "C" {
     // shape: [n_embd] (1-dimensional)
     LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
 
+    //
+    // Vision
+    //
+
+    // encode image into embeddings
+    LLAMA_API int32_t llama_vision_encode(struct llama_context * ctx, llama_img_batch * batch);
+
+    // get output embeddings, to be put into language batch
+    LLAMA_API float * llama_vision_get_embeddings(struct llama_context * ctx, int32_t idx);
+
     //
     // Vocab
     //
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -17,6 +17,7 @@ add_library(llama
             llama-vocab.cpp
             llama-grammar.cpp
             llama-sampling.cpp
+            llama-vision.cpp
             unicode.h
             unicode.cpp
             unicode-data.cpp
diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp
@@ -0,0 +1,5 @@
+#include "llama.h"
+
+#include "llama-vision.h"
+
+
diff --git a/src/llama-vision.h b/src/llama-vision.h
@@ -0,0 +1,91 @@
+#pragma once
+
+#include "ggml.h"
+
+#include <vector>
+
+enum vision_arch {
+    VISION_ARCH_LLAVA,
+    VISION_ARCH_UNKNOWN,
+};
+
+enum mm_patch_merge {
+    MM_PATCH_MERGE_FLAT,
+    MM_PATCH_MERGE_SPATIAL_UNPAD,
+};
+
+struct clip_hparams {
+    vision_arch arch = VISION_ARCH_UNKNOWN;
+
+    uint32_t image_size;
+    uint32_t patch_size;
+    uint32_t hidden_size;
+    uint32_t n_intermediate;
+    uint32_t projection_dim;
+    uint32_t n_head;
+    uint32_t n_layer;
+    uint32_t max_pos_embd;
+
+    float eps;
+
+    mm_patch_merge mm_patch_merge_type = MM_PATCH_MERGE_FLAT;
+
+    int32_t image_grid_pinpoints[32];
+    int32_t image_crop_resolution;
+};
+
+struct clip_layer {
+    // attention
+    struct ggml_tensor * k_w;
+    struct ggml_tensor * k_b;
+    struct ggml_tensor * q_w;
+    struct ggml_tensor * q_b;
+    struct ggml_tensor * v_w;
+    struct ggml_tensor * v_b;
+
+    struct ggml_tensor * output_w;
+    struct ggml_tensor * output_b;
+
+    // layernorm 1
+    struct ggml_tensor * norm_in_w;
+    struct ggml_tensor * norm_in_b;
+
+    // ff
+    struct ggml_tensor * ffn_up_w;
+    struct ggml_tensor * ffn_up_b;
+
+    struct ggml_tensor * ffn_down_w;
+    struct ggml_tensor * ffn_down_b;
+
+    // layernorm 2
+    struct ggml_tensor * norm_out_w;
+    struct ggml_tensor * norm_out_b;
+};
+
+struct clip_vision_model {
+    struct clip_hparams hparams;
+
+    // embeddings
+    struct ggml_tensor * class_embedding;
+    struct ggml_tensor * patch_embeddings;
+    struct ggml_tensor * patch_bias;
+    struct ggml_tensor * position_embeddings;
+
+    struct ggml_tensor * pre_norm_w;
+    struct ggml_tensor * pre_norm_b;
+
+    std::vector<clip_layer> layers;
+
+    struct ggml_tensor * post_norm_w;
+    struct ggml_tensor * post_norm_b;
+
+    struct ggml_tensor * projection;
+
+    // LLaVA projection
+    struct ggml_tensor * mm_a_w = NULL;
+    struct ggml_tensor * mm_a_b = NULL;
+    struct ggml_tensor * mm_b_w = NULL;
+    struct ggml_tensor * mm_b_b = NULL;
+
+    struct ggml_tensor * image_newline = NULL;
+};
diff --git a/src/llama.cpp b/src/llama.cpp

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +#include "llama.h"
++
 +#include "llama-vision.h"
++
++