Skip to content

Commit a75c5c4

Browse files
committed
model is loadable
1 parent cd806a7 commit a75c5c4

File tree

9 files changed

+360
-28
lines changed

9 files changed

+360
-28
lines changed

Makefile

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1120,6 +1120,7 @@ src/llama.o: \
11201120
src/llama-vocab.h \
11211121
src/llama-grammar.h \
11221122
src/llama-sampling.h \
1123+
src/llama-vision.h \
11231124
src/unicode.h \
11241125
include/llama.h \
11251126
ggml/include/ggml-cuda.h \
@@ -1152,6 +1153,17 @@ src/llama-sampling.o: \
11521153
include/llama.h
11531154
$(CXX) $(CXXFLAGS) -c $< -o $@
11541155

1156+
src/llama-vision.o: \
1157+
src/llama-vision.cpp \
1158+
src/llama-vision.h \
1159+
include/llama.h \
1160+
ggml/include/ggml-cuda.h \
1161+
ggml/include/ggml-metal.h \
1162+
ggml/include/ggml.h \
1163+
ggml/include/ggml-alloc.h \
1164+
ggml/include/ggml-backend.h
1165+
$(CXX) $(CXXFLAGS) -c $< -o $@
1166+
11551167
$(LIB_LLAMA): \
11561168
$(OBJ_LLAMA) \
11571169
$(LIB_GGML)

convert_hf_to_gguf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1583,6 +1583,9 @@ def set_gguf_parameters(self):
15831583
self.gguf_writer.add_vision_clip_embedding_length(self.vparams["hidden_size"])
15841584
self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"])
15851585
self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"])
1586+
# TODO: should not hardcode these, but they are currently missing from config.json
1587+
self.gguf_writer.add_vision_clip_max_position_embeddings(577)
1588+
self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05)
15861589

15871590
@staticmethod
15881591
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ class Clip:
195195
PROJECTION_TYPE = "vision.clip.projection_type"
196196
PROJECTION_DIM = "vision.clip.projection_dim"
197197
USE_GELU = "vision.clip.use_gelu"
198+
MAX_POS_EMBEDDING = "vision.clip.max_position_embeddings"
198199
HEAD_COUNT = "vision.clip.attention.head_count"
199200
LAYERNORM_EPS = "vision.clip.attention.layer_norm_epsilon"
200201

gguf-py/gguf/gguf_writer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -841,6 +841,9 @@ def add_vision_clip_feed_forward_length(self, value: int) -> None:
841841
def add_vision_clip_head_count(self, value: int) -> None:
842842
self.add_uint32(Keys.Vision.Clip.HEAD_COUNT, value)
843843

844+
def add_vision_clip_max_position_embeddings(self, value: int) -> None:
845+
self.add_uint32(Keys.Vision.Clip.MAX_POS_EMBEDDING, value)
846+
844847
def add_vision_clip_layer_norm_epsilon(self, value: float) -> None:
845848
self.add_float32(Keys.Vision.Clip.LAYERNORM_EPS, value)
846849

include/llama.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,20 @@ extern "C" {
224224

225225
typedef bool (*llama_progress_callback)(float progress, void * user_data);
226226

227+
// represent an RGB image
228+
// size of data must be equal to 3*nx*ny
229+
typedef struct llama_img {
230+
uint32_t nx;
231+
uint32_t ny;
232+
unsigned char * data;
233+
} llama_img;
234+
235+
// Input data for llama_vision_decode
236+
typedef struct llama_img_batch {
237+
int32_t n_imgs;
238+
llama_img * imgs;
239+
} llama_img_batch;
240+
227241
// Input data for llama_decode
228242
// A llama_batch object can contain input about one or many sequences
229243
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
@@ -875,6 +889,16 @@ extern "C" {
875889
// shape: [n_embd] (1-dimensional)
876890
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
877891

892+
//
893+
// Vision
894+
//
895+
896+
// encode image into embeddings
897+
LLAMA_API int32_t llama_vision_encode(struct llama_context * ctx, llama_img_batch * batch);
898+
899+
// get output embeddings, to be put into language batch
900+
LLAMA_API float * llama_vision_get_embeddings(struct llama_context * ctx, int32_t idx);
901+
878902
//
879903
// Vocab
880904
//

src/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ add_library(llama
1717
llama-vocab.cpp
1818
llama-grammar.cpp
1919
llama-sampling.cpp
20+
llama-vision.cpp
2021
unicode.h
2122
unicode.cpp
2223
unicode-data.cpp

src/llama-vision.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#include "llama.h"
2+
3+
#include "llama-vision.h"
4+
5+

src/llama-vision.h

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#pragma once
2+
3+
#include "ggml.h"
4+
5+
#include <vector>
6+
7+
enum vision_arch {
8+
VISION_ARCH_LLAVA,
9+
VISION_ARCH_UNKNOWN,
10+
};
11+
12+
enum mm_patch_merge {
13+
MM_PATCH_MERGE_FLAT,
14+
MM_PATCH_MERGE_SPATIAL_UNPAD,
15+
};
16+
17+
struct clip_hparams {
18+
vision_arch arch = VISION_ARCH_UNKNOWN;
19+
20+
uint32_t image_size;
21+
uint32_t patch_size;
22+
uint32_t hidden_size;
23+
uint32_t n_intermediate;
24+
uint32_t projection_dim;
25+
uint32_t n_head;
26+
uint32_t n_layer;
27+
uint32_t max_pos_embd;
28+
29+
float eps;
30+
31+
mm_patch_merge mm_patch_merge_type = MM_PATCH_MERGE_FLAT;
32+
33+
int32_t image_grid_pinpoints[32];
34+
int32_t image_crop_resolution;
35+
};
36+
37+
struct clip_layer {
38+
// attention
39+
struct ggml_tensor * k_w;
40+
struct ggml_tensor * k_b;
41+
struct ggml_tensor * q_w;
42+
struct ggml_tensor * q_b;
43+
struct ggml_tensor * v_w;
44+
struct ggml_tensor * v_b;
45+
46+
struct ggml_tensor * output_w;
47+
struct ggml_tensor * output_b;
48+
49+
// layernorm 1
50+
struct ggml_tensor * norm_in_w;
51+
struct ggml_tensor * norm_in_b;
52+
53+
// ff
54+
struct ggml_tensor * ffn_up_w;
55+
struct ggml_tensor * ffn_up_b;
56+
57+
struct ggml_tensor * ffn_down_w;
58+
struct ggml_tensor * ffn_down_b;
59+
60+
// layernorm 2
61+
struct ggml_tensor * norm_out_w;
62+
struct ggml_tensor * norm_out_b;
63+
};
64+
65+
struct clip_vision_model {
66+
struct clip_hparams hparams;
67+
68+
// embeddings
69+
struct ggml_tensor * class_embedding;
70+
struct ggml_tensor * patch_embeddings;
71+
struct ggml_tensor * patch_bias;
72+
struct ggml_tensor * position_embeddings;
73+
74+
struct ggml_tensor * pre_norm_w;
75+
struct ggml_tensor * pre_norm_b;
76+
77+
std::vector<clip_layer> layers;
78+
79+
struct ggml_tensor * post_norm_w;
80+
struct ggml_tensor * post_norm_b;
81+
82+
struct ggml_tensor * projection;
83+
84+
// LLaVA projection
85+
struct ggml_tensor * mm_a_w = NULL;
86+
struct ggml_tensor * mm_a_b = NULL;
87+
struct ggml_tensor * mm_b_w = NULL;
88+
struct ggml_tensor * mm_b_b = NULL;
89+
90+
struct ggml_tensor * image_newline = NULL;
91+
};

0 commit comments

Comments
 (0)