rename everywhere

ngxson · ngxson · commit ad38e8732964 · 2025-01-21T15:53:39.000+01:00
diff --git a/examples/vision/vision.cpp b/examples/vision/vision.cpp
@@ -122,7 +122,7 @@ int main(int argc, char ** argv) {
     int n_prompt = 0;
 
     // process image
-    llama_vision_patches * img_patches = nullptr;
+    llama_vision_tokens * img_tokens = nullptr;
     {
         const char * img_path = params.image[0].c_str();
         if (params.image[0].empty()) {
@@ -131,12 +131,12 @@ int main(int argc, char ** argv) {
         }
         llama_vision_bitmap * img = load_image_from_file(img_path);
         LOG_INF("loaded image %s, size = %d x %d\n", img_path, img->nx, img->ny);
-        img_patches = llama_vision_patches_init(ctx, img);
-        if (!img_patches) {
-            LOG_ERR("failed to create image patches\n");
+        img_tokens = llama_vision_tokenize(ctx, img);
+        if (!img_tokens) {
+            LOG_ERR("failed to create image tokens\n");
             return 1;
         }
-        if (llama_vision_encode(ctx, img_patches)) {
+        if (llama_vision_encode(ctx, img_tokens)) {
             LOG_ERR("failed to encode image\n");
             return 1;
         }
diff --git a/include/llama.h b/include/llama.h
@@ -229,7 +229,9 @@ extern "C" {
         bool sorted;
     } llama_token_data_array;
 
-    struct llama_vision_patches;
+    // Structure represents the basic input unit of vision model
+    // This can be a processed image or slices of images under the hood
+    struct llama_vision_tokens;
 
     // represent an RGB image
     // size of data must be equal to 3*nx*ny
@@ -1286,12 +1288,15 @@ extern "C" {
     LLAMA_API struct llama_vision_bitmap * llama_vision_bitmap_init(uint32_t nx, uint32_t ny);
     LLAMA_API void llama_vision_bitmap_free(struct llama_vision_bitmap * bmp);
 
-    // Create patches from the RGB bitmap
-    LLAMA_API struct llama_vision_patches * llama_vision_patches_init(struct llama_context * ctx, llama_vision_bitmap * bmp);
-    LLAMA_API void llama_vision_patches_free(struct llama_vision_patches * p);
+    // Create image tokens from the RGB bitmap
+    LLAMA_API struct llama_vision_tokens * llama_vision_tokenize(struct llama_context * ctx, llama_vision_bitmap * bmp);
+    LLAMA_API void llama_vision_tokens_free(struct llama_vision_tokens * img_tokens);
+
+    // User must reserve N number of tokens in tokenized text prompt for each image
+    // LLAMA_API int32_t llama_vision_get_n_tokens(const llama_vision_img_tokens * img_tokens);
 
     // Encode patches into embeddings
-    LLAMA_API int32_t llama_vision_encode(struct llama_context * ctx, struct llama_vision_patches * p);
+    LLAMA_API int32_t llama_vision_encode(struct llama_context * ctx, struct llama_vision_tokens * img_tokens);
     LLAMA_API struct ggml_tensor * llama_vision_get_output_tensor(struct llama_context * ctx);
 
     //
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -110,7 +110,7 @@ struct llama_context {
     struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
 
     // vision
-    clip_context vctx;
+    llama_vision_context vctx;
 };
 
 // TODO: make these methods of llama_context
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -1268,8 +1268,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
         {
             std::string name;
             ml.get_key(LLM_KV_VISION_VIT_PROJECTOR_TYPE, name, true);
-            vparams.proj_type = clip_projector_type_from_name(name);
-            if (vparams.proj_type == CLIP_PROJECTOR_TYPE_UNKNOWN) {
+            vparams.proj_type = vision_projector_type_from_name(name);
+            if (vparams.proj_type == VISION_PROJECTOR_TYPE_UNKNOWN) {
                 throw std::runtime_error(format("unsupported clip projector type: %s", name.c_str()));
             }
         }
@@ -3514,7 +3514,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                 throw std::runtime_error("unknown vision architecture");
         }
 
-        if (clip_n_mmproj_embd(clip) != hparams.n_embd) {
+        if (llama_vision_n_mmproj_embd(clip) != hparams.n_embd) {
             std::runtime_error("model has vision, but n_mmproj_embd != n_embd");
         }
     }
diff --git a/src/llama-model.h b/src/llama-model.h
@@ -365,7 +365,7 @@ struct llama_model {
 
     // vision
     bool has_vision = false;
-    clip_vision_model clip;
+    llama_vision_model clip;
 
 private:
     struct impl;
diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp
diff --git a/src/llama-vision.h b/src/llama-vision.h

Original file line number	Diff line number	Diff line change
`@@ -122,7 +122,7 @@ int main(int argc, char ** argv) {`
`122`	`122`	`int n_prompt = 0;`
`123`	`123`
`124`	`124`	`// process image`
`125`		`- llama_vision_patches * img_patches = nullptr;`
	`125`	`+ llama_vision_tokens * img_tokens = nullptr;`
`126`	`126`	`{`
`127`	`127`	`const char * img_path = params.image[0].c_str();`
`128`	`128`	`if (params.image[0].empty()) {`
`@@ -131,12 +131,12 @@ int main(int argc, char ** argv) {`
`131`	`131`	`}`
`132`	`132`	`llama_vision_bitmap * img = load_image_from_file(img_path);`
`133`	`133`	`LOG_INF("loaded image %s, size = %d x %d\n", img_path, img->nx, img->ny);`
`134`		`- img_patches = llama_vision_patches_init(ctx, img);`
`135`		`- if (!img_patches) {`
`136`		`- LOG_ERR("failed to create image patches\n");`
	`134`	`+ img_tokens = llama_vision_tokenize(ctx, img);`
	`135`	`+ if (!img_tokens) {`
	`136`	`+ LOG_ERR("failed to create image tokens\n");`
`137`	`137`	`return 1;`
`138`	`138`	`}`
`139`		`- if (llama_vision_encode(ctx, img_patches)) {`
	`139`	`+ if (llama_vision_encode(ctx, img_tokens)) {`
`140`	`140`	`LOG_ERR("failed to encode image\n");`
`141`	`141`	`return 1;`
`142`	`142`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1268,8 +1268,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {`
`1268`	`1268`	`{`
`1269`	`1269`	`std::string name;`
`1270`	`1270`	`ml.get_key(LLM_KV_VISION_VIT_PROJECTOR_TYPE, name, true);`
`1271`		`- vparams.proj_type = clip_projector_type_from_name(name);`
`1272`		`- if (vparams.proj_type == CLIP_PROJECTOR_TYPE_UNKNOWN) {`
	`1271`	`+ vparams.proj_type = vision_projector_type_from_name(name);`
	`1272`	`+ if (vparams.proj_type == VISION_PROJECTOR_TYPE_UNKNOWN) {`
`1273`	`1273`	`throw std::runtime_error(format("unsupported clip projector type: %s", name.c_str()));`
`1274`	`1274`	`}`
`1275`	`1275`	`}`
`@@ -3514,7 +3514,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {`
`3514`	`3514`	`throw std::runtime_error("unknown vision architecture");`
`3515`	`3515`	`}`
`3516`	`3516`
`3517`		`- if (clip_n_mmproj_embd(clip) != hparams.n_embd) {`
	`3517`	`+ if (llama_vision_n_mmproj_embd(clip) != hparams.n_embd) {`
`3518`	`3518`	`std::runtime_error("model has vision, but n_mmproj_embd != n_embd");`
`3519`	`3519`	`}`
`3520`	`3520`	`}`