ggml-org
diff --git a/‎Makefile‎
Lines changed: 8 additions & 0 deletions b/‎Makefile‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎common/vision.cpp‎
Lines changed: 37 additions & 0 deletions b/‎common/vision.cpp‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎common/vision.h‎
Lines changed: 8 additions & 0 deletions b/‎common/vision.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎examples/simple/simple.cpp‎
Lines changed: 14 additions & 0 deletions b/‎examples/simple/simple.cpp‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎include/llama.h‎
Lines changed: 6 additions & 2 deletions b/‎include/llama.h‎
Lines changed: 6 additions & 2 deletions
@@ -926,6 +926,7 @@ OBJ_LLAMA = \
 	src/llama-vocab.o \
 	src/llama-grammar.o \
 	src/llama-sampling.o \
+	src/llama-vision.o \
 	src/unicode.o \
 	src/unicode-data.o
 
@@ -937,6 +938,7 @@ OBJ_COMMON = \
 	common/ngram-cache.o \
 	common/sampling.o \
 	common/train.o \
+	common/vision.o \
 	common/build-info.o \
 	common/json-schema-to-grammar.o
 
@@ -1221,6 +1223,12 @@ common/ngram-cache.o: \
 	common/ngram-cache.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
+common/vision.o: \
+	common/vision.cpp \
+	common/vision.h \
+	common/stb_image.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 $(LIB_COMMON): \
 	$(OBJ_COMMON) \
 	$(LIB_LLAMA) \
 
@@ -0,0 +1,37 @@
+#include "vision.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+
+#include <vector>
+#include <fstream>
+
+llama_img * load_image_from_file(const char * fname) {
+    std::ifstream file(fname, std::ios::binary);
+    if (!file) {
+        throw std::runtime_error("Unable to open file");
+    }
+    std::vector<char> image_bytes = std::vector<char>(
+        std::istreambuf_iterator<char>(file),
+        std::istreambuf_iterator<char>());
+    // decode image to byte array
+    int nx, ny, nc;
+    auto * bytes = (unsigned char *) image_bytes.data();
+    auto * img = stbi_load_from_memory(bytes, image_bytes.size(), &nx, &ny, &nc, 3);
+    if (!img) {
+        throw std::runtime_error("failed to decode image bytes");
+    }
+    // printf("nx=%d ny=%d nc=%d\n", nx, ny, nc);
+    // GGML_ASSERT(nc == 3);
+    // for (int y = 0; y < ny; y++) {
+    //     for (int x = 0; x < nx; x++) {
+    //         unsigned char * pix = img + x*nc + y*nc*nx;
+    //         printf("%02x%02x%02x ", pix[0], pix[1], pix[2]);
+    //     }
+    //     printf("\n");
+    // }
+    // printf("\n");
+    llama_img * result = llama_img_alloc(nx, ny);
+    memcpy(result->data, bytes, nx*ny*nc);
+    return result;
+}
@@ -0,0 +1,8 @@
+#pragma once
+
+#include "llama.h"
+
+#include <string>
+#include <vector>
+
+llama_img * load_image_from_file(const char * fname);
@@ -2,6 +2,7 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "vision.h"
 
 #include <vector>
 
@@ -61,6 +62,19 @@ int main(int argc, char ** argv) {
 
     llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
 
+
+
+
+    // TODO: this is for testing; DELETE ME
+    llama_img_batch ibatch;
+    ibatch.n_imgs = 1;
+    ibatch.imgs = (llama_img **) malloc(1024);
+    ibatch.imgs[0] = load_image_from_file("media/llama0-logo.png");
+    llama_vision_encode(ctx, &ibatch);
+    return 0;
+
+
+
     // tokenize the prompt
 
     std::vector<llama_token> tokens_list;
 
@@ -234,8 +234,8 @@ extern "C" {
 
     // Input data for llama_vision_decode
     typedef struct llama_img_batch {
-        int32_t     n_imgs;
-        llama_img * imgs;
+        int32_t      n_imgs;
+        llama_img ** imgs;
     } llama_img_batch;
 
     // Input data for llama_decode
@@ -893,6 +893,10 @@ extern "C" {
     // Vision
     //
 
+    // create new RGB image for input
+    LLAMA_API llama_img * llama_img_alloc(int width, int height);
+    LLAMA_API void llama_img_free(llama_img * img);
+
     // encode image into embeddings
     LLAMA_API int32_t llama_vision_encode(struct llama_context * ctx, llama_img_batch * batch);