ggml-org · Yutong-Dai · Jul 23, 2024 · Jul 24, 2024 · Jul 25, 2024 · Aug 19, 2024
diff --git a/.gitignore b/.gitignore
@@ -132,3 +132,29 @@ poetry.toml
 
 # Test models for lora adapters
 /lora-tests
+examples/xgenmm/imgs/*.csv
+examples/xgenmm copy/clip.cpp
+examples/xgenmm copy/clip.h
+examples/xgenmm copy/CMakeLists.txt
+examples/xgenmm copy/convert.sh
+examples/xgenmm copy/debug.py
+examples/xgenmm copy/playground.ipynb
+examples/xgenmm copy/test_anyres_img.cpp
+examples/xgenmm copy/xgenmm_convert_image_encoder_to_gguf.py
+examples/xgenmm copy/xgenmm_surgery.py
+examples/xgenmm copy/xgenmm.cpp
+examples/xgenmm copy/xgenmm.h
+examples/xgenmm copy/bak/xgenmm-surgery copy.py
+examples/xgenmm copy/imgs/image_original_resize.csv
+examples/xgenmm copy/imgs/image_res_0.csv
+examples/xgenmm copy/imgs/image_res_1.csv
+examples/xgenmm copy/imgs/image_res_2.csv
+examples/xgenmm copy/imgs/image_res_3.csv
+examples/xgenmm copy/imgs/image_res_4.csv
+examples/xgenmm copy/imgs/image-1d100e9-1.jpg
+examples/xgenmm copy/imgs/image-1d100e9.jpg
+examples/xgenmm/imgs/4patches_embeddings.pt
+examples/xgenmm/imgs/attention_mask_4patchhes.pt
+examples/xgenmm/models/tokenizers/*
+models/*.inp
+models/*.out
diff --git a/Makefile b/Makefile
@@ -20,6 +20,8 @@ BUILD_TARGETS = \
 	llama-infill \
 	llama-llava-cli \
 	llama-minicpmv-cli\
+	xgenmm-cli\
+	test_anyres_handle_patches\
 	llama-lookahead \
 	llama-lookup \
 	llama-lookup-create \
@@ -1473,6 +1475,22 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
 
+xgenmm-cli: examples/xgenmm/xgenmm-cli.cpp \
+	examples/xgenmm/xgenmm.cpp \
+	examples/xgenmm/xgenmm.h \
+	examples/xgenmm/clip.cpp \
+	examples/xgenmm/clip.h \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
+
+test_anyres_handle_patches: examples/xgenmm/test_anyres_handle_patches.cpp \
+	examples/xgenmm/xgenmm.cpp \
+	examples/xgenmm/xgenmm.h \
+	examples/xgenmm/clip.cpp \
+	examples/xgenmm/clip.h \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual	
+
 ifeq ($(UNAME_S),Darwin)
 swift: examples/batched.swift
 	(cd examples/batched.swift; make build)

@@ -263,6 +263,9 @@ def prepare_tensors(self):
             # we don't need these
             if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
                 continue
+            # added for xgenmm
+            if name.endswith((".additional_embedding.weight", ".additional_fc.bias", "additional_fc.weight")):
+                continue
 
             old_dtype = data_torch.dtype
 
@@ -542,18 +545,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
             # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
             res = "refact"
-        if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
-            # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
-            res = "command-r"
         if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
             # ref: https://huggingface.co/Qwen/Qwen1.5-7B
             res = "qwen2"
         if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
             # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
             res = "olmo"
-        if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
-            # ref: https://huggingface.co/databricks/dbrx-base
-            res = "dbrx"
         if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
             # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
             res = "jina-v2-en"
@@ -572,15 +569,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
             # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
             res = "jina-v2-code"
-        if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
-            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
-            res = "chatglm-bpe"
         if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
             # ref: https://huggingface.co/LumiOpen/Viking-7B
             res = "viking"
-        if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
-            # ref: https://huggingface.co/core42/jais-13b
-            res = "jais"
         if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
             # ref: https://huggingface.co/WisdomShell/CodeShell-7B
             res = "codeshell"
@@ -596,9 +587,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
             # ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
             res = "gpt3-finnish"
-        if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
-            # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
-            res = "exaone"
 
         if res is None:
             logger.warning("\n")
@@ -2084,6 +2072,8 @@ def set_vocab(self):
         self.gguf_writer.add_token_types(toktypes)
 
         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+        special_vocab.special_token_ids['eos'] = 32007
+        print("YD: set special_vocab.special_token_ids['eos'] = 32007")
         special_vocab.add_to_gguf(self.gguf_writer)
 
     def set_gguf_parameters(self):

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -39,6 +39,7 @@ else()
     add_subdirectory(quantize-stats)
     add_subdirectory(quantize)
     add_subdirectory(retrieval)
+    add_subdirectory(xgenmm)
     if (GGML_RPC)
         add_subdirectory(rpc)
     endif()

diff --git a/examples/xgenmm/CMakeLists.txt b/examples/xgenmm/CMakeLists.txt
@@ -0,0 +1,64 @@
+add_library(xgenmm OBJECT
+            xgenmm.cpp
+            xgenmm.h
+            clip.cpp
+            clip.h
+            )
+
+target_link_libraries(xgenmm PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
+
+target_include_directories(xgenmm PUBLIC .)
+target_include_directories(xgenmm PUBLIC ../..)
+target_include_directories(xgenmm PUBLIC ../../common)
+
+target_compile_features(xgenmm PRIVATE cxx_std_11)
+
+add_library(xgenmm_static STATIC $<TARGET_OBJECTS:xgenmm>)
+if (BUILD_SHARED_LIBS)
+    set_target_properties(xgenmm PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(xgenmm PRIVATE LLAMA_SHARED LLAMA_BUILD)
+    add_library(xgenmm_shared SHARED $<TARGET_OBJECTS:xgenmm>)
+    target_link_libraries(xgenmm_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
+    install(TARGETS xgenmm_shared LIBRARY)
+endif()
+
+if (NOT MSVC)
+    target_compile_options(xgenmm PRIVATE -Wno-cast-qual) # stb_image.h
+endif()
+
+if(TARGET BUILD_INFO)
+    add_dependencies(xgenmm BUILD_INFO)
+endif()
+
+
+set(TARGET test_anyres_img)
+add_executable(test_anyres_img test_anyres_img.cpp)
+install(TARGETS test_anyres_img RUNTIME)
+target_link_libraries(test_anyres_img PRIVATE common xgenmm ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(xgenmm PRIVATE cxx_std_11)
+
+
+set(TARGET test_anyres_handle_patches)
+add_executable(test_anyres_handle_patches test_anyres_handle_patches.cpp)
+install(TARGETS test_anyres_handle_patches RUNTIME)
+target_link_libraries(test_anyres_handle_patches PRIVATE common xgenmm ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(xgenmm PRIVATE cxx_std_11)
+
+set(TARGET test_patch_ops)
+add_executable(test_patch_ops test_patch_ops.cpp)
+install(TARGETS test_patch_ops RUNTIME)
+target_link_libraries(test_patch_ops PRIVATE common xgenmm ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(xgenmm PRIVATE cxx_std_11)
+
+
+# not implemented yet
+# set(TARGET xgenmm-cli)
+# add_executable(xgenmm-cli xgenmm-cli.cpp)
+# install(TARGETS xgenmm-cli RUNTIME)
+# target_link_libraries(xgenmm-cli PRIVATE common xgenmm_io xgenmm ${CMAKE_THREAD_LIBS_INIT})
+# target_compile_features(xgenmm PRIVATE cxx_std_11)
+
+# add_library(xgenmm_io OBJECT
+#         xgenmm_io.cpp
+# )
+# target_link_libraries(xgenmm_io PRIVATE xgenmm ${CMAKE_THREAD_LIBS_INIT})            
diff --git a/examples/xgenmm/bak/xgenmm-surgery copy.py b/examples/xgenmm/bak/xgenmm-surgery copy.py
@@ -0,0 +1,101 @@
+import torch
+import argparse
+from open_flamingo import create_model_and_transforms
+from omegaconf import OmegaConf
+import os
+import time
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ckpt_pth", type=str, default='/export/share/manli_shu/models/open-flamingo-dev/anyres_ablation_HFSiglip_patch128-kosmos_non_instruct-phi3_4k_instruct_nq128_pre_V3_5-llava_1p6_ocrmathmix_v4-8x8-ckpt2/checkpoint_0.pt')
+    parser.add_argument('--save_pth', type=str, default='/export/share/yutong/xgenmm/llamacpp_wd')
+    parser.add_argument('--version', type=str, default='siglip_kosmos_phi3_4k_instruct', help='help identify the version of the saved ckpt')
+    return parser.parse_args()
+
+VISION_ENCODER_KEY = 'vision_encoder'
+LLM_KEY = 'lang_model'
+PROJECTOR = 'vision_tokenizer'
+
+
+if __name__ == "__main__":
+    # load ckpt
+    args = get_args()
+    print("🟡 Loading ckpt...")
+    start = time.time()
+    ckpt = torch.load(args.ckpt_pth)["model_state_dict"]
+    end = time.time()
+    print(f"🟢 time used: [{end-start:.3f} s] | Done with loading ckpt")
+
+    # sanity check
+    unexpected_component_keys = set()
+    for k in list(ckpt.keys()):
+        matched = False
+        for c in ['vision_encoder', 'lang_model', 'vision_tokenizer']:
+            if k.startswith(c):
+                matched = True
+                continue
+        if not matched:
+            unexpected_component_keys.add(k)
+
+    if len(unexpected_component_keys) > 0:
+        print(f"❗❗❗ Unexpected component keys: {unexpected_component_keys}. Proceed with caution.")
+
+    save_dir = f"{args.save_pth}/{args.version}"
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    # get a list vl connector keys
+    projector_tensors = {k: v.float() for k, v in ckpt.items() if k.startswith(PROJECTOR)}
+    print("🟡 Saving project ckpt...")
+    save_path = f"{save_dir}/xgenmm.projector"
+    start = time.time()
+    torch.save(projector_tensors, save_path)
+    end = time.time()
+    print(f"🟢 time used: [{end-start:.3f} s] | Save projector ckpt at: {save_path}")
+
+    # here we use the siglip
+    vision_encoder_tensors = {k: v.float() for k, v in ckpt.items() if k.startswith(VISION_ENCODER_KEY)}
+    print("🟡 Saving vision encoder ckpt...")
+    save_path = f"{save_dir}/xgenmm.vision_encoder"
+    start = time.time()
+    torch.save(vision_encoder_tensors, save_path)
+    end = time.time()
+    print(f"🟢 time used: [{end-start:.3f} s] | Save projector ckpt at: {save_path}")
+
+
+    # hard code to load the model using open-flamingo
+    print("🟡 Saving llm ckpt...")
+    cfg = dict(
+        model_family = 'kosmos',
+        lm_path = 'microsoft/Phi-3-mini-4k-instruct',
+        vision_encoder_path = 'google/siglip-so400m-patch14-384',
+        vision_encoder_pretrained = 'google',
+        num_vision_tokens = 128,
+        image_aspect_ratio = 'anyres',
+        anyres_patch_sampling = True,
+        anyres_grids=[[1,2],[2,1],[2,2],[3,1],[1,3]],
+        ckpt_pth = args.ckpt_pth)
+    cfg = OmegaConf.create(cfg)
+    if cfg.model_family in ['kosmos-instruct', 'kosmos', 'llava']:
+        additional_kwargs = {
+            "image_aspect_ratio": cfg.image_aspect_ratio,
+            }
+        if cfg.model_family in ['kosmos-instruct', 'kosmos']:
+            additional_kwargs.update({
+                "num_vision_tokens": cfg.num_vision_tokens,
+                "anyres_patch_sampling": cfg.anyres_patch_sampling,
+            })
+    model, image_processor, tokenizer = create_model_and_transforms(
+                                        clip_vision_encoder_path=cfg.vision_encoder_path,
+                                        clip_vision_encoder_pretrained=cfg.vision_encoder_pretrained,
+                                        lang_model_path=cfg.lm_path,
+                                        tokenizer_path=cfg.lm_path,
+                                        model_family=cfg.model_family,
+                                        **additional_kwargs)
+    model.load_state_dict(ckpt, strict=True)
+    start = time.time()
+    llm = model.lang_model.save_pretrained(f"{save_dir}/model")
+    tokenizer.save_pretrained(f"{save_dir}/model")
+    vision_encoder_config = model.vision_encoder.config
+    vision_encoder_config.save_pretrained(f"{save_dir}/vit_config")
+    end = time.time()
+    print(f"🟢 time used: [{end-start:.3f} s] | Save projector ckpt at: {save_dir}/model")