Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,29 @@ poetry.toml

# Test models for lora adapters
/lora-tests
examples/xgenmm/imgs/*.csv
examples/xgenmm copy/clip.cpp
examples/xgenmm copy/clip.h
examples/xgenmm copy/CMakeLists.txt
examples/xgenmm copy/convert.sh
examples/xgenmm copy/debug.py
examples/xgenmm copy/playground.ipynb
examples/xgenmm copy/test_anyres_img.cpp
examples/xgenmm copy/xgenmm_convert_image_encoder_to_gguf.py
examples/xgenmm copy/xgenmm_surgery.py
examples/xgenmm copy/xgenmm.cpp
examples/xgenmm copy/xgenmm.h
examples/xgenmm copy/bak/xgenmm-surgery copy.py
examples/xgenmm copy/imgs/image_original_resize.csv
examples/xgenmm copy/imgs/image_res_0.csv
examples/xgenmm copy/imgs/image_res_1.csv
examples/xgenmm copy/imgs/image_res_2.csv
examples/xgenmm copy/imgs/image_res_3.csv
examples/xgenmm copy/imgs/image_res_4.csv
examples/xgenmm copy/imgs/image-1d100e9-1.jpg
examples/xgenmm copy/imgs/image-1d100e9.jpg
examples/xgenmm/imgs/4patches_embeddings.pt
examples/xgenmm/imgs/attention_mask_4patchhes.pt
examples/xgenmm/models/tokenizers/*
models/*.inp
models/*.out
18 changes: 18 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ BUILD_TARGETS = \
llama-infill \
llama-llava-cli \
llama-minicpmv-cli\
xgenmm-cli\
test_anyres_handle_patches\
llama-lookahead \
llama-lookup \
llama-lookup-create \
Expand Down Expand Up @@ -1473,6 +1475,22 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual

xgenmm-cli: examples/xgenmm/xgenmm-cli.cpp \
examples/xgenmm/xgenmm.cpp \
examples/xgenmm/xgenmm.h \
examples/xgenmm/clip.cpp \
examples/xgenmm/clip.h \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual

test_anyres_handle_patches: examples/xgenmm/test_anyres_handle_patches.cpp \
examples/xgenmm/xgenmm.cpp \
examples/xgenmm/xgenmm.h \
examples/xgenmm/clip.cpp \
examples/xgenmm/clip.h \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual

ifeq ($(UNAME_S),Darwin)
swift: examples/batched.swift
(cd examples/batched.swift; make build)
Expand Down
20 changes: 5 additions & 15 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,9 @@ def prepare_tensors(self):
# we don't need these
if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
continue
# added for xgenmm
if name.endswith((".additional_embedding.weight", ".additional_fc.bias", "additional_fc.weight")):
continue

old_dtype = data_torch.dtype

Expand Down Expand Up @@ -542,18 +545,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
res = "refact"
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
res = "command-r"
if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
# ref: https://huggingface.co/Qwen/Qwen1.5-7B
res = "qwen2"
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
res = "olmo"
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
# ref: https://huggingface.co/databricks/dbrx-base
res = "dbrx"
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
res = "jina-v2-en"
Expand All @@ -572,15 +569,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
res = "jina-v2-code"
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
res = "chatglm-bpe"
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
# ref: https://huggingface.co/LumiOpen/Viking-7B
res = "viking"
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
# ref: https://huggingface.co/core42/jais-13b
res = "jais"
if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
# ref: https://huggingface.co/WisdomShell/CodeShell-7B
res = "codeshell"
Expand All @@ -596,9 +587,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
# ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
res = "gpt3-finnish"
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
res = "exaone"

if res is None:
logger.warning("\n")
Expand Down Expand Up @@ -2084,6 +2072,8 @@ def set_vocab(self):
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.special_token_ids['eos'] = 32007
print("YD: set special_vocab.special_token_ids['eos'] = 32007")
special_vocab.add_to_gguf(self.gguf_writer)

def set_gguf_parameters(self):
Expand Down
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ else()
add_subdirectory(quantize-stats)
add_subdirectory(quantize)
add_subdirectory(retrieval)
add_subdirectory(xgenmm)
if (GGML_RPC)
add_subdirectory(rpc)
endif()
Expand Down
64 changes: 64 additions & 0 deletions examples/xgenmm/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
add_library(xgenmm OBJECT
xgenmm.cpp
xgenmm.h
clip.cpp
clip.h
)

target_link_libraries(xgenmm PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})

target_include_directories(xgenmm PUBLIC .)
target_include_directories(xgenmm PUBLIC ../..)
target_include_directories(xgenmm PUBLIC ../../common)

target_compile_features(xgenmm PRIVATE cxx_std_11)

add_library(xgenmm_static STATIC $<TARGET_OBJECTS:xgenmm>)
if (BUILD_SHARED_LIBS)
set_target_properties(xgenmm PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(xgenmm PRIVATE LLAMA_SHARED LLAMA_BUILD)
add_library(xgenmm_shared SHARED $<TARGET_OBJECTS:xgenmm>)
target_link_libraries(xgenmm_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS xgenmm_shared LIBRARY)
endif()

if (NOT MSVC)
target_compile_options(xgenmm PRIVATE -Wno-cast-qual) # stb_image.h
endif()

if(TARGET BUILD_INFO)
add_dependencies(xgenmm BUILD_INFO)
endif()


set(TARGET test_anyres_img)
add_executable(test_anyres_img test_anyres_img.cpp)
install(TARGETS test_anyres_img RUNTIME)
target_link_libraries(test_anyres_img PRIVATE common xgenmm ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(xgenmm PRIVATE cxx_std_11)


set(TARGET test_anyres_handle_patches)
add_executable(test_anyres_handle_patches test_anyres_handle_patches.cpp)
install(TARGETS test_anyres_handle_patches RUNTIME)
target_link_libraries(test_anyres_handle_patches PRIVATE common xgenmm ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(xgenmm PRIVATE cxx_std_11)

set(TARGET test_patch_ops)
add_executable(test_patch_ops test_patch_ops.cpp)
install(TARGETS test_patch_ops RUNTIME)
target_link_libraries(test_patch_ops PRIVATE common xgenmm ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(xgenmm PRIVATE cxx_std_11)


# not implemented yet
# set(TARGET xgenmm-cli)
# add_executable(xgenmm-cli xgenmm-cli.cpp)
# install(TARGETS xgenmm-cli RUNTIME)
# target_link_libraries(xgenmm-cli PRIVATE common xgenmm_io xgenmm ${CMAKE_THREAD_LIBS_INIT})
# target_compile_features(xgenmm PRIVATE cxx_std_11)

# add_library(xgenmm_io OBJECT
# xgenmm_io.cpp
# )
# target_link_libraries(xgenmm_io PRIVATE xgenmm ${CMAKE_THREAD_LIBS_INIT})
101 changes: 101 additions & 0 deletions examples/xgenmm/bak/xgenmm-surgery copy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import torch
import argparse
from open_flamingo import create_model_and_transforms
from omegaconf import OmegaConf
import os
import time

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--ckpt_pth", type=str, default='/export/share/manli_shu/models/open-flamingo-dev/anyres_ablation_HFSiglip_patch128-kosmos_non_instruct-phi3_4k_instruct_nq128_pre_V3_5-llava_1p6_ocrmathmix_v4-8x8-ckpt2/checkpoint_0.pt')
parser.add_argument('--save_pth', type=str, default='/export/share/yutong/xgenmm/llamacpp_wd')
parser.add_argument('--version', type=str, default='siglip_kosmos_phi3_4k_instruct', help='help identify the version of the saved ckpt')
return parser.parse_args()

VISION_ENCODER_KEY = 'vision_encoder'
LLM_KEY = 'lang_model'
PROJECTOR = 'vision_tokenizer'


if __name__ == "__main__":
# load ckpt
args = get_args()
print("🟡 Loading ckpt...")
start = time.time()
ckpt = torch.load(args.ckpt_pth)["model_state_dict"]
end = time.time()
print(f"🟢 time used: [{end-start:.3f} s] | Done with loading ckpt")

# sanity check
unexpected_component_keys = set()
for k in list(ckpt.keys()):
matched = False
for c in ['vision_encoder', 'lang_model', 'vision_tokenizer']:
if k.startswith(c):
matched = True
continue
if not matched:
unexpected_component_keys.add(k)

if len(unexpected_component_keys) > 0:
print(f"❗❗❗ Unexpected component keys: {unexpected_component_keys}. Proceed with caution.")

save_dir = f"{args.save_pth}/{args.version}"
if not os.path.exists(save_dir):
os.makedirs(save_dir)
# get a list vl connector keys
projector_tensors = {k: v.float() for k, v in ckpt.items() if k.startswith(PROJECTOR)}
print("🟡 Saving project ckpt...")
save_path = f"{save_dir}/xgenmm.projector"
start = time.time()
torch.save(projector_tensors, save_path)
end = time.time()
print(f"🟢 time used: [{end-start:.3f} s] | Save projector ckpt at: {save_path}")

# here we use the siglip
vision_encoder_tensors = {k: v.float() for k, v in ckpt.items() if k.startswith(VISION_ENCODER_KEY)}
print("🟡 Saving vision encoder ckpt...")
save_path = f"{save_dir}/xgenmm.vision_encoder"
start = time.time()
torch.save(vision_encoder_tensors, save_path)
end = time.time()
print(f"🟢 time used: [{end-start:.3f} s] | Save projector ckpt at: {save_path}")


# hard code to load the model using open-flamingo
print("🟡 Saving llm ckpt...")
cfg = dict(
model_family = 'kosmos',
lm_path = 'microsoft/Phi-3-mini-4k-instruct',
vision_encoder_path = 'google/siglip-so400m-patch14-384',
vision_encoder_pretrained = 'google',
num_vision_tokens = 128,
image_aspect_ratio = 'anyres',
anyres_patch_sampling = True,
anyres_grids=[[1,2],[2,1],[2,2],[3,1],[1,3]],
ckpt_pth = args.ckpt_pth)
cfg = OmegaConf.create(cfg)
if cfg.model_family in ['kosmos-instruct', 'kosmos', 'llava']:
additional_kwargs = {
"image_aspect_ratio": cfg.image_aspect_ratio,
}
if cfg.model_family in ['kosmos-instruct', 'kosmos']:
additional_kwargs.update({
"num_vision_tokens": cfg.num_vision_tokens,
"anyres_patch_sampling": cfg.anyres_patch_sampling,
})
model, image_processor, tokenizer = create_model_and_transforms(
clip_vision_encoder_path=cfg.vision_encoder_path,
clip_vision_encoder_pretrained=cfg.vision_encoder_pretrained,
lang_model_path=cfg.lm_path,
tokenizer_path=cfg.lm_path,
model_family=cfg.model_family,
**additional_kwargs)
model.load_state_dict(ckpt, strict=True)
start = time.time()
llm = model.lang_model.save_pretrained(f"{save_dir}/model")
tokenizer.save_pretrained(f"{save_dir}/model")
vision_encoder_config = model.vision_encoder.config
vision_encoder_config.save_pretrained(f"{save_dir}/vit_config")
end = time.time()
print(f"🟢 time used: [{end-start:.3f} s] | Save projector ckpt at: {save_dir}/model")
Loading