|
| 1 | +import argparse |
| 2 | +import os |
| 3 | +import json |
| 4 | +import re |
| 5 | + |
| 6 | +import torch |
| 7 | +import numpy as np |
| 8 | +from gguf import * |
| 9 | +from janus.models.clip_encoder import CLIPVisionTower |
| 10 | + |
| 11 | + |
| 12 | +TEXT = "clip.text" |
| 13 | +VISION = "clip.vision" |
| 14 | + |
| 15 | + |
| 16 | +def k(raw_key: str, arch: str) -> str: |
| 17 | + return raw_key.format(arch=arch) |
| 18 | + |
| 19 | + |
| 20 | +def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool: |
| 21 | + if name in ( |
| 22 | + "logit_scale", |
| 23 | + "text_model.embeddings.position_ids", |
| 24 | + "vision_model.embeddings.position_ids", |
| 25 | + ): |
| 26 | + return True |
| 27 | + |
| 28 | + if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]: |
| 29 | + return True |
| 30 | + |
| 31 | + if name.startswith("v") and not has_vision: |
| 32 | + return True |
| 33 | + |
| 34 | + if name.startswith("t") and not has_text: |
| 35 | + return True |
| 36 | + |
| 37 | + return False |
| 38 | + |
| 39 | + |
| 40 | +def get_tensor_name(name: str) -> str: |
| 41 | + if "projection" in name: |
| 42 | + return name |
| 43 | + if "mm_projector" in name: |
| 44 | + name = name.replace("model.mm_projector", "mm") |
| 45 | + name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) |
| 46 | + name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) |
| 47 | + return name |
| 48 | + |
| 49 | + return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") |
| 50 | + |
| 51 | + |
| 52 | +def bytes_to_unicode(): |
| 53 | + """ |
| 54 | + Returns list of utf-8 byte and a corresponding list of unicode strings. |
| 55 | + The reversible bpe codes work on unicode strings. |
| 56 | + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. |
| 57 | + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. |
| 58 | + This is a significant percentage of your normal, say, 32K bpe vocab. |
| 59 | + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. |
| 60 | + And avoids mapping to whitespace/control characters the bpe code barfs on. |
| 61 | + """ |
| 62 | + bs = ( |
| 63 | + list(range(ord("!"), ord("~") + 1)) |
| 64 | + + list(range(ord("¡"), ord("¬") + 1)) |
| 65 | + + list(range(ord("®"), ord("ÿ") + 1)) |
| 66 | + ) |
| 67 | + cs = bs[:] |
| 68 | + n = 0 |
| 69 | + for b in range(2**8): |
| 70 | + if b not in bs: |
| 71 | + bs.append(b) |
| 72 | + cs.append(2**8 + n) |
| 73 | + n += 1 |
| 74 | + cs = [chr(n) for n in cs] |
| 75 | + return dict(zip(bs, cs)) |
| 76 | + |
| 77 | + |
| 78 | +ap = argparse.ArgumentParser() |
| 79 | +ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) |
| 80 | +ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") |
| 81 | +ap.add_argument("--clip-model-is-vision", action="store_true", required=False, |
| 82 | + help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") |
| 83 | +ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, |
| 84 | + help="The clip model is from openclip (for ViT-SO400M type))") |
| 85 | +ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") |
| 86 | +ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp") |
| 87 | +ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) |
| 88 | +# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 |
| 89 | +# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 |
| 90 | +# TODO: Double check these two values |
| 91 | +default_image_mean = [0.48145466, 0.4578275, 0.40821073] |
| 92 | +default_image_std = [0.26862954, 0.26130258, 0.27577711] |
| 93 | +ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) |
| 94 | +ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) |
| 95 | + |
| 96 | +# with proper |
| 97 | +args = ap.parse_args() |
| 98 | + |
| 99 | + |
| 100 | +if args.use_f32: |
| 101 | + print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") |
| 102 | + |
| 103 | +# output in the same directory as the model if output_dir is None |
| 104 | +dir_model = args.model_dir |
| 105 | + |
| 106 | +vocab = None |
| 107 | +tokens = None |
| 108 | + |
| 109 | +# Copied from https://huggingface.co/deepseek-ai/Janus-Pro-7B/blob/main/config.json |
| 110 | +# This config is used to initialize the `CLIPVisionTower` class |
| 111 | +vision_config = { |
| 112 | + "image_size":384, |
| 113 | + "model_name": "siglip_large_patch16_384", |
| 114 | + "select_feature": "same", |
| 115 | + "select_layer": -1 |
| 116 | +} |
| 117 | +# Copied from https://github.com/deepseek-ai/Janus/blob/main/janus/models/siglip_vit.py |
| 118 | +# This config is used to initialize the `vision_tower` in `CLIPVisionTower` class |
| 119 | +model_config={ |
| 120 | + "image_size": 384, |
| 121 | + "patch_size": 16, |
| 122 | + "width": 1024, |
| 123 | + "layers": 24, |
| 124 | + "heads": 16, |
| 125 | + "mlp_ratio": 4, |
| 126 | + "global_pool": "map", |
| 127 | + "use_checkpoint": False, |
| 128 | +} |
| 129 | + |
| 130 | +model = CLIPVisionTower(**vision_config) |
| 131 | +model.load_state_dict(torch.load(args.model_dir + "/vision_model.pytorch.bin")) |
| 132 | +# Merge the two configs |
| 133 | +v_hparams = {**vision_config, **model_config} |
| 134 | +t_hparams = None |
| 135 | + |
| 136 | +# possible data types |
| 137 | +# ftype == 0 -> float32 |
| 138 | +# ftype == 1 -> float16 |
| 139 | +# |
| 140 | +# map from ftype to string |
| 141 | +ftype_str = ["f32", "f16"] |
| 142 | + |
| 143 | +ftype = 1 |
| 144 | +if args.use_f32: |
| 145 | + ftype = 0 |
| 146 | + |
| 147 | +fname_middle = None |
| 148 | +has_text_encoder = False |
| 149 | +has_vision_encoder = True |
| 150 | +has_llava_projector = False |
| 151 | + |
| 152 | +fname_middle = "" |
| 153 | + |
| 154 | +output_dir = args.output_dir if args.output_dir is not None else dir_model |
| 155 | +os.makedirs(output_dir, exist_ok=True) |
| 156 | +output_prefix = os.path.basename(output_dir).replace("ggml_", "") |
| 157 | +fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") |
| 158 | +fout = GGUFWriter(path=fname_out, arch="clip") |
| 159 | + |
| 160 | +fout.add_bool("clip.has_text_encoder", has_text_encoder) |
| 161 | +fout.add_bool("clip.has_vision_encoder", has_vision_encoder) |
| 162 | +fout.add_bool("clip.has_llava_projector", has_llava_projector) |
| 163 | +fout.add_file_type(ftype) |
| 164 | +model_name = model_config["model_name"] if "model_name" in model_config else os.path.basename(dir_model) |
| 165 | +fout.add_name(model_name) |
| 166 | +# TODO: Add more information in the description |
| 167 | +fout.add_description("vision-only CLIP model") |
| 168 | + |
| 169 | +if has_vision_encoder: |
| 170 | + # vision_model hparams |
| 171 | + fout.add_uint32("clip.vision.image_size", v_hparams["image_size"]) |
| 172 | + fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"]) |
| 173 | + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["width"]) |
| 174 | + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["width"] * v_hparams["mlp_ratio"]) |
| 175 | + fout.add_uint32("clip.vision.projection_dim", model.vision_tower.patch_embed.proj.out_channels) |
| 176 | + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["heads"]) |
| 177 | + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), model.vision_tower.blocks[0].norm1.eps) |
| 178 | + block_count = v_hparams['layers'] - 1 if has_llava_projector else v_hparams['layers'] |
| 179 | + fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) |
| 180 | + # /** |
| 181 | + # "image_grid_pinpoints": [ |
| 182 | + # [ |
| 183 | + # 336, |
| 184 | + # 672 |
| 185 | + # ], |
| 186 | + # [ |
| 187 | + # 672, |
| 188 | + # 336 |
| 189 | + # ], |
| 190 | + # [ |
| 191 | + # 672, |
| 192 | + # 672 |
| 193 | + # ], |
| 194 | + # [ |
| 195 | + # 1008, |
| 196 | + # 336 |
| 197 | + # ], |
| 198 | + # [ |
| 199 | + # 336, |
| 200 | + # 1008 |
| 201 | + # ] |
| 202 | + # ], |
| 203 | + # Flattened: |
| 204 | + # [ |
| 205 | + # 336, 672, |
| 206 | + # 672, 336, |
| 207 | + # 672, 672, |
| 208 | + # 1008, 336, |
| 209 | + # 336, 1008 |
| 210 | + # ] |
| 211 | + # * |
| 212 | + # */ |
| 213 | + if "image_grid_pinpoints" in v_hparams: |
| 214 | + # flatten it |
| 215 | + image_grid_pinpoints = [] |
| 216 | + for pinpoint in v_hparams["image_grid_pinpoints"]: |
| 217 | + for p in pinpoint: |
| 218 | + image_grid_pinpoints.append(p) |
| 219 | + fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints) |
| 220 | + if "image_crop_resolution" in v_hparams: |
| 221 | + fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"]) |
| 222 | + if "image_aspect_ratio" in v_hparams: |
| 223 | + fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"]) |
| 224 | + if "image_split_resolution" in v_hparams: |
| 225 | + fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"]) |
| 226 | + if "mm_patch_merge_type" in v_hparams: |
| 227 | + fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"]) |
| 228 | + if "mm_projector_type" in v_hparams: |
| 229 | + fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"]) |
| 230 | + |
| 231 | + |
| 232 | + |
| 233 | + image_mean = args.image_mean if args.image_mean is not None else default_image_mean |
| 234 | + image_std = args.image_std if args.image_std is not None else default_image_std |
| 235 | + fout.add_array("clip.vision.image_mean", image_mean) |
| 236 | + fout.add_array("clip.vision.image_std", image_std) |
| 237 | + |
| 238 | +use_gelu = True |
| 239 | +fout.add_bool("clip.use_gelu", use_gelu) |
| 240 | + |
| 241 | + |
| 242 | +if has_llava_projector: |
| 243 | + model.vision_model.encoder.layers.pop(-1) |
| 244 | + projector = torch.load(args.llava_projector) |
| 245 | + for name, data in projector.items(): |
| 246 | + name = get_tensor_name(name) |
| 247 | + # pw and dw conv ndim==4 |
| 248 | + if data.ndim == 2 or data.ndim == 4: |
| 249 | + data = data.squeeze().numpy().astype(np.float16) |
| 250 | + else: |
| 251 | + data = data.squeeze().numpy().astype(np.float32) |
| 252 | + |
| 253 | + fout.add_tensor(name, data) |
| 254 | + |
| 255 | + print("Projector tensors added\n") |
| 256 | + |
| 257 | +state_dict = model.state_dict() |
| 258 | +for name, data in state_dict.items(): |
| 259 | + if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector): |
| 260 | + # we don't need this |
| 261 | + print(f"skipping parameter: {name}") |
| 262 | + continue |
| 263 | + |
| 264 | + name = get_tensor_name(name) |
| 265 | + data = data.squeeze().numpy() |
| 266 | + |
| 267 | + n_dims = len(data.shape) |
| 268 | + |
| 269 | + # ftype == 0 -> float32, ftype == 1 -> float16 |
| 270 | + ftype_cur = 0 |
| 271 | + if n_dims == 4: |
| 272 | + print(f"tensor {name} is always saved in f16") |
| 273 | + data = data.astype(np.float16) |
| 274 | + ftype_cur = 1 |
| 275 | + elif ftype == 1: |
| 276 | + if name[-7:] == ".weight" and n_dims == 2: |
| 277 | + print(" Converting to float16") |
| 278 | + data = data.astype(np.float16) |
| 279 | + ftype_cur = 1 |
| 280 | + else: |
| 281 | + print(" Converting to float32") |
| 282 | + data = data.astype(np.float32) |
| 283 | + ftype_cur = 0 |
| 284 | + else: |
| 285 | + if data.dtype != np.float32: |
| 286 | + print(" Converting to float32") |
| 287 | + data = data.astype(np.float32) |
| 288 | + ftype_cur = 0 |
| 289 | + |
| 290 | + print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") |
| 291 | + fout.add_tensor(name, data) |
| 292 | + |
| 293 | + |
| 294 | +fout.write_header_to_file() |
| 295 | +fout.write_kv_data_to_file() |
| 296 | +fout.write_tensors_to_file() |
| 297 | +fout.close() |
| 298 | + |
| 299 | +print("Done. Output file: " + fname_out) |
0 commit comments