Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -891,6 +891,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206":
# ref: https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base
res = "llada-moe"
if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e":
# ref: https://huggingface.co/ibm-granite/granite-docling-258M
res = "granite-docling"

if res is None:
logger.warning("\n")
Expand Down Expand Up @@ -1325,6 +1328,7 @@ def __init__(self, *args, **kwargs):
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)

# load preprocessor config
self.preprocessor_config = {}
if not self.is_mistral_format:
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
self.preprocessor_config = json.load(f)
Expand All @@ -1347,7 +1351,8 @@ def set_gguf_parameters(self):
self.gguf_writer.add_vision_projection_dim(self.n_embd_text)

# vision config
self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"]))
self.image_size = self.find_vparam(["image_size"])
self.gguf_writer.add_vision_image_size(self.image_size)
self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
Expand Down Expand Up @@ -2378,6 +2383,10 @@ def set_gguf_parameters(self):
self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
self.gguf_writer.add_vision_use_gelu(True)

# Add the preprocessor longest edge size
preproc_image_size = self.preprocessor_config.get("size", {}).get("longest_edge", self.image_size)
self.gguf_writer.add_vision_preproc_image_size(preproc_image_size)

def tensor_force_quant(self, name, new_name, bid, n_dims):
if ".embeddings." in name:
return gguf.GGMLQuantizationType.F32
Expand Down
1 change: 1 addition & 0 deletions convert_hf_to_gguf_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
{"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
{"name": "llada-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base", },
{"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
]

# some models are known to be broken upstream, so we will skip them as exceptions
Expand Down
8 changes: 8 additions & 0 deletions ggml/src/ggml-webgpu/ggml-webgpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,7 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
ctx->staged_param_bufs.push_back(params_bufs);
if (ctx->staged_command_bufs.size() == WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) {
ggml_backend_webgpu_submit_queue(ctx);
ggml_backend_webgpu_wait_on_submission(ctx);
}
}
}
Expand Down Expand Up @@ -1060,6 +1061,9 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
case GGML_OP_SCALE:
ggml_webgpu_scale(ctx, src0, node);
break;
case GGML_OP_SOFT_MAX:
ggml_webgpu_soft_max(ctx, src0, src1, src2, node);
break;
default:
return false;
}
Expand Down Expand Up @@ -1806,6 +1810,9 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
case GGML_OP_SCALE:
supports_op = op->type == GGML_TYPE_F32;
break;
case GGML_OP_SOFT_MAX:
supports_op = op->type == GGML_TYPE_F32;
break;
default:
break;
}
Expand Down Expand Up @@ -1949,6 +1956,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
ggml_webgpu_init_rope_pipeline(ctx);
ggml_webgpu_init_glu_pipeline(ctx);
ggml_webgpu_init_scale_pipeline(ctx);
ggml_webgpu_init_soft_max_pipeline(ctx);

#ifdef GGML_WEBGPU_DEBUG
// Initialize debug buffers
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ fn main(@builtin(workgroup_id) wid: vec3<u32>,
let i2 = i / params.ne1;
let i1 = i % params.ne1;
let i_src_row = params.offset_src + i3 * params.stride_src3 + i2 * params.stride_src2 + i1 * params.stride_src1;
let i_dst_row = params.offset_src + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1;
let i_dst_row = params.offset_dst + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1;

let elems = (params.ne0 + wg_size - 1) / wg_size;

Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ fn main(@builtin(workgroup_id) wid: vec3<u32>,
workgroupBarrier();
}
let row_max = scratch[0];
workgroupBarrier();

var sum = 0.0f;
col = lid.x;
Expand Down
1 change: 1 addition & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ class Clip:

class ClipVision:
IMAGE_SIZE = "clip.vision.image_size"
PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size"
PATCH_SIZE = "clip.vision.patch_size"
EMBEDDING_LENGTH = "clip.vision.embedding_length"
FEED_FORWARD_LENGTH = "clip.vision.feed_forward_length"
Expand Down
3 changes: 3 additions & 0 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1037,6 +1037,9 @@ def add_vision_attention_layernorm_eps(self, value: float) -> None:
def add_vision_image_size(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value)

def add_vision_preproc_image_size(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value)

def add_vision_image_mean(self, values: Sequence[float]) -> None:
self.add_array(Keys.ClipVision.IMAGE_MEAN, values)

Expand Down
5 changes: 5 additions & 0 deletions src/llama-vocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
case LLAMA_VOCAB_PRE_TYPE_OLMO:
case LLAMA_VOCAB_PRE_TYPE_JAIS:
case LLAMA_VOCAB_PRE_TYPE_TRILLION:
case LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING:
regex_exprs = {
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
};
Expand Down Expand Up @@ -1961,6 +1962,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "trillion") {
pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
clean_spaces = false;
} else if (
tokenizer_pre == "granite-docling") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING;
clean_spaces = false;
} else if (
tokenizer_pre == "bailingmoe" ||
tokenizer_pre == "llada-moe") {
Expand Down
81 changes: 41 additions & 40 deletions src/llama-vocab.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,46 +8,47 @@

// pre-tokenization types
enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
};

struct LLM_KV;
Expand Down
1 change: 1 addition & 0 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

// vision-specific
#define KEY_IMAGE_SIZE "clip.vision.image_size"
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
#define KEY_PATCH_SIZE "clip.vision.patch_size"
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
#define KEY_IMAGE_STD "clip.vision.image_std"
Expand Down
52 changes: 48 additions & 4 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,9 @@ struct clip_hparams {
int32_t projection_dim;
int32_t n_head;
int32_t n_layer;
int32_t proj_scale_factor = 0; // idefics3
// idefics3
int32_t preproc_image_size = 0;
int32_t proj_scale_factor = 0;

float image_mean[3];
float image_std[3];
Expand Down Expand Up @@ -2250,6 +2252,7 @@ struct clip_model_loader {

if (is_vision) {
get_u32(KEY_IMAGE_SIZE, hparams.image_size);
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.preproc_image_size, false);
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
Expand Down Expand Up @@ -3551,10 +3554,51 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
// res_imgs->data[0] = *res;
res_imgs->entries.push_back(std::move(img_f32));
return true;
}
else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
} else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
// The refined size has two steps:
// 1. Resize w/ aspect-ratio preserving such that the longer side is
// the preprocessor longest size
// 2. Resize w/out preserving aspect ratio such that both sides are
// multiples of image_size (always rounding up)
//
// CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
const clip_image_size refined_size = image_manipulation::calc_size_preserved_ratio(
original_size, params.image_size, params.preproc_image_size);

llava_uhd::slice_instructions instructions;
instructions.overview_size = clip_image_size{params.image_size, params.image_size};
instructions.refined_size = refined_size;
instructions.grid_size = clip_image_size{
static_cast<int>(std::ceil(static_cast<float>(refined_size.width) / params.image_size)),
static_cast<int>(std::ceil(static_cast<float>(refined_size.height) / params.image_size)),
};
for (int y = 0; y < refined_size.height; y += params.image_size) {
for (int x = 0; x < refined_size.width; x += params.image_size) {
instructions.slices.push_back(llava_uhd::slice_coordinates{
/* x */x,
/* y */y,
/* size */clip_image_size{
std::min(params.image_size, refined_size.width - x),
std::min(params.image_size, refined_size.height - y)
}
});
}
}
auto imgs = llava_uhd::slice_image(img, instructions);

// cast and normalize to f32
for (size_t i = 0; i < imgs.size(); ++i) {
// clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
clip_image_f32_ptr res(clip_image_f32_init());
normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
res_imgs->entries.push_back(std::move(res));
}

res_imgs->grid_x = instructions.grid_size.width;
res_imgs->grid_y = instructions.grid_size.height;
return true;
} else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
|| ctx->proj_type() == PROJECTOR_TYPE_GEMMA3
|| ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3
|| ctx->proj_type() == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
) {
clip_image_u8 resized_image;
Expand Down
Loading