Skip to content

Commit 6b9a524

Browse files
ravenouseCISCngxson
authored
model: add Janus Pro for image understanding (ggml-org#16906)
* Add support for Janus Pro * Update gguf-py/gguf/tensor_mapping.py Co-authored-by: Sigbjørn Skjæret <[email protected]> * Update gguf-py/gguf/tensor_mapping.py Co-authored-by: Sigbjørn Skjæret <[email protected]> * Address reviewer suggestions Co-authored-by: Sigbjørn Skjæret <[email protected]> * Add JANUS_PRO constant * Update clip model handling Co-authored-by: Xuan-Son Nguyen <[email protected]> * Update tools/mtmd/clip.cpp Co-authored-by: Xuan-Son Nguyen <[email protected]> * Refactor JANUS_PRO handling in clip.cpp Co-authored-by: Xuan-Son Nguyen <[email protected]> * Update tools/mtmd/clip.cpp Co-authored-by: Sigbjørn Skjæret <[email protected]> * em whitespace --------- Co-authored-by: Sigbjørn Skjæret <[email protected]> Co-authored-by: Xuan-Son Nguyen <[email protected]> Co-authored-by: Xuan-Son Nguyen <[email protected]>
1 parent 2f966b8 commit 6b9a524

File tree

5 files changed

+147
-1
lines changed

5 files changed

+147
-1
lines changed

convert_hf_to_gguf.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9802,6 +9802,113 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
98029802

98039803
return [(self.map_tensor_name(name), data_torch)]
98049804

9805+
9806+
@ModelBase.register("JanusForConditionalGeneration")
9807+
class JanusProModel(LlamaModel):
9808+
model_arch = gguf.MODEL_ARCH.LLAMA # reuse Llama arch
9809+
9810+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9811+
# Skip vision, aligner, and generation tensors
9812+
skip_prefixes = (
9813+
'model.vision_model.',
9814+
'model.aligner.',
9815+
'model.vqmodel.',
9816+
'model.generation_embeddings.',
9817+
'model.generation_aligner.',
9818+
'model.generation_head.',
9819+
)
9820+
if name.startswith(skip_prefixes):
9821+
return []
9822+
9823+
if name.startswith('model.language_model.'):
9824+
name = name.replace('model.language_model.', 'model.')
9825+
elif name.startswith('language_model.'):
9826+
name = name.replace('language_model.', '')
9827+
9828+
return super().modify_tensors(data_torch, name, bid)
9829+
9830+
9831+
@ModelBase.register("JanusForConditionalGeneration")
9832+
class JanusProVisionModel(MmprojModel):
9833+
def __init__(self, *args, **kwargs):
9834+
super().__init__(*args, **kwargs)
9835+
assert self.hparams_vision is not None
9836+
if "intermediate_size" not in self.hparams_vision:
9837+
mlp_ratio = self.hparams_vision.get("mlp_ratio")
9838+
hidden_size = self.hparams_vision.get("hidden_size")
9839+
if mlp_ratio is not None and hidden_size is not None:
9840+
self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))
9841+
9842+
def set_gguf_parameters(self):
9843+
super().set_gguf_parameters()
9844+
assert self.hparams_vision is not None
9845+
9846+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JANUS_PRO)
9847+
9848+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
9849+
9850+
hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
9851+
if hidden_act == "gelu":
9852+
self.gguf_writer.add_vision_use_gelu(True)
9853+
elif hidden_act == "silu":
9854+
self.gguf_writer.add_vision_use_silu(True)
9855+
9856+
def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[str, Tensor]]:
9857+
"""Map aligner tensors to projector format"""
9858+
suffix = ".bias" if name.endswith(".bias") else ".weight"
9859+
9860+
if name.startswith("model.aligner."):
9861+
local_name = name[len("model.aligner."):]
9862+
elif name.startswith("aligner."):
9863+
local_name = name[len("aligner."):]
9864+
else:
9865+
raise ValueError(f"Unsupported Janus aligner prefix: {name}")
9866+
9867+
if local_name.startswith("fc1."):
9868+
mm_index = 0
9869+
elif local_name.startswith("hidden_layers."):
9870+
parts = local_name.split(".", 2)
9871+
if len(parts) < 3:
9872+
raise ValueError(f"Unexpected Janus aligner tensor name: {name}")
9873+
mm_index = int(parts[1]) + 1
9874+
else:
9875+
raise ValueError(f"Unsupported Janus aligner tensor: {name}")
9876+
9877+
tensor_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_index, suffix=suffix)
9878+
return [(tensor_name, data_torch)]
9879+
9880+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9881+
del bid # unused
9882+
9883+
# Skip language model tensors as they will be handled by `JanusProModel`
9884+
if name.startswith(('model.language_model.', 'language_model.')):
9885+
return []
9886+
9887+
# Skip generation-related components
9888+
skip_generation_prefixes = (
9889+
'model.vqmodel.',
9890+
'vqmodel.',
9891+
'model.generation_embeddings.',
9892+
'generation_embeddings.',
9893+
'model.generation_aligner.',
9894+
'generation_aligner.',
9895+
'model.generation_head.',
9896+
'generation_head.',
9897+
)
9898+
if name.startswith(skip_generation_prefixes):
9899+
return []
9900+
9901+
# Handle aligner tensors
9902+
if name.startswith(('model.aligner.', 'aligner.')):
9903+
return list(self._map_aligner_tensor(data_torch, name))
9904+
9905+
# Handle vision tensors
9906+
if name.startswith(('model.vision_model.', 'vision_model.')):
9907+
return [(self.map_tensor_name(name), data_torch)]
9908+
9909+
return []
9910+
9911+
98059912
###### CONVERSION LOGIC ######
98069913

98079914

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3186,6 +3186,7 @@ class VisionProjectorType:
31863186
KIMIVL = "kimivl"
31873187
LIGHTONOCR = "lightonocr"
31883188
COGVLM = "cogvlm"
3189+
JANUS_PRO = "janus_pro"
31893190

31903191

31913192
# Items here are (block size, type size)

gguf-py/gguf/tensor_mapping.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1183,6 +1183,7 @@ class TensorNameMap:
11831183
"model.mm_projector.mlp.mlp.{bid}",
11841184
"vision_model.vision_adapter.mlp.fc{bid}", # llama 4
11851185
"mlp1.{bid}", # InternVL
1186+
"model.aligner.fc1.hidden_layers.{bid}", # Janus Pro
11861187
),
11871188

11881189
MODEL_TENSOR.V_MMPROJ_PEG: (
@@ -1291,6 +1292,7 @@ class TensorNameMap:
12911292
"model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1
12921293
"vpm.encoder.layers.{bid}.self_attn.out_proj",
12931294
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
1295+
"model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro
12941296
"vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
12951297
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf
12961298
"vision_encoder.transformer.layers.{bid}.attention.wo", # pixtral

tools/mtmd/clip-impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ enum projector_type {
155155
PROJECTOR_TYPE_KIMIVL,
156156
PROJECTOR_TYPE_LIGHTONOCR,
157157
PROJECTOR_TYPE_COGVLM,
158+
PROJECTOR_TYPE_JANUS_PRO,
158159
PROJECTOR_TYPE_UNKNOWN,
159160
};
160161

@@ -180,6 +181,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
180181
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
181182
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
182183
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
184+
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
183185
};
184186

185187
static projector_type clip_projector_type_from_string(const std::string & str) {

tools/mtmd/clip.cpp

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,15 @@ struct clip_graph {
588588
cur = ggml_gelu(ctx0, cur);
589589
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
590590
cur = ggml_add(ctx0, cur, model.mm_2_b);
591+
592+
} else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
593+
cur = build_ffn(cur,
594+
model.mm_0_w, model.mm_0_b,
595+
nullptr, nullptr,
596+
model.mm_1_w, model.mm_1_b,
597+
hparams.ffn_op,
598+
-1);
599+
591600
} else {
592601
GGML_ABORT("SigLIP: Unsupported projector type");
593602
}
@@ -1729,7 +1738,6 @@ struct clip_graph {
17291738

17301739
return gf;
17311740
}
1732-
17331741
// whisper encoder with custom projector
17341742
ggml_cgraph * build_whisper_enc() {
17351743
const int n_frames = img.nx;
@@ -2457,6 +2465,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
24572465
{
24582466
res = graph.build_kimivl();
24592467
} break;
2468+
case PROJECTOR_TYPE_JANUS_PRO:
2469+
{
2470+
res = graph.build_siglip();
2471+
} break;
24602472
case PROJECTOR_TYPE_COGVLM:
24612473
{
24622474
res = graph.build_cogvlm();
@@ -3158,6 +3170,13 @@ struct clip_model_loader {
31583170
model.mm_boi = get_tensor(TN_TOK_BOI);
31593171
model.mm_eoi = get_tensor(TN_TOK_EOI);
31603172
} break;
3173+
case PROJECTOR_TYPE_JANUS_PRO:
3174+
{
3175+
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
3176+
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
3177+
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
3178+
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
3179+
} break;
31613180
default:
31623181
GGML_ASSERT(false && "unknown projector type");
31633182
}
@@ -4219,6 +4238,18 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
42194238
res_imgs->entries.push_back(std::move(img_f32));
42204239
} break;
42214240

4241+
case PROJECTOR_TYPE_JANUS_PRO:
4242+
{
4243+
// Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
4244+
const std::array<uint8_t, 3> pad_color = {127, 127, 127};
4245+
clip_image_u8 resized_image;
4246+
int sz = params.image_size;
4247+
img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
4248+
clip_image_f32_ptr img_f32(clip_image_f32_init());
4249+
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
4250+
res_imgs->entries.push_back(std::move(img_f32));
4251+
} break;
4252+
42224253
case PROJECTOR_TYPE_PIXTRAL:
42234254
case PROJECTOR_TYPE_LIGHTONOCR:
42244255
{
@@ -4395,6 +4426,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
43954426
switch (proj) {
43964427
case PROJECTOR_TYPE_MLP:
43974428
case PROJECTOR_TYPE_MLP_NORM:
4429+
case PROJECTOR_TYPE_JANUS_PRO:
43984430
{
43994431
// do nothing
44004432
} break;
@@ -4905,6 +4937,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
49054937
case PROJECTOR_TYPE_ULTRAVOX:
49064938
case PROJECTOR_TYPE_LFM2:
49074939
case PROJECTOR_TYPE_VOXTRAL:
4940+
case PROJECTOR_TYPE_JANUS_PRO:
49084941
case PROJECTOR_TYPE_COGVLM:
49094942
{
49104943
// do nothing
@@ -4993,6 +5026,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
49935026
return ctx->model.mm_model_mlp_3_w->ne[1];
49945027
case PROJECTOR_TYPE_QWEN2VL:
49955028
case PROJECTOR_TYPE_QWEN25VL:
5029+
case PROJECTOR_TYPE_JANUS_PRO:
49965030
return ctx->model.mm_1_b->ne[0];
49975031
case PROJECTOR_TYPE_QWEN3VL:
49985032
// main path + deepstack paths

0 commit comments

Comments
 (0)