Skip to content

Commit 5471f50

Browse files
committed
Add support for Janus Pro
1 parent e3af556 commit 5471f50

File tree

4 files changed

+189
-2
lines changed

4 files changed

+189
-2
lines changed

convert_hf_to_gguf.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9493,6 +9493,116 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
94939493

94949494
return [] # skip other tensors
94959495

9496+
9497+
@ModelBase.register("JanusForConditionalGeneration")
9498+
class JanusProModel(LlamaModel):
9499+
model_arch = gguf.MODEL_ARCH.LLAMA # reuse Llama arch
9500+
9501+
def set_gguf_parameters(self):
9502+
super().set_gguf_parameters()
9503+
9504+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9505+
# Skip vision, aligner, and generation tensors as they will be handled by `JanusProVisionModel`
9506+
skip_prefixes = (
9507+
'model.vision_model.',
9508+
'model.aligner.',
9509+
'model.vqmodel.',
9510+
'model.generation_embeddings.',
9511+
'model.generation_aligner.',
9512+
'model.generation_head.',
9513+
)
9514+
if name.startswith(skip_prefixes):
9515+
return []
9516+
9517+
if name.startswith('model.language_model.'):
9518+
name = name.replace('model.language_model.', 'model.')
9519+
elif name.startswith('language_model.'):
9520+
name = name.replace('language_model.', '')
9521+
9522+
return super().modify_tensors(data_torch, name, bid)
9523+
9524+
9525+
@ModelBase.register("JanusForConditionalGeneration")
9526+
class JanusProVisionModel(MmprojModel):
9527+
def __init__(self, *args, **kwargs):
9528+
super().__init__(*args, **kwargs)
9529+
assert self.hparams_vision is not None
9530+
if "intermediate_size" not in self.hparams_vision:
9531+
mlp_ratio = self.hparams_vision.get("mlp_ratio")
9532+
hidden_size = self.hparams_vision.get("hidden_size")
9533+
if mlp_ratio is not None and hidden_size is not None:
9534+
self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))
9535+
9536+
def set_gguf_parameters(self):
9537+
super().set_gguf_parameters()
9538+
assert self.hparams_vision is not None
9539+
9540+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JANUS_PRO)
9541+
9542+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
9543+
9544+
hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
9545+
if hidden_act == "gelu":
9546+
self.gguf_writer.add_vision_use_gelu(True)
9547+
elif hidden_act == "silu":
9548+
self.gguf_writer.add_vision_use_silu(True)
9549+
9550+
def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[str, Tensor]]:
9551+
"""Map aligner tensors to projector format"""
9552+
suffix = ".bias" if name.endswith(".bias") else ".weight"
9553+
9554+
if name.startswith("model.aligner."):
9555+
local_name = name[len("model.aligner."):]
9556+
elif name.startswith("aligner."):
9557+
local_name = name[len("aligner."):]
9558+
else:
9559+
raise ValueError(f"Unsupported Janus aligner prefix: {name}")
9560+
9561+
if local_name.startswith("fc1."):
9562+
mm_index = 0
9563+
elif local_name.startswith("hidden_layers."):
9564+
parts = local_name.split(".", 2)
9565+
if len(parts) < 3:
9566+
raise ValueError(f"Unexpected Janus aligner tensor name: {name}")
9567+
mm_index = int(parts[1]) + 1
9568+
else:
9569+
raise ValueError(f"Unsupported Janus aligner tensor: {name}")
9570+
9571+
tensor_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_index, suffix=suffix)
9572+
return [(tensor_name, data_torch)]
9573+
9574+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9575+
del bid # unused
9576+
9577+
# Skip language model tensors as they will be handled by `JanusProModel`
9578+
if name.startswith(('model.language_model.', 'language_model.')):
9579+
return []
9580+
9581+
# Skip generation-related components
9582+
skip_generation_prefixes = (
9583+
'model.vqmodel.',
9584+
'vqmodel.',
9585+
'model.generation_embeddings.',
9586+
'generation_embeddings.',
9587+
'model.generation_aligner.',
9588+
'generation_aligner.',
9589+
'model.generation_head.',
9590+
'generation_head.',
9591+
)
9592+
if name.startswith(skip_generation_prefixes):
9593+
return []
9594+
9595+
# Handle aligner tensors
9596+
if name.startswith(('model.aligner.', 'aligner.')):
9597+
return list(self._map_aligner_tensor(data_torch, name))
9598+
9599+
# Handle vision tensors
9600+
if name.startswith(('model.vision_model.', 'vision_model.')):
9601+
return [(self.map_tensor_name(name), data_torch)]
9602+
9603+
return []
9604+
9605+
94969606
###### CONVERSION LOGIC ######
94979607

94989608

gguf-py/gguf/tensor_mapping.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1154,6 +1154,7 @@ class TensorNameMap:
11541154
"model.mm_projector.mlp.mlp.{bid}",
11551155
"vision_model.vision_adapter.mlp.fc{bid}", # llama 4
11561156
"mlp1.{bid}", # InternVL
1157+
"model.aligner.fc1.hidden_layers.{bid}", # Janus Pro
11571158
),
11581159

11591160
MODEL_TENSOR.V_MMPROJ_PEG: (
@@ -1170,7 +1171,7 @@ class TensorNameMap:
11701171
"vision_tower.vision_model.embeddings.patch_embedding",
11711172
"model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1
11721173
"vpm.embeddings.patch_embedding",
1173-
"model.vision_model.embeddings.patch_embedding", # SmolVLM
1174+
"model.vision_model.embeddings.patch_embedding", # SmolVLM, Janus Pro
11741175
"vision_tower.patch_conv", # pixtral-hf
11751176
"vision_encoder.patch_conv", # pixtral
11761177
"vision_model.patch_embedding.linear", # llama 4
@@ -1182,7 +1183,7 @@ class TensorNameMap:
11821183
"vision_tower.vision_model.embeddings.position_embedding",
11831184
"model.vision_tower.embeddings.position_embeddings", # Intern-S1
11841185
"vpm.embeddings.position_embedding",
1185-
"model.vision_model.embeddings.position_embedding", # SmolVLM
1186+
"model.vision_model.embeddings.position_embedding", # SmolVLM, Janus Pro
11861187
"vision_model.positional_embedding_vlm", # llama 4
11871188
"vision_tower.patch_embed.pos_emb", # kimi-vl
11881189
),
@@ -1252,6 +1253,7 @@ class TensorNameMap:
12521253
"model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1
12531254
"vpm.encoder.layers.{bid}.self_attn.out_proj",
12541255
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
1256+
"model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro
12551257
"vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
12561258
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf
12571259
"vision_encoder.transformer.layers.{bid}.attention.wo", # pixtral

tools/mtmd/clip-impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ enum projector_type {
140140
PROJECTOR_TYPE_LFM2,
141141
PROJECTOR_TYPE_KIMIVL,
142142
PROJECTOR_TYPE_LIGHTONOCR,
143+
PROJECTOR_TYPE_JANUS_PRO,
143144
PROJECTOR_TYPE_UNKNOWN,
144145
};
145146

@@ -163,6 +164,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
163164
{ PROJECTOR_TYPE_LFM2, "lfm2"},
164165
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
165166
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
167+
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
166168
};
167169

168170
static projector_type clip_projector_type_from_string(const std::string & str) {

tools/mtmd/clip.cpp

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1509,6 +1509,45 @@ struct clip_graph {
15091509
return gf;
15101510
}
15111511

1512+
ggml_cgraph * build_janus_pro() {
1513+
GGML_ASSERT(model.class_embedding == nullptr); // No CLS token
1514+
1515+
ggml_tensor * inp = build_inp();
1516+
1517+
const int n_pos = n_patches;
1518+
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
1519+
ggml_set_name(positions, "positions");
1520+
ggml_set_input(positions);
1521+
1522+
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
1523+
1524+
ggml_tensor * cur = build_vit(
1525+
inp, n_patches,
1526+
NORM_TYPE_NORMAL,
1527+
hparams.ffn_op,
1528+
learned_pos_embd,
1529+
nullptr);
1530+
1531+
cur = ggml_mul_mat(ctx0, model.mm_0_w, cur);
1532+
if (model.mm_0_b) {
1533+
cur = ggml_add(ctx0, cur, model.mm_0_b);
1534+
}
1535+
cb(cur, "aligner_0", -1);
1536+
1537+
cur = ggml_gelu(ctx0, cur);
1538+
1539+
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
1540+
if (model.mm_1_b) {
1541+
cur = ggml_add(ctx0, cur, model.mm_1_b);
1542+
}
1543+
cb(cur, "aligner_1", -1);
1544+
1545+
// build the graph
1546+
ggml_build_forward_expand(gf, cur);
1547+
1548+
return gf;
1549+
}
1550+
15121551
// whisper encoder with custom projector
15131552
ggml_cgraph * build_whisper_enc() {
15141553
const int n_frames = img.nx;
@@ -2126,6 +2165,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
21262165
{
21272166
res = graph.build_kimivl();
21282167
} break;
2168+
case PROJECTOR_TYPE_JANUS_PRO:
2169+
{
2170+
res = graph.build_janus_pro();
2171+
} break;
21292172
default:
21302173
{
21312174
res = graph.build_llava();
@@ -2442,6 +2485,14 @@ struct clip_model_loader {
24422485
hparams.ffn_op = FFN_GELU_ERF;
24432486
log_ffn_op = "gelu_erf"; // temporary solution for logging
24442487
} break;
2488+
case PROJECTOR_TYPE_JANUS_PRO:
2489+
{
2490+
// Janus Pro uses mean = std = [0.5, 0.5, 0.5]
2491+
// ref: https://huggingface.co/deepseek-community/Janus-Pro-1B/blob/main/preprocessor_config.json
2492+
// ref: https://huggingface.co/deepseek-community/Janus-Pro-7B/blob/main/preprocessor_config.json
2493+
hparams.image_mean[0] = hparams.image_mean[1] = hparams.image_mean[2] = 0.5f;
2494+
hparams.image_std[0] = hparams.image_std[1] = hparams.image_std[2] = 0.5f;
2495+
} break;
24452496
default:
24462497
break;
24472498
}
@@ -2777,6 +2828,13 @@ struct clip_model_loader {
27772828
model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
27782829
model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
27792830
} break;
2831+
case PROJECTOR_TYPE_JANUS_PRO:
2832+
{
2833+
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
2834+
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
2835+
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
2836+
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
2837+
} break;
27802838
default:
27812839
GGML_ASSERT(false && "unknown projector type");
27822840
}
@@ -3637,6 +3695,17 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
36373695
res_imgs->entries.push_back(std::move(img_f32));
36383696
return true;
36393697

3698+
} else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
3699+
// Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
3700+
const std::array<uint8_t, 3> pad_color = {127, 127, 127};
3701+
clip_image_u8 resized_image;
3702+
int sz = params.image_size; // 384
3703+
image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz}, pad_color);
3704+
clip_image_f32_ptr img_f32(clip_image_f32_init());
3705+
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
3706+
res_imgs->entries.push_back(std::move(img_f32));
3707+
return true;
3708+
36403709
} else if (ctx->proj_type() == PROJECTOR_TYPE_PIXTRAL
36413710
|| ctx->proj_type() == PROJECTOR_TYPE_LIGHTONOCR
36423711
) {
@@ -3817,6 +3886,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
38173886
switch (proj) {
38183887
case PROJECTOR_TYPE_MLP:
38193888
case PROJECTOR_TYPE_MLP_NORM:
3889+
case PROJECTOR_TYPE_JANUS_PRO:
38203890
{
38213891
// do nothing
38223892
} break;
@@ -4286,6 +4356,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
42864356
set_input_i32("pos_w", pos_data);
42874357
} break;
42884358
case PROJECTOR_TYPE_GLM_EDGE:
4359+
case PROJECTOR_TYPE_JANUS_PRO:
42894360
{
42904361
// llava and other models
42914362
std::vector<int32_t> positions(n_pos);
@@ -4427,6 +4498,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
44274498
case PROJECTOR_TYPE_LFM2:
44284499
case PROJECTOR_TYPE_KIMIVL:
44294500
return ctx->model.mm_2_w->ne[1];
4501+
case PROJECTOR_TYPE_JANUS_PRO:
4502+
return ctx->model.mm_1_w->ne[1];
44304503
default:
44314504
GGML_ABORT("Unknown projector type");
44324505
}

0 commit comments

Comments
 (0)