Skip to content

Commit 67e50ad

Browse files
mxyngiosub
authored andcommitted
feat(model): add qwen3vl
1 parent 1467ae6 commit 67e50ad

File tree

14 files changed

+1214
-26
lines changed

14 files changed

+1214
-26
lines changed

convert/convert.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
198198
conv = &qwen2Model{}
199199
case "Qwen2_5_VLForConditionalGeneration":
200200
conv = &qwen25VLModel{}
201+
case "Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration":
202+
conv = &qwen3VLModel{}
201203
case "BertModel":
202204
conv = &bertModel{}
203205
case "CohereForCausalLM":

convert/convert_qwen3.go

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
package convert
2+
3+
import (
4+
"slices"
5+
"strings"
6+
7+
"github.com/ollama/ollama/fs/ggml"
8+
"github.com/pdevine/tensor"
9+
"github.com/pdevine/tensor/native"
10+
)
11+
12+
type qwen3Model struct {
13+
ModelParameters
14+
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
15+
HiddenSize uint32 `json:"hidden_size"`
16+
HiddenLayers uint32 `json:"num_hidden_layers"`
17+
IntermediateSize uint32 `json:"intermediate_size"`
18+
NumAttentionHeads uint32 `json:"num_attention_heads"`
19+
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
20+
HeadDim uint32 `json:"head_dim"`
21+
NumExperts uint32 `json:"num_experts"`
22+
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
23+
NormTopkProb bool `json:"norm_topk_prob"`
24+
RopeTheta float32 `json:"rope_theta"`
25+
RopeScaling struct {
26+
Type string `json:"type"`
27+
Factor ropeFactor `json:"factor"`
28+
OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
29+
MropeSection []int32 `json:"mrope_section"`
30+
} `json:"rope_scaling"`
31+
RMSNormEPS float32 `json:"rms_norm_eps"`
32+
}
33+
34+
// KV implements ModelConverter.
35+
func (q *qwen3Model) KV(t *Tokenizer) ggml.KV {
36+
arch := "qwen3"
37+
if q.NumExperts > 0 {
38+
arch += "moe"
39+
}
40+
41+
kv := q.ModelParameters.KV(t)
42+
kv["general.architecture"] = arch
43+
kv["block_count"] = q.HiddenLayers
44+
kv["context_length"] = q.MaxPositionEmbeddings
45+
kv["embedding_length"] = q.HiddenSize
46+
kv["feed_forward_length"] = q.IntermediateSize
47+
kv["attention.head_count"] = q.NumAttentionHeads
48+
kv["attention.head_count_kv"] = q.NumKeyValueHeads
49+
kv["attention.key_length"] = q.HeadDim
50+
kv["attention.value_length"] = q.HeadDim
51+
52+
if q.NumExperts > 0 {
53+
kv["expert_count"] = q.NumExperts
54+
kv["expert_used_count"] = q.NumExpertsPerToken
55+
kv["norm_top_k_prob"] = q.NormTopkProb
56+
}
57+
58+
kv["rope.freq_base"] = q.RopeTheta
59+
kv["attention.layer_norm_rms_epsilon"] = q.RMSNormEPS
60+
61+
switch q.RopeScaling.Type {
62+
case "":
63+
// no scaling
64+
case "yarn":
65+
kv["rope.scaling.type"] = q.RopeScaling.Type
66+
kv["rope.scaling.factor"] = q.RopeScaling.Factor
67+
case "mrope", "default":
68+
kv["rope.mrope_section"] = q.RopeScaling.MropeSection
69+
default:
70+
panic("unknown rope scaling type")
71+
}
72+
return kv
73+
}
74+
75+
// Tensors implements ModelConverter.
76+
func (q *qwen3Model) Tensors(ts []Tensor) []*ggml.Tensor {
77+
var out []*ggml.Tensor
78+
79+
// TODO: handle split experts
80+
81+
for _, t := range ts {
82+
switch {
83+
case strings.Contains(t.Name(), "ffn_gate_up_exps"):
84+
afterFunc := func(t tensor.Tensor) (tensor.Tensor, error) { return tensor.Transpose(t, 0, 2, 1) }
85+
for t := range splitDim(t, 2,
86+
split{Replacer: strings.NewReplacer("gate_up", "gate"), afterFunc: afterFunc},
87+
split{Replacer: strings.NewReplacer("gate_up", "up"), afterFunc: afterFunc},
88+
) {
89+
t.Shape[1], t.Shape[2] = t.Shape[2], t.Shape[1]
90+
out = append(out, t)
91+
}
92+
case strings.Contains(t.Name(), "ffn_down_exps"):
93+
shape := slices.Clone(t.Shape())
94+
shape[1], shape[2] = shape[2], shape[1]
95+
t.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
96+
dims := make([]int, len(shape))
97+
for i := range shape {
98+
dims[i] = int(shape[i])
99+
}
100+
101+
var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
102+
tt, err := tensor.Transpose(tt, 0, 2, 1)
103+
if err != nil {
104+
return nil, err
105+
}
106+
107+
// flatten tensor so it can be written as a vector
108+
if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
109+
return nil, err
110+
}
111+
112+
return native.VectorF32(tt.(*tensor.Dense))
113+
})
114+
out = append(out, &ggml.Tensor{
115+
Name: t.Name(),
116+
Kind: t.Kind(),
117+
Shape: shape,
118+
WriterTo: t,
119+
})
120+
default:
121+
out = append(out, &ggml.Tensor{
122+
Name: t.Name(),
123+
Kind: t.Kind(),
124+
Shape: t.Shape(),
125+
WriterTo: t,
126+
})
127+
}
128+
}
129+
130+
return out
131+
}
132+
133+
// Replacements implements ModelConverter.
134+
func (q *qwen3Model) Replacements() []string {
135+
return []string{
136+
"lm_head", "output",
137+
"model.embed_tokens", "token_embd",
138+
"model.layers", "blk",
139+
"input_layernorm", "attn_norm",
140+
"self_attn.k_proj", "attn_k",
141+
"self_attn.k_norm", "attn_k_norm",
142+
"self_attn.v_proj", "attn_v",
143+
"self_attn.q_proj", "attn_q",
144+
"self_attn.q_norm", "attn_q_norm",
145+
"self_attn.o_proj", "attn_output",
146+
"mlp.down_proj", "ffn_down",
147+
"mlp.gate_proj", "ffn_gate",
148+
"mlp.up_proj", "ffn_up",
149+
"mlp.gate.weight", "ffn_gate_inp.weight",
150+
"mlp.experts.down_proj", "ffn_down_exps.weight",
151+
"mlp.experts.gate_up_proj", "ffn_gate_up_exps.weight",
152+
"post_attention_layernorm", "ffn_norm",
153+
"model.norm", "output_norm",
154+
}
155+
}
156+
157+
var _ ModelConverter = (*qwen3Model)(nil)

convert/convert_qwen3vl.go

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
package convert
2+
3+
import (
4+
"cmp"
5+
"encoding/json"
6+
"io/fs"
7+
"slices"
8+
"strings"
9+
10+
"github.com/ollama/ollama/fs/ggml"
11+
)
12+
13+
type qwen3VLModel struct {
14+
qwen3Model `json:"text_config"`
15+
16+
VisionModel struct {
17+
Depth uint32 `json:"depth"`
18+
HiddenSize uint32 `json:"hidden_size"`
19+
NumHeads uint32 `json:"num_heads"`
20+
InChannels uint32 `json:"in_channels"`
21+
PatchSize uint32 `json:"patch_size"`
22+
SpatialMergeSize uint32 `json:"spatial_merge_size"`
23+
WindowSize uint32 `json:"window_size"`
24+
RMSNormEps float32 `json:"layer_norm_epsilon"`
25+
RopeTheta float32 `json:"rope_theta"`
26+
TemporalPatchSize uint32 `json:"temporal_patch_size"`
27+
DeepstackVisualIndexes []int32 `json:"deepstack_visual_indexes"`
28+
29+
Size struct {
30+
ShortestEdge uint32 `json:"shortest_edge"`
31+
LongestEdge uint32 `json:"longest_edge"`
32+
} `json:"size"`
33+
34+
ImageMean []float32 `json:"image_mean"`
35+
ImageStd []float32 `json:"image_std"`
36+
} `json:"vision_config"`
37+
}
38+
39+
func (m *qwen3VLModel) parseMore(fsys fs.FS) error {
40+
bts, err := fs.ReadFile(fsys, "preprocessor_config.json")
41+
if err != nil {
42+
return err
43+
}
44+
45+
return json.Unmarshal(bts, &m.VisionModel)
46+
}
47+
48+
func (m *qwen3VLModel) KV(t *Tokenizer) ggml.KV {
49+
kv := m.qwen3Model.KV(t)
50+
51+
arch := "qwen3vl"
52+
if m.NumExperts > 0 {
53+
arch += "moe"
54+
}
55+
// override architecture
56+
kv["general.architecture"] = arch
57+
58+
kv["vision.block_count"] = cmp.Or(m.VisionModel.Depth, 32)
59+
kv["vision.embedding_length"] = m.VisionModel.HiddenSize
60+
kv["vision.attention.head_count"] = cmp.Or(m.VisionModel.NumHeads, 16)
61+
kv["vision.num_channels"] = m.VisionModel.InChannels
62+
kv["vision.patch_size"] = cmp.Or(m.VisionModel.PatchSize, 14)
63+
kv["vision.spatial_merge_size"] = cmp.Or(m.VisionModel.SpatialMergeSize, 2)
64+
kv["vision.attention.layer_norm_epsilon"] = cmp.Or(m.VisionModel.RMSNormEps, 1e-6)
65+
kv["vision.rope.freq_base"] = cmp.Or(m.VisionModel.RopeTheta, 1e4)
66+
kv["vision.temporal_patch_size"] = cmp.Or(m.VisionModel.TemporalPatchSize, 2)
67+
kv["vision.deepstack_visual_indexes"] = m.VisionModel.DeepstackVisualIndexes
68+
69+
kv["vision.shortest_edge"] = m.VisionModel.Size.ShortestEdge
70+
kv["vision.longest_edge"] = m.VisionModel.Size.LongestEdge
71+
72+
kv["vision.image_mean"] = m.VisionModel.ImageMean
73+
kv["vision.image_std"] = m.VisionModel.ImageStd
74+
75+
return kv
76+
}
77+
78+
func (m *qwen3VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
79+
var rest []Tensor
80+
var out []*ggml.Tensor
81+
for _, t := range ts {
82+
switch {
83+
case strings.Contains(t.Name(), "attn_qkv"):
84+
out = append(out, slices.Collect(splitDim(t, 0,
85+
split{Replacer: strings.NewReplacer("attn_qkv", "attn_q")},
86+
split{Replacer: strings.NewReplacer("attn_qkv", "attn_k")},
87+
split{Replacer: strings.NewReplacer("attn_qkv", "attn_v")},
88+
))...)
89+
case strings.Contains(t.Name(), "patch_embed") && strings.HasSuffix(t.Name(), "weight"):
90+
shape := t.Shape()
91+
out = append(out, &ggml.Tensor{
92+
Name: t.Name(),
93+
Kind: t.Kind(),
94+
Shape: append([]uint64{shape[0] * shape[1]}, shape[2:]...),
95+
WriterTo: t,
96+
})
97+
default:
98+
rest = append(rest, t)
99+
}
100+
}
101+
102+
return append(m.qwen3Model.Tensors(rest), out...)
103+
}
104+
105+
func (m *qwen3VLModel) Replacements() []string {
106+
return append(
107+
m.qwen3Model.Replacements(),
108+
"model.language_", "",
109+
"model.visual", "v",
110+
"patch_embed.proj", "patch_embed",
111+
"blocks", "blk",
112+
"attn.qkv", "attn_qkv",
113+
"attn.proj", "attn_out",
114+
"deepstack_merger_list", "deepstack_merger",
115+
)
116+
}

convert/tensor.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ type split struct {
1919
dim int
2020
slices []tensor.Slice
2121

22-
// fn is an optional function to apply to the tensor after slicing
23-
fn func(tensor.Tensor) (tensor.Tensor, error)
22+
// afterFunc is an optional function to apply to the tensor after slicing
23+
afterFunc func(tensor.Tensor) (tensor.Tensor, error)
2424
}
2525

2626
// splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
@@ -54,8 +54,8 @@ func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
5454

5555
tt = tensor.Materialize(tt)
5656

57-
if split.fn != nil {
58-
tt, err = split.fn(tt)
57+
if split.afterFunc != nil {
58+
tt, err = split.afterFunc(tt)
5959
if err != nil {
6060
return nil, err
6161
}

convert/tensor_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,7 @@ func TestSplitDim(t *testing.T) {
432432
t.Run("split with transpose", func(t *testing.T) {
433433
next, stop := iter.Pull(splitDim(&r, 1,
434434
split{Replacer: strings.NewReplacer("a", "x")},
435-
split{Replacer: strings.NewReplacer("b", "y"), fn: func(tt tensor.Tensor) (tensor.Tensor, error) {
435+
split{Replacer: strings.NewReplacer("b", "y"), afterFunc: func(tt tensor.Tensor) (tensor.Tensor, error) {
436436
return tensor.Transpose(tt, 1, 0)
437437
}},
438438
))

fs/ggml/ggml.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -242,13 +242,13 @@ func (kv KV) OllamaEngineRequired() bool {
242242
return slices.Contains([]string{
243243
"gemma3",
244244
"gemma3n",
245-
"mistral3",
246-
"qwen3",
247-
"qwen3moe",
245+
"gptoss", "gpt-oss",
248246
"llama4",
247+
"mistral3",
249248
"mllama",
250249
"qwen25vl",
251-
"gptoss", "gpt-oss",
250+
"qwen3", "qwen3moe",
251+
"qwen3vl", "qwen3vlmoe",
252252
}, kv.Architecture())
253253
}
254254

0 commit comments

Comments
 (0)