1
1
package org .beehive .gpullama3 .model .loader ;
2
2
3
- import org .beehive .gpullama3 .LlamaApp ;
4
3
import org .beehive .gpullama3 .Options ;
5
- import org .beehive .gpullama3 .auxiliary .Timer ;
6
4
import org .beehive .gpullama3 .core .model .GGMLType ;
7
5
import org .beehive .gpullama3 .core .model .GGUF ;
8
6
import org .beehive .gpullama3 .core .model .tensor .ArrayFloatTensor ;
21
19
import org .beehive .gpullama3 .tokenizer .impl .Qwen3Tokenizer ;
22
20
import org .beehive .gpullama3 .tokenizer .impl .Tokenizer ;
23
21
import org .beehive .gpullama3 .tokenizer .vocabulary .Vocabulary ;
22
+ import org .beehive .gpullama3 .tornadovm .TornadoVMMasterPlan ;
24
23
import uk .ac .manchester .tornado .api .types .arrays .FloatArray ;
25
24
26
25
import java .io .IOException ;
@@ -40,11 +39,9 @@ public Model loadModel() {
40
39
Map <String , Object > metadata = gguf .getMetadata ();
41
40
String basename = (String ) metadata .get ("general.basename" );
42
41
43
- String modelName = "DeepSeek-R1-Distill-Qwen" .equals (basename )
44
- ? "DeepSeek-R1-Distill-Qwen"
45
- : "Qwen2.5" ;
42
+ String modelName = "DeepSeek-R1-Distill-Qwen" .equals (basename ) ? "DeepSeek-R1-Distill-Qwen" : "Qwen2.5" ;
46
43
47
- try ( var ignored = Timer . log ( "Load " + modelName + " model" )) {
44
+ try {
48
45
// reuse method of Qwen3
49
46
Vocabulary vocabulary = loadQwen3Vocabulary (metadata );
50
47
boolean isDeepSeekR1DistillQwen = "DeepSeek-R1-Distill-Qwen" .equals (metadata .get ("general.basename" ));
@@ -55,11 +52,8 @@ public Model loadModel() {
55
52
contextLength = modelContextLength ;
56
53
}
57
54
58
- int numberOfKeyValueHeads = metadata .containsKey ("qwen2.attention.head_count_kv" )
59
- ? (int ) metadata .get ("qwen2.attention.head_count_kv" )
60
- : (int ) metadata .get ("qwen2.attention.head_count" );
61
- Qwen2Configuration config = new Qwen2Configuration (
62
- (int ) metadata .get ("qwen2.embedding_length" ), // dim
55
+ int numberOfKeyValueHeads = metadata .containsKey ("qwen2.attention.head_count_kv" ) ? (int ) metadata .get ("qwen2.attention.head_count_kv" ) : (int ) metadata .get ("qwen2.attention.head_count" );
56
+ Qwen2Configuration config = new Qwen2Configuration ((int ) metadata .get ("qwen2.embedding_length" ), // dim
63
57
(int ) metadata .get ("qwen2.feed_forward_length" ), // hiddendim
64
58
(int ) metadata .get ("qwen2.block_count" ), // numberOfLayers
65
59
(int ) metadata .get ("qwen2.attention.head_count" ), // numberOfHeads
@@ -68,22 +62,17 @@ public Model loadModel() {
68
62
numberOfKeyValueHeads , // numberOfHeadsKey
69
63
numberOfKeyValueHeads , // numberOfHeadsValue
70
64
71
- vocabulary .size (),
72
- modelContextLength , contextLength ,
73
- false ,
74
- (float ) metadata .get ("qwen2.attention.layer_norm_rms_epsilon" ),
75
- (float ) metadata .get ("qwen2.rope.freq_base" )
76
- );
65
+ vocabulary .size (), modelContextLength , contextLength , false , (float ) metadata .get ("qwen2.attention.layer_norm_rms_epsilon" ), (float ) metadata .get ("qwen2.rope.freq_base" ));
77
66
78
67
Weights weights = null ;
79
68
if (loadWeights ) {
80
69
Map <String , GGMLTensorEntry > tensorEntries = GGUF .loadTensors (fileChannel , gguf .getTensorDataOffset (), gguf .getTensorInfos ());
81
70
weights = loadWeights (tensorEntries , config );
82
71
}
83
72
// Qwen2.5-Coder uses <|endoftext|> as stop-token.
84
- ChatTokens chatTokens = isDeepSeekR1DistillQwen ?
85
- new ChatTokens ( "<|begin▁of▁sentence|>" , "" , "" , "<|end▁of▁sentence|>" , "" ) :
86
- new ChatTokens ( "<|im_start|>" , "<|im_end|>" , "" , "<|end_of_text|>" , "<|endoftext|>" );
73
+ ChatTokens chatTokens = isDeepSeekR1DistillQwen
74
+ ? new ChatTokens ("<|begin▁of▁sentence|>" , "" , "" , "<|end▁of▁sentence|>" , "" )
75
+ : new ChatTokens ("<|im_start|>" , "<|im_end|>" , "" , "<|end_of_text|>" , "<|endoftext|>" );
87
76
return new Qwen2 (config , tokenizer , weights , ChatFormat .create (tokenizer , chatTokens ));
88
77
} catch (IOException e ) {
89
78
throw new RuntimeException (e );
@@ -108,7 +97,9 @@ public Weights loadWeights(Map<String, GGMLTensorEntry> tensorEntries, Configura
108
97
GGMLTensorEntry outputWeight = tensorEntries .getOrDefault ("output.weight" , tokenEmbeddings );
109
98
110
99
if (Options .getDefaultOptions ().useTornadovm ()) {
111
- System .out .println ("Loading model weights in TornadoVM format (loading " + outputWeight .ggmlType () + " -> " + GGMLType .F16 + ")" );
100
+ if (TornadoVMMasterPlan .ENABLE_TORNADOVM_INIT_TIME ) {
101
+ System .out .println ("Loading model weights in TornadoVM format (loading " + outputWeight .ggmlType () + " -> " + GGMLType .F16 + ")" );
102
+ }
112
103
return createTornadoVMWeights (tensorEntries , config , ropeFreqs , tokenEmbeddings , outputWeight );
113
104
} else {
114
105
return createStandardWeights (tensorEntries , config , ropeFreqs , tokenEmbeddings , outputWeight );
0 commit comments