Skip to content

Commit d053e9c

Browse files
authored
Merge pull request #37 from orionpapadakis/feat/qwen3
[models] Support for Qwen3 models
2 parents b9976d2 + 2fd98ef commit d053e9c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+3248
-493
lines changed

README.md

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
<strong>Llama3</strong> models written in <strong>native Java</strong> automatically accelerated on GPUs with <a href="https://github.com/beehive-lab/TornadoVM" target="_blank"><strong>TornadoVM</strong></a>.
1818
Runs Llama3 inference efficiently using TornadoVM's GPU acceleration.
1919
<br><br>
20-
Currently, supports <strong>Llama3</strong> and <strong>Mistral</strong> models in the GGUF format.
20+
Currently, supports <strong>Llama3</strong>, <strong>Mistral</strong> and, <strong>Qwen3</strong> models in the GGUF format.
2121
<br><br>
2222
Builds on <a href="https://github.com/mukel/llama3.java">Llama3.java</a> by <a href="https://github.com/mukel">Alfonso² Peterssen</a>.
2323
Previous integration of TornadoVM and Llama2 it can be found in <a href="https://github.com/mikepapadim/llama2.tornadovm.java">llama2.tornadovm</a>.
@@ -187,6 +187,7 @@ llama-tornado --gpu --model beehive-llama-3.2-1b-instruct-fp16.gguf --prompt "te
187187
-Dtornado.load.tornado.implementation=uk.ac.manchester.tornado.runtime.common.Tornado \
188188
-Dtornado.load.annotation.implementation=uk.ac.manchester.tornado.annotation.ASMClassVisitor \
189189
-Dtornado.load.annotation.parallel=uk.ac.manchester.tornado.api.annotations.Parallel \
190+
-Dtornado.tvm.maxbytecodesize=65536 \
190191
-Duse.tornadovm=true \
191192
-Dtornado.threadInfo=false \
192193
-Dtornado.debug=false \
@@ -237,6 +238,12 @@ Download `FP16` quantized `Llama-3` .gguf files from:
237238
Download `FP16` quantized `Mistral` .gguf files from:
238239
- https://huggingface.co/collections/beehive-lab/mistral-gpullama3java-684afabb206136d2e9cd47e0
239240

241+
Download `FP16` quantized `Qwen3` .gguf files from:
242+
- https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF
243+
- https://huggingface.co/ggml-org/Qwen3-1.7B-GGUF
244+
- https://huggingface.co/ggml-org/Qwen3-4B-GGUF
245+
- https://huggingface.co/ggml-org/Qwen3-8B-GGUF
246+
240247
Please be gentle with [huggingface.co](https://huggingface.co) servers:
241248

242249
**Note** FP16 models are first-class citizens for the current version.
@@ -252,6 +259,18 @@ wget https://huggingface.co/beehive-lab/Llama-3.2-8B-Instruct-GGUF-FP16/resolve/
252259
253260
# Mistral (7B) - FP16
254261
wget https://huggingface.co/MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF/resolve/main/Mistral-7B-Instruct-v0.3.fp16.gguf
262+
263+
# Qwen3 (0.6B) - FP16
264+
wget https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-f16.gguf
265+
266+
# Qwen3 (1.7B) - FP16
267+
wget https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF/resolve/main/Qwen3-1.7B-f16.gguf
268+
269+
# Qwen3 (4B) - FP16
270+
wget https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF/resolve/main/Qwen3-4B-f16.gguf
271+
272+
# Qwen3 (8B) - FP16
273+
wget https://huggingface.co/ggml-org/Qwen3-0.6B-GGUF/resolve/main/Qwen3-8B-f16.gguf
255274
```
256275

257276
**[Experimental]** you can download the Q8 and Q4 used in the original implementation of Llama3.java, but for now are going to be dequanted to FP16 for TornadoVM support:

external/tornadovm

llama-tornado

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ class LlamaRunner:
7575
"-Dtornado.load.tornado.implementation=uk.ac.manchester.tornado.runtime.common.Tornado",
7676
"-Dtornado.load.annotation.implementation=uk.ac.manchester.tornado.annotation.ASMClassVisitor",
7777
"-Dtornado.load.annotation.parallel=uk.ac.manchester.tornado.api.annotations.Parallel",
78+
"-Dtornado.tvm.maxbytecodesize=65536"
7879
]
7980
cmd.extend(tornado_config)
8081

src/main/java/com/example/LlamaApp.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import com.example.inference.sampler.CategoricalSampler;
66
import com.example.inference.sampler.Sampler;
77
import com.example.inference.sampler.ToppSampler;
8-
import com.example.loader.weights.ModelLoader;
8+
import com.example.model.loader.ModelLoader;
99
import com.example.model.Model;
1010
import com.example.tornadovm.FloatArrayUtils;
1111
import uk.ac.manchester.tornado.api.types.arrays.FloatArray;

src/main/java/com/example/aot/AOT.java

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
import com.example.auxiliary.Timer;
44
import com.example.core.model.GGUF;
55
import com.example.core.model.tensor.GGMLTensorEntry;
6+
import com.example.model.loader.LlamaModelLoader;
67
import com.example.model.Model;
78
import com.example.Options;
9+
import com.example.model.format.LlamaChatFormat;
810
import com.example.model.llama.Llama;
9-
import com.example.loader.weights.ModelLoader;
10-
import com.example.loader.weights.Weights;
11+
import com.example.inference.weights.Weights;
12+
import com.example.tokenizer.impl.LlamaTokenizer;
1113

1214
import java.io.IOException;
1315
import java.nio.channels.FileChannel;
@@ -28,8 +30,10 @@
2830
public final class AOT {
2931
AOT.PartialModel preLoaded = AOT.PRELOADED_GGUF;
3032

33+
static LlamaModelLoader modelLoader;
3134

32-
record PartialModel(String modelFileName, Llama model, long tensorDataOffset, Map<String, GGUF.GGUFTensorInfo> tensorInfos) {}
35+
record PartialModel(String modelFileName, Llama model, long tensorDataOffset, Map<String, GGUF.GGUFTensorInfo> tensorInfos) {
36+
}
3337

3438
private static final PartialModel PRELOADED_GGUF = preLoadGGUF(System.getProperty("llama.PreloadGGUF"));
3539

@@ -44,12 +48,9 @@ private static PartialModel preLoadGGUF(String modelPath) {
4448
}
4549
GGUF gguf = GGUF.loadModel(path);
4650
try (FileChannel fileChannel = FileChannel.open(path, StandardOpenOption.READ)) {
47-
return new PartialModel(
48-
path.getFileName().toString(),
49-
Llama.loadModel(fileChannel, gguf, Options.DEFAULT_MAX_TOKENS, false), // TODO: needs proper handling for AOT
50-
gguf.getTensorDataOffset(),
51-
gguf.getTensorInfos()
52-
);
51+
modelLoader = new LlamaModelLoader(fileChannel, gguf, Options.DEFAULT_MAX_TOKENS, false);
52+
return new PartialModel(path.getFileName().toString(), modelLoader.loadModel(), // TODO: needs proper handling for AOT
53+
gguf.getTensorDataOffset(), gguf.getTensorInfos());
5354
}
5455
} catch (IOException e) {
5556
throw new RuntimeException(e);
@@ -73,12 +74,11 @@ public static Model tryUsePreLoaded(Path modelPath, int contextLength) throws IO
7374
return null;
7475
}
7576
Llama baseModel = preLoaded.model();
76-
try (var timer = Timer.log("Load tensors from pre-loaded model");
77-
var fileChannel = FileChannel.open(modelPath, StandardOpenOption.READ)) {
77+
try (var timer = Timer.log("Load tensors from pre-loaded model"); var fileChannel = FileChannel.open(modelPath, StandardOpenOption.READ)) {
7878
// Load only the tensors (mmap slices).
7979
Map<String, GGMLTensorEntry> tensorEntries = GGUF.loadTensors(fileChannel, preLoaded.tensorDataOffset(), preLoaded.tensorInfos());
80-
Weights weights = ModelLoader.loadWeights(tensorEntries, baseModel.configuration());
81-
return new Llama(baseModel.configuration().withContextLength(contextLength), baseModel.tokenizer(), weights);
80+
Weights weights = modelLoader.loadWeights(tensorEntries, baseModel.configuration());
81+
return new Llama(baseModel.configuration().withContextLength(contextLength), baseModel.tokenizer(), weights, new LlamaChatFormat((LlamaTokenizer) baseModel.tokenizer()));
8282
}
8383
}
8484
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
package com.example.auxiliary;
2+
3+
/** mask of a byte-sequence in UTF-8 encoding */
4+
public record Utf8Mask(int mask, int pattern, int len) {
5+
//@formatter:off
6+
public static final Utf8Mask[] MASKS = {
7+
new Utf8Mask(0b11100000, 0b11000000, 2),
8+
new Utf8Mask(0b11110000, 0b11100000, 3),
9+
new Utf8Mask(0b11111000, 0b11110000, 4)
10+
};
11+
//@formatter:on
12+
}

src/main/java/com/example/core/model/tensor/ArrayFloatTensor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ public final class ArrayFloatTensor extends FloatTensor {
1313

1414
final float[] values;
1515

16-
ArrayFloatTensor(float[] values) {
16+
public ArrayFloatTensor(float[] values) {
1717
this.values = values;
1818
}
1919

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
package com.example.core.model.tensor;
2+
3+
import com.example.core.model.GGMLType;
4+
import jdk.incubator.vector.FloatVector;
5+
import jdk.incubator.vector.VectorSpecies;
6+
7+
import java.lang.foreign.MemorySegment;
8+
import java.lang.foreign.ValueLayout;
9+
10+
public final class F32FloatTensor extends FloatTensor {
11+
final int size;
12+
final MemorySegment segment;
13+
14+
public F32FloatTensor(int size, MemorySegment segment) {
15+
this.size = size;
16+
this.segment = segment;
17+
}
18+
19+
@Override
20+
public int size() {
21+
return size;
22+
}
23+
24+
@Override
25+
public GGMLType type() {
26+
return GGMLType.F32;
27+
}
28+
29+
@Override
30+
public MemorySegment asMemorySegment() {
31+
return null;
32+
}
33+
34+
@Override
35+
public float getFloat(int index) {
36+
return segment.get(ValueLayout.OfFloat.JAVA_FLOAT, index * Float.BYTES);
37+
}
38+
39+
@Override
40+
public void setFloat(int index, float value) {
41+
segment.set(ValueLayout.OfFloat.JAVA_FLOAT, index * Float.BYTES, value);
42+
}
43+
44+
@Override
45+
protected FloatVector getFloatVector(VectorSpecies<Float> species, int offset) {
46+
throw new UnsupportedOperationException("getFloatVector is not yet implemented.");
47+
}
48+
}

0 commit comments

Comments
 (0)