Merge createTokenizer methods into Tokenizer constructors

orionpapadakis · orionpapadakis · commit 340b35e62167 · 2025-06-12T18:13:19.000+03:00
diff --git a/src/main/java/com/example/loader/weights/ModelLoader.java b/src/main/java/com/example/loader/weights/ModelLoader.java
@@ -13,15 +13,7 @@
 import com.example.model.Configuration;
 import com.example.model.Model;
 import com.example.model.ModelType;
-import com.example.model.llama.LlamaConfiguration;
-import com.example.model.llama.Llama;
-import com.example.model.mistral.Mistral;
-import com.example.model.mistral.MistralConfiguration;
 import com.example.inference.operation.RoPE;
-import com.example.tokenizer.impl.LlamaTokenizer;
-import com.example.tokenizer.impl.MistralTokenizer;
-import com.example.tokenizer.impl.Tokenizer;
-import com.example.tokenizer.vocabulary.Vocabulary;
 import uk.ac.manchester.tornado.api.types.HalfFloat;
 import uk.ac.manchester.tornado.api.types.arrays.ByteArray;
 import uk.ac.manchester.tornado.api.types.arrays.FloatArray;
@@ -33,20 +25,13 @@
 import java.nio.channels.FileChannel;
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
-import java.util.Arrays;
-import java.util.List;
 import java.util.Map;
 import java.util.function.IntFunction;
-import java.util.stream.Collectors;
-import java.util.stream.IntStream;
 
 public final class ModelLoader {
     private static final String TOKENIZER_LLAMA_3_MODEL = "gpt2";
     private static final String TOKENIZER_MISTRAL_MODEL = "llama";
 
-    private static final String LLAMA_3_PATTERN = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
-    private static final String MISTRAL_PATTERN = "\\S+|\\s+";
-
     private static ModelType detectModelType(Map<String, Object> metadata) {
         String name = (String) metadata.get("general.name");
         String tokenizerModel = (String) metadata.get("tokenizer.ggml.model");
@@ -232,37 +217,6 @@ private static Weights createStandardWeights(Map<String, GGMLTensorEntry> tensor
                 FloatBuffer.wrap(ropeFreqs.first()), FloatBuffer.wrap(ropeFreqs.second()), loadQuantized(outputWeight), outputWeight.ggmlType());
     }
 
-    private static Tokenizer createLlama3Tokenizer(Map<String, Object> metadata, Vocabulary vocabulary) {
-        String[] mergeLines = (String[]) metadata.get("tokenizer.ggml.merges");
-        List<Pair<Integer, Integer>> merges = Arrays.stream(mergeLines).map(line -> line.split(" "))
-                .map(parts -> new Pair<>(vocabulary.getIndex(parts[0]).orElseThrow(), vocabulary.getIndex(parts[1]).orElseThrow())).toList();
-
-        int allTokens = vocabulary.size();
-        int baseTokens = 128000; // assume all tokens after the base ones are special.
-        int reservedSpecialTokens = allTokens - baseTokens;
-        List<String> specialTokensList = Arrays.stream(vocabulary.tokens(), baseTokens, allTokens).toList();
-
-        assert specialTokensList.stream().allMatch(token -> vocabulary.getIndex(token).isPresent());
-
-        Map<String, Integer> specialTokens = IntStream.range(0, specialTokensList.size()).boxed().collect(Collectors.toMap(i -> specialTokensList.get(i), i -> baseTokens + i));
-
-        return new LlamaTokenizer(vocabulary, merges, LLAMA_3_PATTERN, specialTokens);
-
-    }
-
-    private static Tokenizer createMistralTokenizer(Map<String, Object> metadata, Vocabulary vocabulary) {
-        int[] tokenTypes = (int[]) metadata.get("tokenizer.ggml.token_type");
-        List<Integer> specialTokensList = IntStream.range(0, vocabulary.size()).filter(t -> tokenTypes[t] != 1 && tokenTypes[t] != 6).boxed().toList();
-        Map<String, Integer> specialTokens =
-                IntStream.range(0, specialTokensList.size())
-                        .boxed()
-                        .collect(Collectors.toMap(
-                                t -> vocabulary.get(t),
-                                t -> t)
-                        );
-        return new MistralTokenizer(vocabulary, null, specialTokens, tokenTypes);
-    }
-
     public static FloatTensor loadQuantized(GGMLTensorEntry entry) {
         GGMLType ggmlType = entry.ggmlType();
         return switch (ggmlType) {
diff --git a/src/main/java/com/example/tokenizer/impl/LlamaTokenizer.java b/src/main/java/com/example/tokenizer/impl/LlamaTokenizer.java
@@ -26,6 +26,7 @@
  * <a href="https://github.com/openai/gpt-2/blob/master/src/encoder.py">GPT 2 tokenizer</a>
  */
 public class LlamaTokenizer implements Tokenizer {
+    private static final String LLAMA_3_PATTERN = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
     // general fields
     private final Pattern compiledPattern;
     private final Vocabulary vocabulary;
@@ -55,9 +56,23 @@ public boolean shouldDisplayToken(int token) {
         return !isSpecialToken(token);
     }
 
-    public LlamaTokenizer(Vocabulary vocabulary, List<Pair<Integer, Integer>> merges, String regexPattern, Map<String, Integer> specialTokens) {
+    public LlamaTokenizer(Map<String, Object> metadata, Vocabulary vocabulary) {
+        // load from metadata
+        String[] mergeLines = (String[]) metadata.get("tokenizer.ggml.merges");
+        List<Pair<Integer, Integer>> merges = Arrays.stream(mergeLines).map(line -> line.split(" "))
+                .map(parts -> new Pair<>(vocabulary.getIndex(parts[0]).orElseThrow(), vocabulary.getIndex(parts[1]).orElseThrow())).toList();
+        int allTokens = vocabulary.size();
+        int baseTokens = 128000; // assume all tokens after the base ones are special.
+        int reservedSpecialTokens = allTokens - baseTokens;
+        List<String> specialTokensList = Arrays.stream(vocabulary.tokens(), baseTokens, allTokens).toList();
+
+        assert specialTokensList.stream().allMatch(token -> vocabulary.getIndex(token).isPresent());
+
+        Map<String, Integer> specialTokens = IntStream.range(0, specialTokensList.size()).boxed().collect(Collectors.toMap(i -> specialTokensList.get(i), i -> baseTokens + i));
+
+        // init tokenizer object fields
         this.vocabulary = vocabulary;
-        this.compiledPattern = regexPattern != null ? Pattern.compile(regexPattern) : null;
+        this.compiledPattern = Pattern.compile(LLAMA_3_PATTERN);
         this.specialTokens = new HashMap<>(specialTokens);
         this.merges = new HashMap<>();
         for (Pair<Integer, Integer> pair : merges) {
diff --git a/src/main/java/com/example/tokenizer/impl/MistralTokenizer.java b/src/main/java/com/example/tokenizer/impl/MistralTokenizer.java
@@ -5,6 +5,8 @@
 import java.nio.charset.StandardCharsets;
 import java.util.*;
 import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
 
 /**
  * TikToken-style BPE tokenizer with byte fallback.
@@ -23,6 +25,7 @@
  * This guarantees reversibility: every string can be tokenized and decoded back exactly.
  */
 public class MistralTokenizer implements Tokenizer {
+    private static final String MISTRAL_PATTERN = "\\S+|\\s+";
     // general fields
     private final Pattern compiledPattern;
     private final Vocabulary vocabulary;
@@ -58,11 +61,22 @@ public int getTokenType(int tokenIndex) {
         return tokenType[tokenIndex];
     }
 
-    public MistralTokenizer(Vocabulary vocabulary, String regexPattern, Map<String, Integer> specialTokens, int[] tokenType) {
+    public MistralTokenizer(Map<String, Object> metadata, Vocabulary vocabulary) {
+        // load from metadata
+        int[] tokenTypes = (int[]) metadata.get("tokenizer.ggml.token_type");
+        List<Integer> specialTokensList = IntStream.range(0, vocabulary.size()).filter(t -> tokenTypes[t] != 1 && tokenTypes[t] != 6).boxed().toList();
+        Map<String, Integer> specialTokens =
+                IntStream.range(0, specialTokensList.size())
+                        .boxed()
+                        .collect(Collectors.toMap(
+                                t -> vocabulary.get(t),
+                                t -> t)
+                        );
+        // init tokenizer object fields
         this.vocabulary = vocabulary;
-        this.compiledPattern = regexPattern != null ? Pattern.compile(regexPattern) : null;
+        this.compiledPattern = null;
         this.specialTokens = new HashMap<>(specialTokens);
-        this.tokenType = tokenType;
+        this.tokenType = tokenTypes;
         this.byte0 = vocabulary.getIndex("<0x00>").orElseThrow();
     }