Add comments for tokenizers

orionpapadakis · orionpapadakis · commit 41a173374ab6 · 2025-06-11T18:38:14.000+03:00
diff --git a/src/main/java/com/example/tokenizer/impl/LlamaTokenizer.java b/src/main/java/com/example/tokenizer/impl/LlamaTokenizer.java
@@ -10,6 +10,21 @@
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
+/**
+ * GPT-2-style BPE tokenizer (even though it's called "llama") with an explicit merges list.
+ * <p>
+ * BPE (Byte Pair Encoding):
+ * A sub-word tokenization algorithm that iteratively merges the most frequent pairs of symbols in a corpus to build a vocabulary of common character sequences.
+ * <p>
+ * GPT-2-style tokenization:
+ * Applies BPE at the byte level, ensuring all UTF-8 inputs are representable and using tokens that preserve leading spaces (e.g., 'Ġthe').
+ * <p>
+ * Explicit merges list:
+ * A fixed sequence of learned merge rules that deterministically reconstructs the tokenizer’s vocabulary during inference without retraining.
+ * <p>
+ * Based on <a href="https://github.com/karpathy/minbpe">minbpe</a>, algorithmically follows along the
+ * <a href="https://github.com/openai/gpt-2/blob/master/src/encoder.py">GPT 2 tokenizer</a>
+ */
 public class LlamaTokenizer implements Tokenizer {
     // general fields
     private final Pattern compiledPattern;
diff --git a/src/main/java/com/example/tokenizer/impl/MistralTokenizer.java b/src/main/java/com/example/tokenizer/impl/MistralTokenizer.java
@@ -7,10 +7,20 @@
 import java.util.regex.Pattern;
 
 /**
- * Byte Pair Encoding tokenizer.
+ * TikToken-style BPE tokenizer with byte fallback.
  * <p>
- * Based on <a href="https://github.com/karpathy/minbpe">minbpe</a>, algorithmically follows along the
- * <a href="https://github.com/openai/gpt-2/blob/master/src/encoder.py">GPT 2 tokenizer</a>
+ * TikToken-style:
+ * A Byte Pair Encoding (BPE) strategy that converts text to UTF-8 bytes.
+ * Frequent pairs of bytes (or tokens) are merged according to a learned vocabulary.
+ * This reduces long words into common subwords or whole-word tokens.
+ * If a word or character isn't found, it falls back to byte-level tokens.
+ * <p>
+ * Byte fallback:
+ * A fail-safe mechanism.
+ * It ensures every byte has a token, so any input (even unknown words, misspellings, foreign languages, emojis, or binary) can be tokenized.
+ * If a token is not found in the merges or vocabulary, it will fall back to the individual byte.
+ * Each byte is wrapped as a special token like <0xF0> — these are part of the tokenizer’s extended vocabulary.
+ * This guarantees reversibility: every string can be tokenized and decoded back exactly.
  */
 public class MistralTokenizer implements Tokenizer {
     // general fields