fix: uint16→uint32 dtype detection for large-vocab tokenizers

viralcode · viralcode · commit 9592e1f11b5b · 2026-03-24T11:31:49.000-04:00
Root cause: Qwen tokenizer (vocab 151K &gt; 65535) needs uint32 but
data loaders only checked tokenizer_type=='tiktoken'. Now checks
vocab_size &gt; 65535 from meta.pkl in all data loaders.
diff --git a/supergpt/training/distill.py b/supergpt/training/distill.py
@@ -159,12 +159,20 @@ def distill(
     print(f"  Student: {student_params/1e6:.1f}M params "
           f"({teacher_params/student_params:.1f}× compression)")
 
-    # Load data
+    # Load data — auto-detect dtype from meta.pkl
     block_size = teacher_config.block_size
+    data_dtype = np.uint16  # default
+    meta_path = os.path.join(data_dir, "meta.pkl")
+    if os.path.exists(meta_path):
+        import pickle
+        with open(meta_path, "rb") as f:
+            meta = pickle.load(f)
+        if meta.get("vocab_size", 0) > 65535 or meta.get("tokenizer_type") == "tiktoken":
+            data_dtype = np.uint32
     train_data = np.memmap(os.path.join(data_dir, "train.bin"),
-                           dtype=np.uint16, mode="r")
+                           dtype=data_dtype, mode="r")
     val_data = np.memmap(os.path.join(data_dir, "val.bin"),
-                         dtype=np.uint16, mode="r")
+                         dtype=data_dtype, mode="r")
 
     def get_batch(split):
         data = train_data if split == "train" else val_data
diff --git a/supergpt/training/finetune.py b/supergpt/training/finetune.py
@@ -41,7 +41,8 @@ def load_data(data_dir: str, split: str, block_size: int, batch_size: int, devic
     if os.path.exists(meta_path):
         with open(meta_path, "rb") as f:
             meta = pickle.load(f)
-        if meta.get("tokenizer_type") == "tiktoken":
+        vocab_size = meta.get("vocab_size", 0)
+        if vocab_size > 65535 or meta.get("tokenizer_type") == "tiktoken":
             dtype = np.uint32
         else:
             dtype = np.uint16
diff --git a/supergpt/training/train.py b/supergpt/training/train.py
@@ -343,12 +343,13 @@ def load_data(data_dir: str, split: str, block_size: int, batch_size: int, devic
     """Load a batch of data from the memory-mapped binary file."""
     data_path = os.path.join(data_dir, f"{split}.bin")
 
-    # Detect dtype from meta.pkl
+    # Detect dtype from meta.pkl — use uint32 if vocab > 65535
     meta_path = os.path.join(data_dir, "meta.pkl")
     if os.path.exists(meta_path):
         with open(meta_path, "rb") as f:
             meta = pickle.load(f)
-        if meta.get("tokenizer_type") == "tiktoken":
+        vocab_size = meta.get("vocab_size", 0)
+        if vocab_size > 65535 or meta.get("tokenizer_type") == "tiktoken":
             dtype = np.uint32
         else:
             dtype = np.uint16