Add null tokenizer (NVIDIA-NeMo#11789)

erhoo82 · web-flow · commit 3146703432f8 · 2025-01-09T21:54:49.000+05:30
* Add null tokenizer

Signed-off-by: Sangkug Lym &lt;slym@nvidia.com&gt;

* Apply isort and black reformatting

Signed-off-by: erhoo82 &lt;erhoo82@users.noreply.github.com&gt;

* cleanup

Signed-off-by: Sangkug Lym &lt;slym@nvidia.com&gt;

---------

Signed-off-by: Sangkug Lym &lt;slym@nvidia.com&gt;
Signed-off-by: erhoo82 &lt;erhoo82@users.noreply.github.com&gt;
Co-authored-by: erhoo82 &lt;erhoo82@users.noreply.github.com&gt;
diff --git a/nemo/collections/common/tokenizers/null_tokenizer.py b/nemo/collections/common/tokenizers/null_tokenizer.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
+
+
+class NullTokenizer(MegatronTokenizer):
+    """
+    Synthetic tokenizer for performance benchmarking and debugging
+
+    Args:
+        vocab_size: vocabulary size for embedding
+    """
+
+    def __init__(self, vocab_size):
+        super().__init__(None, vocab_size=vocab_size)
+        self._vocab_size_without_eod = int(vocab_size)
+        self._eod_id = self._vocab_size_without_eod
+
+    def tokenize(self, text):
+        return [int(x) for x in text.split(' ')]
+
+    def detokenize(self, ids):
+        text = [str(x) for x in ids]
+        return ' '.join(text)
+
+    def offsets(self, ids: list[int], text: str) -> list[int]:
+        offsets, start_idx = [], 0
+        for id_ in ids:
+            offsets.append(start_idx)
+            start_idx += 1 + len(str(id_))
+        return offsets
+
+    @property
+    def vocab_size(self):
+        return self._vocab_size_without_eod + 1
+
+    @property
+    def vocab(self):
+        raise NotImplementedError
+
+    @property
+    def inv_vocab(self):
+        raise NotImplementedError
+
+    @property
+    def cls(self):
+        return -1
+
+    @property
+    def sep(self):
+        return -1
+
+    @property
+    def mask(self):
+        return -1
+
+    @property
+    def eod(self):
+        return self._eod_id
+
+    @property
+    def additional_special_tokens_ids(self):
+        return None
diff --git a/nemo/collections/nlp/modules/common/tokenizer_utils.py b/nemo/collections/nlp/modules/common/tokenizer_utils.py
@@ -156,6 +156,7 @@ def get_nmt_tokenizer(
     delimiter: Optional[str] = None,
     trust_remote_code: Optional[bool] = False,
     chat_template: Optional[Dict] = None,
+    vocab_size: Optional[int] = None,
 ):
     """
     Args:
@@ -246,6 +247,11 @@ def get_nmt_tokenizer(
         from nemo.collections.common.tokenizers.tiktoken_tokenizer import TiktokenTokenizer
 
         return TiktokenTokenizer(vocab_file=vocab_file)
+    elif library == 'null':
+        assert vocab_size is not None
+        from nemo.collections.common.tokenizers.null_tokenizer import NullTokenizer
+
+        return NullTokenizer(vocab_size)
     else:
         raise NotImplementedError(
             'Currently we only support "huggingface", "sentencepiece", "megatron", and "byte-level" tokenizer'