allenai
diff --git a/‎.github/workflows/CI.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/CI.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/tokenize.md‎
Lines changed: 4 additions & 0 deletions b/‎docs/tokenize.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 15 additions & 10 deletions b/‎pyproject.toml‎
Lines changed: 15 additions & 10 deletions
diff --git a/‎python/dolma/cli/resolvers.py‎
Lines changed: 1 addition & 2 deletions b/‎python/dolma/cli/resolvers.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎python/dolma/cli/tokenizer.py‎
Lines changed: 13 additions & 0 deletions b/‎python/dolma/cli/tokenizer.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎python/dolma/core/ft_tagger.py‎
Lines changed: 1 addition & 1 deletion b/‎python/dolma/core/ft_tagger.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/dolma/core/utils.py‎
Lines changed: 18 additions & 0 deletions b/‎python/dolma/core/utils.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎python/dolma/taggers/language.py‎
Lines changed: 4 additions & 2 deletions b/‎python/dolma/taggers/language.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎python/dolma/taggers/pii.py‎
Lines changed: 1 addition & 1 deletion b/‎python/dolma/taggers/pii.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/dolma/tokenizer/executor.py‎
Lines changed: 40 additions & 9 deletions b/‎python/dolma/tokenizer/executor.py‎
Lines changed: 40 additions & 9 deletions
@@ -100,7 +100,7 @@ jobs:
         if: steps.cache-venv.outputs.cache-hit != 'true'
         uses: actions/setup-python@v4
         with:
-          python-version: "3.9"
+          python-version: "3.10"
           architecture: "x64"
 
       - name: Create a new Python environment & install maturin
 
@@ -44,3 +44,7 @@ The following parameters are supported either via CLI (e.g. `dolma tokens --para
 |`work_dir.output`|No| Path to a local scratch directory where temporary output files can be placed. If not provided, Dolma will make one for you and delete it upon completion. |
 |`dryrun`|No| If true, only print the configuration and exit without running the tokenizer. |
 |`seed`|No| Seed for random number generation. |
+|`fields.text_field_name`|No|Name of the text field in the input files. Can be a nested field (e.g. "text.nested"). Defaults to "text". |
+|`fields.text_field_type`|No|Type of the text field in the input files. Defaults to "str". |
+|`fields.id_field_name`|No|Name of the id field in the input files. Can be a nested field (e.g. "id.nested.more"). Can be set to null to disable id field. Defaults to "id". |
+|`fields.id_field_type`|No|Type of the id field in the input files. Defaults to "str". |
@@ -1,15 +1,16 @@
 [project]
 name = "dolma"
-version = "1.1.2"
-description = "Data filters"
+version = "1.2.0"
+description = "Toolkit for pre-processing LLM training data."
 license = { text = "Apache-2.0" }
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.10,<3.13"
 dependencies = [
     "anyascii>=0.3.2",
     "blingfire==0.1.8",
-    "boto3>=1.28",
-    "cached-path>=1.5.1",
+    # "boto3>=1.28",
+    "boto3",
+    # "cached-path>=1.5.1", # no longer needed
     # "fasttext==0.9.2",    # broken with new version of setuptools; using fasttext-wheel instead
     "fasttext-wheel==0.9.2",
     "fsspec>=2023.6.0",
@@ -26,7 +27,7 @@ dependencies = [
     "requests",
     "rich",
     "s3fs==2023.6.0",
-    "smart-open",
+    "smart-open>=7.0.4",
     "tokenizers>=0.15.0,<=0.19.1",
     "tqdm",
     "uniseg",
@@ -118,14 +119,18 @@ dev = [
 # extension to process code
 code = ["detect-secrets==1.4.0", "beautifulsoup4>=4", "pygments", "regex"]
 # extension to detect PIIs using presidio
-pii = ["presidio_analyzer==2.2.32", "regex"]
+pii = [
+    # "presidio_analyzer==2.2.32", # presidio causes too many issues with installation, asking users to install it manually
+    "regex",
+]
 
 # language detection; by default, we use fastttext, everything else is optional
 lang = [
     "fasttext-wheel==0.9.2",
-    "LTpycld2==0.42",                  # fork of pycld2 that works on Apple Silicon
+    # "LTpycld2==0.42",                  # LTpycld2/pycld2 all so buggy; recommending user install them on their own
+    "pycld2",
     "lingua-language-detector>=2.0.0",
-    "langdetect>=1.0.9",
+    # "langdetect>=1.0.9",
 ]
 
 # extension to parse warc files
@@ -227,7 +232,7 @@ recursive = true
 aggressive = 3
 
 [tool.mypy]
-python_version = "3.9"
+python_version = "3.10"
 ignore_missing_imports = true
 no_site_packages = true
 allow_redefinition = false
 
@@ -1,11 +1,10 @@
 import multiprocessing
 from typing import List, TypeVar
 
-from cached_path import cached_path
 from omegaconf.omegaconf import OmegaConf as om
 from omegaconf.omegaconf import Resolver
 
-from ..core.paths import glob_path
+from ..core.paths import cached_path, glob_path
 
 __all__ = ["cache", "glob", "processes"]
 
 
@@ -94,6 +94,14 @@ def deprecated_init(cls, tokenizer_name_or_path: str) -> "TokenizerConfig":
         )
 
 
+@dataclass
+class FieldsConfig:
+    text_field_name: str = field(default="text", help="Name of the text field in the input files.")
+    text_field_type: str = field(default="str", help="Type of the text field in the input files.")
+    id_field_name: Optional[str] = field(default="id", help="Name of the id field in the input files.")
+    id_field_type: str = field(default="str", help="Type of the id field in the input files.")
+
+
 @dataclass
 class TokenizationConfig:
     documents: List[str] = field(
@@ -131,6 +139,7 @@ class TokenizationConfig:
         help="Number of sequences to tokenize before writing to disk.",
     )
     ring_size: int = field(default=8, help="Number of files to open in parallel for tokenization.")
+    fields: FieldsConfig = field(default=FieldsConfig(), help="Configuration for the fields in the input files.")
     sample_ring_prop: bool = field(
         default=False,
         help="Whether to sample the ring proportionally to the number of documents in each source.",
@@ -221,4 +230,8 @@ def run(cls, parsed_config: TokenizationConfig):
                 sample_ring_prop=parsed_config.sample_ring_prop,
                 use_fast_tokenizer=parsed_config.tokenizer.fast,
                 refresh_tokenizer=parsed_config.tokenizer.refresh,
+                text_field_name=parsed_config.fields.text_field_name,
+                text_field_type=parsed_config.fields.text_field_type,
+                id_field_name=parsed_config.fields.id_field_name,
+                id_field_type=parsed_config.fields.id_field_type,
             )
@@ -11,11 +11,11 @@
 from typing import Iterable, Literal, NamedTuple, Optional
 
 import smart_open
-from cached_path import cached_path
 from fasttext import train_supervised
 from fasttext.FastText import _FastText
 
 from .data_types import DocResult, Document, Span, TextSlice
+from .paths import cached_path
 from .taggers import BaseTagger
 from .utils import split_paragraphs, split_sentences
 
 
@@ -34,6 +34,24 @@
 logger = get_logger(__name__)
 
 
+TYPES_MAP = {
+    "object": dict,
+    "dict": dict,
+    "array": list,
+    "list": list,
+    "string": str,
+    "str": str,
+    "number": float,
+    "float": float,
+    "integer": int,
+    "int": int,
+    "boolean": bool,
+    "bool": bool,
+    "null": type(None),
+    "None": type(None),
+}
+
+
 def make_variable_name(name: str, remove_multiple_underscores: bool = False) -> str:
     # use underscores for any non-valid characters in variable name
     name = re.sub(r"[^a-zA-Z0-9_]", "_", name)
 
@@ -24,7 +24,6 @@
     if CLD2_AVAILABLE or TYPE_CHECKING:
         import pycld2 as cld2  # pyright:ignore pylint:disable=import-error
 
-
 with necessary.necessary("langdetect", soft=True) as LANGDETECT_AVAILABLE:
     if LANGDETECT_AVAILABLE or TYPE_CHECKING:
         from langdetect import PROFILES_DIRECTORY, DetectorFactory, LangDetectException
@@ -98,7 +97,10 @@ class Cld2LanguageTagger(BaseLanguageTagger):
     def __init__(self) -> None:
         super().__init__()
         if not CLD2_AVAILABLE:
-            raise ImportError("pycld2 is not installed, please run `pip install dolma[lang]`.")
+            raise ImportError(
+                "pycld2 is not available, please run `pip install pycld2` "
+                "or `pip install LTpycld2` (whichever works)."
+            )
 
     def _sanitize_input(self, text: str) -> str:
         return self.RE_BAD_CHARS.sub("", text)
 
@@ -68,7 +68,7 @@ def __init__(
         # presidio
         if self.method == self.PRESIDIO:
             if not PRESIDIO_AVAILABLE:
-                raise RuntimeError("Presidio is not available; please run `pip install dolma[pii]`")
+                raise RuntimeError("Presidio is not available; please run `pip install presidio-analyzer`")
             self.analyzer = AnalyzerEngine()
 
     def predict(self, doc: Document) -> DocResult:
 
@@ -14,9 +14,10 @@
 from ..core.loggers import get_logger
 from ..core.parallel import BaseParallelProcessor, QueueType
 from ..core.paths import get_size, glob_path, join_path, mkdir_p
+from ..core.utils import TYPES_MAP
 from .data_types import TokenizerOutput  # pylint: disable=unused-import
 from .memmap_writer import MemmapWriter
-from .tokenizer import Tokenizer, tokenize_file
+from .tokenizer import make_tokenizer, tokenize_file
 
 TokenizedSeqsQueueType: TypeAlias = "Queue[List[TokenizerOutput]]"
 PathsQueueType: TypeAlias = "Queue[str]"
@@ -89,6 +90,18 @@ def process_single(cls, source_path: str, destination_path: str, queue: QueueTyp
         # whether to split the special tokens into separate tokens, e.g. <s> -> < s >
         tokenizer_kwargs["encode_special_tokens"] = kwargs.pop("encode_special_tokens", None) or False
 
+        # name of the text and id fields in the input files
+        tokenizer_kwargs["text_field_name"] = kwargs.pop("text_field_name", None) or "text"
+        tokenizer_kwargs["id_field_name"] = kwargs.pop("id_field_name", None)
+
+        # type of the text and id fields in the input files
+        text_field_type_str = kwargs.pop("text_field_type", None) or "str"
+        assert text_field_type_str in TYPES_MAP, f"Invalid text field type: {text_field_type_str}"
+        tokenizer_kwargs["text_field_type"] = TYPES_MAP[text_field_type_str]
+        id_field_type_str = kwargs.pop("id_field_type", None) or "str"
+        assert id_field_type_str in TYPES_MAP, f"Invalid id field type: {id_field_type_str}"
+        tokenizer_kwargs["id_field_type"] = TYPES_MAP[id_field_type_str]
+
         # this is useful for making sure the queue does not grows too much
         cpu_count = multiprocessing.cpu_count()
 
@@ -305,6 +318,10 @@ def tokenize_in_parallel(
     sample_ring_prop: bool = False,
     refresh_tokenizer: int = 0,
     use_fast_tokenizer: bool = True,
+    text_field_name: str = "text",
+    text_field_type: str = "str",
+    id_field_name: Optional[str] = "id",
+    id_field_type: str = "str",
 ):
     """
     Tokenizes the input sources in parallel using multiple writers and readers.
@@ -334,18 +351,28 @@ def tokenize_in_parallel(
         refresh_tokenizer (int, optional): Number of batches after which to refresh the tokenizer.
             Defaults to 0, which means the tokenizer will not be refreshed.
         use_fast_tokenizer (bool, optional): Whether to use the fast tokenizer. Defaults to True.
+        text_field_name (str, optional): Name of the text field in the input files. Defaults to "text".
+        text_field_type (str, optional): Type of the text field in the input files. Defaults to "str".
+        id_field_name (str, optional): Name of the id field in the input files. Defaults to "id". Set to None if
+            the input files do not have an id field.
+        id_field_type (str, optional): Type of the id field in the input files. Defaults to "str".
     """
     # variables to avoid issues with parallelism
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
-    # do it once so it gets cached (unless it's local path, so no need)
-    if not os.path.exists(tokenizer_name_or_path):
-        Tokenizer.from_pretrained(
-            identifier=tokenizer_name_or_path,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            pad_token_id=pad_token_id,
-            use_fast=use_fast_tokenizer,
+    # do it once so it gets cached, and we can check if dtype is correct
+
+    tokenizer = make_tokenizer(
+        tokenizer_name_or_path,
+        bos_token_id=bos_token_id,
+        eos_token_id=eos_token_id,
+        pad_token_id=pad_token_id,
+        use_fast=use_fast_tokenizer,
+    )
+    if tokenizer.dtype != np.dtype(dtype):
+        raise TypeError(
+            f"Numpy type mismatch: provided dtype '{dtype}' does not match "
+            f"inferred dtype '{tokenizer.dtype}' based on vocab size {tokenizer.vocab_size:,}!"
         )
 
     # get a run hash
@@ -380,4 +407,8 @@ def tokenize_in_parallel(
         sample_ring_prop=sample_ring_prop,
         use_fast_tokenizer=use_fast_tokenizer,
         refresh_tokenizer=refresh_tokenizer,
+        text_field_name=text_field_name,
+        text_field_type=text_field_type,
+        id_field_name=id_field_name,
+        id_field_type=id_field_type,
     )