huggingface · Rocketknight1 · Jan 15, 2026 · Jan 15, 2026 · Jan 15, 2026
diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py
@@ -18,8 +18,6 @@
 import os
 import re
 
-import regex
-
 from ...tokenization_python import PreTrainedTokenizer
 from ...utils import logging
 
@@ -513,17 +511,17 @@ def add_from_file(self, f):
 ######################################################################
 # This is the core tokenizing regex:
 
-WORD_RE = regex.compile(r"""(%s)""" % "|".join(REGEXPS), regex.VERBOSE | regex.I | regex.UNICODE)
+WORD_RE = re.compile(r"""(%s)""" % "|".join(REGEXPS), re.VERBOSE | re.I | re.UNICODE)
 
 # WORD_RE performs poorly on these patterns:
-HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}")
+HANG_RE = re.compile(r"([^a-zA-Z0-9])\1{3,}")
 
 # The emoticon string gets its own regex so that we can preserve case for
 # them as needed:
-EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE)
+EMOTICON_RE = re.compile(EMOTICONS, re.VERBOSE | re.I | re.UNICODE)
 
 # These are for regularizing HTML entities to Unicode:
-ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")
+ENT_RE = re.compile(r"&(#?(x?))([^&;\s]+);")
 
 
 ######################################################################
@@ -663,15 +661,15 @@ def reduce_lengthening(text):
     """
     Replace repeated character sequences of length 3 or greater with sequences of length 3.
     """
-    pattern = regex.compile(r"(.)\1{2,}")
+    pattern = re.compile(r"(.)\1{2,}")
     return pattern.sub(r"\1\1\1", text)
 
 
 def remove_handles(text):
     """
     Remove Twitter username handles from text.
     """
-    pattern = regex.compile(
+    pattern = re.compile(
         r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)"
     )
     # Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly

diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
@@ -15,8 +15,7 @@
 
 import json
 import os
-
-import regex as re
+import re
 
 from ...tokenization_python import PreTrainedTokenizer
 from ...utils import logging

diff --git a/src/transformers/models/clvp/number_normalizer.py b/src/transformers/models/clvp/number_normalizer.py
@@ -14,14 +14,7 @@
 
 """English Normalizer class for CLVP."""
 
-import sys
-
-
-if sys.version_info >= (3, 11):
-    # Atomic grouping support was only added to the core RE in Python 3.11
-    import re
-else:
-    import regex as re
+import re
 
 
 class EnglishNormalizer:
@@ -208,8 +201,8 @@ def normalize_numbers(self, text: str) -> str:
         text = re.sub(r"([0-9][0-9,]+[0-9])", self._remove_commas, text)
         text = re.sub(r"£([0-9,]*[0-9])", r"\1 pounds", text)
         text = re.sub(r"\$([0-9.,]*[0-9])", self._expand_dollars, text)
-        text = re.sub(r"([0-9]++\.[0-9]+)", self._expand_decimal_point, text)
-        text = re.sub(r"[0-9]++(st|nd|rd|th)", self._expand_ordinal, text)
+        text = re.sub(r"([0-9]+\.[0-9]+)", self._expand_decimal_point, text)
+        text = re.sub(r"[0-9]+(st|nd|rd|th)", self._expand_ordinal, text)
         text = re.sub(r"[0-9]+", self._expand_number, text)
         return text
 

diff --git a/src/transformers/models/clvp/tokenization_clvp.py b/src/transformers/models/clvp/tokenization_clvp.py
@@ -14,10 +14,9 @@
 """Tokenization class for CLVP."""
 
 import json
+import re
 from functools import lru_cache
 
-import regex as re
-
 from ...tokenization_python import AddedToken, PreTrainedTokenizer
 from ...utils import logging
 from .number_normalizer import EnglishNormalizer

diff --git a/src/transformers/models/ctrl/tokenization_ctrl.py b/src/transformers/models/ctrl/tokenization_ctrl.py
@@ -14,8 +14,7 @@
 """Tokenization classes for Salesforce CTRL."""
 
 import json
-
-import regex as re
+import re
 
 from ...tokenization_python import PreTrainedTokenizer
 from ...utils import logging

diff --git a/src/transformers/models/deepseek_vl/convert_deepseek_vl_weights_to_hf.py b/src/transformers/models/deepseek_vl/convert_deepseek_vl_weights_to_hf.py
@@ -15,8 +15,8 @@
 import gc
 import json
 import os
+import re
 
-import regex as re
 import torch
 from huggingface_hub import snapshot_download
 from huggingface_hub.errors import HFValidationError

diff --git a/src/transformers/models/deepseek_vl_hybrid/convert_deepseek_vl_hybrid_weights_to_hf.py b/src/transformers/models/deepseek_vl_hybrid/convert_deepseek_vl_hybrid_weights_to_hf.py
@@ -15,8 +15,8 @@
 import gc
 import json
 import os
+import re
 
-import regex as re
 import torch
 from huggingface_hub import snapshot_download
 from huggingface_hub.errors import HFValidationError

diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
@@ -15,8 +15,8 @@
 import argparse
 import gc
 import os
+import re
 
-import regex as re
 import torch
 from huggingface_hub import hf_hub_download
 

diff --git a/src/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py
@@ -15,8 +15,7 @@
 
 import json
 import os
-
-import regex
+import re
 
 from ...tokenization_python import PreTrainedTokenizer
 from ...utils import logging, requires_backends
@@ -93,16 +92,16 @@ def get_vocab(self):
 
     def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
         # expand symbols
-        text = regex.sub(";", ",", text)
-        text = regex.sub(":", ",", text)
-        text = regex.sub("-", " ", text)
-        text = regex.sub("&", "and", text)
+        text = re.sub(";", ",", text)
+        text = re.sub(":", ",", text)
+        text = re.sub("-", " ", text)
+        text = re.sub("&", "and", text)
 
         # strip unnecessary symbols
-        text = regex.sub(r"[\(\)\[\]\<\>\"]+", "", text)
+        text = re.sub(r"[\(\)\[\]\<\>\"]+", "", text)
 
         # strip whitespaces
-        text = regex.sub(r"\s+", " ", text)
+        text = re.sub(r"\s+", " ", text)
 
         text = text.upper()
 

diff --git a/src/transformers/models/got_ocr2/convert_got_ocr2_weights_to_hf.py b/src/transformers/models/got_ocr2/convert_got_ocr2_weights_to_hf.py
@@ -16,8 +16,8 @@
 import gc
 import glob
 import os
+import re
 
-import regex as re
 import torch
 from huggingface_hub import snapshot_download
 from safetensors import safe_open

diff --git a/src/transformers/models/gpt_oss/convert_gpt_oss_weights_to_hf.py b/src/transformers/models/gpt_oss/convert_gpt_oss_weights_to_hf.py
@@ -16,9 +16,9 @@
 import gc
 import json
 import os
+import re
 from pathlib import Path
 
-import regex as re
 import tiktoken
 import torch
 from safetensors.torch import load_file as safe_load

diff --git a/src/transformers/models/mllama/convert_mllama_weights_to_hf.py b/src/transformers/models/mllama/convert_mllama_weights_to_hf.py
@@ -17,8 +17,8 @@
 import json
 import math
 import os
+import re
 
-import regex as re
 import torch
 import torch.nn.functional as F
 

diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -13,8 +13,8 @@
 import argparse
 import json
 import os
+import re
 
-import regex as re
 import torch
 from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 from safetensors.torch import load_file as safe_load_file

diff --git a/src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py b/src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py
@@ -29,7 +29,7 @@
     )
     GemmaTokenizerFast = None
 
-import regex as re
+import re
 
 
 """

diff --git a/src/transformers/models/sam3/convert_sam3_to_hf.py b/src/transformers/models/sam3/convert_sam3_to_hf.py
@@ -21,8 +21,8 @@
 import argparse
 import gc
 import os
+import re
 
-import regex as re
 import torch
 
 from transformers import CLIPTokenizerFast, Sam3Config, Sam3ImageProcessorFast, Sam3Model, Sam3Processor

diff --git a/src/transformers/models/sam3_video/convert_sam3_video_to_hf.py b/src/transformers/models/sam3_video/convert_sam3_video_to_hf.py
@@ -20,8 +20,8 @@
 import argparse
 import gc
 import os
+import re
 
-import regex as re
 import torch
 
 from transformers import CLIPTokenizerFast

diff --git a/...ls/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py b/...ls/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py
@@ -262,8 +262,6 @@ def rename_keys(s_dict):
 
 def convert_gin_to_config(gin_file, num_experts):
     # Convert a google style config to the hugging face format
-    import regex as re
-
     with open(gin_file, "r") as f:
         raw_gin = f.read()
 

diff --git a/src/transformers/utils/auto_docstring.py b/src/transformers/utils/auto_docstring.py
@@ -14,13 +14,12 @@
 
 import inspect
 import os
+import re
 import textwrap
 from pathlib import Path
 from types import UnionType
 from typing import Union, get_args, get_origin
 
-import regex as re
-
 from .doc import (
     MODELS_TO_PIPELINE,
     PIPELINE_TASKS_TO_SAMPLE_DOCSTRINGS,
-Original file line number
+Diff line change
@@ Expand Up / @@ -29,7 +29,7 @@ @@
         )
         GemmaTokenizerFast = None
-    import regex as re
+    import re
     """
@@ Expand Down @@