Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions src/transformers/models/bertweet/tokenization_bertweet.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
import os
import re

import regex

from ...tokenization_python import PreTrainedTokenizer
from ...utils import logging

Expand Down Expand Up @@ -513,17 +511,17 @@ def add_from_file(self, f):
######################################################################
# This is the core tokenizing regex:

WORD_RE = regex.compile(r"""(%s)""" % "|".join(REGEXPS), regex.VERBOSE | regex.I | regex.UNICODE)
WORD_RE = re.compile(r"""(%s)""" % "|".join(REGEXPS), re.VERBOSE | re.I | re.UNICODE)

# WORD_RE performs poorly on these patterns:
HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}")
HANG_RE = re.compile(r"([^a-zA-Z0-9])\1{3,}")

# The emoticon string gets its own regex so that we can preserve case for
# them as needed:
EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE)
EMOTICON_RE = re.compile(EMOTICONS, re.VERBOSE | re.I | re.UNICODE)

# These are for regularizing HTML entities to Unicode:
ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")
ENT_RE = re.compile(r"&(#?(x?))([^&;\s]+);")


######################################################################
Expand Down Expand Up @@ -663,15 +661,15 @@ def reduce_lengthening(text):
"""
Replace repeated character sequences of length 3 or greater with sequences of length 3.
"""
pattern = regex.compile(r"(.)\1{2,}")
pattern = re.compile(r"(.)\1{2,}")
return pattern.sub(r"\1\1\1", text)


def remove_handles(text):
"""
Remove Twitter username handles from text.
"""
pattern = regex.compile(
pattern = re.compile(
r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)"
)
# Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@

import json
import os

import regex as re
import re

from ...tokenization_python import PreTrainedTokenizer
from ...utils import logging
Expand Down
13 changes: 3 additions & 10 deletions src/transformers/models/clvp/number_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,7 @@

"""English Normalizer class for CLVP."""

import sys


if sys.version_info >= (3, 11):
# Atomic grouping support was only added to the core RE in Python 3.11
import re
else:
import regex as re
import re


class EnglishNormalizer:
Expand Down Expand Up @@ -208,8 +201,8 @@ def normalize_numbers(self, text: str) -> str:
text = re.sub(r"([0-9][0-9,]+[0-9])", self._remove_commas, text)
text = re.sub(r"£([0-9,]*[0-9])", r"\1 pounds", text)
text = re.sub(r"\$([0-9.,]*[0-9])", self._expand_dollars, text)
text = re.sub(r"([0-9]++\.[0-9]+)", self._expand_decimal_point, text)
text = re.sub(r"[0-9]++(st|nd|rd|th)", self._expand_ordinal, text)
text = re.sub(r"([0-9]+\.[0-9]+)", self._expand_decimal_point, text)
text = re.sub(r"[0-9]+(st|nd|rd|th)", self._expand_ordinal, text)
text = re.sub(r"[0-9]+", self._expand_number, text)
return text

Expand Down
3 changes: 1 addition & 2 deletions src/transformers/models/clvp/tokenization_clvp.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,9 @@
"""Tokenization class for CLVP."""

import json
import re
from functools import lru_cache

import regex as re

from ...tokenization_python import AddedToken, PreTrainedTokenizer
from ...utils import logging
from .number_normalizer import EnglishNormalizer
Expand Down
3 changes: 1 addition & 2 deletions src/transformers/models/ctrl/tokenization_ctrl.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@
"""Tokenization classes for Salesforce CTRL."""

import json

import regex as re
import re

from ...tokenization_python import PreTrainedTokenizer
from ...utils import logging
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
import gc
import json
import os
import re

import regex as re
import torch
from huggingface_hub import snapshot_download
from huggingface_hub.errors import HFValidationError
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
import gc
import json
import os
import re

import regex as re
import torch
from huggingface_hub import snapshot_download
from huggingface_hub.errors import HFValidationError
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
import argparse
import gc
import os
import re

import regex as re
import torch
from huggingface_hub import hf_hub_download

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@

import json
import os

import regex
import re

from ...tokenization_python import PreTrainedTokenizer
from ...utils import logging, requires_backends
Expand Down Expand Up @@ -93,16 +92,16 @@ def get_vocab(self):

def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
# expand symbols
text = regex.sub(";", ",", text)
text = regex.sub(":", ",", text)
text = regex.sub("-", " ", text)
text = regex.sub("&", "and", text)
text = re.sub(";", ",", text)
text = re.sub(":", ",", text)
text = re.sub("-", " ", text)
text = re.sub("&", "and", text)

# strip unnecessary symbols
text = regex.sub(r"[\(\)\[\]\<\>\"]+", "", text)
text = re.sub(r"[\(\)\[\]\<\>\"]+", "", text)

# strip whitespaces
text = regex.sub(r"\s+", " ", text)
text = re.sub(r"\s+", " ", text)

text = text.upper()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
import gc
import glob
import os
import re

import regex as re
import torch
from huggingface_hub import snapshot_download
from safetensors import safe_open
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@
import gc
import json
import os
import re
from pathlib import Path

import regex as re
import tiktoken
import torch
from safetensors.torch import load_file as safe_load
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
import json
import math
import os
import re

import regex as re
import torch
import torch.nn.functional as F

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
import argparse
import json
import os
import re

import regex as re
import torch
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from safetensors.torch import load_file as safe_load_file
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
)
GemmaTokenizerFast = None

import regex as re
import re


"""
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/sam3/convert_sam3_to_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
import argparse
import gc
import os
import re

import regex as re
import torch

from transformers import CLIPTokenizerFast, Sam3Config, Sam3ImageProcessorFast, Sam3Model, Sam3Processor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
import argparse
import gc
import os
import re

import regex as re
import torch

from transformers import CLIPTokenizerFast
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -262,8 +262,6 @@ def rename_keys(s_dict):

def convert_gin_to_config(gin_file, num_experts):
# Convert a google style config to the hugging face format
import regex as re

with open(gin_file, "r") as f:
raw_gin = f.read()

Expand Down
3 changes: 1 addition & 2 deletions src/transformers/utils/auto_docstring.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,12 @@

import inspect
import os
import re
import textwrap
from pathlib import Path
from types import UnionType
from typing import Union, get_args, get_origin

import regex as re

from .doc import (
MODELS_TO_PIPELINE,
PIPELINE_TASKS_TO_SAMPLE_DOCSTRINGS,
Expand Down