|
| 1 | +defmodule Tokenizers.Normalizer do |
| 2 | + @moduledoc """ |
| 3 | + A Normalizer is in charge of pre-processing the input string |
| 4 | + in order to normalize it as relevant for a given use case. |
| 5 | +
|
| 6 | + Some common examples of normalization are the Unicode normalization algorithms |
| 7 | + (NFD, NFKD, NFC & NFKC), lowercasing etc... |
| 8 | + The specificity of tokenizers is that we keep track of the alignment while normalizing. |
| 9 | + This is essential to allow mapping from the generated tokens back to the input text. |
| 10 | +
|
| 11 | + The Normalizer is optional. |
| 12 | + """ |
| 13 | + |
| 14 | + @type t() :: %__MODULE__{resource: reference()} |
| 15 | + defstruct [:resource] |
| 16 | + |
| 17 | + @doc """ |
| 18 | + Normalizes the input presented as string into new string |
| 19 | + """ |
| 20 | + @spec normalize(normalizer :: t(), input :: String.t()) :: {:ok, String.t()} |
| 21 | + defdelegate normalize(normalizer, input), to: Tokenizers.Native, as: :normalizers_normalize |
| 22 | + |
| 23 | + @typedoc """ |
| 24 | + Options for BERT normalizer initialisation. All values are optional. |
| 25 | +
|
| 26 | + * `:clean_text` (default `true`) - Whether to clean the text, by removing any control characters and replacing all whitespaces by the classic one. |
| 27 | + * `:handle_chinese_chars` (default `true`) - Whether to handle chinese chars by putting spaces around them. |
| 28 | + * `:strip_accents` - Whether to strip all accents. If this option is not specified, then it will be determined by the value for lowercase (as in the original Bert). |
| 29 | + * `:lowercase` (default `true`) - Whether to lowercase. |
| 30 | + """ |
| 31 | + @type bert_opts() :: [ |
| 32 | + clean_text: boolean(), |
| 33 | + handle_chinese_chars: boolean(), |
| 34 | + strip_accents: boolean(), |
| 35 | + lowercase: boolean() |
| 36 | + ] |
| 37 | + @doc """ |
| 38 | + Takes care of normalizing raw text before giving it to a Bert model. This includes cleaning the text, handling accents, chinese chars and lowercasing. |
| 39 | + """ |
| 40 | + @spec bert_normalizer(opts :: bert_opts()) :: t() |
| 41 | + defdelegate bert_normalizer(opts \\ []), |
| 42 | + to: Tokenizers.Native, |
| 43 | + as: :normalizers_bert_normalizer |
| 44 | + |
| 45 | + @doc """ |
| 46 | + NFD Unicode Normalizer, |
| 47 | + """ |
| 48 | + @spec nfd :: t() |
| 49 | + defdelegate nfd(), to: Tokenizers.Native, as: :normalizers_nfd |
| 50 | + |
| 51 | + @doc """ |
| 52 | + NFKD Unicode Normalizer |
| 53 | + """ |
| 54 | + @spec nfkd :: t() |
| 55 | + defdelegate nfkd(), to: Tokenizers.Native, as: :normalizers_nfkd |
| 56 | + |
| 57 | + @doc """ |
| 58 | + NFC Unicode Normalizer |
| 59 | + """ |
| 60 | + @spec nfc :: t() |
| 61 | + defdelegate nfc(), to: Tokenizers.Native, as: :normalizers_nfc |
| 62 | + |
| 63 | + @doc """ |
| 64 | + NFKC Unicode Normalizer |
| 65 | + """ |
| 66 | + @spec nfkc :: t() |
| 67 | + defdelegate nfkc(), to: Tokenizers.Native, as: :normalizers_nfkc |
| 68 | + |
| 69 | + @typedoc """ |
| 70 | + Options for Strip normalizer initialisation. All values are optional. |
| 71 | +
|
| 72 | + * `:left` (default `true`) - Whether to strip left side. |
| 73 | + * `:right` (default `true`) - Whether to strip right side. |
| 74 | + """ |
| 75 | + @type strip_opts() :: [ |
| 76 | + left: boolean(), |
| 77 | + right: boolean() |
| 78 | + ] |
| 79 | + @doc """ |
| 80 | + Strip normalizer. Removes all whitespace characters on the specified sides (left, right or both) of the input |
| 81 | + """ |
| 82 | + @spec strip(opts :: strip_opts()) :: t() |
| 83 | + defdelegate strip(opts \\ []), to: Tokenizers.Native, as: :normalizers_strip |
| 84 | + |
| 85 | + @doc """ |
| 86 | + Prepend normalizer. |
| 87 | + """ |
| 88 | + @spec prepend(prepend :: String.t()) :: t() |
| 89 | + defdelegate prepend(prepend), to: Tokenizers.Native, as: :normalizers_prepend |
| 90 | + |
| 91 | + @doc """ |
| 92 | + Strip Accent normalizer. Removes all accent symbols in unicode (to be used with NFD for consistency). |
| 93 | + """ |
| 94 | + @spec strip_accents :: t() |
| 95 | + defdelegate strip_accents(), to: Tokenizers.Native, as: :normalizers_strip_accents |
| 96 | + |
| 97 | + @doc """ |
| 98 | + Composes multiple normalizers that will run in the provided order. |
| 99 | + """ |
| 100 | + @spec sequence(normalizers :: [t()]) :: t() |
| 101 | + defdelegate sequence(normalizers), to: Tokenizers.Native, as: :normalizers_sequence |
| 102 | + |
| 103 | + @doc """ |
| 104 | + Replaces all uppercase to lowercase |
| 105 | + """ |
| 106 | + @spec lowercase :: t() |
| 107 | + defdelegate lowercase(), to: Tokenizers.Native, as: :normalizers_lowercase |
| 108 | + |
| 109 | + @doc """ |
| 110 | + Replaces a custom string or regexp and changes it with given content |
| 111 | + """ |
| 112 | + @spec replace(pattern :: String.t(), content :: String.t()) :: |
| 113 | + t() |
| 114 | + defdelegate replace(pattern, content), |
| 115 | + to: Tokenizers.Native, |
| 116 | + as: :normalizers_replace |
| 117 | + |
| 118 | + @doc """ |
| 119 | + Nmt normalizer |
| 120 | + """ |
| 121 | + @spec nmt :: t() |
| 122 | + defdelegate nmt(), to: Tokenizers.Native, as: :normalizers_nmt |
| 123 | + |
| 124 | + @doc """ |
| 125 | + Precompiled normalizer. Don’t use manually it is used for compatiblity for SentencePiece. |
| 126 | + """ |
| 127 | + @spec precompiled(data :: binary()) :: {:ok, t()} | {:error, any()} |
| 128 | + defdelegate precompiled(data), to: Tokenizers.Native, as: :normalizers_precompiled |
| 129 | +end |
| 130 | + |
| 131 | +defimpl Inspect, for: Tokenizers.Normalizer do |
| 132 | + import Inspect.Algebra |
| 133 | + |
| 134 | + def inspect(decoder, opts) do |
| 135 | + attrs = |
| 136 | + decoder |
| 137 | + |> Tokenizers.Native.normalizers_info() |
| 138 | + |> Keyword.new(fn {k, v} -> {String.to_atom(k), v} end) |
| 139 | + |
| 140 | + concat(["#Tokenizers.Normalizer<", to_doc(attrs, opts), ">"]) |
| 141 | + end |
| 142 | +end |
0 commit comments