Skip to content

Commit 9210206

Browse files
author
Źmićer Rubinštejn
authored
Add steps for pipeline (#43)
1 parent c2477cb commit 9210206

File tree

14 files changed

+2119
-1
lines changed

14 files changed

+2119
-1
lines changed

lib/tokenizers/decoder.ex

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
defmodule Tokenizers.Decoder do
2+
@moduledoc """
3+
The Decoder knows how to go from the IDs used by the Tokenizer, back to a readable piece of text.
4+
Some Normalizer and PreTokenizer use special characters or identifiers that need to be reverted.
5+
"""
6+
7+
defstruct [:resource]
8+
@type t() :: %__MODULE__{resource: reference()}
9+
10+
@doc """
11+
Decodes tokens into string with provided decoder.
12+
"""
13+
@spec decode(t(), [String.t()]) :: {:ok, String.t()} | {:error, any()}
14+
defdelegate decode(decoder, tokens), to: Tokenizers.Native, as: :decoders_decode
15+
16+
@typedoc """
17+
Options for BPE decoder initialization. All options can be ommited.
18+
19+
* `suffix` - The suffix to add to the end of each word, defaults to `</w>`
20+
"""
21+
@type bpe_options :: [suffix: String.t()]
22+
23+
@doc """
24+
Creates new BPE decoder
25+
"""
26+
@spec bpe(bpe_options :: bpe_options()) :: t()
27+
defdelegate bpe(options \\ []), to: Tokenizers.Native, as: :decoders_bpe
28+
29+
@doc """
30+
Creates new ByteFallback decoder
31+
"""
32+
@spec byte_fallback() :: t()
33+
defdelegate byte_fallback(), to: Tokenizers.Native, as: :decoders_byte_fallback
34+
35+
@doc """
36+
Creates new ByteLevel decoder
37+
"""
38+
@spec byte_level() :: t()
39+
defdelegate byte_level(), to: Tokenizers.Native, as: :decoders_byte_level
40+
41+
@typedoc """
42+
Options for CTC decoder initialization. All options can be ommited.
43+
44+
* `pad_token` - The token used for padding, defaults to `<pad>`
45+
* `word_delimiter_token` - The token used for word delimiter, defaults to `|`
46+
* `cleanup` - Whether to cleanup tokenization artifacts, defaults to `true`
47+
"""
48+
@type ctc_options :: [
49+
pad_token: String.t(),
50+
word_delimiter_token: String.t(),
51+
cleanup: boolean()
52+
]
53+
54+
@doc """
55+
Creates new CTC decoder
56+
"""
57+
@spec ctc(ctc_options :: ctc_options()) :: t()
58+
defdelegate ctc(options \\ []), to: Tokenizers.Native, as: :decoders_ctc
59+
60+
@doc """
61+
Creates new Fuse decoder
62+
"""
63+
@spec fuse :: t()
64+
defdelegate fuse(), to: Tokenizers.Native, as: :decoders_fuse
65+
66+
@typedoc """
67+
Options for Metaspace decoder initialization. All options can be ommited.
68+
69+
* `replacement` - The replacement character, defaults to `▁` (as char)
70+
* `add_prefix_space` - Whether to add a space to the first word, defaults to `true`
71+
"""
72+
73+
@type metaspace_options :: [
74+
replacement: char(),
75+
add_prefix_space: boolean()
76+
]
77+
78+
@doc """
79+
Creates new Metaspace decoder
80+
"""
81+
@spec metaspace(metaspace_options :: metaspace_options()) :: t()
82+
defdelegate metaspace(options \\ []),
83+
to: Tokenizers.Native,
84+
as: :decoders_metaspace
85+
86+
@doc """
87+
Creates new Replace decoder
88+
"""
89+
@spec replace(pattern :: String.t(), content :: String.t()) :: t()
90+
defdelegate replace(pattern, content), to: Tokenizers.Native, as: :decoders_replace
91+
92+
@doc """
93+
Creates new Sequence decoder
94+
"""
95+
@spec sequence(decoders :: [Tokenizers.Decoder.t()]) :: t()
96+
defdelegate sequence(decoders), to: Tokenizers.Native, as: :decoders_sequence
97+
98+
@doc """
99+
Creates new Strip decoder.
100+
101+
It expects a character and the number of times to strip the
102+
character on `left` and `right` sides.
103+
"""
104+
@spec strip(content :: char(), left :: non_neg_integer(), right :: non_neg_integer()) :: t()
105+
defdelegate strip(content, left, right), to: Tokenizers.Native, as: :decoders_strip
106+
107+
@typedoc """
108+
Options for WordPiece decoder initialization. All options can be ommited.
109+
110+
* `prefix` - The prefix to use for subwords, defaults to `##`
111+
* `cleanup` - Whether to cleanup tokenization artifacts, defaults to `true`
112+
"""
113+
@type word_piece_options :: [
114+
prefix: String.t(),
115+
cleanup: boolean()
116+
]
117+
118+
@doc """
119+
Creates new WordPiece decoder
120+
"""
121+
@spec word_piece(word_piece_options :: word_piece_options()) :: t()
122+
defdelegate word_piece(options \\ []),
123+
to: Tokenizers.Native,
124+
as: :decoders_wordpiece
125+
end
126+
127+
defimpl Inspect, for: Tokenizers.Decoder do
128+
import Inspect.Algebra
129+
130+
@spec inspect(Tokenizers.Decoder.t(), Inspect.Opts.t()) :: Inspect.Algebra.t()
131+
def inspect(decoder, opts) do
132+
attrs =
133+
decoder
134+
|> Tokenizers.Native.decoders_info()
135+
|> Keyword.new(fn {k, v} -> {String.to_atom(k), v} end)
136+
137+
concat(["#Tokenizers.Decoder<", to_doc(attrs, opts), ">"])
138+
end
139+
end

lib/tokenizers/native.ex

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,22 @@ defmodule Tokenizers.Native do
1515
#
1616
def added_token_info(_added_token), do: err()
1717

18+
# Decoders
19+
def decoders_decode(_decoder, _tokens), do: err()
20+
#
21+
def decoders_info(_decoder), do: err()
22+
#
23+
def decoders_byte_level(), do: err()
24+
def decoders_replace(_pattern, _content), do: err()
25+
def decoders_wordpiece(_options), do: err()
26+
def decoders_byte_fallback(), do: err()
27+
def decoders_fuse(), do: err()
28+
def decoders_strip(_content, _left, _right), do: err()
29+
def decoders_metaspace(_options), do: err()
30+
def decoders_bpe(_options), do: err()
31+
def decoders_ctc(_options), do: err()
32+
def decoders_sequence(_decoders), do: err()
33+
1834
# Models
1935
def models_save(_model, _folder, _opts), do: err()
2036
#
@@ -35,6 +51,51 @@ defmodule Tokenizers.Native do
3551
def models_unigram_init(_vocab, _options), do: err()
3652
def models_unigram_empty(), do: err()
3753

54+
# Normalizers
55+
def normalizers_normalize(_normalizer, _input), do: err()
56+
#
57+
def normalizers_info(_normalizer), do: err()
58+
#
59+
def normalizers_bert_normalizer(_opts), do: err()
60+
def normalizers_nfd(), do: err()
61+
def normalizers_nfkd(), do: err()
62+
def normalizers_nfc(), do: err()
63+
def normalizers_nfkc(), do: err()
64+
def normalizers_strip(_opts), do: err()
65+
def normalizers_prepend(_prepend), do: err()
66+
def normalizers_strip_accents(), do: err()
67+
def normalizers_sequence(_normalizers), do: err()
68+
def normalizers_lowercase(), do: err()
69+
def normalizers_replace(_pattern, _content), do: err()
70+
def normalizers_nmt(), do: err()
71+
def normalizers_precompiled(_data), do: err()
72+
73+
# PreTokenizers
74+
def pre_tokenizers_pre_tokenize(_pre_tokenizer, _input), do: err()
75+
#
76+
def pre_tokenizers_info(_pre_tokenizer), do: err()
77+
#
78+
def pre_tokenizers_byte_level(_opts), do: err()
79+
def pre_tokenizers_byte_level_alphabet(), do: err()
80+
def pre_tokenizers_whitespace(), do: err()
81+
def pre_tokenizers_whitespace_split(), do: err()
82+
def pre_tokenizers_bert(), do: err()
83+
def pre_tokenizers_metaspace(_opts), do: err()
84+
def pre_tokenizers_char_delimiter_split(_delimiter), do: err()
85+
def pre_tokenizers_split(_pattern, _behavior, _options), do: err()
86+
def pre_tokenizers_punctuation(_behavior), do: err()
87+
def pre_tokenizers_sequence(_pre_tokenizers), do: err()
88+
def pre_tokenizers_digits(_options), do: err()
89+
90+
# PostProcessors
91+
def post_processors_info(_post_processor), do: err()
92+
#
93+
def post_processors_bert(_sep, _cls), do: err()
94+
def post_processors_roberta(_sep, _cls, _opts), do: err()
95+
def post_processors_byte_level(_opts), do: err()
96+
def post_processors_template(_opts), do: err()
97+
def post_processors_sequence(_post_processors), do: err()
98+
3899
# Trainers
39100
def trainers_info(_trainer), do: err()
40101
#

lib/tokenizers/normalizer.ex

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
defmodule Tokenizers.Normalizer do
2+
@moduledoc """
3+
A Normalizer is in charge of pre-processing the input string
4+
in order to normalize it as relevant for a given use case.
5+
6+
Some common examples of normalization are the Unicode normalization algorithms
7+
(NFD, NFKD, NFC & NFKC), lowercasing etc...
8+
The specificity of tokenizers is that we keep track of the alignment while normalizing.
9+
This is essential to allow mapping from the generated tokens back to the input text.
10+
11+
The Normalizer is optional.
12+
"""
13+
14+
@type t() :: %__MODULE__{resource: reference()}
15+
defstruct [:resource]
16+
17+
@doc """
18+
Normalizes the input presented as string into new string
19+
"""
20+
@spec normalize(normalizer :: t(), input :: String.t()) :: {:ok, String.t()}
21+
defdelegate normalize(normalizer, input), to: Tokenizers.Native, as: :normalizers_normalize
22+
23+
@typedoc """
24+
Options for BERT normalizer initialisation. All values are optional.
25+
26+
* `:clean_text` (default `true`) - Whether to clean the text, by removing any control characters and replacing all whitespaces by the classic one.
27+
* `:handle_chinese_chars` (default `true`) - Whether to handle chinese chars by putting spaces around them.
28+
* `:strip_accents` - Whether to strip all accents. If this option is not specified, then it will be determined by the value for lowercase (as in the original Bert).
29+
* `:lowercase` (default `true`) - Whether to lowercase.
30+
"""
31+
@type bert_opts() :: [
32+
clean_text: boolean(),
33+
handle_chinese_chars: boolean(),
34+
strip_accents: boolean(),
35+
lowercase: boolean()
36+
]
37+
@doc """
38+
Takes care of normalizing raw text before giving it to a Bert model. This includes cleaning the text, handling accents, chinese chars and lowercasing.
39+
"""
40+
@spec bert_normalizer(opts :: bert_opts()) :: t()
41+
defdelegate bert_normalizer(opts \\ []),
42+
to: Tokenizers.Native,
43+
as: :normalizers_bert_normalizer
44+
45+
@doc """
46+
NFD Unicode Normalizer,
47+
"""
48+
@spec nfd :: t()
49+
defdelegate nfd(), to: Tokenizers.Native, as: :normalizers_nfd
50+
51+
@doc """
52+
NFKD Unicode Normalizer
53+
"""
54+
@spec nfkd :: t()
55+
defdelegate nfkd(), to: Tokenizers.Native, as: :normalizers_nfkd
56+
57+
@doc """
58+
NFC Unicode Normalizer
59+
"""
60+
@spec nfc :: t()
61+
defdelegate nfc(), to: Tokenizers.Native, as: :normalizers_nfc
62+
63+
@doc """
64+
NFKC Unicode Normalizer
65+
"""
66+
@spec nfkc :: t()
67+
defdelegate nfkc(), to: Tokenizers.Native, as: :normalizers_nfkc
68+
69+
@typedoc """
70+
Options for Strip normalizer initialisation. All values are optional.
71+
72+
* `:left` (default `true`) - Whether to strip left side.
73+
* `:right` (default `true`) - Whether to strip right side.
74+
"""
75+
@type strip_opts() :: [
76+
left: boolean(),
77+
right: boolean()
78+
]
79+
@doc """
80+
Strip normalizer. Removes all whitespace characters on the specified sides (left, right or both) of the input
81+
"""
82+
@spec strip(opts :: strip_opts()) :: t()
83+
defdelegate strip(opts \\ []), to: Tokenizers.Native, as: :normalizers_strip
84+
85+
@doc """
86+
Prepend normalizer.
87+
"""
88+
@spec prepend(prepend :: String.t()) :: t()
89+
defdelegate prepend(prepend), to: Tokenizers.Native, as: :normalizers_prepend
90+
91+
@doc """
92+
Strip Accent normalizer. Removes all accent symbols in unicode (to be used with NFD for consistency).
93+
"""
94+
@spec strip_accents :: t()
95+
defdelegate strip_accents(), to: Tokenizers.Native, as: :normalizers_strip_accents
96+
97+
@doc """
98+
Composes multiple normalizers that will run in the provided order.
99+
"""
100+
@spec sequence(normalizers :: [t()]) :: t()
101+
defdelegate sequence(normalizers), to: Tokenizers.Native, as: :normalizers_sequence
102+
103+
@doc """
104+
Replaces all uppercase to lowercase
105+
"""
106+
@spec lowercase :: t()
107+
defdelegate lowercase(), to: Tokenizers.Native, as: :normalizers_lowercase
108+
109+
@doc """
110+
Replaces a custom string or regexp and changes it with given content
111+
"""
112+
@spec replace(pattern :: String.t(), content :: String.t()) ::
113+
t()
114+
defdelegate replace(pattern, content),
115+
to: Tokenizers.Native,
116+
as: :normalizers_replace
117+
118+
@doc """
119+
Nmt normalizer
120+
"""
121+
@spec nmt :: t()
122+
defdelegate nmt(), to: Tokenizers.Native, as: :normalizers_nmt
123+
124+
@doc """
125+
Precompiled normalizer. Don’t use manually it is used for compatiblity for SentencePiece.
126+
"""
127+
@spec precompiled(data :: binary()) :: {:ok, t()} | {:error, any()}
128+
defdelegate precompiled(data), to: Tokenizers.Native, as: :normalizers_precompiled
129+
end
130+
131+
defimpl Inspect, for: Tokenizers.Normalizer do
132+
import Inspect.Algebra
133+
134+
def inspect(decoder, opts) do
135+
attrs =
136+
decoder
137+
|> Tokenizers.Native.normalizers_info()
138+
|> Keyword.new(fn {k, v} -> {String.to_atom(k), v} end)
139+
140+
concat(["#Tokenizers.Normalizer<", to_doc(attrs, opts), ">"])
141+
end
142+
end

0 commit comments

Comments
 (0)