Skip to content

Commit 20295cf

Browse files
author
Źmićer Rubinštejn
authored
Quickstart (#45)
1 parent 8052b96 commit 20295cf

File tree

10 files changed

+404
-118
lines changed

10 files changed

+404
-118
lines changed

lib/tokenizers/added_token.ex

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ defmodule Tokenizers.AddedToken do
2828
The notion of ”inside of a word” is defined by the word boundaries pattern
2929
in regular expressions (i.e. the token should start and end with word boundaries).
3030
31-
* `:lstrip` (default `false`) - defines whether this token should strip all potential
31+
* `:lstrip` (default `false`) - defines whether this token should strip all potential
3232
whitespaces on its left side.
3333
If `true`, this token will greedily match any whitespace on its left.
3434
For example if we try to match the token `[MASK]` with `lstrip=true`,
@@ -38,7 +38,7 @@ defmodule Tokenizers.AddedToken do
3838
whitespaces on its right side.
3939
If `true`, this token will greedily match any whitespace on its right.
4040
It works just like `lstrip` but on the right.
41-
41+
4242
* `:normalized` (default `true` for not special tokens, `false` for special tokens) -
4343
defines whether this token should match against the normalized version of the input text.
4444
For example, with the added token `"yesterday"`,
@@ -53,7 +53,7 @@ defmodule Tokenizers.AddedToken do
5353
@doc """
5454
Retrieves information about added token.
5555
"""
56-
@spec info(added_token :: __MODULE__.t()) :: map()
56+
@spec info(added_token :: t()) :: map()
5757
defdelegate info(model), to: Tokenizers.Native, as: :added_token_info
5858
end
5959

lib/tokenizers/decoder.ex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ defmodule Tokenizers.Decoder do
9292
@doc """
9393
Creates new Sequence decoder
9494
"""
95-
@spec sequence(decoders :: [Tokenizers.Decoder.t()]) :: t()
95+
@spec sequence(decoders :: [t()]) :: t()
9696
defdelegate sequence(decoders), to: Tokenizers.Native, as: :decoders_sequence
9797

9898
@doc """

lib/tokenizers/encoding.ex

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -11,114 +11,114 @@ defmodule Tokenizers.Encoding do
1111
@doc """
1212
Get the number of tokens in an encoding.
1313
"""
14-
@spec get_length(Encoding.t()) :: non_neg_integer()
14+
@spec get_length(t()) :: non_neg_integer()
1515
defdelegate get_length(encoding), to: Tokenizers.Native, as: :encoding_get_length
1616

1717
@doc """
1818
Return the number of sequences combined in this Encoding
1919
"""
20-
@spec get_n_sequences(Encoding.t()) :: non_neg_integer()
20+
@spec get_n_sequences(t()) :: non_neg_integer()
2121
defdelegate get_n_sequences(encoding), to: Tokenizers.Native, as: :encoding_get_n_sequences
2222

2323
@doc """
2424
Set the given sequence id for the whole range of tokens contained in this Encoding.
2525
"""
26-
@spec set_sequence_id(Encoding.t(), non_neg_integer()) :: Encoding.t()
26+
@spec set_sequence_id(t(), non_neg_integer()) :: t()
2727
defdelegate set_sequence_id(encoding, id), to: Tokenizers.Native, as: :encoding_set_sequence_id
2828

2929
@doc """
3030
Get the ids from an encoding.
3131
"""
32-
@spec get_ids(Encoding.t()) :: [integer()]
32+
@spec get_ids(t()) :: [integer()]
3333
defdelegate get_ids(encoding), to: Tokenizers.Native, as: :encoding_get_ids
3434

3535
@doc """
3636
Same as `get_ids/1`, but returns binary with u32 values.
3737
"""
38-
@spec get_u32_ids(Encoding.t()) :: binary()
38+
@spec get_u32_ids(t()) :: binary()
3939
defdelegate get_u32_ids(encoding), to: Tokenizers.Native, as: :encoding_get_u32_ids
4040

4141
@doc """
4242
Get token type ids from an encoding.
4343
"""
44-
@spec get_type_ids(Encoding.t()) :: [integer()]
44+
@spec get_type_ids(t()) :: [integer()]
4545
defdelegate get_type_ids(encoding), to: Tokenizers.Native, as: :encoding_get_type_ids
4646

4747
@doc """
4848
Same as `get_type_ids/1`, but returns binary with u32 values.
4949
"""
50-
@spec get_u32_type_ids(Encoding.t()) :: binary()
50+
@spec get_u32_type_ids(t()) :: binary()
5151
defdelegate get_u32_type_ids(encoding), to: Tokenizers.Native, as: :encoding_get_u32_type_ids
5252

5353
@doc """
5454
Get the attention mask from an encoding.
5555
"""
56-
@spec get_attention_mask(Encoding.t()) :: [integer()]
56+
@spec get_attention_mask(t()) :: [integer()]
5757
defdelegate get_attention_mask(encoding),
5858
to: Tokenizers.Native,
5959
as: :encoding_get_attention_mask
6060

6161
@doc """
6262
Same as `get_attention_mask/1`, but returns binary with u32 values.
6363
"""
64-
@spec get_u32_attention_mask(Encoding.t()) :: binary()
64+
@spec get_u32_attention_mask(t()) :: binary()
6565
defdelegate get_u32_attention_mask(encoding),
6666
to: Tokenizers.Native,
6767
as: :encoding_get_u32_attention_mask
6868

6969
@doc """
7070
Get the special tokens mask from an encoding.
7171
"""
72-
@spec get_special_tokens_mask(Encoding.t()) :: [integer()]
72+
@spec get_special_tokens_mask(t()) :: [integer()]
7373
defdelegate get_special_tokens_mask(encoding),
7474
to: Tokenizers.Native,
7575
as: :encoding_get_special_tokens_mask
7676

7777
@doc """
7878
Same as `get_special_tokens_mask/1`, but returns binary with u32 values.
7979
"""
80-
@spec get_u32_special_tokens_mask(Encoding.t()) :: binary()
80+
@spec get_u32_special_tokens_mask(t()) :: binary()
8181
defdelegate get_u32_special_tokens_mask(encoding),
8282
to: Tokenizers.Native,
8383
as: :encoding_get_u32_special_tokens_mask
8484

8585
@doc """
8686
Get the tokens from an encoding.
8787
"""
88-
@spec get_tokens(Encoding.t()) :: [binary()]
88+
@spec get_tokens(t()) :: [binary()]
8989
defdelegate get_tokens(encoding), to: Tokenizers.Native, as: :encoding_get_tokens
9090

9191
@doc """
9292
Get word ids from an encoding.
9393
"""
94-
@spec get_word_ids(Encoding.t()) :: [non_neg_integer() | nil]
94+
@spec get_word_ids(t()) :: [non_neg_integer() | nil]
9595
defdelegate get_word_ids(encoding), to: Tokenizers.Native, as: :encoding_get_word_ids
9696

9797
@doc """
9898
Get sequence ids from an encoding.
9999
"""
100-
@spec get_sequence_ids(Encoding.t()) :: [non_neg_integer() | nil]
100+
@spec get_sequence_ids(t()) :: [non_neg_integer() | nil]
101101
defdelegate get_sequence_ids(encoding), to: Tokenizers.Native, as: :encoding_get_sequence_ids
102102

103103
@doc """
104104
Get offsets from an encoding.
105105
106106
The offsets are expressed in terms of UTF-8 bytes.
107107
"""
108-
@spec get_offsets(Encoding.t()) :: [{integer(), integer()}]
108+
@spec get_offsets(t()) :: [{integer(), integer()}]
109109
defdelegate get_offsets(encoding), to: Tokenizers.Native, as: :encoding_get_offsets
110110

111111
@doc """
112112
Get the overflow from an encoding.
113113
"""
114-
@spec get_overflowing(Encoding.t()) :: [Encoding.t()]
114+
@spec get_overflowing(t()) :: [t()]
115115
defdelegate get_overflowing(encoding), to: Tokenizers.Native, as: :encoding_get_overflowing
116116

117117
@doc """
118118
Get the encoded tokens corresponding to the word at the given index in the input sequence,
119119
with the form (start_token, end_token + 1)
120120
"""
121-
@spec word_to_tokens(Encoding.t(), non_neg_integer(), non_neg_integer()) ::
121+
@spec word_to_tokens(t(), non_neg_integer(), non_neg_integer()) ::
122122
{non_neg_integer(), non_neg_integer()} | nil
123123
defdelegate word_to_tokens(encoding, word, seq_id),
124124
to: Tokenizers.Native,
@@ -127,7 +127,7 @@ defmodule Tokenizers.Encoding do
127127
@doc """
128128
Get the offsets of the word at the given index in the input sequence.
129129
"""
130-
@spec word_to_chars(Encoding.t(), non_neg_integer(), non_neg_integer()) ::
130+
@spec word_to_chars(t(), non_neg_integer(), non_neg_integer()) ::
131131
{non_neg_integer(), non_neg_integer()} | nil
132132
defdelegate word_to_chars(encoding, word, seq_id),
133133
to: Tokenizers.Native,
@@ -136,29 +136,29 @@ defmodule Tokenizers.Encoding do
136136
@doc """
137137
Returns the index of the sequence containing the given token
138138
"""
139-
@spec token_to_sequence(Encoding.t(), non_neg_integer()) :: non_neg_integer() | nil
139+
@spec token_to_sequence(t(), non_neg_integer()) :: non_neg_integer() | nil
140140
defdelegate token_to_sequence(encoding, token),
141141
to: Tokenizers.Native,
142142
as: :encoding_token_to_sequence
143143

144144
@doc """
145145
Get the offsets of the token at the given index.
146146
"""
147-
@spec token_to_chars(Encoding.t(), non_neg_integer()) ::
147+
@spec token_to_chars(t(), non_neg_integer()) ::
148148
{non_neg_integer(), {non_neg_integer(), non_neg_integer()}} | nil
149149
defdelegate token_to_chars(encoding, token), to: Tokenizers.Native, as: :encoding_token_to_chars
150150

151151
@doc """
152152
Get the word that contains the token at the given index.
153153
"""
154-
@spec token_to_word(Encoding.t(), non_neg_integer()) ::
154+
@spec token_to_word(t(), non_neg_integer()) ::
155155
{non_neg_integer(), non_neg_integer()} | nil
156156
defdelegate token_to_word(encoding, token), to: Tokenizers.Native, as: :encoding_token_to_word
157157

158158
@doc """
159159
Get the token that contains the given char.
160160
"""
161-
@spec char_to_token(Encoding.t(), non_neg_integer(), non_neg_integer()) ::
161+
@spec char_to_token(t(), non_neg_integer(), non_neg_integer()) ::
162162
non_neg_integer() | nil
163163
defdelegate char_to_token(encoding, position, seq_id),
164164
to: Tokenizers.Native,
@@ -167,7 +167,7 @@ defmodule Tokenizers.Encoding do
167167
@doc """
168168
Get the word that contains the given char.
169169
"""
170-
@spec char_to_word(Encoding.t(), non_neg_integer(), non_neg_integer()) ::
170+
@spec char_to_word(t(), non_neg_integer(), non_neg_integer()) ::
171171
non_neg_integer() | nil
172172
defdelegate char_to_word(encoding, position, seq_id),
173173
to: Tokenizers.Native,
@@ -192,7 +192,7 @@ defmodule Tokenizers.Encoding do
192192
@doc """
193193
Pad the encoding to the given length.
194194
"""
195-
@spec pad(Encoding.t(), non_neg_integer(), padding_opts()) :: Encoding.t()
195+
@spec pad(t(), non_neg_integer(), padding_opts()) :: t()
196196
defdelegate pad(encoding, target_length, opts \\ []),
197197
to: Tokenizers.Native,
198198
as: :encoding_pad
@@ -208,15 +208,15 @@ defmodule Tokenizers.Encoding do
208208
@doc """
209209
Truncate the encoding to the given length.
210210
"""
211-
@spec truncate(Encoding.t(), non_neg_integer(), truncation_opts()) :: Encoding.t()
211+
@spec truncate(t(), non_neg_integer(), truncation_opts()) :: t()
212212
defdelegate truncate(encoding, max_length, opts \\ []),
213213
to: Tokenizers.Native,
214214
as: :encoding_truncate
215215

216216
@doc """
217-
Returns the number of tokens in an `Encoding.t()`.
217+
Returns the number of tokens in an `t()`.
218218
"""
219-
@spec n_tokens(encoding :: Encoding.t()) :: non_neg_integer()
219+
@spec n_tokens(encoding :: t()) :: non_neg_integer()
220220
defdelegate n_tokens(encoding), to: Tokenizers.Native, as: :encoding_get_length
221221
end
222222

lib/tokenizers/model.ex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ defmodule Tokenizers.Model do
1414
1515
Information retrieved differs per model but all include `model_type`.
1616
"""
17-
@spec info(model :: __MODULE__.t()) :: map()
17+
@spec info(model :: t()) :: map()
1818
defdelegate info(model), to: Tokenizers.Native, as: :models_info
1919

2020
@typedoc """

0 commit comments

Comments
 (0)