Skip to content

Commit 3961417

Browse files
authored
Fix typos and improve language (#59)
1 parent 09f3f25 commit 3961417

File tree

15 files changed

+30
-30
lines changed

15 files changed

+30
-30
lines changed

lib/tokenizers/added_token.ex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,6 @@ defimpl Inspect, for: Tokenizers.AddedToken do
6565
|> Tokenizers.Native.added_token_info()
6666
|> Keyword.new(fn {k, v} -> {String.to_atom(k), v} end)
6767

68-
concat(["#Tokenizers.PreTokenizer<", to_doc(attrs, opts), ">"])
68+
concat(["#Tokenizers.AddedToken<", to_doc(attrs, opts), ">"])
6969
end
7070
end

lib/tokenizers/decoder.ex

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ defmodule Tokenizers.Decoder do
2424
2525
## Options
2626
27-
* `suffix` - the suffix to add to the end of each word. Defaults
27+
* `:suffix` - the suffix to add to the end of each word. Defaults
2828
to `</w>`
2929
3030
"""
@@ -48,12 +48,12 @@ defmodule Tokenizers.Decoder do
4848
4949
## Options
5050
51-
* `pad_token` - the token used for padding. Defaults to `<pad>`
51+
* `:pad_token` - the token used for padding. Defaults to `<pad>`
5252
53-
* `word_delimiter_token` - the token used for word delimiter.
53+
* `:word_delimiter_token` - the token used for word delimiter.
5454
Defaults to `|`
5555
56-
* `cleanup` - whether to cleanup tokenization artifacts, defaults
56+
* `:cleanup` - whether to cleanup tokenization artifacts, defaults
5757
to `true`
5858
5959
"""
@@ -71,7 +71,7 @@ defmodule Tokenizers.Decoder do
7171
7272
## Options
7373
74-
* `replacement` - the replacement character. Defaults to `▁`
74+
* `:replacement` - the replacement character. Defaults to `▁`
7575
(as char)
7676
7777
* `:prepend_scheme` - whether to add a space to the first word if there
@@ -112,9 +112,9 @@ defmodule Tokenizers.Decoder do
112112
113113
## Options
114114
115-
* `prefix` - The prefix to use for subwords. Defaults to `##`
115+
* `:prefix` - The prefix to use for subwords. Defaults to `##`
116116
117-
* `cleanup` - Whether to cleanup tokenization artifacts. Defaults
117+
* `:cleanup` - Whether to cleanup tokenization artifacts. Defaults
118118
to `true`
119119
120120
"""

lib/tokenizers/encoding/transformation.ex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ defmodule Tokenizers.Encoding.Transformation do
22
@moduledoc """
33
Module containing handy functions to build the transformations list.
44
5-
This list is aplied to an encoding using `Tokenizers.Encoding.transform/2`.
5+
This list is applied to an encoding using `Tokenizers.Encoding.transform/2`.
66
"""
77

88
@type t :: [

lib/tokenizers/model/bpe.ex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ defmodule Tokenizers.Model.BPE do
99
the result of the merge operations for a number of words.
1010
Defaults to `10_000`
1111
12-
* `:dropout` - The BPE dropout to use. Must be an float between
12+
* `:dropout` - The BPE dropout to use. Must be a float between
1313
0 and 1
1414
1515
* `:unk_token` - The unknown token to be used by the model

lib/tokenizers/model/unigram.ex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ defmodule Tokenizers.Model.Unigram do
88
"""
99
@type options() :: [
1010
byte_fallback: boolean(),
11-
unk_id: float()
11+
unk_id: integer()
1212
]
1313

1414
@doc """

lib/tokenizers/model/wordpiece.ex

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,14 @@ defmodule Tokenizers.Model.WordPiece do
22
@typedoc """
33
Options for model initialisation.
44
5-
* `:unk_token` - the unknown token to be used by the model.
5+
* `:unk_token` - the unknown token to be used by the model.
66
Defaults to `"[UNK]"`
77
88
* `:max_input_chars_per_word` - the maximum number of characters
9-
to authorize in a single word. Defaults to `100`
9+
to allow in a single word. Defaults to `100`
1010
11-
* `:continuing_subword_prefix` - the prefix to attach to subword
12-
units that don't represent a beginning of word Defaults to `"##"`
11+
* `:continuing_subword_prefix` - the prefix to attach to subword
12+
units that don't represent a beginning of word. Defaults to `"##"`.
1313
1414
"""
1515
@type options() :: [

lib/tokenizers/normalizer.ex

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ defmodule Tokenizers.Normalizer do
2323
defdelegate normalize(normalizer, input), to: Tokenizers.Native, as: :normalizers_normalize
2424

2525
@doc """
26-
Takes care of normalizing raw text before giving it to a Bert model.
26+
Takes care of normalizing raw text before giving it to a BERT model.
2727
28-
This includes cleaning the text, handling accents, chinese chars and
28+
This includes cleaning the text, handling accents, Chinese chars and
2929
lowercasing.
3030
3131
## Options

lib/tokenizers/post_processor.ex

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ defmodule Tokenizers.PostProcessor do
2222
2323
## Options
2424
25-
* `:trim_offest` - whether to trim the whitespaces in the produced
25+
* `:trim_offsets` - whether to trim the whitespaces in the produced
2626
offsets. Defaults to `true`
2727
2828
* `:add_prefix_space` - whether add_prefix_space was ON during the
@@ -47,7 +47,7 @@ defmodule Tokenizers.PostProcessor do
4747
@doc """
4848
Creates a Template post-processor.
4949
50-
Let’s you easily template the post processing, adding special tokens
50+
Lets you easily template the post processing, adding special tokens
5151
and specifying the type id for each sequence/special token. The
5252
template is given two strings representing the single sequence and
5353
the pair of sequences, as well as a set of special tokens to use.

lib/tokenizers/pre_tokenizer.ex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ defmodule Tokenizers.PreTokenizer do
8888
@doc """
8989
Creates a BertPreTokenizer pre-tokenizer.
9090
91-
Splits for use in Bert models.
91+
Splits for use in BERT models.
9292
"""
9393
@spec bert_pre_tokenizer() :: t()
9494
defdelegate bert_pre_tokenizer(), to: Tokenizers.Native, as: :pre_tokenizers_bert

native/ex_tokenizers/src/models.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ pub fn models_save(
133133
.iter()
134134
.map(|path| {
135135
path.to_str()
136-
// Unwraping here, because we are sure that pathes are valid
136+
// Unwraping here, because we are sure that paths are valid
137137
.unwrap()
138138
.to_owned()
139139
})

0 commit comments

Comments
 (0)