Fix typos and improve language (#59)

preciz · web-flow · commit 396141719722 · 2024-07-31T08:52:26.000+02:00
diff --git a/lib/tokenizers/added_token.ex b/lib/tokenizers/added_token.ex
@@ -65,6 +65,6 @@ defimpl Inspect, for: Tokenizers.AddedToken do
       |> Tokenizers.Native.added_token_info()
       |> Keyword.new(fn {k, v} -> {String.to_atom(k), v} end)
 
-    concat(["#Tokenizers.PreTokenizer<", to_doc(attrs, opts), ">"])
+    concat(["#Tokenizers.AddedToken<", to_doc(attrs, opts), ">"])
   end
 end
diff --git a/lib/tokenizers/decoder.ex b/lib/tokenizers/decoder.ex
@@ -24,7 +24,7 @@ defmodule Tokenizers.Decoder do
 
   ## Options
 
-    * `suffix` - the suffix to add to the end of each word. Defaults
+    * `:suffix` - the suffix to add to the end of each word. Defaults
       to `</w>`
 
   """
@@ -48,12 +48,12 @@ defmodule Tokenizers.Decoder do
 
   ## Options
 
-    * `pad_token` - the token used for padding. Defaults to `<pad>`
+    * `:pad_token` - the token used for padding. Defaults to `<pad>`
 
-    * `word_delimiter_token` - the token used for word delimiter.
+    * `:word_delimiter_token` - the token used for word delimiter.
       Defaults to `|`
 
-    * `cleanup` - whether to cleanup tokenization artifacts, defaults
+    * `:cleanup` - whether to cleanup tokenization artifacts, defaults
       to `true`
 
   """
@@ -71,7 +71,7 @@ defmodule Tokenizers.Decoder do
 
   ## Options
 
-    * `replacement` - the replacement character. Defaults to `▁`
+    * `:replacement` - the replacement character. Defaults to `▁`
       (as char)
 
     * `:prepend_scheme` - whether to add a space to the first word if there
@@ -112,9 +112,9 @@ defmodule Tokenizers.Decoder do
 
   ## Options
 
-    * `prefix` - The prefix to use for subwords. Defaults to `##`
+    * `:prefix` - The prefix to use for subwords. Defaults to `##`
 
-    * `cleanup` - Whether to cleanup tokenization artifacts. Defaults
+    * `:cleanup` - Whether to cleanup tokenization artifacts. Defaults
       to `true`
 
   """
diff --git a/lib/tokenizers/encoding/transformation.ex b/lib/tokenizers/encoding/transformation.ex
@@ -2,7 +2,7 @@ defmodule Tokenizers.Encoding.Transformation do
   @moduledoc """
   Module containing handy functions to build the transformations list.
 
-  This list is aplied to an encoding using `Tokenizers.Encoding.transform/2`.
+  This list is applied to an encoding using `Tokenizers.Encoding.transform/2`.
   """
 
   @type t :: [
diff --git a/lib/tokenizers/model/bpe.ex b/lib/tokenizers/model/bpe.ex
@@ -9,7 +9,7 @@ defmodule Tokenizers.Model.BPE do
       the result of the merge operations for a number of words.
       Defaults to `10_000`
 
-    * `:dropout` - The BPE dropout to use. Must be an float between
+    * `:dropout` - The BPE dropout to use. Must be a float between
       0 and 1
 
     * `:unk_token` - The unknown token to be used by the model
diff --git a/lib/tokenizers/model/unigram.ex b/lib/tokenizers/model/unigram.ex
@@ -8,7 +8,7 @@ defmodule Tokenizers.Model.Unigram do
   """
   @type options() :: [
           byte_fallback: boolean(),
-          unk_id: float()
+          unk_id: integer()
         ]
 
   @doc """
diff --git a/lib/tokenizers/model/wordpiece.ex b/lib/tokenizers/model/wordpiece.ex
@@ -2,14 +2,14 @@ defmodule Tokenizers.Model.WordPiece do
   @typedoc """
   Options for model initialisation.
 
-    * `:unk_token`  - the unknown token to be used by the model.
+    * `:unk_token` - the unknown token to be used by the model.
       Defaults to `"[UNK]"`
 
     * `:max_input_chars_per_word` - the maximum number of characters
-      to authorize in a single word. Defaults to `100`
+      to allow in a single word. Defaults to `100`
 
-    * `:continuing_subword_prefix`  - the prefix to attach to subword
-      units that don't represent a beginning of word Defaults to `"##"`
+    * `:continuing_subword_prefix` - the prefix to attach to subword
+      units that don't represent a beginning of word. Defaults to `"##"`.
 
   """
   @type options() :: [
diff --git a/lib/tokenizers/normalizer.ex b/lib/tokenizers/normalizer.ex
@@ -23,9 +23,9 @@ defmodule Tokenizers.Normalizer do
   defdelegate normalize(normalizer, input), to: Tokenizers.Native, as: :normalizers_normalize
 
   @doc """
-  Takes care of normalizing raw text before giving it to a Bert model.
+  Takes care of normalizing raw text before giving it to a BERT model.
 
-  This includes cleaning the text, handling accents, chinese chars and
+  This includes cleaning the text, handling accents, Chinese chars and
   lowercasing.
 
   ## Options
diff --git a/lib/tokenizers/post_processor.ex b/lib/tokenizers/post_processor.ex
@@ -22,7 +22,7 @@ defmodule Tokenizers.PostProcessor do
 
   ## Options
 
-    * `:trim_offest` - whether to trim the whitespaces in the produced
+    * `:trim_offsets` - whether to trim the whitespaces in the produced
       offsets. Defaults to `true`
 
     * `:add_prefix_space` - whether add_prefix_space was ON during the
@@ -47,7 +47,7 @@ defmodule Tokenizers.PostProcessor do
   @doc """
   Creates a Template post-processor.
 
-  Let’s you easily template the post processing, adding special tokens
+  Lets you easily template the post processing, adding special tokens
   and specifying the type id for each sequence/special token. The
   template is given two strings representing the single sequence and
   the pair of sequences, as well as a set of special tokens to use.
diff --git a/lib/tokenizers/pre_tokenizer.ex b/lib/tokenizers/pre_tokenizer.ex
@@ -88,7 +88,7 @@ defmodule Tokenizers.PreTokenizer do
   @doc """
   Creates a BertPreTokenizer pre-tokenizer.
 
-  Splits for use in Bert models.
+  Splits for use in BERT models.
   """
   @spec bert_pre_tokenizer() :: t()
   defdelegate bert_pre_tokenizer(), to: Tokenizers.Native, as: :pre_tokenizers_bert
diff --git a/native/ex_tokenizers/src/models.rs b/native/ex_tokenizers/src/models.rs
@@ -133,7 +133,7 @@ pub fn models_save(
         .iter()
         .map(|path| {
             path.to_str()
-                // Unwraping here, because we are sure that pathes are valid
+                // Unwraping here, because we are sure that paths are valid
                 .unwrap()
                 .to_owned()
         })
diff --git a/test/tokenizers/decoder_test.exs b/test/tokenizers/decoder_test.exs
@@ -67,7 +67,7 @@ defmodule Tokenizers.DecoderTest do
       assert %Tokenizers.Decoder{} = Tokenizers.Decoder.strip(?_, 0, 0)
     end
 
-    test "cant be initialized with invalid char" do
+    test "can't be initialized with invalid char" do
       assert_raise ArgumentError, fn ->
         Tokenizers.Decoder.strip(61_126_999, 0, 0)
       end
diff --git a/test/tokenizers/model/bpe_test.exs b/test/tokenizers/model/bpe_test.exs
@@ -25,23 +25,23 @@ defmodule Tokenizers.Model.BPETest do
   end
 
   describe "loaded from file" do
-    test "Good initialized with valid pathes" do
+    test "Good initialization with valid paths" do
       assert {:ok, %Tokenizers.Model{}} =
                Tokenizers.Model.BPE.from_file(
                  "test/fixtures/vocab.json",
                  "test/fixtures/merges.txt"
                )
     end
 
-    test "bad initialized with invalid pathes" do
+    test "bad initialization with invalid paths" do
       assert {:error, _} =
                Tokenizers.Model.BPE.from_file(
                  "test/fixtures/not_found_vocab.json",
                  "test/fixtures/merges.txt"
                )
     end
 
-    test "bad initialized with good pathes but invalid data" do
+    test "bad initialization with good paths but invalid data" do
       assert {:error, _} =
                Tokenizers.Model.BPE.from_file(
                  "test/fixtures/vocab.txt",
diff --git a/test/tokenizers/model/wordlevel_test.exs b/test/tokenizers/model/wordlevel_test.exs
@@ -25,12 +25,12 @@ defmodule Tokenizers.Model.WordLevelTest do
   end
 
   describe "loaded from file" do
-    test "good initialized with valid pathes" do
+    test "good initialization with valid paths" do
       assert {:ok, %Tokenizers.Model{}} =
                Tokenizers.Model.WordLevel.from_file("test/fixtures/vocab.json")
     end
 
-    test "bad initialized with invalid pathes" do
+    test "bad initialization with invalid paths" do
       assert {:error, _} =
                Tokenizers.Model.WordLevel.from_file("test/fixtures/not_found_vocab.json")
     end
diff --git a/test/tokenizers/model/wordpiece_test.exs b/test/tokenizers/model/wordpiece_test.exs
@@ -25,12 +25,12 @@ defmodule Tokenizers.Model.WordPieceTest do
   end
 
   describe "loaded from file" do
-    test "good initialized with valid pathes" do
+    test "good initialization with valid paths" do
       assert {:ok, %Tokenizers.Model{}} =
                Tokenizers.Model.WordPiece.from_file("test/fixtures/vocab.txt")
     end
 
-    test "bad initialized with invalid pathes" do
+    test "bad initialization with invalid paths" do
       assert {:error, _} =
                Tokenizers.Model.WordPiece.from_file("test/fixtures/not_found_vocab.json")
     end
diff --git a/test/tokenizers/normalizer_test.exs b/test/tokenizers/normalizer_test.exs
@@ -1,6 +1,6 @@
 defmodule Tokenizers.NormalizerTest do
   use ExUnit.Case, async: true
-  doctest Tokenizers.Decoder
+  doctest Tokenizers.Normalizer
 
   describe "Bert" do
     test "accepts no parameters" do

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@ defmodule Tokenizers.Model.Unigram do`
`8`	`8`	`"""`
`9`	`9`	`@type options() :: [`
`10`	`10`	`byte_fallback: boolean(),`
`11`		`- unk_id: float()`
	`11`	`+ unk_id: integer()`
`12`	`12`	`]`
`13`	`13`
`14`	`14`	`@doc """`