Skip to content

Commit cb9738f

Browse files
committed
fixing indentation.
1 parent f21b1ca commit cb9738f

File tree

2 files changed

+119
-119
lines changed

2 files changed

+119
-119
lines changed

+bert/+tokenizer/+internal/FullTokenizer.m

Lines changed: 37 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,41 @@
11
classdef FullTokenizer < bert.tokenizer.internal.Tokenizer
2-
% FullTokenizer A tokenizer based on word-piece tokenization.
3-
%
4-
% tokenizer = FullTokenizer(vocabFile) constructs a FullTokenizer
5-
% using the vocabulary specified in the newline delimited txt file
6-
% vocabFile.
7-
%
8-
% tokenizer = FullTokenizer(vocabFile,'PARAM1', VAL1, 'PARAM2', VAL2, ...)
9-
% specifies the optional parameter name/value pairs:
10-
%
11-
% 'BasicTokenizer' - Tokenizer used to split text into words.
12-
% If not specified, a default
13-
% BasicTokenizer is constructed.
14-
%
15-
% 'IgnoreCase' - A logical value to control if the
16-
% FullTokenizer is case sensitive or not.
17-
% The default value is true.
18-
%
19-
% FullTokenizer methods:
20-
% tokenize - tokenize text
21-
% encode - encode tokens
22-
% decode - decode encoded tokens
23-
%
24-
% Example:
25-
% % Save a file named fakeVocab.txt with the text on the next 3 lines:
26-
% fake
27-
% vo
28-
% ##cab
29-
%
30-
% % Now create a FullTokenizer
31-
% tokenizer = bert.tokenizer.internal.FullTokenizer('fakeVocab.txt');
32-
% tokens = tokenizer.tokenize("This tokenizer has a fake vocab")
33-
% % Note that most tokens are unknown as they are not in the
34-
% % vocabulary and neither are any sub-tokens. However "fake" is
35-
% % detected and "vocab" is split into "vo" and "##cab".
36-
% tokenizer.encode(tokens)
37-
% % This returns the encoded form of the tokens - each token is
38-
% % replaced by its corresponding line number in the fakeVocab.txt
2+
% FullTokenizer A tokenizer based on word-piece tokenization.
3+
%
4+
% tokenizer = FullTokenizer(vocabFile) constructs a FullTokenizer
5+
% using the vocabulary specified in the newline delimited txt file
6+
% vocabFile.
7+
%
8+
% tokenizer = FullTokenizer(vocabFile,'PARAM1', VAL1, 'PARAM2', VAL2, ...)
9+
% specifies the optional parameter name/value pairs:
10+
%
11+
% 'BasicTokenizer' - Tokenizer used to split text into words.
12+
% If not specified, a default
13+
% BasicTokenizer is constructed.
14+
%
15+
% 'IgnoreCase' - A logical value to control if the
16+
% FullTokenizer is case sensitive or not.
17+
% The default value is true.
18+
%
19+
% FullTokenizer methods:
20+
% tokenize - tokenize text
21+
% encode - encode tokens
22+
% decode - decode encoded tokens
23+
%
24+
% Example:
25+
% % Save a file named fakeVocab.txt with the text on the next 3 lines:
26+
% fake
27+
% vo
28+
% ##cab
29+
%
30+
% % Now create a FullTokenizer
31+
% tokenizer = bert.tokenizer.internal.FullTokenizer('fakeVocab.txt');
32+
% tokens = tokenizer.tokenize("This tokenizer has a fake vocab")
33+
% % Note that most tokens are unknown as they are not in the
34+
% % vocabulary and neither are any sub-tokens. However "fake" is
35+
% % detected and "vocab" is split into "vo" and "##cab".
36+
% tokenizer.encode(tokens)
37+
% % This returns the encoded form of the tokens - each token is
38+
% % replaced by its corresponding line number in the fakeVocab.txt
3939

4040
% Copyright 2021-2023 The MathWorks, Inc.
4141

+bert/+tokenizer/BERTTokenizer.m

Lines changed: 82 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,45 @@
11
classdef BERTTokenizer
2-
% BERTTokenizer Construct a tokenizer to use with BERT
3-
% models.
4-
%
5-
% tokenizer = BERTTokenizer() Constructs a case-insensitive
6-
% BERTTokenizer using the BERT-Base vocabulary file.
7-
%
8-
% tokenizer = BERTTokenizer(vocabFile) Constructs a
9-
% case-insensitive BERTTokenizer using the file vocabFile as
10-
% the vocabulary.
11-
%
12-
% tokenizer = BERTTokenizer(vocabFile,'PARAM1', VAL1, 'PARAM2', VAL2, ...)
13-
% specifies the optional parameter name/value pairs:
14-
%
15-
% 'IgnoreCase' - A logical value to control if the
16-
% BERTTokenizer is case sensitive or not.
17-
% The default value is true.
18-
%
19-
% 'FullTokenizer' - The underlying word-piece tokenizer.
20-
% If not specified, a default
21-
% FullTokenizer is constructed.
22-
%
23-
% BERTTokenizer properties:
24-
% FullTokenizer - The underlying word-piece tokenizer.
25-
% PaddingToken - The string "[PAD]"
26-
% StartToken - The string "[CLS]"
27-
% SeparatorToken - The string "[SEP]"
28-
% MaskToken - The string "[MASK]"
29-
% PaddingCode - The encoded PaddingToken
30-
% StartCode - The encoded StartToken
31-
% SeparatorCode - The encoded SeparatorToken
32-
% MaskCode - The encoded MaskToken
33-
%
34-
% BERTTokenizer methods:
35-
% tokenize - Tokenize strings
36-
% encode - Tokenize and encode strings
37-
% encodeTokens - Encode pre-tokenized token sequences
38-
% decode - Decode an encoded sequence to string
39-
%
40-
% Example:
41-
% tokenizer = bert.tokenizer.BERTTokenizer();
42-
% sequences = tokenizer.encode("Hello World!")
2+
% BERTTokenizer Construct a tokenizer to use with BERT
3+
% models.
4+
%
5+
% tokenizer = BERTTokenizer() Constructs a case-insensitive
6+
% BERTTokenizer using the BERT-Base vocabulary file.
7+
%
8+
% tokenizer = BERTTokenizer(vocabFile) Constructs a
9+
% case-insensitive BERTTokenizer using the file vocabFile as
10+
% the vocabulary.
11+
%
12+
% tokenizer = BERTTokenizer(vocabFile,'PARAM1', VAL1, 'PARAM2', VAL2, ...)
13+
% specifies the optional parameter name/value pairs:
14+
%
15+
% 'IgnoreCase' - A logical value to control if the
16+
% BERTTokenizer is case sensitive or not.
17+
% The default value is true.
18+
%
19+
% 'FullTokenizer' - The underlying word-piece tokenizer.
20+
% If not specified, a default
21+
% FullTokenizer is constructed.
22+
%
23+
% BERTTokenizer properties:
24+
% FullTokenizer - The underlying word-piece tokenizer.
25+
% PaddingToken - The string "[PAD]"
26+
% StartToken - The string "[CLS]"
27+
% SeparatorToken - The string "[SEP]"
28+
% MaskToken - The string "[MASK]"
29+
% PaddingCode - The encoded PaddingToken
30+
% StartCode - The encoded StartToken
31+
% SeparatorCode - The encoded SeparatorToken
32+
% MaskCode - The encoded MaskToken
33+
%
34+
% BERTTokenizer methods:
35+
% tokenize - Tokenize strings
36+
% encode - Tokenize and encode strings
37+
% encodeTokens - Encode pre-tokenized token sequences
38+
% decode - Decode an encoded sequence to string
39+
%
40+
% Example:
41+
% tokenizer = bert.tokenizer.BERTTokenizer();
42+
% sequences = tokenizer.encode("Hello World!")
4343

4444
% Copyright 2021-2023 The MathWorks, Inc.
4545

@@ -60,47 +60,47 @@
6060

6161
methods
6262
function this = BERTTokenizer(vocabFile,nvp)
63-
% BERTTokenizer Construct a tokenizer to use with BERT
64-
% models.
65-
%
66-
% tokenizer = BERTTokenizer() Constructs a case-insensitive
67-
% BERTTokenizer using the BERT-Base vocabulary file.
68-
%
69-
% tokenizer = BERTTokenizer(vocabFile) Constructs a
70-
% case-insensitive BERTTokenizer using the file vocabFile as
71-
% the vocabulary.
72-
%
73-
% tokenizer = BERTTokenizer(vocabFile,'PARAM1', VAL1, 'PARAM2', VAL2, ...)
74-
% specifies the optional parameter name/value pairs:
75-
%
76-
% 'IgnoreCase' - A logical value to control if the
77-
% BERTTokenizer is case sensitive or not.
78-
% The default value is true.
79-
%
80-
% 'FullTokenizer' - The underlying word-piece tokenizer.
81-
% If not specified, a default
82-
% FullTokenizer is constructed.
83-
%
84-
% BERTTokenizer properties:
85-
% FullTokenizer - The underlying word-piece tokenizer.
86-
% PaddingToken - The string "[PAD]"
87-
% StartToken - The string "[CLS]"
88-
% SeparatorToken - The string "[SEP]"
89-
% MaskToken - The string "[MASK]"
90-
% PaddingCode - The encoded PaddingToken
91-
% StartCode - The encoded StartToken
92-
% SeparatorCode - The encoded SeparatorToken
93-
% MaskCode - The encoded MaskToken
94-
%
95-
% BERTTokenizer methods:
96-
% tokenize - Tokenize strings
97-
% encode - Tokenize and encode strings
98-
% encodeTokens - Encode pre-tokenized token sequences
99-
% decode - Decode an encoded sequence to string
100-
%
101-
% Example:
102-
% tokenizer = bert.tokenizer.BERTTokenizer();
103-
% sequences = tokenizer.encode("Hello World!")
63+
% BERTTokenizer Construct a tokenizer to use with BERT
64+
% models.
65+
%
66+
% tokenizer = BERTTokenizer() Constructs a case-insensitive
67+
% BERTTokenizer using the BERT-Base vocabulary file.
68+
%
69+
% tokenizer = BERTTokenizer(vocabFile) Constructs a
70+
% case-insensitive BERTTokenizer using the file vocabFile as
71+
% the vocabulary.
72+
%
73+
% tokenizer = BERTTokenizer(vocabFile,'PARAM1', VAL1, 'PARAM2', VAL2, ...)
74+
% specifies the optional parameter name/value pairs:
75+
%
76+
% 'IgnoreCase' - A logical value to control if the
77+
% BERTTokenizer is case sensitive or not.
78+
% The default value is true.
79+
%
80+
% 'FullTokenizer' - The underlying word-piece tokenizer.
81+
% If not specified, a default
82+
% FullTokenizer is constructed.
83+
%
84+
% BERTTokenizer properties:
85+
% FullTokenizer - The underlying word-piece tokenizer.
86+
% PaddingToken - The string "[PAD]"
87+
% StartToken - The string "[CLS]"
88+
% SeparatorToken - The string "[SEP]"
89+
% MaskToken - The string "[MASK]"
90+
% PaddingCode - The encoded PaddingToken
91+
% StartCode - The encoded StartToken
92+
% SeparatorCode - The encoded SeparatorToken
93+
% MaskCode - The encoded MaskToken
94+
%
95+
% BERTTokenizer methods:
96+
% tokenize - Tokenize strings
97+
% encode - Tokenize and encode strings
98+
% encodeTokens - Encode pre-tokenized token sequences
99+
% decode - Decode an encoded sequence to string
100+
%
101+
% Example:
102+
% tokenizer = bert.tokenizer.BERTTokenizer();
103+
% sequences = tokenizer.encode("Hello World!")
104104
arguments
105105
vocabFile (1,1) string {mustBeFile} = bert.internal.getSupportFilePath("base","vocab.txt")
106106
nvp.IgnoreCase (1,1) logical = true

0 commit comments

Comments
 (0)