|
1 | 1 | classdef BERTTokenizer |
2 | | - % BERTTokenizer Construct a tokenizer to use with BERT |
3 | | - % models. |
4 | | - % |
5 | | - % tokenizer = BERTTokenizer() Constructs a case-insensitive |
6 | | - % BERTTokenizer using the BERT-Base vocabulary file. |
7 | | - % |
8 | | - % tokenizer = BERTTokenizer(vocabFile) Constructs a |
9 | | - % case-insensitive BERTTokenizer using the file vocabFile as |
10 | | - % the vocabulary. |
11 | | - % |
12 | | - % tokenizer = BERTTokenizer(vocabFile,'PARAM1', VAL1, 'PARAM2', VAL2, ...) |
13 | | - % specifies the optional parameter name/value pairs: |
14 | | - % |
15 | | - % 'IgnoreCase' - A logical value to control if the |
16 | | - % BERTTokenizer is case sensitive or not. |
17 | | - % The default value is true. |
18 | | - % |
19 | | - % 'FullTokenizer' - The underlying word-piece tokenizer. |
20 | | - % If not specified, a default |
21 | | - % FullTokenizer is constructed. |
22 | | - % |
23 | | - % BERTTokenizer properties: |
24 | | - % FullTokenizer - The underlying word-piece tokenizer. |
25 | | - % PaddingToken - The string "[PAD]" |
26 | | - % StartToken - The string "[CLS]" |
27 | | - % SeparatorToken - The string "[SEP]" |
28 | | - % MaskToken - The string "[MASK]" |
29 | | - % PaddingCode - The encoded PaddingToken |
30 | | - % StartCode - The encoded StartToken |
31 | | - % SeparatorCode - The encoded SeparatorToken |
32 | | - % MaskCode - The encoded MaskToken |
33 | | - % |
34 | | - % BERTTokenizer methods: |
35 | | - % tokenize - Tokenize strings |
36 | | - % encode - Tokenize and encode strings |
37 | | - % encodeTokens - Encode pre-tokenized token sequences |
38 | | - % decode - Decode an encoded sequence to string |
39 | | - % |
40 | | - % Example: |
41 | | - % tokenizer = bert.tokenizer.BERTTokenizer(); |
42 | | - % sequences = tokenizer.encode("Hello World!") |
| 2 | + % BERTTokenizer Construct a tokenizer to use with BERT |
| 3 | + % models. |
| 4 | + % |
| 5 | + % tokenizer = BERTTokenizer() Constructs a case-insensitive |
| 6 | + % BERTTokenizer using the BERT-Base vocabulary file. |
| 7 | + % |
| 8 | + % tokenizer = BERTTokenizer(vocabFile) Constructs a |
| 9 | + % case-insensitive BERTTokenizer using the file vocabFile as |
| 10 | + % the vocabulary. |
| 11 | + % |
| 12 | + % tokenizer = BERTTokenizer(vocabFile,'PARAM1', VAL1, 'PARAM2', VAL2, ...) |
| 13 | + % specifies the optional parameter name/value pairs: |
| 14 | + % |
| 15 | + % 'IgnoreCase' - A logical value to control if the |
| 16 | + % BERTTokenizer is case sensitive or not. |
| 17 | + % The default value is true. |
| 18 | + % |
| 19 | + % 'FullTokenizer' - The underlying word-piece tokenizer. |
| 20 | + % If not specified, a default |
| 21 | + % FullTokenizer is constructed. |
| 22 | + % |
| 23 | + % BERTTokenizer properties: |
| 24 | + % FullTokenizer - The underlying word-piece tokenizer. |
| 25 | + % PaddingToken - The string "[PAD]" |
| 26 | + % StartToken - The string "[CLS]" |
| 27 | + % SeparatorToken - The string "[SEP]" |
| 28 | + % MaskToken - The string "[MASK]" |
| 29 | + % PaddingCode - The encoded PaddingToken |
| 30 | + % StartCode - The encoded StartToken |
| 31 | + % SeparatorCode - The encoded SeparatorToken |
| 32 | + % MaskCode - The encoded MaskToken |
| 33 | + % |
| 34 | + % BERTTokenizer methods: |
| 35 | + % tokenize - Tokenize strings |
| 36 | + % encode - Tokenize and encode strings |
| 37 | + % encodeTokens - Encode pre-tokenized token sequences |
| 38 | + % decode - Decode an encoded sequence to string |
| 39 | + % |
| 40 | + % Example: |
| 41 | + % tokenizer = bert.tokenizer.BERTTokenizer(); |
| 42 | + % sequences = tokenizer.encode("Hello World!") |
43 | 43 |
|
44 | 44 | % Copyright 2021-2023 The MathWorks, Inc. |
45 | 45 |
|
|
60 | 60 |
|
61 | 61 | methods |
62 | 62 | function this = BERTTokenizer(vocabFile,nvp) |
63 | | - % BERTTokenizer Construct a tokenizer to use with BERT |
64 | | - % models. |
65 | | - % |
66 | | - % tokenizer = BERTTokenizer() Constructs a case-insensitive |
67 | | - % BERTTokenizer using the BERT-Base vocabulary file. |
68 | | - % |
69 | | - % tokenizer = BERTTokenizer(vocabFile) Constructs a |
70 | | - % case-insensitive BERTTokenizer using the file vocabFile as |
71 | | - % the vocabulary. |
72 | | - % |
73 | | - % tokenizer = BERTTokenizer(vocabFile,'PARAM1', VAL1, 'PARAM2', VAL2, ...) |
74 | | - % specifies the optional parameter name/value pairs: |
75 | | - % |
76 | | - % 'IgnoreCase' - A logical value to control if the |
77 | | - % BERTTokenizer is case sensitive or not. |
78 | | - % The default value is true. |
79 | | - % |
80 | | - % 'FullTokenizer' - The underlying word-piece tokenizer. |
81 | | - % If not specified, a default |
82 | | - % FullTokenizer is constructed. |
83 | | - % |
84 | | - % BERTTokenizer properties: |
85 | | - % FullTokenizer - The underlying word-piece tokenizer. |
86 | | - % PaddingToken - The string "[PAD]" |
87 | | - % StartToken - The string "[CLS]" |
88 | | - % SeparatorToken - The string "[SEP]" |
89 | | - % MaskToken - The string "[MASK]" |
90 | | - % PaddingCode - The encoded PaddingToken |
91 | | - % StartCode - The encoded StartToken |
92 | | - % SeparatorCode - The encoded SeparatorToken |
93 | | - % MaskCode - The encoded MaskToken |
94 | | - % |
95 | | - % BERTTokenizer methods: |
96 | | - % tokenize - Tokenize strings |
97 | | - % encode - Tokenize and encode strings |
98 | | - % encodeTokens - Encode pre-tokenized token sequences |
99 | | - % decode - Decode an encoded sequence to string |
100 | | - % |
101 | | - % Example: |
102 | | - % tokenizer = bert.tokenizer.BERTTokenizer(); |
103 | | - % sequences = tokenizer.encode("Hello World!") |
| 63 | + % BERTTokenizer Construct a tokenizer to use with BERT |
| 64 | + % models. |
| 65 | + % |
| 66 | + % tokenizer = BERTTokenizer() Constructs a case-insensitive |
| 67 | + % BERTTokenizer using the BERT-Base vocabulary file. |
| 68 | + % |
| 69 | + % tokenizer = BERTTokenizer(vocabFile) Constructs a |
| 70 | + % case-insensitive BERTTokenizer using the file vocabFile as |
| 71 | + % the vocabulary. |
| 72 | + % |
| 73 | + % tokenizer = BERTTokenizer(vocabFile,'PARAM1', VAL1, 'PARAM2', VAL2, ...) |
| 74 | + % specifies the optional parameter name/value pairs: |
| 75 | + % |
| 76 | + % 'IgnoreCase' - A logical value to control if the |
| 77 | + % BERTTokenizer is case sensitive or not. |
| 78 | + % The default value is true. |
| 79 | + % |
| 80 | + % 'FullTokenizer' - The underlying word-piece tokenizer. |
| 81 | + % If not specified, a default |
| 82 | + % FullTokenizer is constructed. |
| 83 | + % |
| 84 | + % BERTTokenizer properties: |
| 85 | + % FullTokenizer - The underlying word-piece tokenizer. |
| 86 | + % PaddingToken - The string "[PAD]" |
| 87 | + % StartToken - The string "[CLS]" |
| 88 | + % SeparatorToken - The string "[SEP]" |
| 89 | + % MaskToken - The string "[MASK]" |
| 90 | + % PaddingCode - The encoded PaddingToken |
| 91 | + % StartCode - The encoded StartToken |
| 92 | + % SeparatorCode - The encoded SeparatorToken |
| 93 | + % MaskCode - The encoded MaskToken |
| 94 | + % |
| 95 | + % BERTTokenizer methods: |
| 96 | + % tokenize - Tokenize strings |
| 97 | + % encode - Tokenize and encode strings |
| 98 | + % encodeTokens - Encode pre-tokenized token sequences |
| 99 | + % decode - Decode an encoded sequence to string |
| 100 | + % |
| 101 | + % Example: |
| 102 | + % tokenizer = bert.tokenizer.BERTTokenizer(); |
| 103 | + % sequences = tokenizer.encode("Hello World!") |
104 | 104 | arguments |
105 | 105 | vocabFile (1,1) string {mustBeFile} = bert.internal.getSupportFilePath("base","vocab.txt") |
106 | 106 | nvp.IgnoreCase (1,1) logical = true |
|
0 commit comments