14
14
15
15
"""Keras Layers for BERT-specific preprocessing."""
16
16
# pylint: disable=g-import-not-at-top
17
- from typing import Any , Dict , List , Optional , Union
17
+ from typing import Any , Dict , List , Mapping , Optional , Text , Union
18
18
19
19
from absl import logging
20
20
import tensorflow as tf
@@ -71,8 +71,9 @@ class BertTokenizer(tf.keras.layers.Layer):
71
71
72
72
def __init__ (self , * ,
73
73
vocab_file : str ,
74
- lower_case : bool ,
74
+ lower_case : Optional [ bool ] = None ,
75
75
tokenize_with_offsets : bool = False ,
76
+ tokenizer_kwargs : Optional [Mapping [Text , Any ]] = None ,
76
77
** kwargs ):
77
78
"""Initialize a `BertTokenizer` layer.
78
79
@@ -81,15 +82,18 @@ def __init__(self, *,
81
82
This is a text file with newline-separated wordpiece tokens.
82
83
This layer initializes a lookup table from it that gets used with
83
84
`text.BertTokenizer`.
84
- lower_case: A Python boolean forwarded to `text.BertTokenizer`.
85
+ lower_case: Optional boolean forwarded to `text.BertTokenizer`.
85
86
If true, input text is converted to lower case (where applicable)
86
87
before tokenization. This must be set to match the way in which
87
- the `vocab_file` was created.
88
+ the `vocab_file` was created. If passed, this overrides whatever value
89
+ may have been passed in `tokenizer_kwargs`.
88
90
tokenize_with_offsets: A Python boolean. If true, this layer calls
89
91
`text.BertTokenizer.tokenize_with_offsets()` instead of plain
90
92
`text.BertTokenizer.tokenize()` and outputs a triple of
91
93
`(tokens, start_offsets, limit_offsets)`
92
94
insead of just tokens.
95
+ tokenizer_kwargs: Optional mapping with keyword arguments to forward to
96
+ `text.BertTokenizer`'s constructor.
93
97
**kwargs: Standard arguments to `Layer()`.
94
98
95
99
Raises:
@@ -111,8 +115,11 @@ def __init__(self, *,
111
115
self ._special_tokens_dict = self ._create_special_tokens_dict (
112
116
self ._vocab_table , vocab_file )
113
117
super ().__init__ (** kwargs )
114
- self ._bert_tokenizer = text .BertTokenizer (
115
- self ._vocab_table , lower_case = lower_case )
118
+ tokenizer_kwargs = dict (tokenizer_kwargs or {})
119
+ if lower_case is not None :
120
+ tokenizer_kwargs ["lower_case" ] = lower_case
121
+ self ._bert_tokenizer = text .BertTokenizer (self ._vocab_table ,
122
+ ** tokenizer_kwargs )
116
123
117
124
@property
118
125
def vocab_size (self ):
0 commit comments