Skip to content

Commit ad230eb

Browse files
committed
RFC: End-to-end text preprocessing with TF.Text #283
PiperOrigin-RevId: 347726784
1 parent ec3b3c0 commit ad230eb

17 files changed

+2563
-100
lines changed

tensorflow_text/BUILD

Lines changed: 173 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,16 @@ py_library(
6666
":greedy_constrained_sequence_op",
6767
":hub_module_splitter",
6868
":hub_module_tokenizer",
69+
":item_selector_ops",
70+
":masking_ops",
6971
":mst_ops",
7072
":ngrams_op",
7173
":normalize_ops",
7274
":pad_along_dimension_op",
75+
":pad_model_inputs_ops",
7376
":pointer_ops",
7477
":regex_split_ops",
78+
":segment_combiner_ops",
7579
":sentence_breaking_ops",
7680
":sentencepiece_tokenizer",
7781
":sliding_window_op",
@@ -83,6 +87,7 @@ py_library(
8387
":text_similarity_metric_ops",
8488
":todense_layer",
8589
":tokenization",
90+
":trimmer_ops",
8691
":unicode_char_tokenizer",
8792
":unicode_script_tokenizer",
8893
":viterbi_constrained_sequence_op",
@@ -101,7 +106,6 @@ py_library(
101106
##########################
102107
# Individual tf.text ops #
103108
##########################
104-
105109
# The py libraries are ordered alphabetically and are grouped with their corresponding tests.
106110

107111
py_library(
@@ -220,6 +224,86 @@ py_test(
220224
],
221225
)
222226

227+
py_library(
228+
name = "item_selector_ops",
229+
srcs = ["python/ops/item_selector_ops.py"],
230+
deps = [
231+
# python:array_ops tensorflow dep,
232+
# python:control_flow_ops tensorflow dep,
233+
# python:dtypes tensorflow dep,
234+
# python:framework_ops tensorflow dep,
235+
# python:framework_test_lib tensorflow dep,
236+
# python:lookup_ops tensorflow dep,
237+
# python:map_fn tensorflow dep,
238+
# python:math_ops tensorflow dep,
239+
# python:random_ops tensorflow dep,
240+
# python:sort_ops tensorflow dep,
241+
# python/ops/ragged:ragged_array_ops tensorflow dep,
242+
# python/ops/ragged:ragged_batch_gather_ops tensorflow dep,
243+
# python/ops/ragged:ragged_factory_ops tensorflow dep,
244+
# python/ops/ragged:ragged_functional_ops tensorflow dep,
245+
# python/ops/ragged:ragged_map_ops tensorflow dep,
246+
# python/ops/ragged:ragged_math_ops tensorflow dep,
247+
# python/ops/ragged:ragged_tensor tensorflow dep,
248+
# python/ops/ragged:ragged_tensor_shape tensorflow dep,
249+
# python/ops/ragged:ragged_where_op tensorflow dep,
250+
],
251+
)
252+
253+
py_test(
254+
name = "item_selector_ops_test",
255+
size = "medium",
256+
srcs = ["python/ops/item_selector_ops_test.py"],
257+
python_version = "PY3",
258+
srcs_version = "PY3",
259+
deps = [
260+
":item_selector_ops",
261+
# python:array_ops tensorflow dep,
262+
# python:client_testlib tensorflow dep,
263+
# python:dtypes tensorflow dep,
264+
# python:framework_test_lib tensorflow dep,
265+
# python:math_ops tensorflow dep,
266+
# python/ops/ragged:ragged_factory_ops tensorflow dep,
267+
],
268+
)
269+
270+
py_library(
271+
name = "masking_ops",
272+
srcs = ["python/ops/masking_ops.py"],
273+
deps = [
274+
# python:array_ops tensorflow dep,
275+
# python:dtypes tensorflow dep,
276+
# python:map_fn tensorflow dep,
277+
# python:math_ops tensorflow dep,
278+
# python:random_ops tensorflow dep,
279+
# python:sort_ops tensorflow dep,
280+
# python/ops/ragged:ragged_batch_gather_ops tensorflow dep,
281+
# python/ops/ragged:ragged_functional_ops tensorflow dep,
282+
# python/ops/ragged:ragged_map_ops tensorflow dep,
283+
# python/ops/ragged:ragged_math_ops tensorflow dep,
284+
# python/ops/ragged:ragged_tensor tensorflow dep,
285+
# python/ops/ragged:ragged_where_op tensorflow dep,
286+
],
287+
)
288+
289+
py_test(
290+
name = "masking_ops_test",
291+
size = "medium",
292+
srcs = ["python/ops/masking_ops_test.py"],
293+
python_version = "PY3",
294+
srcs_version = "PY3",
295+
deps = [
296+
":item_selector_ops",
297+
":masking_ops",
298+
"@absl_py//absl/testing:parameterized",
299+
# python:array_ops tensorflow dep,
300+
# python:client_testlib tensorflow dep,
301+
# python:constant_op tensorflow dep,
302+
# python:framework_test_lib tensorflow dep,
303+
# python/ops/ragged:ragged_factory_ops tensorflow dep,
304+
],
305+
)
306+
223307
py_tf_text_library(
224308
name = "mst_ops",
225309
srcs = ["python/ops/mst_ops.py"],
@@ -313,6 +397,37 @@ py_test(
313397
],
314398
)
315399

400+
py_library(
401+
name = "pad_model_inputs_ops",
402+
srcs = ["python/ops/pad_model_inputs_ops.py"],
403+
deps = [
404+
# python:array_ops tensorflow dep,
405+
# python:constant_op tensorflow dep,
406+
# python:control_flow_ops tensorflow dep,
407+
# python:dtypes tensorflow dep,
408+
# python:math_ops tensorflow dep,
409+
# python:tensor_array_ops tensorflow dep,
410+
# python/ops/ragged:ragged_array_ops tensorflow dep,
411+
# python/ops/ragged:ragged_map_ops tensorflow dep,
412+
# python/ops/ragged:ragged_tensor tensorflow dep,
413+
],
414+
)
415+
416+
py_test(
417+
name = "pad_model_inputs_ops_test",
418+
srcs = ["python/ops/pad_model_inputs_ops_test.py"],
419+
python_version = "PY3",
420+
srcs_version = "PY3",
421+
deps = [
422+
":pad_model_inputs_ops",
423+
"@absl_py//absl/testing:parameterized",
424+
# python:client_testlib tensorflow dep,
425+
# python:constant_op tensorflow dep,
426+
# python:framework_test_lib tensorflow dep,
427+
# python/ops/ragged:ragged_factory_ops tensorflow dep,
428+
],
429+
)
430+
316431
py_library(
317432
name = "pad_along_dimension_op",
318433
srcs = ["python/ops/pad_along_dimension_op.py"],
@@ -420,6 +535,7 @@ py_tf_text_library(
420535
cc_op_defs = ["core/ops/regex_split_ops.cc"],
421536
cc_op_kernels = ["//tensorflow_text/core/kernels:regex_split_kernels"],
422537
deps = [
538+
":splitter",
423539
# python/ops/ragged:ragged_tensor tensorflow dep,
424540
],
425541
)
@@ -446,6 +562,32 @@ py_library(
446562
],
447563
)
448564

565+
py_library(
566+
name = "segment_combiner_ops",
567+
srcs = ["python/ops/segment_combiner_ops.py"],
568+
deps = [
569+
# python:array_ops tensorflow dep,
570+
# python:dtypes tensorflow dep,
571+
# python:framework_ops tensorflow dep,
572+
# python:math_ops tensorflow dep,
573+
],
574+
)
575+
576+
py_test(
577+
name = "segment_combiner_ops_test",
578+
srcs = ["python/ops/segment_combiner_ops_test.py"],
579+
python_version = "PY3",
580+
srcs_version = "PY3",
581+
deps = [
582+
":segment_combiner_ops",
583+
"@absl_py//absl/testing:parameterized",
584+
# python:client_testlib tensorflow dep,
585+
# python:constant_op tensorflow dep,
586+
# python:dtypes tensorflow dep,
587+
# python/ops/ragged:ragged_factory_ops tensorflow dep,
588+
],
589+
)
590+
449591
py_tf_text_library(
450592
name = "text_similarity_metric_ops",
451593
srcs = ["python/metrics/text_similarity_metric_ops.py"],
@@ -705,6 +847,36 @@ py_test(
705847
],
706848
)
707849

850+
py_library(
851+
name = "trimmer_ops",
852+
srcs = ["python/ops/trimmer_ops.py"],
853+
deps = [
854+
":item_selector_ops",
855+
# python:array_ops tensorflow dep,
856+
# python:constant_op tensorflow dep,
857+
# python:control_flow_ops tensorflow dep,
858+
# python:dtypes tensorflow dep,
859+
# python:functional_ops tensorflow dep,
860+
# python/ops/ragged:ragged_map_ops tensorflow dep,
861+
# python/ops/ragged:ragged_tensor tensorflow dep,
862+
],
863+
)
864+
865+
py_test(
866+
name = "trimmer_ops_test",
867+
srcs = ["python/ops/trimmer_ops_test.py"],
868+
python_version = "PY3",
869+
srcs_version = "PY3",
870+
deps = [
871+
":trimmer_ops",
872+
"@absl_py//absl/testing:parameterized",
873+
# python:client_testlib tensorflow dep,
874+
# python:constant_op tensorflow dep,
875+
# python:framework_test_lib tensorflow dep,
876+
# python/ops/ragged:ragged_factory_ops tensorflow dep,
877+
],
878+
)
879+
708880
py_library(
709881
name = "hub_module_splitter",
710882
srcs = ["python/ops/hub_module_splitter.py"],

tensorflow_text/__init__.py

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -26,43 +26,51 @@
2626
# increasing order of their lowercase version.
2727
_allowed_symbols = [
2828
"BertTokenizer",
29+
"Detokenizer",
30+
"FirstNItemSelector",
31+
"HubModuleSplitter",
32+
"HubModuleTokenizer",
33+
"MaskValuesChooser",
34+
"RandomItemSelector",
35+
"Reduction",
36+
"RegexSplitter",
37+
"SentencepieceTokenizer",
38+
"SplitMergeFromLogitsTokenizer",
39+
"SplitMergeTokenizer",
40+
"Splitter",
41+
"StateBasedSentenceBreaker",
42+
"Tokenizer",
43+
"TokenizerWithOffsets",
44+
"UnicodeCharTokenizer",
45+
"UnicodeScriptTokenizer",
46+
"WaterfallTrimmer",
47+
"WhitespaceTokenizer",
48+
"WordShape",
49+
"WordpieceTokenizer",
2950
"case_fold_utf8",
3051
"coerce_to_structurally_valid_utf8",
31-
"Detokenizer",
52+
"combine_segments",
3253
"find_source_offsets",
3354
"gather_with_default",
3455
"greedy_constrained_sequence",
35-
"HubModuleSplitter",
36-
"HubModuleTokenizer",
3756
"keras",
57+
"mask_language_model",
3858
"max_spanning_tree",
3959
"max_spanning_tree_gradient",
4060
"metrics",
4161
"ngrams",
4262
"normalize_utf8",
4363
"normalize_utf8_with_offsets_map",
4464
"pad_along_dimension",
45-
"Reduction",
65+
"pad_model_inputs",
4666
"regex_split",
4767
"regex_split_with_offsets",
4868
"sentence_fragments",
49-
"SentencepieceTokenizer",
5069
"sliding_window",
5170
"span_alignment",
5271
"span_overlaps",
53-
"SplitMergeFromLogitsTokenizer",
54-
"SplitMergeTokenizer",
55-
"Splitter",
56-
"StateBasedSentenceBreaker",
57-
"Tokenizer",
58-
"TokenizerWithOffsets",
59-
"UnicodeCharTokenizer",
60-
"UnicodeScriptTokenizer",
6172
"viterbi_constrained_sequence",
62-
"WhitespaceTokenizer",
63-
"WordpieceTokenizer",
6473
"wordshape",
65-
"WordShape",
6674
]
6775

6876
remove_undocumented(__name__, _allowed_symbols)

tensorflow_text/python/ops/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@
2525
from tensorflow_text.python.ops.greedy_constrained_sequence_op import greedy_constrained_sequence
2626
from tensorflow_text.python.ops.hub_module_splitter import HubModuleSplitter
2727
from tensorflow_text.python.ops.hub_module_tokenizer import HubModuleTokenizer
28+
from tensorflow_text.python.ops.item_selector_ops import FirstNItemSelector
29+
from tensorflow_text.python.ops.item_selector_ops import RandomItemSelector
30+
from tensorflow_text.python.ops.masking_ops import mask_language_model
31+
from tensorflow_text.python.ops.masking_ops import MaskValuesChooser
2832
from tensorflow_text.python.ops.mst_ops import max_spanning_tree
2933
from tensorflow_text.python.ops.mst_ops import max_spanning_tree_gradient
3034
from tensorflow_text.python.ops.ngrams_op import ngrams
@@ -34,11 +38,14 @@
3438
from tensorflow_text.python.ops.normalize_ops import normalize_utf8
3539
from tensorflow_text.python.ops.normalize_ops import normalize_utf8_with_offsets_map
3640
from tensorflow_text.python.ops.pad_along_dimension_op import pad_along_dimension
41+
from tensorflow_text.python.ops.pad_model_inputs_ops import pad_model_inputs
3742
from tensorflow_text.python.ops.pointer_ops import gather_with_default
3843
from tensorflow_text.python.ops.pointer_ops import span_alignment
3944
from tensorflow_text.python.ops.pointer_ops import span_overlaps
4045
from tensorflow_text.python.ops.regex_split_ops import regex_split
4146
from tensorflow_text.python.ops.regex_split_ops import regex_split_with_offsets
47+
from tensorflow_text.python.ops.regex_split_ops import RegexSplitter
48+
from tensorflow_text.python.ops.segment_combiner_ops import combine_segments
4249
from tensorflow_text.python.ops.sentence_breaking_ops import sentence_fragments
4350
from tensorflow_text.python.ops.sentencepiece_tokenizer import SentencepieceTokenizer
4451
from tensorflow_text.python.ops.sliding_window_op import sliding_window
@@ -50,6 +57,7 @@
5057
from tensorflow_text.python.ops.tokenization import Detokenizer
5158
from tensorflow_text.python.ops.tokenization import Tokenizer
5259
from tensorflow_text.python.ops.tokenization import TokenizerWithOffsets
60+
from tensorflow_text.python.ops.trimmer_ops import WaterfallTrimmer
5361
from tensorflow_text.python.ops.unicode_char_tokenizer import UnicodeCharTokenizer
5462
from tensorflow_text.python.ops.unicode_script_tokenizer import UnicodeScriptTokenizer
5563
from tensorflow_text.python.ops.viterbi_constrained_sequence_op import viterbi_constrained_sequence

0 commit comments

Comments
 (0)