Add conversion for SentencepieceTokenizeOp (#1309)

TomWildenhain-Microsoft · web-flow · commit d480f11df7af · 2021-02-02T17:02:49.000-05:00
Signed-off-by: Tom Wildenhain &lt;tomwi@microsoft.com&gt;
diff --git a/tests/completed_perf_testing_models.yaml b/tests/completed_perf_testing_models.yaml
@@ -3359,3 +3359,22 @@ covid-twitter-bert:
   atol: 0.0005
   tag: "serve"
   signature_def: "serving_default"
+
+universal-sentence-encoder-multilingual:
+  disabled: false
+  skip_conversion: false
+  model: "C:/Users/tomwi/Documents/tfhubmodels/universal-sentence-encoder-multilingual"
+  converted_model: "C:/Users/tomwi/Documents/tfhubmodels/universal-sentence-encoder-multilingual/model.onnx"
+  model_type: saved_model
+  large_model: false
+  run_tf_frozen: false
+  use_custom_ops: true
+  input_get: get_sentences
+  inputs:
+    "inputs:0": [100]
+  outputs:
+    - Identity:0
+  rtol: 0.05
+  atol: 0.0005
+  tag: "serve"
+  signature_def: "serving_default"
diff --git a/tests/run_pretrained_models.py b/tests/run_pretrained_models.py
@@ -40,6 +40,11 @@
     # not needed for tf-2.0
     pass
 
+try:
+    import tensorflow_text  # pylint: disable=unused-import
+except ModuleNotFoundError:
+    pass
+
 from tf2onnx import tf_loader, logging, optimizer, utils, tf_utils, constants
 from tf2onnx.tfonnx import process_tf_graph
 from tf2onnx.tf_loader import tf_session, tf_reset_default_graph
diff --git a/tf2onnx/convert.py b/tf2onnx/convert.py
@@ -143,6 +143,12 @@ def main():
         if using_tf_opset:
             extra_opset.append(constants.TENSORFLOW_OPSET)
 
+    if any(opset.domain == constants.CONTRIB_OPS_DOMAIN for opset in extra_opset):
+        try:
+            import tensorflow_text   # pylint: disable=import-outside-toplevel
+        except ModuleNotFoundError:
+            logger.warning("tensorflow_text not installed. Model will fail to load if tensorflow_text ops are used.")
+
     # get the frozen tensorflow model from graphdef, checkpoint or saved_model.
     if args.graphdef:
         graph_def, inputs, outputs = tf_loader.from_graphdef(args.graphdef, args.inputs, args.outputs)
diff --git a/tf2onnx/custom_opsets/string_ops.py b/tf2onnx/custom_opsets/string_ops.py
@@ -120,3 +120,29 @@ def version_1(cls, ctx, node, **kwargs):
             not_node = ctx.insert_new_node_on_output("Not", output_name, name=utils.make_name(node.name))
             ctx.copy_shape(output_name, not_node.output[0])
             ctx.copy_dtype(output_name, not_node.output[0])
+
+@tf_op("SentencepieceOp", domain=constants.CONTRIB_OPS_DOMAIN)
+class SentencepieceOp:
+    @classmethod
+    def version_1(cls, ctx, node, **kwargs):
+        # This op will be removed when its consumer is converted
+        pass
+
+@tf_op("SentencepieceTokenizeOp", domain=constants.CONTRIB_OPS_DOMAIN)
+class SentencepieceTokenizeOp:
+    @classmethod
+    def version_1(cls, ctx, node, **kwargs):
+        node.domain = constants.CONTRIB_OPS_DOMAIN
+        input_node = node.inputs[0]
+        utils.make_sure(input_node.type == "SentencepieceOp", "Input 0 to node %s is not SentencepieceOp", node.name)
+        ctx.remove_input(node, node.input[0], 0)
+
+        nbest_size_cast = ctx.make_node("Cast", [node.input[1]], attr={'to': TensorProto.INT64}).output[0]
+        ctx.replace_input(node, node.input[1], nbest_size_cast, 1)
+        for i in range(1, len(node.input)):
+            unsqueeze = GraphBuilder(ctx).make_unsqueeze({'data': node.input[i], 'axes': [0]})
+            ctx.replace_input(node, node.input[i], unsqueeze, i)
+        node.set_attr("model", input_node.attr['model'].s)
+        node.type = "SentencepieceTokenizer"
+        if ctx.is_safe_to_remove_nodes([input_node]):
+            ctx.remove_node(input_node.name)