add pure keras tests from keras2onnx (#1574)

TomWildenhain-Microsoft · web-flow · commit 8a61c99fbc39 · 2021-06-21T13:53:44.000-04:00
* add pure keras tests from keras2onnx

Signed-off-by: Tom Wildenhain &lt;tomwi@microsoft.com&gt;

* Disable some failing pure keras tests for old tf versions

Signed-off-by: Tom Wildenhain &lt;tomwi@microsoft.com&gt;

* polish changes

Signed-off-by: Tom Wildenhain &lt;tomwi@microsoft.com&gt;
diff --git a/ci_build/azure_pipelines/keras2onnx_unit_test.yml b/ci_build/azure_pipelines/keras2onnx_unit_test.yml
@@ -8,6 +8,7 @@ jobs:
   strategy:
     matrix:
 
+      ############ TF Keras Unit Tests ############
       Python36-tf1.15:
         python.version: '3.6'
         ONNX_PATH: onnx==1.5.0
@@ -38,6 +39,36 @@ jobs:
         TENSORFLOW_PATH: tensorflow-cpu==2.5.0
         INSTALL_ORT: pip install onnxruntime==1.8.0
 
+      ############ Pure Keras Unit Tests ############
+      # Keras-Py36-tf1.15.0:  # Failing, will enable soon.
+      #   python.version: '3.6'
+      #   ONNX_PATH: onnx==1.5.0
+      #   KERAS: keras==2.2.5
+      #   TENSORFLOW_PATH: tensorflow==1.15.0
+      #   INSTALL_ORT: pip install onnxruntime==1.8.0
+
+      Keras-Py37-tf1.15.0:
+        python.version: '3.7'
+        ONNX_PATH: onnx==1.9.0
+        KERAS: keras==2.4.3
+        TENSORFLOW_PATH: tensorflow==1.15.0
+        INSTALL_ORT: pip install onnxruntime==1.8.0
+
+      # UT for keras 2.3 need tensorflow <= 2.0.0
+      Keras-Py37-tf2.0.0:
+        python.version: '3.7'
+        ONNX_PATH: onnx==1.6.0
+        KERAS: keras==2.3.1
+        TENSORFLOW_PATH: tensorflow==2.0.0
+        INSTALL_ORT: pip install onnxruntime==1.8.0
+
+      Keras-Py38-tf2.2.0:
+        python.version: '3.8'
+        ONNX_PATH: onnx==1.7.0
+        KERAS: keras==2.4.3
+        TENSORFLOW_PATH: tensorflow==2.2.0
+        INSTALL_ORT: pip install onnxruntime==1.8.0
+
   steps:
   - script: sudo install -d -m 0777 /home/vsts/.conda/envs
     displayName: Fix Conda permissions
@@ -55,6 +86,10 @@ jobs:
       pip install h5py==2.9.0
       pip install numpy==1.19
       pip install $(TENSORFLOW_PATH)
+      if [[ ! -z $KERAS ]];
+      then
+        pip install $(KERAS)
+      fi
       pip install git+https://github.com/microsoft/onnxconverter-common
       pip install -r requirements.txt
       pip install -r requirements-dev.txt
@@ -66,6 +101,10 @@ jobs:
       pip install -e .
       python -c "import onnxruntime"
       python -c "import onnxconverter_common"
+      if [[ ! -z $KERAS ]];
+      then
+        export TF_KERAS=0
+      fi
       pytest keras2onnx_tests --doctest-modules --junitxml=junit/test-results.xml
     displayName: 'pytest'
 
diff --git a/keras2onnx_tests/test_cgan.py b/keras2onnx_tests/test_cgan.py
@@ -4,7 +4,7 @@
 import tensorflow as tf
 import mock_keras2onnx
 import numpy as np
-from mock_keras2onnx.proto import keras, is_tf_keras
+from mock_keras2onnx.proto import keras, is_tf_keras, is_tensorflow_older_than
 from tf2onnx.keras2onnx_api import convert_keras
 from distutils.version import StrictVersion
 
@@ -118,6 +118,8 @@ def build_discriminator(self):
 @pytest.mark.skipif(mock_keras2onnx.proto.tfcompat.is_tf2 and is_tf_keras, reason="Tensorflow 1.x only tests.")
 @pytest.mark.skipif(is_tf_keras and StrictVersion(tf.__version__.split('-')[0]) < StrictVersion("1.14.0"),
                     reason="Not supported before tensorflow 1.14.0 for tf_keras")
+@pytest.mark.skipif(mock_keras2onnx.proto.tfcompat.is_tf2 and is_tensorflow_older_than('2.2'),
+                    reason="Variable freezing fails to replace ResourceGather op")
 def test_CGAN(runner):
     keras_model = CGAN().combined
     batch = 5
diff --git a/keras2onnx_tests/test_layers.py b/keras2onnx_tests/test_layers.py
@@ -2,7 +2,7 @@
 
 import pytest
 import numpy as np
-from onnxconverter_common.onnx_ex import get_maximum_opset_supported
+from tf2onnx.keras2onnx_api import get_maximum_opset_supported
 from mock_keras2onnx.proto.tfcompat import is_tf2, tensorflow as tf
 from mock_keras2onnx.proto import (keras, is_tf_keras,
                                    is_tensorflow_older_than, is_tensorflow_later_than,
@@ -1633,6 +1633,8 @@ def test_padding(misc_conv_runner):
     misc_conv_runner(layer, ishape)
 
 
+@pytest.mark.skipif(is_tf2 and is_tensorflow_older_than('2.2'),
+                    reason="Variable freezing fails to replace ResourceGather op")
 def test_embedding(runner):
     model = keras.Sequential()
     model.add(Embedding(1000, 64, input_length=10))
@@ -1853,6 +1855,8 @@ def test_GRU(runner):
         assert runner(onnx_model.graph.name, onnx_model, [data, init_state_onnx], expected)
 
 
+@pytest.mark.skipif(not is_tf_keras and is_tf2 and is_tensorflow_older_than('2.2'),
+                    reason="Fails due to some reason involving bad graph captures. Works in new versions and tf_keras")
 def test_GRU_2(runner):
     model = keras.Sequential(name='TestGRU')
     model.add(keras.layers.GRU(400, reset_after=True, input_shape=(1, 257)))
@@ -2109,6 +2113,8 @@ def test_bidirectional_with_initial_states(runner, rnn_class):
 @pytest.mark.skipif(get_maximum_opset_supported() < 5,
                     reason="None seq_length Bidirectional LSTM is not supported before opset 5.")
 @pytest.mark.parametrize("rnn_class", RNN_CLASSES)
+@pytest.mark.skipif(is_tf2 and is_tensorflow_older_than('2.2'),
+                    reason="Variable freezing fails to replace GatherResource op")
 def test_bidirectional_seqlen_none(runner, rnn_class):
     model = Sequential()
     model.add(Embedding(39, 128))
@@ -2199,6 +2205,8 @@ def test_separable_convolution(runner):
     assert runner('separable_convolution_2', onnx_model, x, expected)
 
 
+@pytest.mark.skipif(is_tf2 and is_tensorflow_older_than('2.2'),
+                    reason="Variable freezing fails to replace GatherResource op")
 def test_shared_embed(runner):
     max_cont_length = 5
     max_ques_length = 7
diff --git a/tf2onnx/convert.py b/tf2onnx/convert.py
@@ -291,19 +291,65 @@ def tensor_names_from_structed(concrete_func, input_names, output_names):
     return tensors_to_rename
 
 
+def _rename_duplicate_keras_model_names(model):
+    """
+    In very rare cases, keras has a bug where it will give multiple outputs the same name.
+    We must edit the model or the TF trace will fail. Returns old_out_names (or None if no edit was made).
+    IMPORTANT: model may be edited. Assign model.output_names to old_out_names to restore.
+    """
+    old_out_names = None
+    if model.output_names and len(set(model.output_names)) != len(model.output_names):
+        # In very rare cases, keras has a bug where it will give multiple outputs the same name
+        # We must edit the model or the TF trace will fail
+        old_out_names = model.output_names
+        used_names = set()
+        new_out_names = []
+        for name in model.output_names:
+            new_name = name
+            i = 0
+            while new_name in used_names:
+                i += 1
+                new_name = name + "_" + str(i)
+            used_names.add(new_name)
+            new_out_names.append(new_name)
+        model.output_names = new_out_names
+    return old_out_names
+
+
+def _is_legacy_keras_model(model):
+    """Inspects model class to determine if it is from tf or legacy keras"""
+
+    logger = logging.getLogger(constants.TF2ONNX_PACKAGE_NAME)
+    unknown_type_err = "model is not instance of tf.keras.Model or keras.Model"
+    if isinstance(model, tf.keras.Model):
+        return False
+    try:
+        import keras  # pylint: disable=import-outside-toplevel
+        if isinstance(model, keras.Model):
+            return True
+        logger.warning(unknown_type_err)
+    except ImportError:
+        logger.warning(unknown_type_err)
+    return False
+
+
 def _from_keras_tf1(model, input_signature=None, opset=None, custom_ops=None, custom_op_handlers=None,
                     custom_rewriter=None, inputs_as_nchw=None, extra_opset=None, shape_override=None,
                     target=None, large_model=False, output_path=None):
     """from_keras for tf 1.15"""
-
     input_names = [t.name for t in model.inputs]
     output_names = [t.name for t in model.outputs]
+    old_out_names = _rename_duplicate_keras_model_names(model)
     tensors_to_rename = dict(zip(input_names, model.input_names))
-    if len(set(model.output_names)) == len(model.output_names):
-        # In very rare cases, keras has a bug where it will give multiple outputs the same name
-        tensors_to_rename.update(zip(output_names, model.output_names))
+    tensors_to_rename.update(zip(output_names, model.output_names))
+    if old_out_names is not None:
+        model.output_names = old_out_names
 
-    sess = tf.keras.backend.get_session(model.outputs)
+    if _is_legacy_keras_model(model):
+        import keras  # pylint: disable=import-outside-toplevel
+        sess = keras.backend.get_session()
+    else:
+        sess = tf.keras.backend.get_session(model.outputs)
 
     with tf.device("/cpu:0"):
         frozen_graph, initialized_tables = tf_loader.freeze_session(sess, input_names, output_names, get_tables=True)
@@ -351,6 +397,7 @@ def from_keras(model, input_signature=None, opset=None, custom_ops=None, custom_
     Returns:
         An ONNX model_proto and an external_tensor_storage dict.
     """
+    old_out_names = _rename_duplicate_keras_model_names(model)
     if LooseVersion(tf.__version__) < "2.0":
         return _from_keras_tf1(model, input_signature, opset, custom_ops, custom_op_handlers, custom_rewriter,
                                inputs_as_nchw, extra_opset, shape_override, target, large_model, output_path)
@@ -370,9 +417,21 @@ def wrap_call(*args, training=False, **kwargs):
             return model_call(*args, **kwargs)
         model.call = wrap_call
         function = _saving_utils.trace_model_call(model, input_signature)
-        concrete_func = function.get_concrete_function()
-        # Put it back
-        model.call = model_call
+        try:
+            # Legacy keras get make TF erroneously enter eager mode when it should be making symbolic tensors
+            import tensorflow_core  # pylint: disable=import-outside-toplevel
+            old_get_learning_phase = tensorflow_core.python.keras.backend.learning_phase
+            tensorflow_core.python.keras.backend.learning_phase = \
+                tensorflow_core.python.keras.backend.symbolic_learning_phase
+        except ImportError:
+            old_get_learning_phase = None
+        try:
+            concrete_func = function.get_concrete_function()
+        finally:
+            # Put everything back
+            model.call = model_call
+            if old_get_learning_phase is not None:
+                tensorflow_core.python.keras.backend.learning_phase = old_get_learning_phase
 
     # These inputs will be removed during freezing (includes resources, etc.)
     graph_captures = concrete_func.graph._captures  # pylint: disable=protected-access
@@ -392,6 +451,9 @@ def wrap_call(*args, training=False, **kwargs):
         # Other models specify output order using the key order of structured_outputs
         output_names = [reverse_lookup[out] for out in concrete_func.structured_outputs.keys()]
 
+    if old_out_names is not None:
+        model.output_names = old_out_names
+
     with tf.device("/cpu:0"):
         frozen_graph, initialized_tables = \
             tf_loader.from_trackable(model, concrete_func, input_names, output_names, large_model)
diff --git a/tf2onnx/tf_loader.py b/tf2onnx/tf_loader.py
@@ -113,6 +113,19 @@ def inputs_without_resource(sess, input_names):
 def convert_variables_to_constants_large_model(func):
     # For large models we use internal tf methods as a hack
 
+    if tf.__version__.startswith("2.1.") or tf.__version__.startswith("2.0."):
+        from tensorflow.python.framework import convert_to_constants
+        orig_fn = convert_to_constants._construct_concrete_function  # pylint: disable=protected-access
+        def fake_construct_fn(func, output_graph_def, converted_input_indices):
+            # Return graph_def without loading it to avoid crash. Will fix errors in graph_def later.
+            return output_graph_def
+        convert_to_constants._construct_concrete_function = fake_construct_fn  # pylint: disable=protected-access
+        try:
+            frozen_graph_def = convert_to_constants.convert_variables_to_constants_v2(func, lower_control_flow=False)
+        finally:
+            convert_to_constants._construct_concrete_function = orig_fn  # pylint: disable=protected-access
+        return frozen_graph_def
+
     if tf.__version__.startswith("2.2."):
         try:
             from tensorflow.python.framework.convert_to_constants import \
@@ -156,9 +169,9 @@ def make_tensor_proto_wrapped(values, dtype=None, shape=None, verify_shape=False
 def fix_freezing_errors(graph_def):
     assign_var_ops = []
     for i in reversed(range(len(graph_def.node))):
-        if graph_def.node[i].op == "AssignVariableOp":
+        if graph_def.node[i].op in ["AssignVariableOp", "AssignSubVariableOp"]:
             assign_var_ops.append(graph_def.node.pop(i).name)
-            logger.warning("Removed AssignVariableOp %s", assign_var_ops[-1])
+            logger.warning("Removed %s %s", graph_def.node[i].op, assign_var_ops[-1])
     names_to_remove = set(assign_var_ops)
     for n in graph_def.node:
         for i in reversed(range(len(n.input))):
@@ -218,9 +231,9 @@ def from_function(func, input_names, output_names, large_model=False):
             frozen_func = convert_variables_to_constants_v2(func, lower_control_flow=False, aggressive_inlining=True)
     except ValueError as e:
         if "incompatible with expected resource" in str(e):
-            frozen_func = convert_variables_to_constants_large_model(func)
+            bad_graph_def = convert_variables_to_constants_large_model(func)
             logger.warning("TF freezing failed. Attempting to fix freezing errors.")
-            graph_def = fix_freezing_errors(frozen_func)
+            graph_def = fix_freezing_errors(bad_graph_def)
         else:
             raise e
     else: