Add optimizer to push q and dq ops into place (#1497)

TomWildenhain-Microsoft · web-flow · commit 9c7b56c73219 · 2021-05-04T11:23:11.000-07:00
* Add optimizer to push q and dq ops into place

Signed-off-by: Tom Wildenhain &lt;tomwi@microsoft.com&gt;

* Increase ci pipeline ort version

Signed-off-by: Tom Wildenhain &lt;tomwi@microsoft.com&gt;

* Update min opset

Signed-off-by: Tom Wildenhain &lt;tomwi@microsoft.com&gt;
diff --git a/ci_build/azure_pipelines/templates/job_generator.yml b/ci_build/azure_pipelines/templates/job_generator.yml
@@ -6,7 +6,7 @@ parameters:
   tf_versions: ['']
   onnx_versions: ['']
   onnx_opsets: ['13', '12', '11', '10', '9', '8', '7']
-  onnx_backends: {onnxruntime: ['1.6.0']}
+  onnx_backends: {onnxruntime: ['1.7.0']}
   job: {}
   run_setup: 'True'
   report_coverage: 'False'
diff --git a/tests/test_backend.py b/tests/test_backend.py
@@ -2734,6 +2734,71 @@ def func(x):
             return tf.identity(x_, name=_TFOUTPUT)
         _ = self._run_test_case(func, [_OUTPUT], {_INPUT: x_val})
 
+    @check_tf_min_version("1.15")
+    @check_opset_min_version(10, "quantize_and_dequantize")
+    def test_qdq_optimizer(self):
+        x_shape = [3, 3, 2]
+        x_val = np.arange(1, 1+np.prod(x_shape)).astype("float32").reshape(x_shape)
+        def func(x):
+            x_ = quantize_and_dequantize(x, 1.0, 6.0, signed_input=False, range_given=True)
+            x_ = tf.transpose(x_, [1, 2, 0])
+            x_ = tf.reshape(x_, tf.constant([9, 2]))
+            x_ = quantize_and_dequantize(x_, 1.0, 6.0, signed_input=False, range_given=True)
+            return tf.identity(x_, name=_TFOUTPUT)
+        _ = self._run_test_case(func, [_OUTPUT], {_INPUT: x_val},
+                                graph_validator=lambda g: check_op_count(g, "DequantizeLinear", 1, disabled=False))
+
+    @check_tf_min_version("1.15")
+    @check_opset_min_version(10, "quantize_and_dequantize")
+    def test_qdq_optimizer_split_concat(self):
+        x_shape = [7, 3, 5]
+        y_shape = [7, 2, 5]
+        x_val = np.arange(1, 1+np.prod(x_shape)).astype("float32").reshape(x_shape)
+        y_val = np.arange(1, 1+np.prod(y_shape)).astype("float32").reshape(y_shape)
+        def func(x, y):
+            x_ = quantize_and_dequantize(x, 1.0, 30.0, signed_input=False, range_given=True)
+            a, _, c = tf.unstack(x_, axis=1)
+            ac = tf.stack([a, c], axis=1)
+            y_ = quantize_and_dequantize(y, 1.0, 30.0, signed_input=False, range_given=True)
+            m = tf.matmul(ac, tf.transpose(y_, [0, 2, 1]))
+            m_ = m[2:, :, :]
+            m_ = quantize_and_dequantize(m_, 1.0, 30.0, signed_input=False, range_given=True)
+            return tf.identity(m_, name=_TFOUTPUT)
+        def validate_graph(g):
+            # MatMul should be wrapped in Dq/Q
+            for n in g.get_nodes():
+                if n.type == "MatMul":
+                    if not all(inp.type == "DequantizeLinear" for inp in n.inputs):
+                        return False
+                    if not all(c.type == "QuantizeLinear" for c in g.find_output_consumers(n.output[0])):
+                        return False
+            return True
+
+        _ = self._run_test_case(func, [_OUTPUT], {_INPUT: x_val, _INPUT1: y_val}, graph_validator=validate_graph)
+
+    @check_tf_min_version("1.15")
+    @check_opset_min_version(11, "ScatterND")
+    def test_qdq_optimizer_scatter(self):
+        x_val = np.array([10, 20, 30, 40], dtype=np.float32).reshape((4))
+        y_val = np.array([0, 2], dtype=np.int64).reshape((2, 1))
+        z_val = np.array([8, 11], dtype=np.float32).reshape((2))
+
+        def func(x, y, z):
+            x_ = quantize_and_dequantize(x, 1.0, 30.0, signed_input=False, range_given=True)
+            z_ = quantize_and_dequantize(z, 1.0, 30.0, signed_input=False, range_given=True)
+            w = tf.tensor_scatter_nd_update(x_, y, z_)
+            w_ = quantize_and_dequantize(w, 1.0, 30.0, signed_input=False, range_given=True)
+            return tf.identity(w_, name=_TFOUTPUT)
+        self._run_test_case(func, [_OUTPUT], {_INPUT: x_val, _INPUT1: y_val, _INPUT2: z_val},
+                            graph_validator=lambda g: check_op_count(g, "DequantizeLinear", 1, disabled=False))
+
+        def func(x, y, z):
+            x_ = quantize_and_dequantize(x, 1.0, 30.0, signed_input=False, range_given=True)
+            w = tf.tensor_scatter_nd_update(x_, y, z)
+            w_ = quantize_and_dequantize(w, 1.0, 30.0, signed_input=False, range_given=True)
+            return tf.identity(w_, name=_TFOUTPUT)
+        self._run_test_case(func, [_OUTPUT], {_INPUT: x_val, _INPUT1: y_val, _INPUT2: z_val})
+
     @check_tf_min_version("1.15")
     @check_opset_min_version(10, "quantize_and_dequantize")
     def test_qdq_dyn_range_unsigned_input(self):
diff --git a/tf2onnx/optimizer/__init__.py b/tf2onnx/optimizer/__init__.py
@@ -19,6 +19,7 @@
 from .const_dequantize_optimizer import ConstDequantizeOptimizer
 from .reshape_optimizer import ReshapeOptimizer
 from .global_pool_optimizer import GlobalPoolOptimizer
+from .q_dq_optimizer import QDQOptimizer
 from .. import logging
 
 # optimizer sequence need to be considered carefully
@@ -32,9 +33,10 @@
     # for optimize_transpose may have some trans nodes that can be merge
     ("merge_duplication", MergeDuplicatedNodesOptimizer),
     ("reshape_optimizer", ReshapeOptimizer),
+    ("global_pool_optimizer", GlobalPoolOptimizer),
+    ("q_dq_optimizer", QDQOptimizer),
     ("remove_identity", IdentityOptimizer),
     ("remove_back_to_back", BackToBackOptimizer),
-    ("global_pool_optimizer", GlobalPoolOptimizer),
 ])
 
 
@@ -50,6 +52,7 @@ def optimize_graph(graph, catch_errors=True):
     before = graph.dump_node_statistics()
     opts = _get_optimizers()
     continue_flag = True
+    iteration = 0
     while continue_flag:
         continue_flag = False
         for name, factory in opts.items():
@@ -58,15 +61,16 @@ def optimize_graph(graph, catch_errors=True):
                 try:
                     current = copy.deepcopy(graph)
                     opt = factory()
-                    graph = opt.optimize(current) or graph
+                    graph = opt.optimize(current, iteration) or graph
                     continue_flag = continue_flag or opt.graph_been_opt
                 except Exception:  # pylint: disable=broad-except
                     # if current optimizer fails, continue with other optimizers
                     logger.warning("Failed to apply %s", name, exc_info=1)
             else:
                 opt = factory()
-                graph = opt.optimize(graph)
+                graph = opt.optimize(graph, iteration)
                 continue_flag = continue_flag or opt.graph_been_opt
+        iteration += 1
 
     try:
         graph.topological_sort(graph.get_nodes())
diff --git a/tf2onnx/optimizer/optimizer_base.py b/tf2onnx/optimizer/optimizer_base.py
@@ -17,6 +17,7 @@ class GraphOptimizerBase(object):
     def __init__(self):
         self._logger = logging.getLogger('.'.join(__name__.split('.')[:-1] + [self.__class__.__name__]))
         self._graph_been_opt = False
+        self.opt_iteration = 0
 
     @property
     def logger(self):
@@ -34,10 +35,11 @@ def graph_been_opt(self):
     def graph_been_opt(self, value):
         self._graph_been_opt = value
 
-    def optimize(self, graph):
+    def optimize(self, graph, iteration):
         """ Optimize graph, return optimized graph. """
         before = graph.dump_node_statistics()
 
+        self.opt_iteration = iteration
         graph = self._optimize(graph)
         graph.update_proto()
         graph.delete_unused_nodes(graph.outputs)
diff --git a/tf2onnx/optimizer/q_dq_optimizer.py b/tf2onnx/optimizer/q_dq_optimizer.py
@@ -0,0 +1,155 @@
+# SPDX-License-Identifier: Apache-2.0
+
+
+"""q dq optimizer
+   Pushes Quantize ops up and Dequantize ops down to maximize DQ -> op -> Q patterns for ORT
+   Does not work for per-channel quantization yet
+"""
+
+from .optimizer_base import GraphOptimizerBase
+
+# pylint: disable=logging-not-lazy,unused-argument,missing-docstring
+
+
+class QDQOptimizer(GraphOptimizerBase):
+
+    def __init__(self):  # pylint: disable=useless-super-delegation
+        super(QDQOptimizer, self).__init__()
+
+    def _optimize(self, graph):
+        return self._apply_optimization(graph, self._optimize_at_current_graph_level)
+
+    def _optimize_at_current_graph_level(self, graph):
+        graph_changed = True
+        while graph_changed:
+            graph_changed = False
+            ops = graph.get_nodes()
+            for op in ops:
+                if op.type == "QuantizeLinear" and self._optimize_quantize(op, graph):
+                    graph_changed = True
+                    self.graph_been_opt = True
+                elif op.type == "DequantizeLinear" and self._optimize_dequantize(op, graph):
+                    graph_changed = True
+                    self.graph_been_opt = True
+        return graph
+
+    def _optimize_quantize(self, quant_node, graph):
+        if 'axis' in quant_node.attr:
+            return False
+        node = quant_node.inputs[0]
+        if node.type == "DequantizeLinear":
+            # Remove DQ -> Q
+            if not self.has_same_quantization_params(quant_node, node):
+                return False
+            if quant_node.output[0] in graph.outputs or node.output[0] in graph.outputs:
+                return False
+            graph.replace_all_inputs(quant_node.output[0], node.input[0])
+            if not graph.find_output_consumers(quant_node.output[0]):
+                graph.remove_node(quant_node.name)
+            if not graph.find_output_consumers(node.output[0]):
+                graph.remove_node(node.name)
+            return True
+
+        # Push quantize nodes up
+        tensor_idx = is_tensor_op(graph, node)
+        if tensor_idx is None:
+            return False
+        inp_indices, out_indices = tensor_idx
+        for i in out_indices:
+            consumers = graph.find_output_consumers(node.output[i])
+            if node.output[i] in graph.outputs:
+                return False
+            for c in consumers:
+                if c.type != "QuantizeLinear":
+                    return False
+                if not self.has_same_quantization_params(c, quant_node):
+                    return False
+                if c.output[0] in graph.outputs:
+                    return False
+        # All outputs are quantized. Push quantization up to input.
+        for i in inp_indices:
+            inp_q = self.make_q_or_dq(graph, "QuantizeLinear", node.input[i], quant_node, node.name)
+            graph.replace_input(node, node.input[i], inp_q.output[0], i)
+
+        for i in out_indices:
+            graph.copy_dtype(quant_node.output[0], node.output[i])
+            consumers = graph.find_output_consumers(node.output[i])
+            for c in consumers:
+                graph.replace_all_inputs(c.output[0], node.output[i])
+
+        return True
+
+    def _optimize_dequantize(self, dequant_node, graph):
+        if 'axis' in dequant_node.attr:
+            return False
+        # Push dequantize nodes down
+        consumers = graph.find_output_consumers(dequant_node.output[0])
+        for node in consumers:
+            if self._optimize_dequantize_and_node(dequant_node, node, graph):
+                return True
+        return False
+
+    def _optimize_dequantize_and_node(self, dequant_node, node, graph):
+        tensor_idx = is_tensor_op(graph, node)
+        if tensor_idx is None:
+            return False
+        inp_indices, out_indices = tensor_idx
+        for i in inp_indices:
+            inp = node.inputs[i]
+            if inp.type != "DequantizeLinear":
+                return False
+            if not self.has_same_quantization_params(inp, dequant_node):
+                return False
+            if inp.output[0] in graph.outputs:
+                return False
+        for i in out_indices:
+            if node.output[i] in graph.outputs:
+                return False
+        # All inputs are dequantized. Push dequantization down to output.
+        for i in inp_indices:
+            # Skip the dequantize on the input
+            graph.replace_input(node, node.input[i], node.inputs[i].input[0], i)
+
+        for i in out_indices:
+            graph.copy_dtype(dequant_node.input[0], node.output[i])
+            out_dq = self.make_q_or_dq(graph, "DequantizeLinear", node.output[i], dequant_node, node.name)
+            graph.insert_node_on_output(out_dq, node.output[i])
+
+        return True
+
+    def has_same_quantization_params(self, node1, node2):
+        if node1.get_attr_value("axis") != node2.get_attr_value("axis"):
+            return False
+        # Constant merging will ensure these are the same nodes if they are equal
+        return node1.input[1:] == node2.input[1:]
+
+    def make_q_or_dq(self, graph, op_type, inp, reference_node, name_scope):
+        """Makes a QuantizeLinear or DequantizeLinear with quantization params copied from the reference_node"""
+        axis = reference_node.get_attr_value("axis")
+        if axis is None:
+            attr = {}
+        else:
+            attr = {'axis': axis}
+        return graph.make_node(op_type, [inp] + reference_node.input[1:], attr=attr, op_name_scope=name_scope)
+
+
+def is_tensor_op(g, node):
+    """Detects ops that reshape/shuffle tensor elements without computing/changing them (Transpose, Gather, etc.)
+    Returns None or a tuple (inp_indices, out_indices) s.t. all corresponding outputs of the node depend only
+    on elements of the corresponding inputs of the node and all other inputs/outputs are unchanged.
+    WARNING: Transpose optimizer pushes tranpose down so be careful when swapping to avoid infinite loop."""
+    if node.type in ["Identity", "Reshape", "Flatten", "Expand", "Transpose", "Squeeze", "Unsqueeze", "Slice"]:
+        return ([0], [0])
+    if node.type in ["Gather", "GatherND", "GatherElements"]:
+        # Output depends on data if indices is unchanged
+        return ([0], [0])
+    if node.type in ["Scatter", "ScatterND", "ScatterElements"]:
+        # Output depends on data and updates if indices is unchanged
+        return ([0, 2], [0])
+    if node.type == "Concat":
+        return (list(range(len(node.input))), [0])
+    if node.type == "Split":
+        return ([0], list(range(len(node.output))))
+    if node.type in ["Compress", "Tile", "ReverseSequence", "DepthToSpace"]:
+        return ([0], [0])
+    return None
diff --git a/tf2onnx/optimizer/transpose_optimizer.py b/tf2onnx/optimizer/transpose_optimizer.py
@@ -916,6 +916,13 @@ def _slice_handler(self, trans, node):
 
     def _quantize_handler(self, trans, node):
         # Used for QuantizeLinear and DequantizeLinear
+        if node.type == "DequantizeLinear":
+            # Only push through if we will be able to push through consumers too.
+            cons = self._g.find_output_consumers(node.output[0])
+            # If there is a false positive in the handler map, the q_dq and transpose optimizers might fight.
+            # Give up after 3 iterations. The q_dq optimizer should win so the dq hugs the op.
+            if not all(n.type in self._handler_map for n in cons) or self.opt_iteration >= 3:
+                return False
         if not self._switch_transpose_and_node(node, trans):
             return False
         if 'axis' in node.attr: