CodeLinaro
diff --git a/‎onnxruntime/python/tools/transformers/convert_generation.py‎
Lines changed: 99 additions & 41 deletions b/‎onnxruntime/python/tools/transformers/convert_generation.py‎
Lines changed: 99 additions & 41 deletions
diff --git a/‎onnxruntime/python/tools/transformers/fusion_attention.py‎
Lines changed: 2 additions & 2 deletions b/‎onnxruntime/python/tools/transformers/fusion_attention.py‎
Lines changed: 2 additions & 2 deletions
@@ -1447,7 +1447,7 @@ def add_output_qk_to_mha(model: OnnxModel, dtype: int = 0, skip_node_idxs: list[
     return model
 
 
-def fix_past_sequence_length(model: ModelProto):
+def fix_past_sequence_length(model: OnnxModel):
     # Modify total_sequence_length = past_sequence_length + curr_sequence_length subgraph to calculate
     # past_sequence_length from the new `past_sequence_length` input of size 1D and type int32 instead of
     # from `past_key_self_0` since DecoderMaskedMultiHeadAttention (DMMHA) uses buffer sharing and
@@ -1480,56 +1480,119 @@ def fix_past_sequence_length(model: ModelProto):
     #                |
     #               Add
 
+    # Constant names to be used
+    past_seq_len_name = "past_sequence_length"
+    past_seq_len_int32 = "past_seq_len_int32"
+    past_seq_len_int64 = "past_seq_len_int64"
+
     node = list(filter(lambda n: n.op_type == "LayerNormalization", model.model.graph.node))[0]  # noqa: RUF015
 
-    base_path = model.match_parent_path(
+    base_path_hf = model.match_parent_path(
+        node,
+        ["Add", "Gather", "Tile", "Expand", "Unsqueeze", "Range"],
+        [0, 1, 1, 0, 0, 0],
+    )
+    base_path_oai = model.match_parent_path(
         node,
         ["Add", "Slice"],
         [0, 1],
     )
-    if base_path is None:
+    if base_path_hf is not None:
+        base_path = base_path_hf
+    elif base_path_oai is not None:
+        base_path = base_path_oai
+    else:
+        logger.info("Cannot identify base path for fixing past_sequence_length subgraph")
         return
+    base_node = base_path[-1]
 
-    left_path = model.match_parent_path(
-        base_path[-1],
-        ["Unsqueeze", "Add", "Gather", "Shape"],
-        [2, 0, 0, 0],
-    )
-    right_path = model.match_parent_path(
-        base_path[-1],
-        ["Unsqueeze", "Gather", "Shape"],
-        [1, 0, 0],
-    )
-    long_right_path = model.match_parent_path(
-        base_path[-1],
-        ["Unsqueeze", "Gather", "Shape", "Reshape", "Transpose"],
-        [1, 0, 0, 0, 0],
-    )
-    if left_path is None or right_path is None or left_path[-2:] != right_path[-2:]:
-        return
+    if base_node.op_type == "Range":
+        # Hugging Face implementation
+        range_node = base_path[-1]
+
+        gather_path = model.match_parent_path(
+            range_node,
+            ["Gather", "Shape"],
+            [0, 0],
+        )
+        if gather_path is None:
+            logger.info("Cannot identify gather path for fixing past_sequence_length subgraph")
+            return
+
+        add_path = model.match_parent_path(
+            range_node,
+            ["Add", "Gather", "Shape"],
+            [1, 0, 0],
+        )
+        if add_path is None:
+            logger.info("Cannot identify add path for fixing past_sequence_length subgraph")
+            return
+        add_node = add_path[0]
+
+        if gather_path != add_path[1:]:
+            logger.info("Gather path and add path do not share the same nodes for calculating the past_sequence_length")
+            return
+
+        # Remove `past_key_self_0 --> Shape --> Gather` connection
+        constant_in_gather = list(filter(lambda n: n.output[0] == gather_path[0].input[1], model.model.graph.node))[0]  # noqa: RUF015
+        model.model.graph.node.remove(constant_in_gather)
+        model.model.graph.node.remove(gather_path[0])
+        model.model.graph.node.remove(gather_path[1])
+
+        # Add `past_seq_len_int64` as an input name to existing nodes
+        range_node.input[0] = past_seq_len_int64
+        add_node.input[0] = past_seq_len_int64
 
-    # Remove `past_key_self_0 --> [Transpose --> Reshape] --> Shape --> Gather` connection
-    # where `Transpose --> Reshape` part may or may not exist. The OpenAI implementation of
-    # Whisper has an extra `Transpose --> Reshape` connection to remove.
-    constant_node = list(filter(lambda n: n.output[0] == left_path[-2].input[1], model.model.graph.node))[0]  # noqa: RUF015
-    model.model.graph.node.remove(left_path[-2])
-    model.model.graph.node.remove(left_path[-1])
-    model.model.graph.node.remove(constant_node)
-    if long_right_path is not None:
-        # Remove `Transpose --> Reshape` part
-        model.model.graph.node.remove(long_right_path[-2])
-        model.model.graph.node.remove(long_right_path[-1])
+    else:
+        # OpenAI implementation
+        input_ids_path = model.match_parent_path(
+            base_node,
+            ["Unsqueeze", "Add", "Gather", "Shape", "Reshape", "Transpose"],
+            [2, 0, 0, 0, 0, 0],
+        )
+        if input_ids_path is None:
+            logger.info("Cannot identify input_ids path for fixing past_sequence_length subgraph")
+            return
+        add_node = input_ids_path[1]
+
+        past_key_path = model.match_parent_path(
+            base_node,
+            ["Unsqueeze", "Gather", "Shape", "Reshape", "Transpose"],
+            [1, 0, 0, 0, 0],
+        )
+        if past_key_path is None:
+            logger.info("Cannot identify past_key path for fixing past_sequence_length subgraph")
+            return
+        unsqueeze_node = past_key_path[0]
+
+        if input_ids_path[2:] != past_key_path[1:]:
+            logger.info(
+                "The input_ids path and past_key path do not share the same nodes for calculating the past_sequence_length"
+            )
+            return
+
+        # Remove `past_key_self_0 --> Transpose --> Reshape --> Shape --> Gather` connection
+        constant_in_gather = list(filter(lambda n: n.output[0] == past_key_path[1].input[1], model.model.graph.node))[0]  # noqa: RUF015
+        model.model.graph.node.remove(constant_in_gather)
+        constant_in_reshape = list(filter(lambda n: n.output[0] == past_key_path[-2].input[1], model.model.graph.node))[  # noqa: RUF015
+            0
+        ]
+        model.model.graph.node.remove(constant_in_reshape)
+        model.model.graph.node.remove(past_key_path[1])
+        model.model.graph.node.remove(past_key_path[2])
+        model.model.graph.node.remove(past_key_path[3])
+        model.model.graph.node.remove(past_key_path[4])
+
+        # Add `past_seq_len_int64` as an input name to existing nodes
+        unsqueeze_node.input[0] = past_seq_len_int64
+        add_node.input[0] = past_seq_len_int64
 
     # Add `past_sequence_length` as model input
-    past_seq_len_name = "past_sequence_length"
     model.model.graph.input.append(
         onnx.helper.make_tensor_value_info(past_seq_len_name, TensorProto.INT32, shape=[1]),
     )
 
     # Add `past_sequence_length --> Squeeze --> Cast` connection
-    past_seq_len_int32 = "past_seq_len_int32"
-    past_seq_len_int64 = "past_seq_len_int64"
-
     squeeze_node = onnx.helper.make_node(
         "Squeeze",
         inputs=[past_seq_len_name],
@@ -1546,14 +1609,9 @@ def fix_past_sequence_length(model: ModelProto):
     )
     cast_output = onnx.helper.make_tensor_value_info(past_seq_len_int64, TensorProto.INT64, shape=[])
 
-    model.model.graph.value_info.extend([squeeze_output, cast_output])
-
-    # Add `past_seq_len_int64` as an input name to existing nodes
-    left_path[1].input[0] = past_seq_len_int64
-    right_path[0].input[0] = past_seq_len_int64
-
     # Add new nodes to graph
     model.model.graph.node.extend([squeeze_node, cast_node])
+    model.model.graph.value_info.extend([squeeze_output, cast_output])
     model.topological_sort()
     return model, past_seq_len_name
 
 
@@ -663,12 +663,12 @@ def create_attention_node(
         first_input: str,
         output: str,
         add_qk_str: str = "",
+        causal: bool = False,
         past_k: str = "",
         past_v: str = "",
         present_k: str = "",
         present_v: str = "",
         scale: float | None = None,
-        causal: bool = False,
     ) -> NodeProto | None:
         """Create an Attention node.
 
@@ -685,12 +685,12 @@ def create_attention_node(
             first_input (str): first input name
             output (str): output name
             add_qk_str (str): name of Add node after Q x K'
+            causal: whether it is uni-directional mask.
             past_k (str): name of input for past K value
             past_v (str): name of input for past V value
             present_k (str): name of output to store present K value
             present_v (str): name of output to store present V value
             scale: scale before softmax
-            causal: whether it is uni-directional mask.
 
         Returns:
             Union[NodeProto, None]: the node created or None if failed.