[Bug fixes] fix inference dybatch d2s (#6998)

wj-Mcat · web-flow · commit 83e0b4703ae9 · 2023-09-13T01:53:09.000-05:00
* fix inference dybatch d2s

* remove encoder-output
diff --git a/paddlenlp/experimental/transformers/generation_utils.py b/paddlenlp/experimental/transformers/generation_utils.py
@@ -297,11 +297,14 @@ def sample(
         step_idx_ori = paddle.full(shape=[1], dtype="int64", fill_value=1)
         batch_idx = paddle.full(shape=[1], dtype="int32", fill_value=-1)
 
+        # fake temp next_tokens
+        next_tokens = paddle.full(shape=[paddle.shape(input_ids).shape[0], 1], dtype="int32", fill_value=0)
+
         # let inputs_embeds enter into model_kwargs.
         # because the code below directly use the model_kwargs as a parameter without using inputs_embeds.
         model_kwargs["inputs_embeds"] = inputs_embeds
         model_kwargs["all_input_ids"] = input_ids
-        logits_processors = model_kwargs["logits_processors"]
+        logits_processors = model_kwargs.pop("logits_processors")
 
         def _forward_(**args):
             # cache_kvs is never empty because it is passed as a parameter in def sample.
@@ -367,18 +370,25 @@ def _post_process_(outputs, top_p, temperature, step_idx_ori, model_kwargs):
 
             return next_tokens, model_kwargs
 
-        # encoder
-        outputs = _forward_(**model_kwargs)
-        # first decoder
-        next_tokens, model_kwargs = _post_process_(
-            outputs,
-            top_p,
-            temperature,
-            step_idx_ori,
-            model_kwargs,
-        )
-        step_idx_ori += 1
-        encoder_output = outputs
+        if paddle.max(model_kwargs["seq_len_encoder"]) > 0:
+            # encoder
+            outputs = _forward_(**model_kwargs)
+            # first decoder
+            next_tokens, model_kwargs = _post_process_(
+                outputs,
+                top_p,
+                temperature,
+                step_idx_ori,
+                model_kwargs,
+            )
+            step_idx_ori += 1
+        else:
+            outputs = None
+            # first decoder
+            next_tokens = None
+            model_kwargs["next_tokens"] = next_tokens
+            step_idx_ori += 0
+
         # gives it a value, means we will entered into decoder phase.
         model_kwargs["cache"] = 0
 
@@ -402,5 +412,4 @@ def _post_process_(outputs, top_p, temperature, step_idx_ori, model_kwargs):
             paddle.cast(model_kwargs["stop_flags"], "int32"),
             model_kwargs["seq_len_decoder"],
             model_kwargs["tgt_pos"],
-            encoder_output,
         )
diff --git a/paddlenlp/experimental/transformers/llama/modeling.py b/paddlenlp/experimental/transformers/llama/modeling.py
@@ -286,7 +286,26 @@ def forward(
             input_ids, position_ids, self.head_dim_shape_tensor, position_offset, True
         )
 
-        with paddle.base.framework._stride_in_no_check_dy2st_diff():
+        if hasattr(paddle.framework, "_no_check_dy2st_diff"):
+            # TODO(daisiming): _no_check_dy2st_diff is used to turn off the checking of behavior
+            # inconsistency between dynamic graph and static graph. _no_check_dy2st_diff should be
+            # removed after static graphs support inplace and stride.
+            with paddle.framework._no_check_dy2st_diff():
+                hidden_states, _ = self.transformer_block(
+                    input_ids,
+                    hidden_states,
+                    cum_offsets=cum_offsets,
+                    padding_offset=padding_offset,
+                    attn_mask=paddle.cast(attention_mask, dtype=hidden_states.dtype),
+                    caches=cache_kvs,
+                    pre_caches=pre_caches,
+                    pre_caches_length=position_offset,
+                    seq_lens=seq_lens,
+                    rotary_embs=new_rope,
+                    rotary_emb_dims=1,
+                    time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None,
+                )
+        else:
             hidden_states, _ = self.transformer_block(
                 input_ids,
                 hidden_states,