[BugFix] Try except sequence parallel utils (#8189) (#8274)

DesmonDay · web-flow · commit 6c1f4493654d · 2024-04-15T16:45:23.000+08:00
* try except sp

* fix sp import
diff --git a/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py b/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py
@@ -48,13 +48,16 @@
     MinLengthLogitsProcessor,
     RepetitionPenaltyLogitsProcessor,
 )
-from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-    ColumnSequenceParallelLinear,
-    GatherOp,
-    RowSequenceParallelLinear,
-    ScatterOp,
-    mark_as_sequence_parallel_parameter,
-)
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        ColumnSequenceParallelLinear,
+        GatherOp,
+        RowSequenceParallelLinear,
+        ScatterOp,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
 
 from paddlenlp.transformers.segment_parallel_utils  import ReshardLayer
 
diff --git a/model_zoo/gpt-3/ppfleetx/models/language_model/language_module.py b/model_zoo/gpt-3/ppfleetx/models/language_model/language_module.py
@@ -24,9 +24,12 @@
 from ppfleetx.core.module.basic_module import BasicModule
 from ppfleetx.data.tokenizers import GPTTokenizer
 from ppfleetx.distributed.apis import env
-from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-    register_sequence_parallel_allreduce_hooks,
-)
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        register_sequence_parallel_allreduce_hooks,
+    )
+except:
+    pass
 from ppfleetx.utils.log import logger
 
 # TODO(haohongxiang): to solve the problem of cross-reference
diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py
@@ -29,16 +29,20 @@
 from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
 from .image_processing_utils import ImageProcessingMixin
 from .attention_utils import create_bigbird_rand_mask_idx_list
-from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-    GatherOp,
-    ScatterOp,
-    AllGatherOp,
-    ReduceScatterOp,
-    ColumnSequenceParallelLinear,
-    RowSequenceParallelLinear,
-    mark_as_sequence_parallel_parameter,
-    register_sequence_parallel_allreduce_hooks,
-)
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        GatherOp,
+        ScatterOp,
+        AllGatherOp,
+        ReduceScatterOp,
+        ColumnSequenceParallelLinear,
+        RowSequenceParallelLinear,
+        mark_as_sequence_parallel_parameter,
+        register_sequence_parallel_allreduce_hooks,
+    )
+except:
+    pass
 from .export import export_model
 
 # isort: split
diff --git a/paddlenlp/transformers/gpt/modeling.py b/paddlenlp/transformers/gpt/modeling.py
@@ -29,13 +29,17 @@
 from paddle.distributed import fleet
 from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
 from paddle.distributed.fleet.utils import recompute
-from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-    ColumnSequenceParallelLinear,
-    GatherOp,
-    RowSequenceParallelLinear,
-    ScatterOp,
-    mark_as_sequence_parallel_parameter,
-)
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        ColumnSequenceParallelLinear,
+        GatherOp,
+        RowSequenceParallelLinear,
+        ScatterOp,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
 from paddle.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from paddle.utils import try_import
 
diff --git a/paddlenlp/transformers/gpt/modeling_auto.py b/paddlenlp/transformers/gpt/modeling_auto.py
@@ -30,10 +30,14 @@
 from paddle.distributed import fleet
 from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
 from paddle.distributed.fleet.utils import recompute
-from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-    ScatterOp,
-    mark_as_sequence_parallel_parameter,
-)
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        ScatterOp,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
 
 from ...utils.converter import StateDictNameMapping
 from .. import PretrainedModel, register_base_model
diff --git a/paddlenlp/transformers/gpt/modeling_pp.py b/paddlenlp/transformers/gpt/modeling_pp.py
@@ -19,9 +19,13 @@
     SharedLayerDesc,
 )
 from paddle.distributed.fleet.utils import recompute
-from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-    mark_as_sequence_parallel_parameter,
-)
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
 
 from paddlenlp.transformers.model_utils import PipelinePretrainedModel
 
diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
@@ -45,13 +45,16 @@ def swiglu(x, y=None):
         return F.silu(x) * y
 
 
-from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-    ColumnSequenceParallelLinear,
-    GatherOp,
-    RowSequenceParallelLinear,
-    ScatterOp,
-    mark_as_sequence_parallel_parameter,
-)
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        ColumnSequenceParallelLinear,
+        GatherOp,
+        RowSequenceParallelLinear,
+        ScatterOp,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
 from paddle.utils import try_import
 
 from paddlenlp.transformers.conversion_utils import (
diff --git a/paddlenlp/transformers/mc2_seqence_parallel_linear.py b/paddlenlp/transformers/mc2_seqence_parallel_linear.py
@@ -23,10 +23,14 @@
 
 from paddle import distributed as dist
 from paddle.autograd import PyLayer
-from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-    ColumnSequenceParallelLinear,
-    RowSequenceParallelLinear,
-)
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        ColumnSequenceParallelLinear,
+        RowSequenceParallelLinear,
+    )
+except:
+    pass
 
 __all_gather_recomputation__ = False
 if int(os.getenv("MC2_Recompute", 0)):
diff --git a/paddlenlp/transformers/mixtral/modeling.py b/paddlenlp/transformers/mixtral/modeling.py
@@ -33,13 +33,16 @@
 except ImportError:
     fused_rotary_position_embedding = None
 
-from paddle.distributed.fleet.utils.sequence_parallel_utils import (
-    ColumnSequenceParallelLinear,
-    GatherOp,
-    RowSequenceParallelLinear,
-    ScatterOp,
-    mark_as_sequence_parallel_parameter,
-)
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        ColumnSequenceParallelLinear,
+        GatherOp,
+        RowSequenceParallelLinear,
+        ScatterOp,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
 
 from paddlenlp.transformers.conversion_utils import (
     StateDictNameMapping,