@@ -667,16 +667,17 @@ All parameter, weight, gradient are variables in Paddle.
667
667
ExecutionStrategy allows the user to more preciously control how to run
668
668
the program in ParallelExecutor by setting the property.
669
669
670
- The available properties include:
671
- use_cuda (bool): Whether to use CUDA or not. Default True.
672
- num_threads (int): The number of threads that used to run the
673
- operators in ParallelExecutor. If it is not set, it will be
674
- set in ParallelExecutor according to the device count.
675
- Default 0.
676
- allow_op_delay (bool): Whether to delay the communication operators
677
- to run. Default False.
678
- num_iteration_per_drop_scope (int): how many iterations between
679
- the two dropping local scopes. Default 100.
670
+ Examples:
671
+ .. code-block:: python
672
+
673
+ exec_strategy = fluid.ExecutionStrategy()
674
+ exec_strategy.num_threads = 4
675
+
676
+ train_exe = fluid.ParallelExecutor(use_cuda=True,
677
+ loss_name=loss.name,
678
+ exec_strategy=exec_strategy)
679
+
680
+ train_loss, = train_exe.run([loss.name], feed=feed_dict)
680
681
681
682
)DOC" );
682
683
@@ -686,27 +687,54 @@ All parameter, weight, gradient are variables in Paddle.
686
687
[](const ExecutionStrategy &self) { return self.num_threads_ ; },
687
688
[](ExecutionStrategy &self, size_t num_threads) {
688
689
self.num_threads_ = num_threads;
689
- })
690
+ },
691
+ R"DOC( The type is INT, num_threads represents the size of thread pool that
692
+ used to run the operators of the current program in ParallelExecutor.
693
+ If :math:`num\_threads=1`, all the operators will execute one by one,
694
+ but the order maybe difference between iterations.
695
+ If it is not set, it will be set in ParallelExecutor according to the
696
+ device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU,
697
+ :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor.
698
+ if it is not set, ParallelExecutor will get the cpu count by calling
699
+ `multiprocessing.cpu_count()`. Default 0.)DOC" )
690
700
.def_property (
691
701
" use_cuda" ,
692
702
[](const ExecutionStrategy &self) { return self.use_cuda_ ; },
693
703
[](ExecutionStrategy &self, bool use_cuda) {
694
704
self.use_cuda_ = use_cuda;
695
- })
705
+ }) // FIXME(chengduo): Doesn't add doc for 'use_cuda', use_cuda may
706
+ // make user confuse, because ParallelExecutor has a parameter named
707
+ // 'use_cuda' too, in current implementation, ParallelExecutor's
708
+ // 'use_cuda' will rewrite ExecutionStrategy's 'use_cuda'.
696
709
.def_property (
697
710
" allow_op_delay" ,
698
711
[](const ExecutionStrategy &self) { return self.allow_op_delay_ ; },
699
712
[](ExecutionStrategy &self, bool allow_op_delay) {
700
713
self.allow_op_delay_ = allow_op_delay;
701
- })
714
+ },
715
+ R"DOC( The type is BOOL, allow_op_delay represents whether to delay the
716
+ communication operators to run, it may make the execution faster.
717
+ Note that in some models, allow_op_delay may cause program hang. Default False.)DOC" )
702
718
.def_property (
703
719
" num_iteration_per_drop_scope" ,
704
720
[](const ExecutionStrategy &self) {
705
721
return self.num_iteration_per_drop_scope_ ;
706
722
},
707
723
[](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
708
724
self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
709
- });
725
+ },
726
+ R"DOC( The type is INT, num_iteration_per_drop_scope indicates how
727
+ many iterations to clean up the temp variables which
728
+ is generated during execution. It may make the execution faster,
729
+ because the temp variable's shape maybe the same between two iterations. Default 100.
730
+
731
+ NOTES:
732
+ 1. If you fetch data when calling the 'run', the ParallelExecutor
733
+ will clean up the temp variables at the end of the current iteration.
734
+ 2. In some NLP model, it may cause the GPU memory is insufficient,
735
+ in this case, you should reduce `num_iteration_per_drop_scope`.
736
+ )DOC" );
737
+
710
738
exec_strategy.def_property (
711
739
" use_experimental_executor" ,
712
740
[](const ExecutionStrategy &self) {
@@ -721,20 +749,17 @@ All parameter, weight, gradient are variables in Paddle.
721
749
BuildStrategy allows the user to more preciously control how to
722
750
build the SSA Graph in ParallelExecutor by setting the property.
723
751
724
- The available properties include:
725
- reduce_strategy (str): There are two reduce strategies, 'AllReduce'
726
- and 'Reduce'. If you want that all parameters will be optimized
727
- on all devices, you can choose 'AllReduce'; if you choose
728
- 'Reduce', all parameters will be evenly allocated to different
729
- devices for optimization, and then broadcast the optimized
730
- parameter to other devices. Default 'AllReduce'.
731
- gradient_scale_strategy (str): There are two ways of defining loss@grad,
732
- 'CoeffNumDevice' and 'Customized'. By default, ParallelExecutor
733
- sets the loss@grad according to the number of devices. If you want
734
- to customize loss@grad, you can choose 'Customized'.
735
- Default 'CoeffNumDevice'.
736
- debug_graphviz_path (str): Whether to write the SSA Graph to file in the
737
- form of graphviz. It is useful for debugging. Default "".
752
+ Examples:
753
+ .. code-block:: python
754
+
755
+ build_strategy = fluid.BuildStrategy()
756
+ build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
757
+
758
+ train_exe = fluid.ParallelExecutor(use_cuda=True,
759
+ loss_name=loss.name,
760
+ build_strategy=build_strategy)
761
+
762
+ train_loss, = train_exe.run([loss.name], feed=feed_dict)
738
763
)DOC" );
739
764
740
765
py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, " ReduceStrategy" )
@@ -753,31 +778,51 @@ All parameter, weight, gradient are variables in Paddle.
753
778
[](const BuildStrategy &self) { return self.reduce_ ; },
754
779
[](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
755
780
self.reduce_ = strategy;
756
- })
781
+ },
782
+ R"DOC( The type is STR, there are two reduce strategies in ParallelExecutor,
783
+ 'AllReduce' and 'Reduce'. If you want that all the parameters'
784
+ optimization are done on all devices independently, you should choose 'AllReduce';
785
+ if you choose 'Reduce', all the parameters' optimization will be evenly distributed
786
+ to different devices, and then broadcast the optimized parameter to other devices.
787
+ In some models, `Reduce` is faster. Default 'AllReduce'. )DOC" )
757
788
.def_property (
758
789
" gradient_scale_strategy" ,
759
790
[](const BuildStrategy &self) { return self.gradient_scale_ ; },
760
791
[](BuildStrategy &self,
761
792
BuildStrategy::GradientScaleStrategy strategy) {
762
793
self.gradient_scale_ = strategy;
763
- })
794
+ },
795
+ R"DOC( The type is STR, there are three ways of defining :math:`loss@grad` in
796
+ ParallelExecutor, 'CoeffNumDevice', 'One' and 'Customized'. By default,
797
+ ParallelExecutor sets the :math:`loss@grad` according to the number of devices.
798
+ If you want to customize :math:`loss@grad`, you can choose 'Customized'.
799
+ Default 'CoeffNumDevice'.)DOC" )
764
800
.def_property (
765
801
" debug_graphviz_path" ,
766
802
[](const BuildStrategy &self) { return self.debug_graphviz_path_ ; },
767
803
[](BuildStrategy &self, const std::string &path) {
768
804
self.debug_graphviz_path_ = path;
769
- })
805
+ },
806
+ R"DOC( The type is STR, debug_graphviz_path indicate the path that
807
+ writing the SSA Graph to file in the form of graphviz, you.
808
+ It is useful for debugging. Default "")DOC" )
770
809
.def_property (
771
810
" enable_data_balance" ,
772
811
[](const BuildStrategy &self) { return self.enable_data_balance_ ; },
773
- [](BuildStrategy &self, bool b) { self.enable_data_balance_ = b; })
774
- .def_property (" fuse_elewise_add_act_ops" ,
775
- [](const BuildStrategy &self) {
776
- return self.fuse_elewise_add_act_ops_ ;
777
- },
778
- [](BuildStrategy &self, bool b) {
779
- self.fuse_elewise_add_act_ops_ = b;
780
- })
812
+ [](BuildStrategy &self, bool b) {
813
+ self.enable_data_balance_ = b;
814
+ }) // FIXME(chengudo): enable_data_balance seems not important
815
+ .def_property (
816
+ " fuse_elewise_add_act_ops" ,
817
+ [](const BuildStrategy &self) {
818
+ return self.fuse_elewise_add_act_ops_ ;
819
+ },
820
+ [](BuildStrategy &self, bool b) {
821
+ self.fuse_elewise_add_act_ops_ = b;
822
+ },
823
+ R"DOC( The type is BOOL, fuse_elewise_add_act_ops indicate whether
824
+ to fuse elementwise_add_op and activation_op,
825
+ it may make the execution faster. Default False)DOC" )
781
826
.def (" _create_passes_from_strategy" ,
782
827
[](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
783
828
return self.CreatePassesFromStrategy ();
0 commit comments