@@ -596,34 +596,78 @@ All parameter, weight, gradient are variables in Paddle.
596
596
597
597
// -- python binds for parallel executor.
598
598
py::class_<ParallelExecutor> pe (m, " ParallelExecutor" );
599
- py::class_<ExecutionStrategy> exec_strategy (pe, " ExecutionStrategy" );
599
+ py::class_<ExecutionStrategy> exec_strategy (pe, " ExecutionStrategy" , R"DOC(
600
+ ExecutionStrategy allows the user to more preciously control how to run
601
+ the program in ParallelExecutor by setting the property.
602
+
603
+ Examples:
604
+ .. code-block:: python
605
+
606
+ exec_strategy = fluid.ExecutionStrategy()
607
+ exec_strategy.num_threads = 4
608
+
609
+ train_exe = fluid.ParallelExecutor(use_cuda=True,
610
+ loss_name=loss.name,
611
+ exec_strategy=exec_strategy)
612
+
613
+ train_loss, = train_exe.run([loss.name], feed=feed_dict)
614
+
615
+ )DOC" );
616
+
600
617
exec_strategy.def (py::init ())
601
618
.def_property (
602
619
" num_threads" ,
603
620
[](const ExecutionStrategy &self) { return self.num_threads_ ; },
604
621
[](ExecutionStrategy &self, size_t num_threads) {
605
622
self.num_threads_ = num_threads;
606
- })
623
+ },
624
+ R"DOC( The type is INT, num_threads represents the size of thread pool that
625
+ used to run the operators of the current program in ParallelExecutor.
626
+ If :math:`num\_threads=1`, all the operators will execute one by one,
627
+ but the order maybe difference between iterations.
628
+ If it is not set, it will be set in ParallelExecutor according to the
629
+ device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU,
630
+ :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor.
631
+ if it is not set, ParallelExecutor will get the cpu count by calling
632
+ `multiprocessing.cpu_count()`. Default 0.)DOC" )
607
633
.def_property (
608
634
" use_cuda" ,
609
635
[](const ExecutionStrategy &self) { return self.use_cuda_ ; },
610
636
[](ExecutionStrategy &self, bool use_cuda) {
611
637
self.use_cuda_ = use_cuda;
612
- })
638
+ }) // FIXME(chengduo): Doesn't add doc for 'use_cuda', use_cuda may
639
+ // make user confuse, because ParallelExecutor has a parameter named
640
+ // 'use_cuda' too, in current implementation, ParallelExecutor's
641
+ // 'use_cuda' will rewrite ExecutionStrategy's 'use_cuda'.
613
642
.def_property (
614
643
" allow_op_delay" ,
615
644
[](const ExecutionStrategy &self) { return self.allow_op_delay_ ; },
616
645
[](ExecutionStrategy &self, bool allow_op_delay) {
617
646
self.allow_op_delay_ = allow_op_delay;
618
- })
647
+ },
648
+ R"DOC( The type is BOOL, allow_op_delay represents whether to delay the
649
+ communication operators to run, it may make the execution faster.
650
+ Note that in some models, allow_op_delay may cause program hang. Default False.)DOC" )
619
651
.def_property (
620
652
" num_iteration_per_drop_scope" ,
621
653
[](const ExecutionStrategy &self) {
622
654
return self.num_iteration_per_drop_scope_ ;
623
655
},
624
656
[](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
625
657
self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
626
- });
658
+ },
659
+ R"DOC( The type is INT, num_iteration_per_drop_scope indicates how
660
+ many iterations to clean up the temp variables which
661
+ is generated during execution. It may make the execution faster,
662
+ because the temp variable's shape maybe the same between two iterations. Default 100.
663
+
664
+ NOTES:
665
+ 1. If you fetch data when calling the 'run', the ParallelExecutor
666
+ will clean up the temp variables at the end of the current iteration.
667
+ 2. In some NLP model, it may cause the GPU memory is insufficient,
668
+ in this case, you should reduce `num_iteration_per_drop_scope`.
669
+ )DOC" );
670
+
627
671
exec_strategy.def_property (
628
672
" use_experimental_executor" ,
629
673
[](const ExecutionStrategy &self) {
@@ -634,7 +678,22 @@ All parameter, weight, gradient are variables in Paddle.
634
678
: ExecutionStrategy::kDefault ;
635
679
});
636
680
637
- py::class_<BuildStrategy> build_strategy (pe, " BuildStrategy" );
681
+ py::class_<BuildStrategy> build_strategy (pe, " BuildStrategy" , R"DOC(
682
+ BuildStrategy allows the user to more preciously control how to
683
+ build the SSA Graph in ParallelExecutor by setting the property.
684
+
685
+ Examples:
686
+ .. code-block:: python
687
+
688
+ build_strategy = fluid.BuildStrategy()
689
+ build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
690
+
691
+ train_exe = fluid.ParallelExecutor(use_cuda=True,
692
+ loss_name=loss.name,
693
+ build_strategy=build_strategy)
694
+
695
+ train_loss, = train_exe.run([loss.name], feed=feed_dict)
696
+ )DOC" );
638
697
639
698
py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, " ReduceStrategy" )
640
699
.value (" Reduce" , BuildStrategy::ReduceStrategy::kReduce )
@@ -652,31 +711,51 @@ All parameter, weight, gradient are variables in Paddle.
652
711
[](const BuildStrategy &self) { return self.reduce_ ; },
653
712
[](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
654
713
self.reduce_ = strategy;
655
- })
714
+ },
715
+ R"DOC( The type is STR, there are two reduce strategies in ParallelExecutor,
716
+ 'AllReduce' and 'Reduce'. If you want that all the parameters'
717
+ optimization are done on all devices independently, you should choose 'AllReduce';
718
+ if you choose 'Reduce', all the parameters' optimization will be evenly distributed
719
+ to different devices, and then broadcast the optimized parameter to other devices.
720
+ In some models, `Reduce` is faster. Default 'AllReduce'. )DOC" )
656
721
.def_property (
657
722
" gradient_scale_strategy" ,
658
723
[](const BuildStrategy &self) { return self.gradient_scale_ ; },
659
724
[](BuildStrategy &self,
660
725
BuildStrategy::GradientScaleStrategy strategy) {
661
726
self.gradient_scale_ = strategy;
662
- })
727
+ },
728
+ R"DOC( The type is STR, there are three ways of defining :math:`loss@grad` in
729
+ ParallelExecutor, 'CoeffNumDevice', 'One' and 'Customized'. By default,
730
+ ParallelExecutor sets the :math:`loss@grad` according to the number of devices.
731
+ If you want to customize :math:`loss@grad`, you can choose 'Customized'.
732
+ Default 'CoeffNumDevice'.)DOC" )
663
733
.def_property (
664
734
" debug_graphviz_path" ,
665
735
[](const BuildStrategy &self) { return self.debug_graphviz_path_ ; },
666
736
[](BuildStrategy &self, const std::string &path) {
667
737
self.debug_graphviz_path_ = path;
668
- })
738
+ },
739
+ R"DOC( The type is STR, debug_graphviz_path indicate the path that
740
+ writing the SSA Graph to file in the form of graphviz, you.
741
+ It is useful for debugging. Default "")DOC" )
669
742
.def_property (
670
743
" enable_data_balance" ,
671
744
[](const BuildStrategy &self) { return self.enable_data_balance_ ; },
672
- [](BuildStrategy &self, bool b) { self.enable_data_balance_ = b; })
673
- .def_property (" fuse_elewise_add_act_ops" ,
674
- [](const BuildStrategy &self) {
675
- return self.fuse_elewise_add_act_ops_ ;
676
- },
677
- [](BuildStrategy &self, bool b) {
678
- self.fuse_elewise_add_act_ops_ = b;
679
- });
745
+ [](BuildStrategy &self, bool b) {
746
+ self.enable_data_balance_ = b;
747
+ }) // FIXME(chengudo): enable_data_balance seems not important
748
+ .def_property (
749
+ " fuse_elewise_add_act_ops" ,
750
+ [](const BuildStrategy &self) {
751
+ return self.fuse_elewise_add_act_ops_ ;
752
+ },
753
+ [](BuildStrategy &self, bool b) {
754
+ self.fuse_elewise_add_act_ops_ = b;
755
+ },
756
+ R"DOC( The type is BOOL, fuse_elewise_add_act_ops indicate whether
757
+ to fuse elementwise_add_op and activation_op,
758
+ it may make the execution faster. Default False)DOC" );
680
759
681
760
pe.def (py::init<const std::vector<platform::Place> &,
682
761
const std::unordered_set<std::string> &,
0 commit comments