@@ -156,7 +156,50 @@ PYBIND11_PLUGIN(core) {
156
156
.def (" _get_double_element" , TensorGetElement<double >)
157
157
.def (" _dtype" , [](Tensor &self) { return ToDataType (self.type ()); });
158
158
159
- py::class_<LoDTensor, Tensor>(m, " LoDTensor" )
159
+ py::class_<LoDTensor, Tensor>(m, " LoDTensor" , R"DOC(
160
+ LoDTensor is a Tensor with optional LoD information.
161
+
162
+ np.array(lod_tensor) can convert LoDTensor to numpy array.
163
+ lod_tensor.lod() can retrieve the LoD information.
164
+
165
+ LoD is short for Level of Details and is usually used for varied sequence
166
+ length. You can skip the following comment if you don't need optional LoD.
167
+
168
+ For example:
169
+ A LoDTensor X can look like the example below. It contains 2 sequences.
170
+ The first has length 2 and the second has length 3, as described by x.lod.
171
+
172
+ The first tensor dimension 5=2+3 is calculated from LoD if it's available.
173
+ It means the total number of sequence element. In X, each element has 2
174
+ columns, hence [5, 2].
175
+
176
+ x.lod = [[2, 3]]
177
+ x.data = [[1, 2], [3, 4], // seq 1
178
+ [5, 6], [7, 8], [9, 10]] // seq 2
179
+ x.shape = [5, 2]
180
+
181
+ LoD can have multiple levels (for example, a paragraph can have multiple
182
+ sentences and a sentence can have multiple words). In the following
183
+ LodTensor Y, the lod_level is 2. It means there are 2 sequence, the
184
+ first sequence length is 2 (has 2 sub-sequences), the second one's
185
+ length is 1. The first sequence's 2 sub-sequences have length 2 and 2,
186
+ respectively. And the second sequence's 1 sub-sequence has length 3.
187
+
188
+ y.lod = [[2 1], [2 2 3]]
189
+ y.shape = [2+2+3, ...]
190
+
191
+ Note:
192
+ In above description, LoD is length-based. In Paddle internal
193
+ implementation, lod is offset-based. Hence, internally,
194
+ y.lod is represented as [[0, 2, 3], [0, 2, 4, 7]] (length-based
195
+ equivlent would be [[2-0, 3-2], [2-0, 4-2, 7-4]]).
196
+
197
+ Sometimes LoD is called recursive_sequence_length to be more
198
+ self-explanatory. In this case, it must be length-based. Due to history
199
+ reasons. when LoD is called lod in public API, it might be offset-based.
200
+ Users should be careful about it.
201
+
202
+ )DOC" )
160
203
.def_buffer (
161
204
[](Tensor &self) -> py::buffer_info { return CastToPyBuffer (self); })
162
205
.def (" __init__" ,
@@ -596,34 +639,78 @@ All parameter, weight, gradient are variables in Paddle.
596
639
597
640
// -- python binds for parallel executor.
598
641
py::class_<ParallelExecutor> pe (m, " ParallelExecutor" );
599
- py::class_<ExecutionStrategy> exec_strategy (pe, " ExecutionStrategy" );
642
+ py::class_<ExecutionStrategy> exec_strategy (pe, " ExecutionStrategy" , R"DOC(
643
+ ExecutionStrategy allows the user to more preciously control how to run
644
+ the program in ParallelExecutor by setting the property.
645
+
646
+ Examples:
647
+ .. code-block:: python
648
+
649
+ exec_strategy = fluid.ExecutionStrategy()
650
+ exec_strategy.num_threads = 4
651
+
652
+ train_exe = fluid.ParallelExecutor(use_cuda=True,
653
+ loss_name=loss.name,
654
+ exec_strategy=exec_strategy)
655
+
656
+ train_loss, = train_exe.run([loss.name], feed=feed_dict)
657
+
658
+ )DOC" );
659
+
600
660
exec_strategy.def (py::init ())
601
661
.def_property (
602
662
" num_threads" ,
603
663
[](const ExecutionStrategy &self) { return self.num_threads_ ; },
604
664
[](ExecutionStrategy &self, size_t num_threads) {
605
665
self.num_threads_ = num_threads;
606
- })
666
+ },
667
+ R"DOC( The type is INT, num_threads represents the size of thread pool that
668
+ used to run the operators of the current program in ParallelExecutor.
669
+ If :math:`num\_threads=1`, all the operators will execute one by one,
670
+ but the order maybe difference between iterations.
671
+ If it is not set, it will be set in ParallelExecutor according to the
672
+ device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU,
673
+ :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor.
674
+ if it is not set, ParallelExecutor will get the cpu count by calling
675
+ `multiprocessing.cpu_count()`. Default 0.)DOC" )
607
676
.def_property (
608
677
" use_cuda" ,
609
678
[](const ExecutionStrategy &self) { return self.use_cuda_ ; },
610
679
[](ExecutionStrategy &self, bool use_cuda) {
611
680
self.use_cuda_ = use_cuda;
612
- })
681
+ }) // FIXME(chengduo): Doesn't add doc for 'use_cuda', use_cuda may
682
+ // make user confuse, because ParallelExecutor has a parameter named
683
+ // 'use_cuda' too, in current implementation, ParallelExecutor's
684
+ // 'use_cuda' will rewrite ExecutionStrategy's 'use_cuda'.
613
685
.def_property (
614
686
" allow_op_delay" ,
615
687
[](const ExecutionStrategy &self) { return self.allow_op_delay_ ; },
616
688
[](ExecutionStrategy &self, bool allow_op_delay) {
617
689
self.allow_op_delay_ = allow_op_delay;
618
- })
690
+ },
691
+ R"DOC( The type is BOOL, allow_op_delay represents whether to delay the
692
+ communication operators to run, it may make the execution faster.
693
+ Note that in some models, allow_op_delay may cause program hang. Default False.)DOC" )
619
694
.def_property (
620
695
" num_iteration_per_drop_scope" ,
621
696
[](const ExecutionStrategy &self) {
622
697
return self.num_iteration_per_drop_scope_ ;
623
698
},
624
699
[](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
625
700
self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
626
- });
701
+ },
702
+ R"DOC( The type is INT, num_iteration_per_drop_scope indicates how
703
+ many iterations to clean up the temp variables which
704
+ is generated during execution. It may make the execution faster,
705
+ because the temp variable's shape maybe the same between two iterations. Default 100.
706
+
707
+ NOTES:
708
+ 1. If you fetch data when calling the 'run', the ParallelExecutor
709
+ will clean up the temp variables at the end of the current iteration.
710
+ 2. In some NLP model, it may cause the GPU memory is insufficient,
711
+ in this case, you should reduce `num_iteration_per_drop_scope`.
712
+ )DOC" );
713
+
627
714
exec_strategy.def_property (
628
715
" use_experimental_executor" ,
629
716
[](const ExecutionStrategy &self) {
@@ -634,7 +721,22 @@ All parameter, weight, gradient are variables in Paddle.
634
721
: ExecutionStrategy::kDefault ;
635
722
});
636
723
637
- py::class_<BuildStrategy> build_strategy (pe, " BuildStrategy" );
724
+ py::class_<BuildStrategy> build_strategy (pe, " BuildStrategy" , R"DOC(
725
+ BuildStrategy allows the user to more preciously control how to
726
+ build the SSA Graph in ParallelExecutor by setting the property.
727
+
728
+ Examples:
729
+ .. code-block:: python
730
+
731
+ build_strategy = fluid.BuildStrategy()
732
+ build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
733
+
734
+ train_exe = fluid.ParallelExecutor(use_cuda=True,
735
+ loss_name=loss.name,
736
+ build_strategy=build_strategy)
737
+
738
+ train_loss, = train_exe.run([loss.name], feed=feed_dict)
739
+ )DOC" );
638
740
639
741
py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, " ReduceStrategy" )
640
742
.value (" Reduce" , BuildStrategy::ReduceStrategy::kReduce )
@@ -652,31 +754,51 @@ All parameter, weight, gradient are variables in Paddle.
652
754
[](const BuildStrategy &self) { return self.reduce_ ; },
653
755
[](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
654
756
self.reduce_ = strategy;
655
- })
757
+ },
758
+ R"DOC( The type is STR, there are two reduce strategies in ParallelExecutor,
759
+ 'AllReduce' and 'Reduce'. If you want that all the parameters'
760
+ optimization are done on all devices independently, you should choose 'AllReduce';
761
+ if you choose 'Reduce', all the parameters' optimization will be evenly distributed
762
+ to different devices, and then broadcast the optimized parameter to other devices.
763
+ In some models, `Reduce` is faster. Default 'AllReduce'. )DOC" )
656
764
.def_property (
657
765
" gradient_scale_strategy" ,
658
766
[](const BuildStrategy &self) { return self.gradient_scale_ ; },
659
767
[](BuildStrategy &self,
660
768
BuildStrategy::GradientScaleStrategy strategy) {
661
769
self.gradient_scale_ = strategy;
662
- })
770
+ },
771
+ R"DOC( The type is STR, there are three ways of defining :math:`loss@grad` in
772
+ ParallelExecutor, 'CoeffNumDevice', 'One' and 'Customized'. By default,
773
+ ParallelExecutor sets the :math:`loss@grad` according to the number of devices.
774
+ If you want to customize :math:`loss@grad`, you can choose 'Customized'.
775
+ Default 'CoeffNumDevice'.)DOC" )
663
776
.def_property (
664
777
" debug_graphviz_path" ,
665
778
[](const BuildStrategy &self) { return self.debug_graphviz_path_ ; },
666
779
[](BuildStrategy &self, const std::string &path) {
667
780
self.debug_graphviz_path_ = path;
668
- })
781
+ },
782
+ R"DOC( The type is STR, debug_graphviz_path indicate the path that
783
+ writing the SSA Graph to file in the form of graphviz, you.
784
+ It is useful for debugging. Default "")DOC" )
669
785
.def_property (
670
786
" enable_data_balance" ,
671
787
[](const BuildStrategy &self) { return self.enable_data_balance_ ; },
672
- [](BuildStrategy &self, bool b) { self.enable_data_balance_ = b; })
673
- .def_property (" fuse_elewise_add_act_ops" ,
674
- [](const BuildStrategy &self) {
675
- return self.fuse_elewise_add_act_ops_ ;
676
- },
677
- [](BuildStrategy &self, bool b) {
678
- self.fuse_elewise_add_act_ops_ = b;
679
- });
788
+ [](BuildStrategy &self, bool b) {
789
+ self.enable_data_balance_ = b;
790
+ }) // FIXME(chengudo): enable_data_balance seems not important
791
+ .def_property (
792
+ " fuse_elewise_add_act_ops" ,
793
+ [](const BuildStrategy &self) {
794
+ return self.fuse_elewise_add_act_ops_ ;
795
+ },
796
+ [](BuildStrategy &self, bool b) {
797
+ self.fuse_elewise_add_act_ops_ = b;
798
+ },
799
+ R"DOC( The type is BOOL, fuse_elewise_add_act_ops indicate whether
800
+ to fuse elementwise_add_op and activation_op,
801
+ it may make the execution faster. Default False)DOC" );
680
802
681
803
pe.def (py::init<const std::vector<platform::Place> &,
682
804
const std::unordered_set<std::string> &,
0 commit comments