Update parallel_do.md (#8665)

Yang Yang(Tony) · web-flow · commit 2edeb639e275 · 2018-02-28T12:52:21.000-08:00
diff --git a/doc/design/parallel_do.md b/doc/design/parallel_do.md
@@ -39,15 +39,16 @@ In the backward pass
 This implementation allows to write mixed device program like this
 
 ```python
-# get embedding feature on CPU
-feature = some_cpu_only_op(data)
+W1 = fluid.tensor(size=[100,20], parameter=true)
+W2 = fluid.tensor(size=[20,15], parameter=true)
 
-gpu_places = get_place(use_gpu=True)
+data = layers.data()
+
+gpu_places = layers.get_place(use_gpu=True)
 # parallel processing on multiple GPUs
 pd = ParallelDo(gpu_places)
-with pd.do():
-    read_input(feature)
-    prediction = my_net(feature)
+with pd.do(input=data):
+    prediction = softmax(fc(fc(data, W1), W2))
     write_output(prediction)
 prediction = pd()
 loss = cross_entropy(prediction, label)
@@ -66,20 +67,20 @@ start_program
 main_program
 {
 block0 {
-  vars: data, places, w1, w2
+  vars: data, places, w1, w2, w1_grad, w2_grad,
   ops: data, get_place, parallel_do(block1),
        parallel_do_grad(block2),
        sgd(w2, w2_grad),
        sgd(w1, w1_grad)
 }
-block1 {
+block1 { # the forward pass
   parent_block: 0
   vars: data, h1, h2, loss
   ops: fc, fc, softmax
 }
-block2 {
+block2 { # the backward pass
   parent_block: 1
-  vars: data_grad, h1_grad, h2_grad, loss_gard, w1_grad, w2_grad
+  vars: data_grad, h1_grad, h2_grad, loss_gard, local_w1_grad, local_w2_grad
   ops: softmax_grad,
        fc_grad
        fc_grad