diff --git a/_typos.toml b/_typos.toml
index 50f9018e38c..65fc3e52d59 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -76,9 +76,6 @@ Transfomed = "Transfomed"
 Tthe = "Tthe"
 Ture = "Ture"
 Useage = "Useage"
-Varialble = "Varialble"
-Varible = "Varible"
-Varient = "Varient"
 Wether = "Wether"
 accordding = "accordding"
 accoustic = "accoustic"
@@ -257,12 +254,6 @@ unqiue = "unqiue"
 unsupport = "unsupport"
 updte = "updte"
 utill = "utill"
-varialbes = "varialbes"
-varibale = "varibale"
-varibales = "varibales"
-varience = "varience"
-varient = "varient"
-visting = "visting"
 warpped = "warpped"
 wether = "wether"
 wiht = "wiht"
diff --git a/docs/design/concepts/tensor.md b/docs/design/concepts/tensor.md
index 3fdd8b35de2..3d08c52e5e2 100644
--- a/docs/design/concepts/tensor.md
+++ b/docs/design/concepts/tensor.md
@@ -161,7 +161,7 @@ Please reference the section of `Learn from Majel` for more details.
 
 `ArrayView` is an encapsulation of `Array`， which introduces extra iterator methods, such as `begin()` and `end()`. The `begin()` method returns an iterator pointing to the first element in the ArrayView. And the `end()` method returns an iterator pointing to the pass-the-end element in the ArrayView.
 
-`ArrayView` make the visting and manipulating an array more efficiently, flexibly and safely.
+`ArrayView` make the visiting and manipulating an array more efficiently, flexibly and safely.
 
 
 A global function `make_view` is provided to transform an array to corresponding arrayview.
diff --git a/docs/design/concepts/tensor_array.md b/docs/design/concepts/tensor_array.md
index 98e9cff4e6c..bed7ba9eb42 100644
--- a/docs/design/concepts/tensor_array.md
+++ b/docs/design/concepts/tensor_array.md
@@ -212,7 +212,7 @@ class TensorArray:
 ```
 
 ## DenseTensor-related Supports
-The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes varience-length sequences as input, and output sequences too.
+The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes variable-length sequences as input, and output sequences too.
 
 Since each step of RNN can only take a tensor-represented batch of data as input,
 some preprocess should be taken on the inputs such as sorting the sentences by their length in descending order and cut each word and pack to new batches.
@@ -244,10 +244,10 @@ def pack(level, indices_map):
     pass
 ```
 
-With these two methods, a varience-length sentence supported RNN can be implemented like
+With these two methods, a variable-length sentence supported RNN can be implemented like
 
 ```c++
-// input is the varient-length data
+// input is the variable-length data
 LodTensor sentence_input(xxx);
 TensorArray ta;
 Tensor indice_map;
@@ -268,4 +268,4 @@ for (int step = 0; step = ta.size(); step++) {
 DenseTensor rnn_output = ta.pack(ta, indice_map);
 ```
 the code above shows that by embedding the DenseTensor-related preprocess operations into `TensorArray`,
-the implementation of a RNN that supports varient-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend.
+the implementation of a RNN that supports variable-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend.
diff --git a/docs/design/dynamic_rnn/rnn_design_en.md b/docs/design/dynamic_rnn/rnn_design_en.md
index 82c298afcca..31153595f0b 100644
--- a/docs/design/dynamic_rnn/rnn_design_en.md
+++ b/docs/design/dynamic_rnn/rnn_design_en.md
@@ -1,4 +1,4 @@
-# Varient Length supported RNN Design
+# Variable Length supported RNN Design
 For the learning of variable length sequences, the existing mainstream frameworks such as tensorflow, pytorch, caffe2, mxnet and so on all use padding.
 
 Different-length sequences in a mini-batch will be padded with zeros and transformed to same length.
diff --git a/docs/design/modules/backward.md b/docs/design/modules/backward.md
index 397fea5ce14..d2aa7a0bfcc 100644
--- a/docs/design/modules/backward.md
+++ b/docs/design/modules/backward.md
@@ -61,7 +61,7 @@ def _append_backward_ops_(target,
         target_block(Block): the block which is going to hold new generated grad ops
         no_grad_dict(dict):
             key(int)  block index
-            val(set) a set of varibale names. These varibales have no gradient
+            val(set) a set of variable names. These variables have no gradient
         grad_to_var(dict)(output argument):
             key(str): grad variable name
             val(str): corresponding forward variable name
diff --git a/docs/design/modules/net_op_design.md b/docs/design/modules/net_op_design.md
index e64ac2fb1c6..15f44185a4d 100644
--- a/docs/design/modules/net_op_design.md
+++ b/docs/design/modules/net_op_design.md
@@ -90,7 +90,7 @@ class PlainNet : public Net {
   // Create a network describe by `def`.  NetDesc is the definition of a network.
   PlainNet(const NetDesc &def);
 
-  // Infer all the operators' input and output varialbes' shapes, will be called before every mini-batch
+  // Infer all the operators' input and output variables' shapes, will be called before every mini-batch
   training.
   virtual Error InferShape(Scope *scope) override;
 
diff --git a/docs/design/others/gan_api.md b/docs/design/others/gan_api.md
index 090bd6eaa78..f46b9634d71 100644
--- a/docs/design/others/gan_api.md
+++ b/docs/design/others/gan_api.md
@@ -58,7 +58,7 @@ class DCGAN:
     self.D_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
     self.D_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
     self.D_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
-    self.D_W2 = pd.Varialble(np.random.rand(128, 1))
+    self.D_W2 = pd.Variable(np.random.rand(128, 1))
     self.D_b2 = pd.Variable(np.zeros(128))
     self.theta_D = [self.D_W0, self.D_b0, self.D_W1, self.D_b1, self.D_W2, self.D_b2]
 
@@ -67,7 +67,7 @@ class DCGAN:
     self.G_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
     self.G_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
     self.G_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
-    self.G_W2 = pd.Varialble(np.random.rand(128, 1))
+    self.G_W2 = pd.Variable(np.random.rand(128, 1))
     self.G_b2 = pd.Variable(np.zeros(128))
     self.theta_G = [self.G_W0, self.G_b0, self.G_W1, self.G_b1, self.G_W2, self.G_b2]
 ```
diff --git a/docs/design/others/graph_survey.md b/docs/design/others/graph_survey.md
index e20834eee5e..cbf0bbb434d 100644
--- a/docs/design/others/graph_survey.md
+++ b/docs/design/others/graph_survey.md
@@ -28,7 +28,7 @@ def get_symbol(num_classes=10, **kwargs):
 
 
 
-Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own AnyAttr. There is a op field in AnyAttr class, when a Symbol represents Variable(often input data), the op field is null.
+Variable here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own AnyAttr. There is a op field in AnyAttr class, when a Symbol represents Variable(often input data), the op field is null.
 
 Symbol contains a data member, std::vector<NodeEntry> outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph.
 
diff --git a/docs/guides/06_distributed_training/group_sharded_parallel_cn.rst b/docs/guides/06_distributed_training/group_sharded_parallel_cn.rst
index 6d3b8433b37..2a84c331ec0 100644
--- a/docs/guides/06_distributed_training/group_sharded_parallel_cn.rst
+++ b/docs/guides/06_distributed_training/group_sharded_parallel_cn.rst
@@ -16,7 +16,7 @@
 1.1 GroupSharded
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-GroupSharded 实现了类似 ZeRO-DP 的训练策略，将模型状态包括：模型参数（parameter）、参数梯度（gradient）、参数对应的优化器状态（以 Adam 为例 moment 和 varience）切分到每一张 GPU 上。让模型参数部分所占的显存随并行卡数的增加而减少。
+GroupSharded 实现了类似 ZeRO-DP 的训练策略，将模型状态包括：模型参数（parameter）、参数梯度（gradient）、参数对应的优化器状态（以 Adam 为例 moment 和 variance）切分到每一张 GPU 上。让模型参数部分所占的显存随并行卡数的增加而减少。
 通过 paddle.distributed.sharding.group_sharded_parallel 提供的简单易用接口, 用户只需要添加几行代码就可将策略加入到原有的训练中。
 
 模型训练过程中的显存消耗主要由两大部分组成：模型参数及优化器状态、训练产生的中间变量（activations）。