diff --git a/_typos.toml b/_typos.toml
index f6fe08c0129..07f2d250b8a 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -28,11 +28,5 @@ datas = "datas"
 feeded = "feeded"
 
 # These words need to be fixed
-setted = "setted"
-shoule = "shoule"
-similary = "similary"
-simplier = "simplier"
 softwares = "softwares"
-sperated = "sperated"
 splitted = "splitted"
-szie = "szie"
diff --git a/ci_scripts/ci_start.sh b/ci_scripts/ci_start.sh
index 764a1f3e779..3014dab284c 100644
--- a/ci_scripts/ci_start.sh
+++ b/ci_scripts/ci_start.sh
@@ -4,7 +4,7 @@ export DIR_PATH=${PWD}
 SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 source ${SCRIPT_DIR}/utils.sh
 set +x
-# 1 decide PADDLE_WHL if not setted.
+# 1 decide PADDLE_WHL if not set.
 if [ -z "${PADDLE_WHL}" ] ; then
     docs_pr_info=$(get_repo_pr_info "PaddlePaddle/docs" ${GIT_PR_ID})
     paddle_pr_id=$(get_paddle_pr_num_from_docs_pr_info ${docs_pr_info})
@@ -48,8 +48,8 @@ if [ "${BUILD_DOC}" = "true" ] &&  [ -x /usr/local/bin/sphinx-build ] ; then
     fi
 
     is_shell_attribute_set x
-    xdebug_setted=$?
-    if [ $xdebug_setted ] ; then
+    xdebug_set=$?
+    if [ $xdebug_set ] ; then
         set +x
     fi
     # clean git workspace
@@ -57,7 +57,7 @@ if [ "${BUILD_DOC}" = "true" ] &&  [ -x /usr/local/bin/sphinx-build ] ; then
     git reset --hard && git clean -dfx
     cd ${DIR_PATH}
 
-    if [ $xdebug_setted ] ; then
+    if [ $xdebug_set ] ; then
         set -x
     fi
 
diff --git a/ci_scripts/ci_start_en.sh b/ci_scripts/ci_start_en.sh
index ff37507d1b1..a1a035082c7 100644
--- a/ci_scripts/ci_start_en.sh
+++ b/ci_scripts/ci_start_en.sh
@@ -6,7 +6,7 @@ source ${SCRIPT_DIR}/utils.sh
 export OUTPUTDIR=/docs
 export VERSIONSTR=$(echo ${BRANCH} | sed 's@release/@@g')
 
-# 1 decide PADDLE_WHL if not setted.
+# 1 decide PADDLE_WHL if not set.
 if [ -z "${PADDLE_WHL}" ] ; then
     # docs_pr_info=$(get_repo_pr_info "PaddlePaddle/docs" ${GIT_PR_ID})
     paddle_pr_id=${GIT_PR_ID}
@@ -51,8 +51,8 @@ if [ "${BUILD_DOC}" = "true" ] &&  [ -x /usr/local/bin/sphinx-build ] ; then
     fi
 
     is_shell_attribute_set x
-    xdebug_setted=$?
-    if [ $xdebug_setted ] ; then
+    xdebug_set=$?
+    if [ $xdebug_set ] ; then
         set +x
     fi
     # clean git workspace
@@ -64,7 +64,7 @@ if [ "${BUILD_DOC}" = "true" ] &&  [ -x /usr/local/bin/sphinx-build ] ; then
         echo "Ak = ${BOS_CREDENTIAL_AK}" >> ${BCECMD_CONFIG}/credentials
         echo "Sk = ${BOS_CREDENTIAL_SK}" >> ${BCECMD_CONFIG}/credentials
     fi
-    if [ $xdebug_setted ] ; then
+    if [ $xdebug_set ] ; then
         set -x
     fi
 
diff --git a/docs/design/concurrent/select_op.md b/docs/design/concurrent/select_op.md
index 1aef49b883c..d27e584b0e9 100644
--- a/docs/design/concurrent/select_op.md
+++ b/docs/design/concurrent/select_op.md
@@ -14,7 +14,7 @@ creating a ***select_op***.
 ## How to use it
 
 The **select_op** is available as a c++ operator.  However most users
-will prefer to use the much simplier Python API.
+will prefer to use the much simpler Python API.
 
 - **fluid.Select()**: Creates a select operator and adds it to the current
 block within the main program.  Also creates a sub block and adds it to the
diff --git a/docs/design/others/auto_gradient_check.md b/docs/design/others/auto_gradient_check.md
index 773b7b6a767..4cc7ebd4177 100644
--- a/docs/design/others/auto_gradient_check.md
+++ b/docs/design/others/auto_gradient_check.md
@@ -3,7 +3,7 @@
 ## Background：
 - Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right because of the following challenges:
   1. The formula for backpropagation formula should be correct according to the forward computation.
-  2. The Implementation of the above shoule be correct in CPP.
+  2. The Implementation of the above should be correct in CPP.
   3. It is difficult to prepare an unbiased test data.
 
 - Auto gradient checking gets a numerical gradient using forward Operator and uses it as a reference for the backward Operator's result. It has several advantages:
diff --git a/docs/design/others/gan_api.md b/docs/design/others/gan_api.md
index 4aabf6ebe1b..a0a7b38b6ae 100644
--- a/docs/design/others/gan_api.md
+++ b/docs/design/others/gan_api.md
@@ -153,7 +153,7 @@ class DCGAN:
     self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
     self.d_loss = self.d_loss_real + self.d_loss_fake
 
-    self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie))
+    self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_size))
 ```
 
 If we do not have dependency engine but blocks, the module building our GAN model will be like this:
@@ -175,7 +175,7 @@ class DCGAN:
       else: # original version of GAN
         self.G = self.generator(self.z)
         self.D_g = self.discriminator(self.G, self.y)
-      self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie))
+      self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_size))
 
     with pd.default_block().d_block():
       if self.y_dim: # if conditional GAN, includes label
diff --git "a/docs/eval/\343\200\220Hackathon No.111\343\200\221PR.md" "b/docs/eval/\343\200\220Hackathon No.111\343\200\221PR.md"
index c69b7de231d..275f261375c 100644
--- "a/docs/eval/\343\200\220Hackathon No.111\343\200\221PR.md"	
+++ "b/docs/eval/\343\200\220Hackathon No.111\343\200\221PR.md"	
@@ -173,7 +173,7 @@ show_collage(examples)
 ```
 ## 3.2、构建训练数据
 
-图片检索的模型的训练样本跟常见的分类任务的训练样本不太一样的地方在于，每个训练样本并不是一个(image, class)这样的形式。而是（image0, image1, similary_or_not)的形式，即，每
+图片检索的模型的训练样本跟常见的分类任务的训练样本不太一样的地方在于，每个训练样本并不是一个(image, class)这样的形式。而是（image0, image1, similar_or_not)的形式，即，每
 
 一个训练样本由两张图片组成，而其 label 是这两张图片是否相似的标志位（0 或者 1）。
 
diff --git a/docs/practices/cv/image_search.ipynb b/docs/practices/cv/image_search.ipynb
index 502c5d4939e..062077d946f 100755
--- a/docs/practices/cv/image_search.ipynb
+++ b/docs/practices/cv/image_search.ipynb
@@ -223,7 +223,7 @@
    "source": [
     "### 3.3 构建训练数据\n",
     "\n",
-    "图片检索的模型的训练样本跟常见的分类任务的训练样本不太一样的地方在于，每个训练样本并不是一个`(image, class)`这样的形式。而是（image0, image1, similary_or_not)的形式，即，每一个训练样本由两张图片组成，而其`label`是这两张图片是否相似的标志位（0或者1）。\n",
+    "图片检索的模型的训练样本跟常见的分类任务的训练样本不太一样的地方在于，每个训练样本并不是一个`(image, class)`这样的形式。而是（image0, image1, similar_or_not)的形式，即，每一个训练样本由两张图片组成，而其`label`是这两张图片是否相似的标志位（0或者1）。\n",
     "\n",
     "很自然的能够想到，来自同一个类别的两张图片，是相似的图片，而来自不同类别的两张图片，应该是不相似的图片。\n",
     "\n",
diff --git a/docs/practices/jit/image_search_with_jit.ipynb b/docs/practices/jit/image_search_with_jit.ipynb
index cbcb528432a..efdf2a19f0d 100755
--- a/docs/practices/jit/image_search_with_jit.ipynb
+++ b/docs/practices/jit/image_search_with_jit.ipynb
@@ -161,7 +161,7 @@
    },
    "source": [
     "### 3.2 构建训练数据\n",
-    "图片检索的模型的训练样本跟常见的分类任务的训练样本不太一样的地方在于，每个训练样本并不是一个(image, class)这样的形式。而是（image0, image1, similary_or_not)的形式，即，每一个训练样本由两张图片组成，而其label是这两张图片是否相似的标志位（0或者1）。\n",
+    "图片检索的模型的训练样本跟常见的分类任务的训练样本不太一样的地方在于，每个训练样本并不是一个(image, class)这样的形式。而是（image0, image1, similar_or_not)的形式，即，每一个训练样本由两张图片组成，而其label是这两张图片是否相似的标志位（0或者1）。\n",
     "\n",
     "很自然的能够想到，来自同一个类别的两张图片，是相似的图片，而来自不同类别的两张图片，应该是不相似的图片。\n",
     "\n",
diff --git a/docs/practices/nlp/transformer_in_English-to-Spanish.ipynb b/docs/practices/nlp/transformer_in_English-to-Spanish.ipynb
index ff819b762b4..d5f4c377840 100644
--- a/docs/practices/nlp/transformer_in_English-to-Spanish.ipynb
+++ b/docs/practices/nlp/transformer_in_English-to-Spanish.ipynb
@@ -1170,7 +1170,7 @@
    "source": [
     "### 4.2 Encoder\n",
     "Encoder部分主要包含了多头注意力机制、归一化层以及前馈神经网络。输入会依次经过多头注意力模块、归一化层构成的残差模块、前馈神经网络模块、归一化层构成的残差模块。\n",
-    "* 多头注意力机制（MultiHeadAttention）：使用[paddle.nn.MultiHeadAttention](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/nn/MultiHeadAttention_cn.html#multiheadattention)实现多头注意力机制，需要注意其掩码attn_mask需要的shape是[batch_szie,num_heads,sequence_length,sequence_length]。\n",
+    "* 多头注意力机制（MultiHeadAttention）：使用[paddle.nn.MultiHeadAttention](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/nn/MultiHeadAttention_cn.html#multiheadattention)实现多头注意力机制，需要注意其掩码attn_mask需要的shape是[batch_size,num_heads,sequence_length,sequence_length]。\n",
     "* 前馈神经网络（Feed Forward）：输入经过MultiHeadAttention层后，经过一层feed forward层。模型中的feed forward，采用的是一种position-wise feed-forward的方法，即先对输入加一个全连接网络，之后使用Relu激活，之后再加一个全连接网络。\n",
     "* 残差网络：由归一化（LayerNorm）后的结果与之前时刻的输入相加组成。LayerNorm会在每一个样本上计算均值和方差。\n"
    ]
@@ -1492,7 +1492,7 @@
     "        cost = paddle.squeeze(cost, axis=[2])\n",
     "\n",
     "        # trg_mask 的形状[batch_size,suqence_len]\n",
-    "        # * 这个星号应该是对应位置相乘，返回结果的形状 [bathc_szie,sequence_len]\n",
+    "        # * 这个星号应该是对应位置相乘，返回结果的形状 [batch_size,sequence_len]\n",
     "        masked_cost = cost * trg_mask\n",
     "\n",
     "        return paddle.mean(masked_cost)"