fillna with mean added

nirmalya8 · nirmalya8 · commit f5bef705d748 · 2021-10-05T12:43:18.000+05:30
diff --git a/2-Working-With-Data/08-data-preparation/notebook.ipynb b/2-Working-With-Data/08-data-preparation/notebook.ipynb
@@ -1630,12 +1630,12 @@
     {
       "cell_type": "code",
       "metadata": {
-        "id": "MY5faq4yLdpQ",
-        "outputId": "c3838b07-0d15-471e-8dad-370de91d4bdc",
         "colab": {
           "base_uri": "https://localhost:8080/",
           "height": 204
-        }
+        },
+        "id": "MY5faq4yLdpQ",
+        "outputId": "c3838b07-0d15-471e-8dad-370de91d4bdc"
       },
       "source": [
         "fill_with_mode = pd.DataFrame([[1,2,\"True\"],\n",
@@ -1736,11 +1736,11 @@
     {
       "cell_type": "code",
       "metadata": {
-        "id": "WKy-9Y2tN5jv",
-        "outputId": "41f5064e-502d-4aec-dc2d-86f885068b4f",
         "colab": {
           "base_uri": "https://localhost:8080/"
-        }
+        },
+        "id": "WKy-9Y2tN5jv",
+        "outputId": "41f5064e-502d-4aec-dc2d-86f885068b4f"
       },
       "source": [
         "fill_with_mode[2].value_counts()"
@@ -1784,12 +1784,12 @@
     {
       "cell_type": "code",
       "metadata": {
-        "id": "tvas7c9_OPWE",
-        "outputId": "7282c4f7-0e59-4398-b4f2-5919baf61164",
         "colab": {
           "base_uri": "https://localhost:8080/",
           "height": 204
-        }
+        },
+        "id": "tvas7c9_OPWE",
+        "outputId": "7282c4f7-0e59-4398-b4f2-5919baf61164"
       },
       "source": [
         "fill_with_mode"
@@ -1894,19 +1894,252 @@
         "\n",
         "We replace with Median, in case of skewed data with outliers. This is beacuse median is robust to outliers.\n",
         "\n",
-        "When the data is normalized, we can use mean, as in that case, mean and median would be pretty close."
+        "When the data is normalized, we can use mean, as in that case, mean and median would be pretty close.\n",
+        "\n",
+        "First, let us take a column which is normally distributed and let us fill the missing value with the mean of the column. "
       ]
     },
     {
       "cell_type": "code",
       "metadata": {
-        "id": "09HM_2feOj5Y"
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 204
+        },
+        "id": "09HM_2feOj5Y",
+        "outputId": "ade42fec-dc40-45d0-e22c-974849ea8664"
       },
       "source": [
-        ""
+        "fill_with_mean = pd.DataFrame([[-2,0,1],\n",
+        "                               [-1,2,3],\n",
+        "                               [np.nan,4,5],\n",
+        "                               [1,6,7],\n",
+        "                               [2,8,9]])\n",
+        "\n",
+        "fill_with_mean"
       ],
-      "execution_count": null,
-      "outputs": []
+      "execution_count": 33,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>0</th>\n",
+              "      <th>1</th>\n",
+              "      <th>2</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>-2.0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>-1.0</td>\n",
+              "      <td>2</td>\n",
+              "      <td>3</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>NaN</td>\n",
+              "      <td>4</td>\n",
+              "      <td>5</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>1.0</td>\n",
+              "      <td>6</td>\n",
+              "      <td>7</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>2.0</td>\n",
+              "      <td>8</td>\n",
+              "      <td>9</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "     0  1  2\n",
+              "0 -2.0  0  1\n",
+              "1 -1.0  2  3\n",
+              "2  NaN  4  5\n",
+              "3  1.0  6  7\n",
+              "4  2.0  8  9"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 33
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ka7-wNfzSxbx"
+      },
+      "source": [
+        "The mean of the column is"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "XYtYEf5BSxFL",
+        "outputId": "1e79aeea-6baf-4572-dcd1-23e5ec742036",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "source": [
+        "np.mean(fill_with_mean[0])"
+      ],
+      "execution_count": 34,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "0.0"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 34
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "oBSRGxKRS39K"
+      },
+      "source": [
+        "Filling with mean"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "FzncQLmuS5jh",
+        "outputId": "75f33b25-e6b3-41bb-8049-1ed2e085efe2",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 204
+        }
+      },
+      "source": [
+        "fill_with_mean[0].fillna(np.mean(fill_with_mean[0]),inplace=True)\n",
+        "fill_with_mean"
+      ],
+      "execution_count": 35,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>0</th>\n",
+              "      <th>1</th>\n",
+              "      <th>2</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>-2.0</td>\n",
+              "      <td>0</td>\n",
+              "      <td>1</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>-1.0</td>\n",
+              "      <td>2</td>\n",
+              "      <td>3</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>0.0</td>\n",
+              "      <td>4</td>\n",
+              "      <td>5</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>1.0</td>\n",
+              "      <td>6</td>\n",
+              "      <td>7</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>2.0</td>\n",
+              "      <td>8</td>\n",
+              "      <td>9</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "     0  1  2\n",
+              "0 -2.0  0  1\n",
+              "1 -1.0  2  3\n",
+              "2  0.0  4  5\n",
+              "3  1.0  6  7\n",
+              "4  2.0  8  9"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 35
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "CwpVFCrPTC5z"
+      },
+      "source": [
+        "As we can see, the missing value has been replaced with its mean."
+      ]
     },
     {
       "cell_type": "code",