|
1630 | 1630 | { |
1631 | 1631 | "cell_type": "code", |
1632 | 1632 | "metadata": { |
1633 | | - "id": "MY5faq4yLdpQ", |
1634 | | - "outputId": "c3838b07-0d15-471e-8dad-370de91d4bdc", |
1635 | 1633 | "colab": { |
1636 | 1634 | "base_uri": "https://localhost:8080/", |
1637 | 1635 | "height": 204 |
1638 | | - } |
| 1636 | + }, |
| 1637 | + "id": "MY5faq4yLdpQ", |
| 1638 | + "outputId": "c3838b07-0d15-471e-8dad-370de91d4bdc" |
1639 | 1639 | }, |
1640 | 1640 | "source": [ |
1641 | 1641 | "fill_with_mode = pd.DataFrame([[1,2,\"True\"],\n", |
|
1736 | 1736 | { |
1737 | 1737 | "cell_type": "code", |
1738 | 1738 | "metadata": { |
1739 | | - "id": "WKy-9Y2tN5jv", |
1740 | | - "outputId": "41f5064e-502d-4aec-dc2d-86f885068b4f", |
1741 | 1739 | "colab": { |
1742 | 1740 | "base_uri": "https://localhost:8080/" |
1743 | | - } |
| 1741 | + }, |
| 1742 | + "id": "WKy-9Y2tN5jv", |
| 1743 | + "outputId": "41f5064e-502d-4aec-dc2d-86f885068b4f" |
1744 | 1744 | }, |
1745 | 1745 | "source": [ |
1746 | 1746 | "fill_with_mode[2].value_counts()" |
|
1784 | 1784 | { |
1785 | 1785 | "cell_type": "code", |
1786 | 1786 | "metadata": { |
1787 | | - "id": "tvas7c9_OPWE", |
1788 | | - "outputId": "7282c4f7-0e59-4398-b4f2-5919baf61164", |
1789 | 1787 | "colab": { |
1790 | 1788 | "base_uri": "https://localhost:8080/", |
1791 | 1789 | "height": 204 |
1792 | | - } |
| 1790 | + }, |
| 1791 | + "id": "tvas7c9_OPWE", |
| 1792 | + "outputId": "7282c4f7-0e59-4398-b4f2-5919baf61164" |
1793 | 1793 | }, |
1794 | 1794 | "source": [ |
1795 | 1795 | "fill_with_mode" |
|
1894 | 1894 | "\n", |
1895 | 1895 | "We replace with Median, in case of skewed data with outliers. This is beacuse median is robust to outliers.\n", |
1896 | 1896 | "\n", |
1897 | | - "When the data is normalized, we can use mean, as in that case, mean and median would be pretty close." |
| 1897 | + "When the data is normalized, we can use mean, as in that case, mean and median would be pretty close.\n", |
| 1898 | + "\n", |
| 1899 | + "First, let us take a column which is normally distributed and let us fill the missing value with the mean of the column. " |
1898 | 1900 | ] |
1899 | 1901 | }, |
1900 | 1902 | { |
1901 | 1903 | "cell_type": "code", |
1902 | 1904 | "metadata": { |
1903 | | - "id": "09HM_2feOj5Y" |
| 1905 | + "colab": { |
| 1906 | + "base_uri": "https://localhost:8080/", |
| 1907 | + "height": 204 |
| 1908 | + }, |
| 1909 | + "id": "09HM_2feOj5Y", |
| 1910 | + "outputId": "ade42fec-dc40-45d0-e22c-974849ea8664" |
1904 | 1911 | }, |
1905 | 1912 | "source": [ |
1906 | | - "" |
| 1913 | + "fill_with_mean = pd.DataFrame([[-2,0,1],\n", |
| 1914 | + " [-1,2,3],\n", |
| 1915 | + " [np.nan,4,5],\n", |
| 1916 | + " [1,6,7],\n", |
| 1917 | + " [2,8,9]])\n", |
| 1918 | + "\n", |
| 1919 | + "fill_with_mean" |
1907 | 1920 | ], |
1908 | | - "execution_count": null, |
1909 | | - "outputs": [] |
| 1921 | + "execution_count": 33, |
| 1922 | + "outputs": [ |
| 1923 | + { |
| 1924 | + "output_type": "execute_result", |
| 1925 | + "data": { |
| 1926 | + "text/html": [ |
| 1927 | + "<div>\n", |
| 1928 | + "<style scoped>\n", |
| 1929 | + " .dataframe tbody tr th:only-of-type {\n", |
| 1930 | + " vertical-align: middle;\n", |
| 1931 | + " }\n", |
| 1932 | + "\n", |
| 1933 | + " .dataframe tbody tr th {\n", |
| 1934 | + " vertical-align: top;\n", |
| 1935 | + " }\n", |
| 1936 | + "\n", |
| 1937 | + " .dataframe thead th {\n", |
| 1938 | + " text-align: right;\n", |
| 1939 | + " }\n", |
| 1940 | + "</style>\n", |
| 1941 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 1942 | + " <thead>\n", |
| 1943 | + " <tr style=\"text-align: right;\">\n", |
| 1944 | + " <th></th>\n", |
| 1945 | + " <th>0</th>\n", |
| 1946 | + " <th>1</th>\n", |
| 1947 | + " <th>2</th>\n", |
| 1948 | + " </tr>\n", |
| 1949 | + " </thead>\n", |
| 1950 | + " <tbody>\n", |
| 1951 | + " <tr>\n", |
| 1952 | + " <th>0</th>\n", |
| 1953 | + " <td>-2.0</td>\n", |
| 1954 | + " <td>0</td>\n", |
| 1955 | + " <td>1</td>\n", |
| 1956 | + " </tr>\n", |
| 1957 | + " <tr>\n", |
| 1958 | + " <th>1</th>\n", |
| 1959 | + " <td>-1.0</td>\n", |
| 1960 | + " <td>2</td>\n", |
| 1961 | + " <td>3</td>\n", |
| 1962 | + " </tr>\n", |
| 1963 | + " <tr>\n", |
| 1964 | + " <th>2</th>\n", |
| 1965 | + " <td>NaN</td>\n", |
| 1966 | + " <td>4</td>\n", |
| 1967 | + " <td>5</td>\n", |
| 1968 | + " </tr>\n", |
| 1969 | + " <tr>\n", |
| 1970 | + " <th>3</th>\n", |
| 1971 | + " <td>1.0</td>\n", |
| 1972 | + " <td>6</td>\n", |
| 1973 | + " <td>7</td>\n", |
| 1974 | + " </tr>\n", |
| 1975 | + " <tr>\n", |
| 1976 | + " <th>4</th>\n", |
| 1977 | + " <td>2.0</td>\n", |
| 1978 | + " <td>8</td>\n", |
| 1979 | + " <td>9</td>\n", |
| 1980 | + " </tr>\n", |
| 1981 | + " </tbody>\n", |
| 1982 | + "</table>\n", |
| 1983 | + "</div>" |
| 1984 | + ], |
| 1985 | + "text/plain": [ |
| 1986 | + " 0 1 2\n", |
| 1987 | + "0 -2.0 0 1\n", |
| 1988 | + "1 -1.0 2 3\n", |
| 1989 | + "2 NaN 4 5\n", |
| 1990 | + "3 1.0 6 7\n", |
| 1991 | + "4 2.0 8 9" |
| 1992 | + ] |
| 1993 | + }, |
| 1994 | + "metadata": {}, |
| 1995 | + "execution_count": 33 |
| 1996 | + } |
| 1997 | + ] |
| 1998 | + }, |
| 1999 | + { |
| 2000 | + "cell_type": "markdown", |
| 2001 | + "metadata": { |
| 2002 | + "id": "ka7-wNfzSxbx" |
| 2003 | + }, |
| 2004 | + "source": [ |
| 2005 | + "The mean of the column is" |
| 2006 | + ] |
| 2007 | + }, |
| 2008 | + { |
| 2009 | + "cell_type": "code", |
| 2010 | + "metadata": { |
| 2011 | + "id": "XYtYEf5BSxFL", |
| 2012 | + "outputId": "1e79aeea-6baf-4572-dcd1-23e5ec742036", |
| 2013 | + "colab": { |
| 2014 | + "base_uri": "https://localhost:8080/" |
| 2015 | + } |
| 2016 | + }, |
| 2017 | + "source": [ |
| 2018 | + "np.mean(fill_with_mean[0])" |
| 2019 | + ], |
| 2020 | + "execution_count": 34, |
| 2021 | + "outputs": [ |
| 2022 | + { |
| 2023 | + "output_type": "execute_result", |
| 2024 | + "data": { |
| 2025 | + "text/plain": [ |
| 2026 | + "0.0" |
| 2027 | + ] |
| 2028 | + }, |
| 2029 | + "metadata": {}, |
| 2030 | + "execution_count": 34 |
| 2031 | + } |
| 2032 | + ] |
| 2033 | + }, |
| 2034 | + { |
| 2035 | + "cell_type": "markdown", |
| 2036 | + "metadata": { |
| 2037 | + "id": "oBSRGxKRS39K" |
| 2038 | + }, |
| 2039 | + "source": [ |
| 2040 | + "Filling with mean" |
| 2041 | + ] |
| 2042 | + }, |
| 2043 | + { |
| 2044 | + "cell_type": "code", |
| 2045 | + "metadata": { |
| 2046 | + "id": "FzncQLmuS5jh", |
| 2047 | + "outputId": "75f33b25-e6b3-41bb-8049-1ed2e085efe2", |
| 2048 | + "colab": { |
| 2049 | + "base_uri": "https://localhost:8080/", |
| 2050 | + "height": 204 |
| 2051 | + } |
| 2052 | + }, |
| 2053 | + "source": [ |
| 2054 | + "fill_with_mean[0].fillna(np.mean(fill_with_mean[0]),inplace=True)\n", |
| 2055 | + "fill_with_mean" |
| 2056 | + ], |
| 2057 | + "execution_count": 35, |
| 2058 | + "outputs": [ |
| 2059 | + { |
| 2060 | + "output_type": "execute_result", |
| 2061 | + "data": { |
| 2062 | + "text/html": [ |
| 2063 | + "<div>\n", |
| 2064 | + "<style scoped>\n", |
| 2065 | + " .dataframe tbody tr th:only-of-type {\n", |
| 2066 | + " vertical-align: middle;\n", |
| 2067 | + " }\n", |
| 2068 | + "\n", |
| 2069 | + " .dataframe tbody tr th {\n", |
| 2070 | + " vertical-align: top;\n", |
| 2071 | + " }\n", |
| 2072 | + "\n", |
| 2073 | + " .dataframe thead th {\n", |
| 2074 | + " text-align: right;\n", |
| 2075 | + " }\n", |
| 2076 | + "</style>\n", |
| 2077 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 2078 | + " <thead>\n", |
| 2079 | + " <tr style=\"text-align: right;\">\n", |
| 2080 | + " <th></th>\n", |
| 2081 | + " <th>0</th>\n", |
| 2082 | + " <th>1</th>\n", |
| 2083 | + " <th>2</th>\n", |
| 2084 | + " </tr>\n", |
| 2085 | + " </thead>\n", |
| 2086 | + " <tbody>\n", |
| 2087 | + " <tr>\n", |
| 2088 | + " <th>0</th>\n", |
| 2089 | + " <td>-2.0</td>\n", |
| 2090 | + " <td>0</td>\n", |
| 2091 | + " <td>1</td>\n", |
| 2092 | + " </tr>\n", |
| 2093 | + " <tr>\n", |
| 2094 | + " <th>1</th>\n", |
| 2095 | + " <td>-1.0</td>\n", |
| 2096 | + " <td>2</td>\n", |
| 2097 | + " <td>3</td>\n", |
| 2098 | + " </tr>\n", |
| 2099 | + " <tr>\n", |
| 2100 | + " <th>2</th>\n", |
| 2101 | + " <td>0.0</td>\n", |
| 2102 | + " <td>4</td>\n", |
| 2103 | + " <td>5</td>\n", |
| 2104 | + " </tr>\n", |
| 2105 | + " <tr>\n", |
| 2106 | + " <th>3</th>\n", |
| 2107 | + " <td>1.0</td>\n", |
| 2108 | + " <td>6</td>\n", |
| 2109 | + " <td>7</td>\n", |
| 2110 | + " </tr>\n", |
| 2111 | + " <tr>\n", |
| 2112 | + " <th>4</th>\n", |
| 2113 | + " <td>2.0</td>\n", |
| 2114 | + " <td>8</td>\n", |
| 2115 | + " <td>9</td>\n", |
| 2116 | + " </tr>\n", |
| 2117 | + " </tbody>\n", |
| 2118 | + "</table>\n", |
| 2119 | + "</div>" |
| 2120 | + ], |
| 2121 | + "text/plain": [ |
| 2122 | + " 0 1 2\n", |
| 2123 | + "0 -2.0 0 1\n", |
| 2124 | + "1 -1.0 2 3\n", |
| 2125 | + "2 0.0 4 5\n", |
| 2126 | + "3 1.0 6 7\n", |
| 2127 | + "4 2.0 8 9" |
| 2128 | + ] |
| 2129 | + }, |
| 2130 | + "metadata": {}, |
| 2131 | + "execution_count": 35 |
| 2132 | + } |
| 2133 | + ] |
| 2134 | + }, |
| 2135 | + { |
| 2136 | + "cell_type": "markdown", |
| 2137 | + "metadata": { |
| 2138 | + "id": "CwpVFCrPTC5z" |
| 2139 | + }, |
| 2140 | + "source": [ |
| 2141 | + "As we can see, the missing value has been replaced with its mean." |
| 2142 | + ] |
1910 | 2143 | }, |
1911 | 2144 | { |
1912 | 2145 | "cell_type": "code", |
|
0 commit comments