Code-Plus-CUMI
diff --git a/‎4 - Python/4 - Data Science/0 - Data Cleaning/0 - Images/0 - Good Days Distribution.png‎
4.88 KB b/‎4 - Python/4 - Data Science/0 - Data Cleaning/0 - Images/0 - Good Days Distribution.png‎
4.88 KB
diff --git a/‎4 - Python/4 - Data Science/0 - Data Cleaning/0 - Images/1 - Scale, Standard, Normalization.png‎
108 KB b/‎4 - Python/4 - Data Science/0 - Data Cleaning/0 - Images/1 - Scale, Standard, Normalization.png‎
108 KB
diff --git a/‎4 - Python/4 - Data Science/0 - Data Cleaning/1 - Codes/0 - Handling Missing Values.py‎
Lines changed: 42 additions & 0 deletions b/‎4 - Python/4 - Data Science/0 - Data Cleaning/1 - Codes/0 - Handling Missing Values.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎4 - Python/4 - Data Science/0 - Data Cleaning/1 - Codes/1 - Scale, Standardization and Normalization.py‎
Lines changed: 106 additions & 0 deletions b/‎4 - Python/4 - Data Science/0 - Data Cleaning/1 - Codes/1 - Scale, Standardization and Normalization.py‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎4 - Python/4 - Data Science/0 - Data Cleaning/1 - Codes/2 - Parsing Dates.py‎
Lines changed: 73 additions & 0 deletions b/‎4 - Python/4 - Data Science/0 - Data Cleaning/1 - Codes/2 - Parsing Dates.py‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎4 - Python/4 - Data Science/0 - Data Cleaning/1 - Codes/3 - Character Encoding.py‎
Lines changed: 40 additions & 0 deletions b/‎4 - Python/4 - Data Science/0 - Data Cleaning/1 - Codes/3 - Character Encoding.py‎
Lines changed: 40 additions & 0 deletions
@@ -0,0 +1,42 @@
+# read the dataset
+# check out how many missing values we have per column
+# check out the percentage of missing values per column
+# check out the percentage of missing values in the dataset
+
+"""
+Is the missing values because it wasn't recorded or 
+because it doesn't exist?
+
+Answer: If a value is missing becuase it doesn't exist 
+(like the height of the oldest child of someone who 
+doesn't have any children) then it doesn't make sense 
+to try and guess what it might be. 
+
+These values you probably do want to keep as NaN. On the 
+other hand, if a value is missing because it wasn't 
+recorded, then you  can try to guess what it might have 
+been based on the other values in that column and row. 
+"""
+
+# Counting the Missing Values and its Percentage #
+
+import pandas as pd
+import numpy as np
+np.random.seed(0)
+
+df = pd.read_csv('filepath')
+
+# Counting how many missing values each column has
+df.isnull().sum()
+
+# Counting the percentage of the missing values
+# for each column
+df.isnull().sum() / len(df)
+df.isnull().sum() * 100 / len(df)
+
+# Counting the percentage of the missing values
+# for the whole dataset
+total_missing = df.isnull().sum().sum()
+total_cells = np.product(df.shape)
+
+percent_missing = (total_missing / total_cells) * 100
@@ -0,0 +1,106 @@
+"""
+0 - Scaling
+
+It's used to change the RANGE of the datas. The RANGE
+goes from 0 to 1.
+
+----
+
+About the models, you'll need to scale the datas when
+you're using methods based on measures of how far apart 
+data points are, like the models:
+	
+	/ Gradient Descent Optimization
+	/ Support Vector Machines (SVM)
+	/ K-Nearest Neighbors (KNN)
+"""
+
+from sklearn.preprocessing import MinMaxScaler
+
+scaler_1 = MinMaxScaler()
+scaler_1.fit_transform(df_train)
+scaler_1.transform(df_val)
+
+"""
+1 - Standardization
+
+It's like the Scale, but the scale range doesn't go
+from 0 to 1, it varies.
+
+----
+
+About the models, you'll need to scale the datas when
+you're using methods based on measures of how far apart 
+data points are, like the models:
+	
+	/ Gradient Descent Optimization
+	/ Support Vector Machines (SVM)
+	/ K-Nearest Neighbors (KNN)
+"""
+
+from sklearn.preprocessing import RobustScaler
+from sklearn.preprocessing import StandardScaler
+
+# Robust Scaler >> Less Sensitive to Outliers
+scaler_2 = RobustScaler()
+scaler_2.fit_transform(df_train)
+scaler_2.transform(df_val)
+
+# Standard Scaler >> Used when the Mean is near to 0
+scaler_3 = StandardScaler()
+scaler_3.fit_transform(df_train)
+scaler_3.transform(df_val)
+"""
+2 - Normalization
+
+It's used to change the DISTRIBUTION of the data.
+
+In a nutshell, Normalization just changes the distribution
+of the datas in order to get a Normal Distribution
+(Gaussian Distribution or Bell Curve).
+
+----
+
+About the models, you'll need to normalize the datas
+when using:
+
+	/ Linear Discriminant Analysis (LDA)
+	/ Gaussian Naive Bayes
+
+Tip: any method with "Gaussian" in the name probably 
+needs that you normalize the datas.
+"""
+
+from sklearn.preprocessing import Normalizer
+
+normalizer = Normalizer()
+normalizer.fit_transform(df_train)
+normalizer.transform(df_val)
+
+#########
+
+"""
+		***********
+		** Notes **
+		***********
+
+		Explanation Scale/Standardization
+
+It's like to scale Real (R$) to Dollar (U$), where
+1 dollar is equals 5 reals nowadays. So, if we don't
+use the Scale, the model will consider 1 dollar equals
+to 1 real, and that's not true.
+
+Another example is the height and weight, where we gotta
+scale the datas, like where 1 inch is equals 2.54 cm,
+and 1 pound is equals 0.45 kg.
+
+-*-*-*-*-
+
+		Another Explanation Just to Get the Feeling
+
+Scale, Standardization and Normalization avoid the model 
+considers some features more important than others by 
+the scale, like consider the salary (from 40,000 to 
+210,000) more important than the age (from 18 to 100).
+"""
@@ -0,0 +1,73 @@
+import pandas as pd
+import datetime
+import numpy as np
+np.random.seed(0)
+
+"""
+		** Parsing Dates **
+
+Transforming 'objects' dtype into 'datetime' one.
+"""
+
+# Checking out the 'date' column
+# of a imaginary dataset
+
+df = pd.read_csv('filepath')
+
+df['date'].head()
+# > 01/05/99
+# > 02/05/99
+# > 03/05/99
+# > 04/05/99
+# > 05/05/99
+
+df['date'].dtype
+# > Object
+
+####
+
+# Formatting to:
+#
+# day/month/two-digit-year
+# %d/%m/%y
+
+df['formatted_date'] = pd.to_datetime(df['date']
+									, format='%d/%m/%y')
+
+df['formatted_date'].dtype
+# > datetime64
+
+
+# When the column has more than one date time format
+# use 'infer_datetime_format=True' in order to pandas
+# guess the correct format for each row
+#
+# - Problem 1: pandas can't recognize the correct format
+# for all cases;
+# - Problem 2: it takes more time than specifying the
+# format by yourself
+df['formatted_date'] = pd.to_datetime(df['date']
+									, infer_datetime_format=True)
+
+########
+
+# Extracting information from the dates
+
+df['formatted_date'].dt.day
+# > 01
+# > 02
+# > 03
+# > 04
+# > 05
+
+#######
+
+# Checking out the Days Distribution in order
+# to check if the pandas missformatted the months 
+# as days
+#
+# See: "0 - Good Days Distribution.png" to an example
+# of a correct distribution!!
+sns.distplot(df['formatted_date'].dt.day
+			, kde=False,
+			, bins=31)
@@ -0,0 +1,40 @@
+"""
+	************************
+	** Character Encoding **
+	************************
+
+When you read a csv file that's not in 'UTF-8' charset,
+you'll get an error like this one:
+
+	/ UnicodeDecodeError: 'utf-8' codec can't decode byte 
+	0x99 in position 7955: invalid start byte
+
+To solve this, you gotta convert the file to UTF-8
+following the steps bellow:
+
+	1 - find out the file's charset;
+	2 - read the file with the correct charset;
+	3 - save the file with pandas (UTF-8 is the default
+	charset to pandas)
+"""
+
+import pandas as pd
+import chardet # library to guess the file's charset
+
+# Guessing File's Charset #
+
+with open('filepath', 'rb') as file:
+
+	# read the first 10,000 bytes of the file
+	# to guess the charset
+	guessed_charset = chardet.detect(file.read(10000))
+
+print(guessed_charset)
+# > {'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}
+# so there is 73% of chance of the charset be Windows-1252
+
+# Reading the File with Correct Charset #
+df = pd.read_csv('filepath', encoding='Windows-1252')
+
+# Saving the File into UTF-8 #
+df.to_csv('new_file_name')