11"""
2-
3- **********************************
4- ** Principal Component Analysis **
5- **********************************
2+
3+ **********************************
4+ ** Principal Component Analysis **
5+ **********************************
66
7- Principal Component Analysis (PCA) is used to create new Features
7+ Principal Component Analysis (PCA) is used to create new Features
88combining other Features. In general, we get these new Features
99by tracing diagonal lines (axes) over the scatter plot between the
1010two features we would like calculate the PCA.
1111
12- After that, the model will calculate the correlation and the
12+ After that, the model will calculate the correlation and the
1313variance between these two features and return the Components
1414(new Features).
1515
16- { image 4 }
16+ { image 1.0 }
1717
18- These new features are called the principal components of the
18+ These new features are called the principal components of the
1919data. The weights themselves are called loadings. There will be
2020as many principal components as there are features in the
2121original dataset: if we had used ten features instead of two,
2222we would have ended up with ten components.
23-
24- -*-*-*-*-
25-
26- - PCA Best Practices:
27-
28- / PCA only works with numeric features, like continuous
29- quantities or counts, so don't forget to Encode the Categorical
30- Features;
31-
32- / PCA is sensitive to scale. It's good practice to standardize
33- your data before applying PCA, unless you know you have good
34- reason not to;
35-
36- / Consider removing or constraining outliers, since they can
37- have an undue influence on the results;
38-
39- -*-*-*-*-
40-
41- - When to use PCA:
42-
43- / when the dataset has a bunch of features (data set compression);
44- / when the features are multi-colinear (there's a significant
45- number of linear correlations between them);
46- / when our goal is to apply denoising;
47- / when you want to check out whether clusters have similar
48- properties and attributes
4923"""
5024
51- # 0 - Importing libraries, creating functions to plot PCA's
52- # Variances and to calculate Mutual Information (MI), and reading
53- # the dataset
25+ # ---- Importing Libraries and Defining Functions ----
5426import matplotlib .pyplot as plt
5527import numpy as np
5628import pandas as pd
5729import seaborn as sns
5830from sklearn .feature_selection import mutual_info_regression
31+ from sklearn .decomposition import PCA
5932
6033def plot_variance (pca , width = 8 , dpi = 100 ):
6134
@@ -79,14 +52,11 @@ def plot_variance(pca, width=8, dpi=100):
7952 return axs
8053
8154def make_mi_scores (X , y , discrete_features ):
82- mi_scores = mutual_info_regression (X , y , discrete_features = discrete_features )
55+ mi_scores = mutual_info_regression (X , y , discrete_features = discrete_features )
8356 mi_scores = pd .Series (mi_scores , name = "MI Scores" , index = X .columns )
8457 mi_scores = mi_scores .sort_values (ascending = False )
8558 return mi_scores
8659
87-
88- df = pd .read_csv ("../input/fe-course-data/autos.csv" )
89-
9060"""
9161We've selected four features that cover a range of properties.
9262Each of these features also has a high MI score with the target,
@@ -102,8 +72,8 @@ def make_mi_scores(X, y, discrete_features):
10272are higher.
10373"""
10474
105- # 1 - Setting up Target, setting up Features for PCA,
106- # and scaling the Features
75+ # ---- Reading DataSet and Treating the Features ----
76+ df = pd . read_csv ( "../input/fe-course-data/autos.csv" )
10777features = ["highway_mpg" , "engine_size" , "horsepower" , "curb_weight" ]
10878
10979X = df .copy ()
@@ -113,39 +83,36 @@ def make_mi_scores(X, y, discrete_features):
11383X_scaled = (X - X .mean (axis = 0 )) / X .std (axis = 0 )
11484
11585
116- # 2 - Importing library, calculating PCA, and converting the
117- # results into a DataFrame
118- from sklearn .decomposition import PCA
119-
86+ # ---- Calculating PCA ----
12087pca = PCA (n_components = 2 )
121- X_pca = pca .fit_transform (X_scaled )
12288
89+ X_pca = pca .fit_transform (X_scaled )
12390component_names = [f"PC{ i + 1 } " for i in range (X_pca .shape [1 ])]
12491X_pca = pd .DataFrame (X_pca , columns = component_names )
12592
12693X_pca .head ()
94+ print (pca .explained_variance_ratio_ ) # variance ratio
12795
128- # variance ratio
129- print (pca .explained_variance_ratio_ )
130-
131- # 3 - Getting the loadings (loadings are the variance and
132- # correlations between each component created)
96+ # ---- Getting the Loadings ----
97+ #
98+ # \ loadings are the variance and correlations between each
99+ # created component
133100loadings = pd .DataFrame (
134101 pca .components_ .T , # transpose the matrix of loadings
135102 columns = component_names , # so the columns are the principal components
136103 index = X .columns , # and the rows are the original features
137104)
138105loadings
139106
140- # 4 - Plotting the Results
141- plot_variance (pca );
142-
107+ # ---- Calculating Mutual Info Scores and Plotting the Results ----
143108mi_scores = make_mi_scores (X_pca , y , discrete_features = False )
144109mi_scores
145110
111+ plot_variance (pca );
112+
146113"""
147114
148- { image 4 .1 }
115+ { image 1 .1 }
149116
150117This table of loadings is telling us that in the Size component,
151118Height and Diameter vary in the same direction (same sign), but
0 commit comments