Code-Plus-CUMI
diff --git a/‎4 - Python/4 - Data Science/2.1 - Dimensionality Reduction/0 - codes/0 - Dimensionality Reduction.py‎
Lines changed: 55 additions & 0 deletions b/‎4 - Python/4 - Data Science/2.1 - Dimensionality Reduction/0 - codes/0 - Dimensionality Reduction.py‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎4 - Python/4 - Data Science/2.1 - Dimensionality Reduction/0 - codes/1 - PCA.py‎
Lines changed: 124 additions & 0 deletions b/‎4 - Python/4 - Data Science/2.1 - Dimensionality Reduction/0 - codes/1 - PCA.py‎
Lines changed: 124 additions & 0 deletions
diff --git a/‎4 - Python/4 - Data Science/2.1 - Dimensionality Reduction/0 - codes/2 - t-SNE.py‎
Lines changed: 50 additions & 0 deletions b/‎4 - Python/4 - Data Science/2.1 - Dimensionality Reduction/0 - codes/2 - t-SNE.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎4 - Python/4 - Data Science/2.1 - Dimensionality Reduction/0 - codes/3 - UMAP.py‎
Lines changed: 61 additions & 0 deletions b/‎4 - Python/4 - Data Science/2.1 - Dimensionality Reduction/0 - codes/3 - UMAP.py‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎4 - Python/4 - Data Science/2.1 - Dimensionality Reduction/1 - images/1.0 - PCA.png‎
39.5 KB b/‎4 - Python/4 - Data Science/2.1 - Dimensionality Reduction/1 - images/1.0 - PCA.png‎
39.5 KB
diff --git a/‎4 - Python/4 - Data Science/2.1 - Dimensionality Reduction/1 - images/1.1 - PCA - Variations.png‎
16.8 KB b/‎4 - Python/4 - Data Science/2.1 - Dimensionality Reduction/1 - images/1.1 - PCA - Variations.png‎
16.8 KB
diff --git a/‎4 - Python/4 - Data Science/2.1 - Dimensionality Reduction/1 - images/2.0 - Cluster in Dataset.webp‎
2.69 KB b/‎4 - Python/4 - Data Science/2.1 - Dimensionality Reduction/1 - images/2.0 - Cluster in Dataset.webp‎
2.69 KB
diff --git a/‎4 - Python/4 - Data Science/2.1 - Dimensionality Reduction/1 - images/2.1 - Cluster in DataSet after t-SNE.webp‎
2.22 KB b/‎4 - Python/4 - Data Science/2.1 - Dimensionality Reduction/1 - images/2.1 - Cluster in DataSet after t-SNE.webp‎
2.22 KB
@@ -0,0 +1,55 @@
+"""
+	
+	******************************
+	** Dimensionality Reduction **
+	******************************
+
+	Dimensionality Reduction is the process to decrease the number
+of features from a DataSet combining them to create a new small
+number of features called "Components".
+
+-*-*-*-*-
+
+	To apply this technique, your DataSet must attend the following
+requisition:
+
+	/ all Categorical Features must be Encoded, since Dimensionality
+Reduction works out just with Numerical Features;
+
+	/ the features must be standardized, unless you know you have
+good reason not to, such as, the DataSet is already standardized
+by default;
+
+	/ outliers must be treated being removed or constrained, since
+they can have an undue influence on the results.
+
+-*-*-*-*-
+
+	Situations when you can use Dimensionality Reduction:
+
+	/ when you desire to check out whether clusters have similar
+properties and attributes;
+
+	/ when the DataSet contains lot of features (DataSet
+Compression to two or three features);
+
+	/ when the features are multi-colinear (there is a significant
+number of Linear Correlations between them);
+	
+	/ when your goal is to apply denoising.
+
+-*-*-*-*-
+
+	Variations of Dimensionality Reduction:
+
+	/ Principal Component Analisys (PCA): maximizes the variance;
+
+	/ t-Distributed Stochastic Neighbor Embedding (t-SNE): creates a
+reduced feature space where similar samples are modeled by nearby
+points and dissimilar samples are modeled by distant points with
+high probability;
+
+	/ Uniform Manifold Approximation and Projection (UMAP): applies
+Nearest Neighbors to cluster the datas and then reducts the
+dimensions.
+"""
@@ -0,0 +1,124 @@
+"""
+	
+	**********************************
+	** Principal Component Analysis **
+	**********************************
+
+    Principal Component Analysis (PCA) is used to create new Features
+combining other Features. In general, we get these new Features
+by tracing diagonal lines (axes) over the scatter plot between the
+two features we would like calculate the PCA.
+
+    After that, the model will calculate the correlation and the
+variance between these two features and return the Components
+(new Features).
+
+{ image 1.0 }
+
+    These new features are called the principal components of the
+data. The weights themselves are called loadings. There will be
+as many principal components as there are features in the
+original dataset: if we had used ten features instead of two,
+we would have ended up with ten components.
+"""
+
+# ---- Importing Libraries and Defining Functions ----
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from sklearn.feature_selection import mutual_info_regression
+from sklearn.decomposition import PCA
+
+def plot_variance(pca, width=8, dpi=100):
+
+    # Create figure #
+    fig, axs = plt.subplots(1, 2)
+    n = pca.n_components_
+    grid = np.arange(1, n + 1)
+
+    # Explained variance #
+    evr = pca.explained_variance_ratio_
+    axs[0].bar(grid, evr)
+    axs[0].set(xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0))
+
+    # Cumulative Variance #
+    cv = np.cumsum(evr)
+    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
+    axs[1].set(xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0))
+
+    # Set up figure #
+    fig.set(figwidth=8, dpi=100)
+    return axs
+
+def make_mi_scores(X, y, discrete_features):
+	mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
+    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
+    mi_scores = mi_scores.sort_values(ascending=False)
+    return mi_scores
+
+"""
+We've selected four features that cover a range of properties.
+Each of these features also has a high MI score with the target,
+price. We'll standardize the data since these features aren't
+naturally on the same scale.
+
+We say that the features are not in the same scale when their
+ratio are different in a highly way, such as: person's age and 
+salary, while a person's age varies from 0 - 100, the salary can
+vary between 1,000 - 1,000,000. There's a huge gap between them,
+so we gotta scale the features in order to tthe model doesn't
+think that salary is more important than age just because the values
+are higher.
+"""
+
+# ---- Reading DataSet and Treating the Features ----
+df = pd.read_csv("../input/fe-course-data/autos.csv")
+features = ["highway_mpg", "engine_size", "horsepower", "curb_weight"]
+
+X = df.copy()
+y = X.pop('price')
+X = X.loc[:, features]
+
+X_scaled = (X - X.mean(axis=0)) / X.std(axis=0)
+
+
+# ---- Calculating PCA ----
+pca = PCA(n_components=2)
+
+X_pca = pca.fit_transform(X_scaled)
+component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
+X_pca = pd.DataFrame(X_pca, columns=component_names)
+
+X_pca.head()
+print(pca.explained_variance_ratio_) # variance ratio
+
+# ---- Getting the Loadings ----
+#
+# \ loadings are the variance and correlations between each
+# created component
+loadings = pd.DataFrame(
+    pca.components_.T,  # transpose the matrix of loadings
+    columns=component_names,  # so the columns are the principal components
+    index=X.columns,  # and the rows are the original features
+)
+loadings
+
+# ---- Calculating Mutual Info Scores and Plotting the Results ----
+mi_scores = make_mi_scores(X_pca, y, discrete_features=False)
+mi_scores
+
+plot_variance(pca);
+
+"""
+
+{ image 1.1 }
+
+This table of loadings is telling us that in the Size component,
+Height and Diameter vary in the same direction (same sign), but
+in the Shape component they vary in opposite directions (opposite
+sign).
+
+In each component, the loadings are all of the same magnitude
+and so the features contribute equally in both.
+"""
@@ -0,0 +1,50 @@
+"""
+	
+	*************************************************
+	** t-Distributed Stochastic Neighbor Embedding **
+	*************************************************
+
+	Suppose we had a dataset composed of 3 distinct classes
+in a 2D plot and we want to convert it to a 1D plot maintaining
+the differences and distances between each cluster.
+
+	{ image 2.0 }
+	{ image 2.1 }
+"""
+
+# ---- Import Libraries ----
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.manifold import TNSE
+
+# ---- Applying t-SNE ----
+#
+# \ n_components: number of compoennts / dimensions
+#
+# \ verbose: logger (1 >> true / 0 >> false)
+#
+# \ perplexity: number of nearest neighbors that is used
+# to Manifold Learning Algorithms. This value should be fine
+# between 5 and 50 and as larger the DataSet is, the larger
+# its value should be
+#
+# \ n_iter: number of iterations to run the algorithm's process
+# of learning
+#
+
+tsne = TSNE(n_components=2, verbose=0, perplexity=40, n_iter=300)
+X_tsne = tsne.fit_transform(data_subset)
+X_tsne.head()
+
+
+# ---- Plotting the Result ----
+palette = sns.color_palette("bright", 10)
+
+sns.scatterplot(
+	X_tsne[:,0]
+	, X_tsne[:,1]
+	, hue=y
+	, legend='full'
+	, palette=palette
+)
@@ -0,0 +1,61 @@
+"""
+	
+	***************************************************
+	** Uniform Manifold Approximation and Projection **
+	***************************************************
+
+	UMAP is another Dimensionality Reduction Technique that,
+different from PCA and t-SNE, applies Nearest Neighbors
+to cluster the datas and them reduct the dimensions.
+"""
+
+# ---- Importing Libraries ----
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import umap_umap as UMAP # pip import umap-learn
+
+# ---- Applying Umapping (Unsupervisioned Learning) ----
+reducer = UMAP(
+    n_neighbors=100, # default 15, The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation.
+    n_components=3, # default 2, The dimension of the space to embed into.
+    metric='euclidean', # default 'euclidean', The metric to use to compute distances in high dimensional space.
+    n_epochs=1000, # default None, The number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. 
+    learning_rate=1.0, # default 1.0, The initial learning rate for the embedding optimization.
+    init='spectral', # default 'spectral', How to initialize the low dimensional embedding. Options are: {'spectral', 'random', A numpy array of initial embedding positions}.
+    min_dist=0.1, # default 0.1, The effective minimum distance between embedded points.
+    spread=1.0, # default 1.0, The effective scale of embedded points. In combination with ``min_dist`` this determines how clustered/clumped the embedded points are.
+    low_memory=False, # default False, For some datasets the nearest neighbor computation can consume a lot of memory. If you find that UMAP is failing due to memory constraints consider setting this option to True.
+    set_op_mix_ratio=1.0, # default 1.0, The value of this parameter should be between 0.0 and 1.0; a value of 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy intersection.
+    local_connectivity=1, # default 1, The local connectivity required -- i.e. the number of nearest neighbors that should be assumed to be connected at a local level.
+    repulsion_strength=1.0, # default 1.0, Weighting applied to negative samples in low dimensional embedding optimization.
+    negative_sample_rate=5, # default 5, Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
+    transform_queue_size=4.0, # default 4.0, Larger values will result in slower performance but more accurate nearest neighbor evaluation.
+    a=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
+    b=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
+    random_state=42, # default: None, If int, random_state is the seed used by the random number generator;
+    metric_kwds=None, # default None) Arguments to pass on to the metric, such as the ``p`` value for Minkowski distance.
+    angular_rp_forest=False, # default False, Whether to use an angular random projection forest to initialise the approximate nearest neighbor search.
+    target_n_neighbors=-1, # default -1, The number of nearest neighbors to use to construct the target simplcial set. If set to -1 use the ``n_neighbors`` value.
+    #target_metric='categorical', # default 'categorical', The metric used to measure distance for a target array is using supervised dimension reduction. By default this is 'categorical' which will measure distance in terms of whether categories match or are different. 
+    #target_metric_kwds=None, # dict, default None, Keyword argument to pass to the target metric when performing supervised dimension reduction. If None then no arguments are passed on.
+    #target_weight=0.5, # default 0.5, weighting factor between data topology and target topology.
+    transform_seed=42, # default 42, Random seed used for the stochastic aspects of the transform operation.
+    verbose=False, # default False, Controls verbosity of logging.
+    unique=False, # default False, Controls if the rows of your data should be uniqued before being embedded. 
+)
+
+X_trans = reducer.fit_transform(X)
+print('Shape of X_trans: ', X_trans.shape)
+
+# ---- Applying Umapping (Supervisioned Learning) ----
+reducer2 = UMAP(
+	n_neighbors=100, n_components=3, n_epochs=1000,
+	min_dist=0.5, local_connectivity=2, random_state=42,
+)
+
+X_train_res = reducer2.fit_transform(X_train, y_train)
+X_test_res = reducer2.transform(X_test)
+
+print('Shape of X_train_res: ', X_train_res.shape)
+print('Shape of X_test_res: ', X_test_res.shape)