Skip to content

Commit 1f47a7b

Browse files
authored
Add Principal Component Analysis in R (#233)
1 parent f3cb0ba commit 1f47a7b

File tree

1 file changed

+69
-0
lines changed

1 file changed

+69
-0
lines changed

data_preprocessing/pca.r

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# ==============================================
2+
# Principal Component Analysis (PCA)
3+
# ==============================================
4+
# Algorithm: Dimensionality reduction using orthogonal linear transformation
5+
# Framework: Base R
6+
#
7+
# Purpose:
8+
# - Reduce high-dimensional data into a smaller set of uncorrelated variables (principal components)
9+
# - Preserve as much variance as possible
10+
#
11+
# Steps:
12+
# 1. Standardize the dataset (zero mean, unit variance)
13+
# 2. Compute the covariance matrix of the standardized data
14+
# 3. Compute eigenvalues and eigenvectors of the covariance matrix
15+
# 4. Sort eigenvectors by decreasing eigenvalues (most variance first)
16+
# 5. Project original data onto top k eigenvectors to get reduced data
17+
#
18+
# Complexity:
19+
# - Time: O(n * d^2 + d^3) where n = samples, d = features
20+
# - Space: O(d^2 + n * d)
21+
#
22+
# Applications:
23+
# - Data visualization, noise reduction, feature extraction
24+
# - Preprocessing for machine learning models
25+
# ==============================================
26+
27+
# PCA Algorithm Implementation (Algorithm only)
28+
pca_algorithm <- function(X, k) {
29+
# Basic input validation (kept minimal to match repo style)
30+
if (is.vector(X)) {
31+
X <- matrix(X, ncol = 1)
32+
}
33+
if (!is.matrix(X) || !is.numeric(X)) {
34+
stop("Input 'X' must be a numeric matrix or vector")
35+
}
36+
d <- ncol(X)
37+
if (k <= 0 || k > d) {
38+
stop("'k' must be between 1 and the number of columns of X")
39+
}
40+
41+
# Step 1: Standardize the data (zero mean, unit variance per feature)
42+
X_std <- scale(X)
43+
44+
# Step 2: Compute covariance matrix of standardized data
45+
cov_matrix <- cov(X_std)
46+
47+
# Step 3: Eigen decomposition (covariance is symmetric)
48+
eig <- eigen(cov_matrix)
49+
eig_values <- eig$values
50+
eig_vectors <- eig$vectors
51+
52+
# Step 4: Select top k principal components (eigenvectors)
53+
top_vectors <- eig_vectors[, 1:k, drop = FALSE]
54+
55+
# Step 5: Project standardized data onto top k components
56+
X_reduced <- X_std %*% top_vectors
57+
58+
return(list(
59+
reduced_data = X_reduced,
60+
components = top_vectors,
61+
eigenvalues = eig_values
62+
))
63+
}
64+
65+
# Example usage (algorithm only)
66+
# set.seed(42)
67+
# X <- matrix(rnorm(50 * 5), nrow = 50, ncol = 5)
68+
# pca_result <- pca_algorithm(X, k = 2)
69+
# head(pca_result$reduced_data)

0 commit comments

Comments
 (0)