diff --git a/DIRECTORY.md b/DIRECTORY.md index a7a1e1b5..217b3e6f 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -3,7 +3,10 @@ * [Apriori](https://github.com/TheAlgorithms/R/blob/HEAD/association_algorithms/apriori.r) ## Biomedical + * [Chi Square Tests](https://github.com/TheAlgorithms/R/blob/HEAD/biomedical/chi_square_tests.r) + * [Correlation Analysis](https://github.com/TheAlgorithms/R/blob/HEAD/biomedical/correlation_analysis.r) * [Mann Whitney U Test](https://github.com/TheAlgorithms/R/blob/HEAD/biomedical/mann_whitney_u_test.r) + * [T Tests Comprehensive](https://github.com/TheAlgorithms/R/blob/HEAD/biomedical/t_tests_comprehensive.r) * [Wilcoxon Signed Rank Test](https://github.com/TheAlgorithms/R/blob/HEAD/biomedical/wilcoxon_signed_rank_test.r) ## Classification Algorithms diff --git a/biomedical/Rplots.pdf b/biomedical/Rplots.pdf new file mode 100644 index 00000000..0bc6ce11 Binary files /dev/null and b/biomedical/Rplots.pdf differ diff --git a/biomedical/chi_square_tests.r b/biomedical/chi_square_tests.r new file mode 100644 index 00000000..cf3cb817 --- /dev/null +++ b/biomedical/chi_square_tests.r @@ -0,0 +1,715 @@ +# Chi-Square Tests for Biomedical Data Analysis +# +# This implementation provides comprehensive chi-square testing capabilities: +# 1. Chi-square goodness of fit test (compare observed vs expected frequencies) +# 2. Chi-square test of independence (test association between categorical variables) +# 3. Chi-square test for homogeneity (compare distributions across groups) +# 4. McNemar's test (for paired categorical data) +# +# Applications in biomedical research: +# - Clinical trials: testing treatment response patterns +# - Epidemiology: testing associations between risk factors and diseases +# - Genetics: testing Hardy-Weinberg equilibrium, allele frequencies +# - Public health: analyzing categorical health outcomes +# - Quality control: testing distribution patterns in lab results +# +# Time Complexity: O(n) for data processing, O(r*c) for contingency tables +# Space Complexity: O(r*c) for storing contingency tables + +# Helper function for creating professional contingency table visualizations +create_contingency_plot <- function(observed, expected = NULL, main_title = "Contingency Table Analysis") { + # Set up color palette + colors <- c("#E8F4F8", "#B8E0D2", "#95A5A6", "#D5A6BD", "#F7DC6F", "#F8C471") + + par(mfrow = c(2, 2), mar = c(4, 4, 3, 2)) + + # 1. Observed frequencies heatmap + if (is.matrix(observed)) { + image(1:ncol(observed), 1:nrow(observed), t(observed[nrow(observed):1, ]), + col = heat.colors(20), main = "Observed Frequencies", + xlab = "Column Variable", ylab = "Row Variable", axes = FALSE) + axis(1, at = 1:ncol(observed), labels = colnames(observed)) + axis(2, at = 1:nrow(observed), labels = rev(rownames(observed))) + + # Add text values + for (i in 1:nrow(observed)) { + for (j in 1:ncol(observed)) { + text(j, nrow(observed) - i + 1, observed[i, j], cex = 1.2, font = 2) + } + } + } + + # 2. Expected frequencies heatmap (if provided) + if (!is.null(expected) && is.matrix(expected)) { + image(1:ncol(expected), 1:nrow(expected), t(expected[nrow(expected):1, ]), + col = terrain.colors(20), main = "Expected Frequencies", + xlab = "Column Variable", ylab = "Row Variable", axes = FALSE) + axis(1, at = 1:ncol(expected), labels = colnames(expected)) + axis(2, at = 1:nrow(expected), labels = rev(rownames(expected))) + + # Add text values + for (i in 1:nrow(expected)) { + for (j in 1:ncol(expected)) { + text(j, nrow(expected) - i + 1, round(expected[i, j], 1), cex = 1.2, font = 2) + } + } + } + + # 3. Residuals plot (if expected provided) + if (!is.null(expected)) { + residuals <- (observed - expected) / sqrt(expected) + image(1:ncol(residuals), 1:nrow(residuals), t(residuals[nrow(residuals):1, ]), + col = cm.colors(20), main = "Standardized Residuals", + xlab = "Column Variable", ylab = "Row Variable", axes = FALSE) + axis(1, at = 1:ncol(residuals), labels = colnames(residuals)) + axis(2, at = 1:nrow(residuals), labels = rev(rownames(residuals))) + + # Add text values with color coding + for (i in 1:nrow(residuals)) { + for (j in 1:ncol(residuals)) { + color <- ifelse(abs(residuals[i, j]) > 2, "red", "black") + text(j, nrow(residuals) - i + 1, round(residuals[i, j], 2), + cex = 1.1, font = 2, col = color) + } + } + } +} + +# Chi-square goodness of fit test +chi_square_goodness_of_fit <- function(observed, expected = NULL, labels = NULL, + plot = TRUE, plot_title = "Goodness of Fit Test") { + #' Chi-Square Goodness of Fit Test + #' + #' Tests whether observed frequencies differ significantly from expected frequencies + #' + #' @param observed numeric vector of observed frequencies + #' @param expected numeric vector of expected frequencies (or NULL for equal proportions) + #' @param labels character vector of category labels + #' @param plot logical: whether to create visualization + #' @param plot_title character: title for the plot + #' @return list with test results + + # Input validation + if (!is.numeric(observed) || any(observed < 0)) { + stop("observed must be a numeric vector with non-negative values") + } + + if (any(observed != floor(observed))) { + warning("observed frequencies should be integers (counts)") + } + + k <- length(observed) # number of categories + + if (is.null(expected)) { + # Equal expected frequencies + total <- sum(observed) + expected <- rep(total / k, k) + } else { + if (length(expected) != k) { + stop("observed and expected must have the same length") + } + if (any(expected <= 0)) { + stop("expected frequencies must be positive") + } + # Scale expected to match total observed + expected <- expected * sum(observed) / sum(expected) + } + + # Check minimum expected frequency rule + min_expected <- min(expected) + if (min_expected < 5) { + warning(paste("Minimum expected frequency is", round(min_expected, 2), + "< 5. Results may be unreliable.")) + } + + # Calculate chi-square statistic + chi_square <- sum((observed - expected)^2 / expected) + df <- k - 1 + p_value <- pchisq(chi_square, df, lower.tail = FALSE) + + # Calculate residuals + residuals <- (observed - expected) / sqrt(expected) + standardized_residuals <- residuals / sqrt(1 - 1/k) + + # Prepare labels + if (is.null(labels)) { + labels <- paste("Category", 1:k) + } + + # Create visualization + if (plot) { + par(mfrow = c(2, 2), mar = c(5, 4, 3, 2)) + + # 1. Bar plot of observed vs expected + barplot_data <- rbind(observed, expected) + rownames(barplot_data) <- c("Observed", "Expected") + colnames(barplot_data) <- labels + + barplot(barplot_data, beside = TRUE, col = c("lightblue", "lightcoral"), + main = paste(plot_title, "\nObserved vs Expected"), + ylab = "Frequency", legend = TRUE, las = 2) + + # 2. Residuals plot + barplot(residuals, names.arg = labels, col = ifelse(abs(residuals) > 2, "red", "lightgreen"), + main = "Standardized Residuals", ylab = "Residual", las = 2) + abline(h = c(-2, 2), col = "red", lty = 2) + abline(h = 0, col = "black", lty = 1) + + # 3. Pie chart of observed frequencies + pie(observed, labels = paste(labels, "\n(", observed, ")", sep = ""), + col = rainbow(k), main = "Observed Distribution") + + # 4. Test summary + plot(1, 1, type = "n", xlim = c(0, 10), ylim = c(0, 10), + main = "Test Summary", xlab = "", ylab = "", axes = FALSE) + + text(5, 9.5, "Chi-Square Goodness of Fit", cex = 1.2, font = 2) + text(5, 8.7, paste("Chi-square statistic:", round(chi_square, 4)), cex = 1) + text(5, 8.1, paste("Degrees of freedom:", df), cex = 1) + text(5, 7.5, paste("p-value:", round(p_value, 6)), cex = 1) + text(5, 6.9, paste("Sample size:", sum(observed)), cex = 1) + + # Significance + sig_level <- ifelse(p_value < 0.001, "***", + ifelse(p_value < 0.01, "**", + ifelse(p_value < 0.05, "*", "ns"))) + text(5, 6.1, paste("Significance:", sig_level), cex = 1, + col = ifelse(sig_level == "ns", "red", "darkgreen")) + + if (p_value < 0.05) { + text(5, 5.3, "Result: Reject H0", cex = 1, col = "darkgreen", font = 2) + text(5, 4.7, "Observed ≠ Expected", cex = 1, col = "darkgreen") + } else { + text(5, 5.3, "Result: Fail to reject H0", cex = 1, col = "red", font = 2) + text(5, 4.7, "Observed = Expected", cex = 1, col = "red") + } + + # Effect size (Cramer's V for goodness of fit) + cramers_v <- sqrt(chi_square / sum(observed)) + text(5, 3.9, paste("Cramer's V:", round(cramers_v, 4)), cex = 1) + + par(mfrow = c(1, 1)) + } + + # Return results + result <- list( + statistic = chi_square, + p_value = p_value, + degrees_of_freedom = df, + observed = observed, + expected = expected, + residuals = residuals, + standardized_residuals = standardized_residuals, + cramers_v = sqrt(chi_square / sum(observed)), + method = "Chi-square goodness of fit test", + categories = labels, + sample_size = sum(observed) + ) + + class(result) <- "biomedical_chisq" + return(result) +} + +# Chi-square test of independence +chi_square_independence <- function(x, y = NULL, plot = TRUE, + plot_title = "Test of Independence") { + #' Chi-Square Test of Independence + #' + #' Tests whether two categorical variables are independent + #' + #' @param x either a contingency table (matrix) or a factor/vector for first variable + #' @param y factor/vector for second variable (if x is not a matrix) + #' @param plot logical: whether to create visualization + #' @param plot_title character: title for the plot + #' @return list with test results + + # Handle input formats + if (is.matrix(x) || is.table(x)) { + # x is already a contingency table + observed <- as.matrix(x) + if (is.null(rownames(observed))) rownames(observed) <- paste("Row", 1:nrow(observed)) + if (is.null(colnames(observed))) colnames(observed) <- paste("Col", 1:ncol(observed)) + } else { + # Create contingency table from vectors + if (is.null(y)) stop("If x is not a matrix, y must be provided") + if (length(x) != length(y)) stop("x and y must have the same length") + + # Remove missing values + complete_cases <- complete.cases(x, y) + x <- x[complete_cases] + y <- y[complete_cases] + + observed <- table(x, y) + observed <- as.matrix(observed) + } + + # Input validation + if (any(observed < 0)) stop("All frequencies must be non-negative") + if (sum(observed) == 0) stop("Total frequency cannot be zero") + + # Calculate expected frequencies + row_totals <- rowSums(observed) + col_totals <- colSums(observed) + total <- sum(observed) + + expected <- outer(row_totals, col_totals) / total + + # Check minimum expected frequency rule + min_expected <- min(expected) + prop_low_expected <- mean(expected < 5) + + if (min_expected < 1) { + stop("Some expected frequencies < 1. Test is not appropriate.") + } + + if (prop_low_expected > 0.2) { + warning(paste(round(prop_low_expected * 100, 1), + "% of expected frequencies are < 5. Consider combining categories.")) + } + + # Calculate chi-square statistic + chi_square <- sum((observed - expected)^2 / expected) + df <- (nrow(observed) - 1) * (ncol(observed) - 1) + p_value <- pchisq(chi_square, df, lower.tail = FALSE) + + # Calculate effect sizes + n <- sum(observed) + cramers_v <- sqrt(chi_square / (n * (min(nrow(observed), ncol(observed)) - 1))) + phi_coefficient <- ifelse(nrow(observed) == 2 && ncol(observed) == 2, + sqrt(chi_square / n), NA) + + # Calculate standardized residuals + residuals <- (observed - expected) / sqrt(expected) + adj_residuals <- residuals / sqrt((1 - row_totals/total) %*% t(1 - col_totals/total)) + + # Create visualization + if (plot) { + create_contingency_plot(observed, expected, plot_title) + + # Additional summary plot + plot(1, 1, type = "n", xlim = c(0, 10), ylim = c(0, 10), + main = "Independence Test Summary", xlab = "", ylab = "", axes = FALSE) + + text(5, 9.5, "Chi-Square Test of Independence", cex = 1.2, font = 2) + text(5, 8.7, paste("Chi-square statistic:", round(chi_square, 4)), cex = 1) + text(5, 8.1, paste("Degrees of freedom:", df), cex = 1) + text(5, 7.5, paste("p-value:", round(p_value, 6)), cex = 1) + text(5, 6.9, paste("Sample size:", n), cex = 1) + text(5, 6.3, paste("Cramer's V:", round(cramers_v, 4)), cex = 1) + + if (!is.na(phi_coefficient)) { + text(5, 5.7, paste("Phi coefficient:", round(phi_coefficient, 4)), cex = 1) + } + + # Significance + sig_level <- ifelse(p_value < 0.001, "***", + ifelse(p_value < 0.01, "**", + ifelse(p_value < 0.05, "*", "ns"))) + text(5, 4.9, paste("Significance:", sig_level), cex = 1, + col = ifelse(sig_level == "ns", "red", "darkgreen")) + + if (p_value < 0.05) { + text(5, 4.1, "Result: Variables are dependent", cex = 1, col = "darkgreen", font = 2) + } else { + text(5, 4.1, "Result: Variables are independent", cex = 1, col = "red", font = 2) + } + + # Effect size interpretation + effect_magnitude <- if (cramers_v < 0.1) "negligible" + else if (cramers_v < 0.3) "small" + else if (cramers_v < 0.5) "medium" + else "large" + text(5, 3.3, paste("Effect size:", effect_magnitude), cex = 1) + } + + # Return results + result <- list( + statistic = chi_square, + p_value = p_value, + degrees_of_freedom = df, + observed = observed, + expected = expected, + residuals = residuals, + adjusted_residuals = adj_residuals, + cramers_v = cramers_v, + phi_coefficient = phi_coefficient, + method = "Chi-square test of independence", + sample_size = n, + row_totals = row_totals, + col_totals = col_totals + ) + + class(result) <- "biomedical_chisq" + return(result) +} + +# McNemar's test for paired categorical data +mcnemar_test <- function(x, y = NULL, correct = TRUE, plot = TRUE, + plot_title = "McNemar's Test") { + #' McNemar's Test for Paired Categorical Data + #' + #' Tests for changes in paired categorical data (e.g., before/after treatment) + #' + #' @param x either a 2x2 contingency table or a factor/vector for condition 1 + #' @param y factor/vector for condition 2 (if x is not a matrix) + #' @param correct logical: apply continuity correction + #' @param plot logical: whether to create visualization + #' @param plot_title character: title for the plot + #' @return list with test results + + # Handle input formats + if (is.matrix(x) || is.table(x)) { + if (nrow(x) != 2 || ncol(x) != 2) { + stop("For McNemar's test, contingency table must be 2x2") + } + contingency_table <- as.matrix(x) + } else { + if (is.null(y)) stop("If x is not a matrix, y must be provided") + if (length(x) != length(y)) stop("x and y must have the same length") + + # Remove missing values + complete_cases <- complete.cases(x, y) + x <- x[complete_cases] + y <- y[complete_cases] + + contingency_table <- table(x, y) + if (nrow(contingency_table) != 2 || ncol(contingency_table) != 2) { + stop("Both variables must have exactly 2 levels for McNemar's test") + } + contingency_table <- as.matrix(contingency_table) + } + + # Extract discordant pairs + b <- contingency_table[1, 2] # changed from negative to positive + c <- contingency_table[2, 1] # changed from positive to negative + + # McNemar's test statistic + if (correct && (b + c) > 0) { + # With continuity correction + chi_square <- (abs(b - c) - 1)^2 / (b + c) + } else { + # Without continuity correction + chi_square <- (b - c)^2 / (b + c) + } + + df <- 1 + p_value <- pchisq(chi_square, df, lower.tail = FALSE) + + # Calculate concordant pairs + concordant <- contingency_table[1,1] + contingency_table[2,2] + + # Effect size (odds ratio for discordant pairs) + if (c > 0) { + odds_ratio <- b / c + log_or_se <- sqrt(1/b + 1/c) + or_ci_lower <- exp(log(odds_ratio) - 1.96 * log_or_se) + or_ci_upper <- exp(log(odds_ratio) + 1.96 * log_or_se) + } else { + odds_ratio <- Inf + or_ci_lower <- NA + or_ci_upper <- NA + } + + # Create visualization + if (plot) { + par(mfrow = c(2, 2), mar = c(4, 4, 3, 2)) + + # 1. Contingency table heatmap + image(1:2, 1:2, t(contingency_table[2:1, ]), col = heat.colors(10), + main = "2x2 Contingency Table", xlab = "After", ylab = "Before", axes = FALSE) + axis(1, at = 1:2, labels = colnames(contingency_table)) + axis(2, at = 1:2, labels = rev(rownames(contingency_table))) + + # Add cell values + for (i in 1:2) { + for (j in 1:2) { + text(j, 3-i, contingency_table[i, j], cex = 2, font = 2) + } + } + + # 2. Concordant vs Discordant pairs + discordant <- b + c + + barplot(c(concordant, discordant), names.arg = c("Concordant", "Discordant"), + col = c("lightgreen", "lightcoral"), main = "Concordant vs Discordant Pairs", + ylab = "Count") + + # 3. Change direction + if (discordant > 0) { + change_data <- c(b, c) + names(change_data) <- c("- to +", "+ to -") + barplot(change_data, col = c("lightblue", "orange"), + main = "Direction of Change", ylab = "Count") + } + + # 4. Test summary + plot(1, 1, type = "n", xlim = c(0, 10), ylim = c(0, 10), + main = "McNemar Test Summary", xlab = "", ylab = "", axes = FALSE) + + text(5, 9.5, "McNemar's Test Results", cex = 1.2, font = 2) + text(5, 8.7, paste("Chi-square statistic:", round(chi_square, 4)), cex = 1) + text(5, 8.1, paste("Degrees of freedom:", df), cex = 1) + text(5, 7.5, paste("p-value:", round(p_value, 6)), cex = 1) + text(5, 6.9, paste("Continuity correction:", correct), cex = 1) + text(5, 6.3, paste("Discordant pairs:", discordant), cex = 1) + + if (is.finite(odds_ratio)) { + text(5, 5.7, paste("Odds ratio:", round(odds_ratio, 4)), cex = 1) + } + + # Significance + sig_level <- ifelse(p_value < 0.001, "***", + ifelse(p_value < 0.01, "**", + ifelse(p_value < 0.05, "*", "ns"))) + text(5, 4.9, paste("Significance:", sig_level), cex = 1, + col = ifelse(sig_level == "ns", "red", "darkgreen")) + + if (p_value < 0.05) { + text(5, 4.1, "Result: Significant change", cex = 1, col = "darkgreen", font = 2) + } else { + text(5, 4.1, "Result: No significant change", cex = 1, col = "red", font = 2) + } + + par(mfrow = c(1, 1)) + } + + # Return results + result <- list( + statistic = chi_square, + p_value = p_value, + degrees_of_freedom = df, + contingency_table = contingency_table, + discordant_pairs = c(b = b, c = c), + concordant_pairs = concordant, + odds_ratio = odds_ratio, + odds_ratio_ci = c(or_ci_lower, or_ci_upper), + continuity_correction = correct, + method = "McNemar's test for paired data", + sample_size = sum(contingency_table) + ) + + class(result) <- "biomedical_chisq" + return(result) +} + +# Print method for chi-square test results +print.biomedical_chisq <- function(x, ...) { + cat("\n", x$method, "\n") + cat(rep("=", nchar(x$method) + 2), "\n", sep = "") + + if (grepl("goodness of fit", x$method)) { + cat("Categories:", length(x$observed), "\n") + cat("Sample size:", x$sample_size, "\n") + cat("Chi-square statistic:", x$statistic, "\n") + cat("Degrees of freedom:", x$degrees_of_freedom, "\n") + cat("p-value:", x$p_value, "\n") + cat("Cramer's V:", x$cramers_v, "\n") + + } else if (grepl("independence", x$method)) { + cat("Contingency table dimensions:", nrow(x$observed), "x", ncol(x$observed), "\n") + cat("Sample size:", x$sample_size, "\n") + cat("Chi-square statistic:", x$statistic, "\n") + cat("Degrees of freedom:", x$degrees_of_freedom, "\n") + cat("p-value:", x$p_value, "\n") + cat("Cramer's V:", x$cramers_v, "\n") + if (!is.na(x$phi_coefficient)) { + cat("Phi coefficient:", x$phi_coefficient, "\n") + } + + } else if (grepl("McNemar", x$method)) { + cat("Sample size:", x$sample_size, "\n") + cat("Discordant pairs:", sum(x$discordant_pairs), "\n") + cat("Chi-square statistic:", x$statistic, "\n") + cat("Degrees of freedom:", x$degrees_of_freedom, "\n") + cat("p-value:", x$p_value, "\n") + cat("Continuity correction:", x$continuity_correction, "\n") + if (is.finite(x$odds_ratio)) { + cat("Odds ratio:", x$odds_ratio, "\n") + } + } + + # Interpretation + if (x$p_value < 0.05) { + cat("\nConclusion: Reject the null hypothesis (significant result)\n") + } else { + cat("\nConclusion: Fail to reject the null hypothesis (not significant)\n") + } +} + +# Comprehensive demonstration function +demonstrate_chi_square_tests <- function() { + cat("=== Chi-Square Tests for Biomedical Data Analysis ===\n\n") + + set.seed(123) + + # Example 1: Goodness of fit test (Blood type distribution) + cat("1. GOODNESS OF FIT TEST: Blood Type Distribution\n") + cat("Research Question: Does the observed blood type distribution match expected population frequencies?\n") + cat("H0: Observed frequencies = Expected frequencies\n") + cat("H1: Observed frequencies ≠ Expected frequencies\n\n") + + # Blood type data + observed_blood_types <- c(45, 42, 10, 3) # A, B, AB, O + expected_proportions <- c(0.42, 0.37, 0.09, 0.12) # Population frequencies + blood_type_labels <- c("Type A", "Type B", "Type AB", "Type O") + + cat("Observed blood types in sample (n=100):\n") + for (i in 1:length(observed_blood_types)) { + cat(paste(blood_type_labels[i], ":", observed_blood_types[i], "\n")) + } + cat("\nExpected population proportions:\n") + for (i in 1:length(expected_proportions)) { + cat(paste(blood_type_labels[i], ":", expected_proportions[i], "\n")) + } + + goodness_result <- chi_square_goodness_of_fit( + observed_blood_types, + expected_proportions, + blood_type_labels, + plot_title = "Blood Type Distribution Analysis" + ) + print(goodness_result) + + cat("\n", rep("=", 60), "\n\n") + + # Example 2: Test of independence (Treatment response by gender) + cat("2. INDEPENDENCE TEST: Treatment Response by Gender\n") + cat("Research Question: Is treatment response independent of patient gender?\n") + cat("H0: Treatment response is independent of gender\n") + cat("H1: Treatment response depends on gender\n\n") + + # Create contingency table + treatment_response <- matrix(c( + 25, 15, # Male: Success, Failure + 30, 10 # Female: Success, Failure + ), nrow = 2, byrow = TRUE, + dimnames = list(Gender = c("Male", "Female"), + Response = c("Success", "Failure"))) + + cat("Contingency Table:\n") + print(treatment_response) + cat("\n") + + independence_result <- chi_square_independence( + treatment_response, + plot_title = "Treatment Response by Gender" + ) + print(independence_result) + + cat("\n", rep("=", 60), "\n\n") + + # Example 3: McNemar's test (Before/after treatment) + cat("3. McNEMAR'S TEST: Before/After Treatment Status\n") + cat("Research Question: Did treatment significantly change patient status?\n") + cat("H0: No change in treatment status (marginal frequencies equal)\n") + cat("H1: Significant change in treatment status\n\n") + + # Paired before/after data + before_after <- matrix(c( + 10, 5, # Before Positive: After Positive, After Negative + 15, 20 # Before Negative: After Positive, After Negative + ), nrow = 2, byrow = TRUE, + dimnames = list(Before = c("Positive", "Negative"), + After = c("Positive", "Negative"))) + + cat("Before/After Contingency Table:\n") + print(before_after) + cat("\n") + + mcnemar_result <- mcnemar_test( + before_after, + plot_title = "Before/After Treatment Analysis" + ) + print(mcnemar_result) + + cat("\n", rep("=", 60), "\n\n") + + # Example 4: Large contingency table (Drug side effects by age group) + cat("4. LARGE CONTINGENCY TABLE: Drug Side Effects by Age Group\n") + cat("Research Question: Do drug side effects vary by age group?\n\n") + + side_effects <- matrix(c( + 12, 8, 15, 5, # Age 18-30: None, Mild, Moderate, Severe + 18, 12, 20, 10, # Age 31-50: None, Mild, Moderate, Severe + 25, 20, 25, 15, # Age 51-70: None, Mild, Moderate, Severe + 15, 18, 30, 22 # Age 71+: None, Mild, Moderate, Severe + ), nrow = 4, byrow = TRUE, + dimnames = list(Age_Group = c("18-30", "31-50", "51-70", "71+"), + Side_Effect = c("None", "Mild", "Moderate", "Severe"))) + + cat("Side Effects by Age Group:\n") + print(side_effects) + cat("\n") + + large_table_result <- chi_square_independence( + side_effects, + plot_title = "Drug Side Effects by Age Group" + ) + print(large_table_result) + + cat("\n", rep("=", 60), "\n\n") + + # Example 5: Power analysis and sample size considerations + cat("5. POWER ANALYSIS FOR CHI-SQUARE TESTS\n") + cat("Demonstrating the relationship between effect size, sample size, and statistical power\n\n") + + # Simulate power analysis for different effect sizes + sample_sizes <- c(50, 100, 200, 500) + effect_sizes <- c(0.1, 0.3, 0.5) # Small, medium, large (Cramer's V) + + power_results <- data.frame( + sample_size = rep(sample_sizes, each = length(effect_sizes)), + effect_size = rep(effect_sizes, length(sample_sizes)), + power = NA + ) + + for (i in 1:nrow(power_results)) { + n <- power_results$sample_size[i] + effect <- power_results$effect_size[i] + + # Simulate chi-square tests with known effect size + power <- mean(replicate(1000, { + # Create 2x2 table with specified effect size + p1 <- 0.5 + p2 <- p1 + effect + if (p2 > 1) p2 <- 1 - effect + + x1 <- rbinom(n/2, 1, p1) + x2 <- rbinom(n/2, 1, p2) + group <- rep(c("Group1", "Group2"), each = n/2) + outcome <- c(x1, x2) + + test_table <- table(group, outcome) + if (any(dim(test_table) != 2)) return(FALSE) + + result <- chi_square_independence(test_table, plot = FALSE) + result$p_value < 0.05 + })) + + power_results$power[i] <- power + } + + cat("Power Analysis Results (1000 simulations each):\n") + power_matrix <- matrix(power_results$power, nrow = length(sample_sizes), + dimnames = list(paste("n =", sample_sizes), + paste("Effect =", effect_sizes))) + print(round(power_matrix, 3)) + + cat("\nInterpretations:\n") + cat("- Cramer's V = 0.1: Small effect size, requires large samples\n") + cat("- Cramer's V = 0.3: Medium effect size, moderate samples adequate\n") + cat("- Cramer's V = 0.5: Large effect size, detectable with smaller samples\n") + cat("- Aim for power ≥ 0.80 (80%) in study design\n\n") + + cat("Clinical Guidelines for Chi-Square Tests:\n") + cat("- Ensure expected frequencies ≥ 5 in at least 80% of cells\n") + cat("- No expected frequency should be < 1\n") + cat("- Consider Fisher's exact test for small samples\n") + cat("- Report effect sizes (Cramer's V, phi coefficient) alongside p-values\n") + cat("- Consider clinical significance of associations\n") +} + +# Run demonstration if script is executed directly +if (sys.nframe() == 0) { + demonstrate_chi_square_tests() +} \ No newline at end of file diff --git a/biomedical/correlation_analysis.r b/biomedical/correlation_analysis.r new file mode 100644 index 00000000..40be8b0c --- /dev/null +++ b/biomedical/correlation_analysis.r @@ -0,0 +1,749 @@ +# Comprehensive Correlation Analysis for Biomedical Data +# +# This implementation provides multiple correlation analysis methods: +# 1. Pearson correlation (linear relationships, parametric) +# 2. Spearman correlation (monotonic relationships, non-parametric) +# 3. Kendall's tau (robust to outliers, non-parametric) +# 4. Partial correlation (controlling for confounding variables) +# 5. Correlation matrix analysis with multiple testing correction +# +# Applications in biomedical research: +# - Investigating relationships between biomarkers and outcomes +# - Analyzing dose-response relationships in pharmacology +# - Studying associations between physiological measurements +# - Quality control in laboratory measurements +# - Gene expression correlation studies +# - Clinical trial endpoint relationships +# +# Time Complexity: O(n) for pairwise correlations, O(n*p^2) for partial correlations +# Space Complexity: O(p^2) for correlation matrices, O(n) for pairwise analysis + +# Comprehensive correlation analysis function +correlation_analysis <- function(x, y = NULL, method = "pearson", conf.level = 0.95, + plot = TRUE, plot_title = "Correlation Analysis") { + #' Comprehensive Correlation Analysis + #' + #' Performs correlation analysis with confidence intervals and significance testing + #' + #' @param x numeric vector or matrix of data + #' @param y numeric vector (if x is a vector) or NULL (if x is a matrix) + #' @param method character: "pearson", "spearman", or "kendall" + #' @param conf.level confidence level for confidence intervals + #' @param plot logical: whether to create visualization + #' @param plot_title character: title for the plot + #' @return list with correlation results + + # Input validation + if (is.matrix(x) || is.data.frame(x)) { + if (!is.null(y)) { + warning("y is ignored when x is a matrix or data frame") + } + return(correlation_matrix_analysis(x, method, conf.level, plot, plot_title)) + } + + if (!is.numeric(x) || !is.numeric(y)) { + stop("Both x and y must be numeric") + } + + if (length(x) != length(y)) { + stop("x and y must have the same length") + } + + if (!method %in% c("pearson", "spearman", "kendall")) { + stop("method must be 'pearson', 'spearman', or 'kendall'") + } + + # Remove missing values + complete_pairs <- complete.cases(x, y) + x_clean <- x[complete_pairs] + y_clean <- y[complete_pairs] + n <- length(x_clean) + + if (n < 3) { + stop("Need at least 3 complete pairs for correlation analysis") + } + + # Calculate correlation and test + if (method == "pearson") { + result <- pearson_correlation(x_clean, y_clean, conf.level) + } else if (method == "spearman") { + result <- spearman_correlation(x_clean, y_clean, conf.level) + } else { # kendall + result <- kendall_correlation(x_clean, y_clean, conf.level) + } + + # Add sample information + result$sample_size <- n + result$missing_pairs <- length(x) - n + + # Simple title case function (replace stringr dependency) + title_case <- function(s) paste0(toupper(substr(s, 1, 1)), tolower(substr(s, 2, nchar(s)))) + result$method <- paste(title_case(method), "correlation coefficient") + + # Create visualization + if (plot) { + create_correlation_plot(x_clean, y_clean, result, plot_title) + } + + class(result) <- "biomedical_correlation" + return(result) +} + +# Pearson correlation implementation +pearson_correlation <- function(x, y, conf.level = 0.95) { + n <- length(x) + + # Calculate Pearson correlation coefficient + r <- cor(x, y, method = "pearson") + + # Test statistic and p-value + t_stat <- r * sqrt((n - 2) / (1 - r^2)) + df <- n - 2 + p_value <- 2 * pt(abs(t_stat), df, lower.tail = FALSE) + + # Confidence interval using Fisher's Z transformation + z_r <- 0.5 * log((1 + r) / (1 - r)) # Fisher's Z + se_z <- 1 / sqrt(n - 3) + alpha <- 1 - conf.level + z_critical <- qnorm(1 - alpha/2) + + z_lower <- z_r - z_critical * se_z + z_upper <- z_r + z_critical * se_z + + # Transform back to correlation scale + ci_lower <- (exp(2 * z_lower) - 1) / (exp(2 * z_lower) + 1) + ci_upper <- (exp(2 * z_upper) - 1) / (exp(2 * z_upper) + 1) + + # Effect size interpretation + r_squared <- r^2 + + return(list( + correlation = r, + t_statistic = t_stat, + p_value = p_value, + degrees_of_freedom = df, + confidence_interval = c(ci_lower, ci_upper), + confidence_level = conf.level, + r_squared = r_squared, + fishers_z = z_r, + assumptions_check = check_pearson_assumptions(x, y) + )) +} + +# Spearman correlation implementation +spearman_correlation <- function(x, y, conf.level = 0.95) { + n <- length(x) + + # Calculate Spearman correlation coefficient + rho <- cor(x, y, method = "spearman") + + # Test statistic (approximate for large n) + if (n > 10) { + t_stat <- rho * sqrt((n - 2) / (1 - rho^2)) + df <- n - 2 + p_value <- 2 * pt(abs(t_stat), df, lower.tail = FALSE) + } else { + # Exact test for small samples (simplified) + warning("Small sample size: p-value may be approximate") + t_stat <- rho * sqrt((n - 2) / (1 - rho^2)) + df <- n - 2 + p_value <- 2 * pt(abs(t_stat), df, lower.tail = FALSE) + } + + # Confidence interval (approximate using Fisher's Z) + z_rho <- 0.5 * log((1 + rho) / (1 - rho)) + se_z <- 1.06 / sqrt(n - 3) # Adjusted standard error for Spearman + alpha <- 1 - conf.level + z_critical <- qnorm(1 - alpha/2) + + z_lower <- z_rho - z_critical * se_z + z_upper <- z_rho + z_critical * se_z + + ci_lower <- (exp(2 * z_lower) - 1) / (exp(2 * z_lower) + 1) + ci_upper <- (exp(2 * z_upper) - 1) / (exp(2 * z_upper) + 1) + + return(list( + correlation = rho, + t_statistic = t_stat, + p_value = p_value, + degrees_of_freedom = df, + confidence_interval = c(ci_lower, ci_upper), + confidence_level = conf.level, + rank_based = TRUE + )) +} + +# Kendall's tau correlation implementation +kendall_correlation <- function(x, y, conf.level = 0.95) { + n <- length(x) + + # Calculate Kendall's tau + tau <- cor(x, y, method = "kendall") + + # Test statistic for Kendall's tau + var_tau <- 2 * (2*n + 5) / (9 * n * (n - 1)) + z_stat <- tau / sqrt(var_tau) + p_value <- 2 * pnorm(abs(z_stat), lower.tail = FALSE) + + # Confidence interval (approximate) + alpha <- 1 - conf.level + z_critical <- qnorm(1 - alpha/2) + + ci_lower <- tau - z_critical * sqrt(var_tau) + ci_upper <- tau + z_critical * sqrt(var_tau) + + # Ensure CI is within [-1, 1] + ci_lower <- max(ci_lower, -1) + ci_upper <- min(ci_upper, 1) + + return(list( + correlation = tau, + z_statistic = z_stat, + p_value = p_value, + confidence_interval = c(ci_lower, ci_upper), + confidence_level = conf.level, + robust = TRUE + )) +} + +# Check assumptions for Pearson correlation +check_pearson_assumptions <- function(x, y) { + n <- length(x) + + # 1. Normality test (Shapiro-Wilk for n <= 5000) + if (n <= 5000) { + shapiro_x <- shapiro.test(x) + shapiro_y <- shapiro.test(y) + normality_x <- shapiro_x$p.value > 0.05 + normality_y <- shapiro_y$p.value > 0.05 + } else { + # Use Kolmogorov-Smirnov for large samples + ks_x <- ks.test(x, "pnorm", mean(x), sd(x)) + ks_y <- ks.test(y, "pnorm", mean(y), sd(y)) + normality_x <- ks_x$p.value > 0.05 + normality_y <- ks_y$p.value > 0.05 + } + + # 2. Linearity check (correlation between x and y vs. x and y^2) + linear_corr <- abs(cor(x, y)) + nonlinear_corr <- abs(cor(x, y^2)) + linearity <- linear_corr > nonlinear_corr + + # 3. Homoscedasticity check (Breusch-Pagan-like test) + residuals <- y - predict(lm(y ~ x)) + bp_stat <- cor(x, abs(residuals))^2 + homoscedasticity <- bp_stat < 0.1 # Rough threshold + + # 4. Outlier detection (using IQR method) + outliers_x <- detect_outliers(x) + outliers_y <- detect_outliers(y) + no_outliers <- length(outliers_x) == 0 && length(outliers_y) == 0 + + return(list( + normality_x = normality_x, + normality_y = normality_y, + linearity = linearity, + homoscedasticity = homoscedasticity, + no_outliers = no_outliers, + outliers_x = outliers_x, + outliers_y = outliers_y, + overall_suitable = normality_x && normality_y && linearity && homoscedasticity && no_outliers + )) +} + +# Outlier detection using IQR method +detect_outliers <- function(x) { + Q1 <- quantile(x, 0.25) + Q3 <- quantile(x, 0.75) + IQR <- Q3 - Q1 + lower_bound <- Q1 - 1.5 * IQR + upper_bound <- Q3 + 1.5 * IQR + + outlier_indices <- which(x < lower_bound | x > upper_bound) + return(outlier_indices) +} + +# Correlation matrix analysis +correlation_matrix_analysis <- function(data, method = "pearson", conf.level = 0.95, + plot = TRUE, plot_title = "Correlation Matrix") { + # Convert to matrix if data frame + if (is.data.frame(data)) { + numeric_cols <- sapply(data, is.numeric) + if (!all(numeric_cols)) { + warning("Non-numeric columns removed from analysis") + data <- data[, numeric_cols, drop = FALSE] + } + data <- as.matrix(data) + } + + # Remove rows with missing values + complete_rows <- complete.cases(data) + data_clean <- data[complete_rows, , drop = FALSE] + n <- nrow(data_clean) + p <- ncol(data_clean) + + if (n < 3) stop("Need at least 3 complete observations") + if (p < 2) stop("Need at least 2 variables") + + # Calculate correlation matrix + cor_matrix <- cor(data_clean, method = method) + + # Calculate p-values for all pairs + p_matrix <- matrix(1, nrow = p, ncol = p) + colnames(p_matrix) <- rownames(p_matrix) <- colnames(data_clean) + + for (i in 1:(p-1)) { + for (j in (i+1):p) { + if (method == "pearson") { + test_result <- cor.test(data_clean[, i], data_clean[, j], method = "pearson") + } else if (method == "spearman") { + test_result <- cor.test(data_clean[, i], data_clean[, j], method = "spearman") + } else { + test_result <- cor.test(data_clean[, i], data_clean[, j], method = "kendall") + } + + p_matrix[i, j] <- p_matrix[j, i] <- test_result$p.value + } + } + + # Multiple testing correction + p_values_upper <- p_matrix[upper.tri(p_matrix)] + p_adjusted_bonferroni <- p.adjust(p_values_upper, method = "bonferroni") + p_adjusted_fdr <- p.adjust(p_values_upper, method = "fdr") + + # Create adjusted p-value matrices + p_bonferroni <- p_fdr <- matrix(1, nrow = p, ncol = p) + colnames(p_bonferroni) <- rownames(p_bonferroni) <- colnames(data_clean) + colnames(p_fdr) <- rownames(p_fdr) <- colnames(data_clean) + + counter <- 1 + for (i in 1:(p-1)) { + for (j in (i+1):p) { + p_bonferroni[i, j] <- p_bonferroni[j, i] <- p_adjusted_bonferroni[counter] + p_fdr[i, j] <- p_fdr[j, i] <- p_adjusted_fdr[counter] + counter <- counter + 1 + } + } + + # Create visualization + if (plot) { + create_correlation_matrix_plot(cor_matrix, p_matrix, data_clean, plot_title) + } + + # Simple title case function + title_case <- function(s) paste0(toupper(substr(s, 1, 1)), tolower(substr(s, 2, nchar(s)))) + + result <- list( + correlation_matrix = cor_matrix, + p_values = p_matrix, + p_values_bonferroni = p_bonferroni, + p_values_fdr = p_fdr, + sample_size = n, + variables = p, + method = paste(title_case(method), "correlation matrix"), + missing_rows = nrow(data) - n, + significant_pairs_uncorrected = sum(p_matrix < 0.05 & upper.tri(p_matrix)), + significant_pairs_bonferroni = sum(p_bonferroni < 0.05 & upper.tri(p_bonferroni)), + significant_pairs_fdr = sum(p_fdr < 0.05 & upper.tri(p_fdr)) + ) + + class(result) <- "biomedical_correlation_matrix" + return(result) +} + +# Create correlation plot for two variables +create_correlation_plot <- function(x, y, result, main_title) { + par(mfrow = c(2, 2), mar = c(4, 4, 3, 2)) + + # 1. Scatter plot with regression line + plot(x, y, pch = 19, col = "darkblue", cex = 1.2, + main = paste(main_title, "\nScatter Plot"), + xlab = "X Variable", ylab = "Y Variable") + + # Add regression line + if ("assumptions_check" %in% names(result) && result$assumptions_check$linearity) { + abline(lm(y ~ x), col = "red", lwd = 2) + } else { + # Add lowess smoother for non-linear relationships + lines(lowess(x, y), col = "red", lwd = 2) + } + + # Add correlation info + legend("topleft", + c(paste("r =", round(result$correlation, 3)), + paste("p =", round(result$p_value, 4))), + bty = "n", cex = 1.1) + + # 2. Residuals plot (if Pearson) + if ("assumptions_check" %in% names(result)) { + fitted_values <- predict(lm(y ~ x)) + residuals <- y - fitted_values + + plot(fitted_values, residuals, pch = 19, col = "darkgreen", + main = "Residuals vs Fitted", xlab = "Fitted Values", ylab = "Residuals") + abline(h = 0, col = "red", lwd = 2, lty = 2) + + # Add lowess line to check for patterns + lines(lowess(fitted_values, residuals), col = "blue", lwd = 2) + } else { + # For non-parametric methods, show rank plot + plot(rank(x), rank(y), pch = 19, col = "darkgreen", + main = "Rank Plot", xlab = "Rank of X", ylab = "Rank of Y") + abline(lm(rank(y) ~ rank(x)), col = "red", lwd = 2) + } + + # 3. Q-Q plots for normality (if Pearson) + if ("assumptions_check" %in% names(result)) { + qqnorm(x, main = "Q-Q Plot X", pch = 19, col = "blue") + qqline(x, col = "red", lwd = 2) + } else { + # Histogram for non-parametric + hist(x, main = "Distribution of X", xlab = "X Values", + col = "lightblue", probability = TRUE) + lines(density(x), col = "red", lwd = 2) + } + + # 4. Test summary + plot(1, 1, type = "n", xlim = c(0, 10), ylim = c(0, 10), + main = "Correlation Summary", xlab = "", ylab = "", axes = FALSE) + + text(5, 9.5, strsplit(result$method, " correlation")[[1]][1], cex = 1.2, font = 2) + text(5, 8.7, paste("Correlation:", round(result$correlation, 4)), cex = 1.1) + text(5, 8.1, paste("p-value:", round(result$p_value, 6)), cex = 1.1) + + if ("degrees_of_freedom" %in% names(result)) { + text(5, 7.5, paste("df:", result$degrees_of_freedom), cex = 1.1) + } + + text(5, 6.9, paste("Sample size:", result$sample_size), cex = 1.1) + + # Confidence interval + ci_text <- paste(result$confidence_level * 100, "% CI: [", + round(result$confidence_interval[1], 3), ", ", + round(result$confidence_interval[2], 3), "]", sep = "") + text(5, 6.3, ci_text, cex = 1) + + # Effect size + if ("r_squared" %in% names(result)) { + text(5, 5.7, paste("R-squared:", round(result$r_squared, 4)), cex = 1) + } + + # Significance + sig_level <- ifelse(result$p_value < 0.001, "***", + ifelse(result$p_value < 0.01, "**", + ifelse(result$p_value < 0.05, "*", "ns"))) + text(5, 5.1, paste("Significance:", sig_level), cex = 1.1, + col = ifelse(sig_level == "ns", "red", "darkgreen")) + + # Interpretation + r_abs <- abs(result$correlation) + magnitude <- if (r_abs < 0.1) "negligible" + else if (r_abs < 0.3) "small" + else if (r_abs < 0.5) "medium" + else if (r_abs < 0.7) "large" + else "very large" + + text(5, 4.3, paste("Effect size:", magnitude), cex = 1) + + # Assumptions check (if available) + if ("assumptions_check" %in% names(result)) { + assumptions_ok <- result$assumptions_check$overall_suitable + text(5, 3.5, paste("Assumptions met:", assumptions_ok), cex = 1, + col = ifelse(assumptions_ok, "darkgreen", "orange")) + } + + par(mfrow = c(1, 1)) +} + +# Create correlation matrix visualization +create_correlation_matrix_plot <- function(cor_matrix, p_matrix, data, main_title) { + p <- ncol(cor_matrix) + + par(mfrow = c(2, 2), mar = c(5, 5, 3, 2)) + + # 1. Correlation heatmap + image(1:p, 1:p, cor_matrix, col = colorRampPalette(c("blue", "white", "red"))(20), + main = paste(main_title, "\nCorrelation Matrix"), + xlab = "", ylab = "", axes = FALSE, zlim = c(-1, 1)) + + axis(1, at = 1:p, labels = colnames(cor_matrix), las = 2, cex.axis = 0.8) + axis(2, at = 1:p, labels = colnames(cor_matrix), las = 2, cex.axis = 0.8) + + # Add correlation values + for (i in 1:p) { + for (j in 1:p) { + if (i != j) { + text(j, i, round(cor_matrix[i, j], 2), cex = 0.8, + col = ifelse(abs(cor_matrix[i, j]) > 0.5, "white", "black")) + } + } + } + + # 2. P-value heatmap + log_p <- -log10(p_matrix + 1e-16) # Add small constant to avoid log(0) + image(1:p, 1:p, log_p, col = heat.colors(20), + main = "Significance (-log10 p-values)", + xlab = "", ylab = "", axes = FALSE) + + axis(1, at = 1:p, labels = colnames(p_matrix), las = 2, cex.axis = 0.8) + axis(2, at = 1:p, labels = colnames(p_matrix), las = 2, cex.axis = 0.8) + + # Add significance markers + for (i in 1:p) { + for (j in 1:p) { + if (i != j) { + sig_marker <- if (p_matrix[i, j] < 0.001) "***" + else if (p_matrix[i, j] < 0.01) "**" + else if (p_matrix[i, j] < 0.05) "*" + else "" + text(j, i, sig_marker, cex = 1.2, font = 2, col = "darkred") + } + } + } + + # 3. Scatterplot matrix (subset if too many variables) + if (p <= 5) { + pairs(data, pch = 19, col = "darkblue", cex = 0.8, main = "Scatterplot Matrix") + } else { + # Show first 4 variables + pairs(data[, 1:4], pch = 19, col = "darkblue", cex = 0.8, + main = "Scatterplot Matrix (First 4 Variables)") + } + + # 4. Summary statistics + plot(1, 1, type = "n", xlim = c(0, 10), ylim = c(0, 10), + main = "Matrix Summary", xlab = "", ylab = "", axes = FALSE) + + text(5, 9.5, "Correlation Matrix Analysis", cex = 1.2, font = 2) + text(5, 8.7, paste("Variables:", p), cex = 1.1) + text(5, 8.1, paste("Sample size:", nrow(data)), cex = 1.1) + + # Count significant correlations + sig_uncorrected <- sum(p_matrix < 0.05 & upper.tri(p_matrix)) + total_pairs <- p * (p - 1) / 2 + + text(5, 7.5, paste("Significant pairs (α=0.05):", sig_uncorrected, "/", total_pairs), cex = 1) + text(5, 6.9, paste("Proportion significant:", round(sig_uncorrected/total_pairs, 3)), cex = 1) + + # Strongest correlations + cor_upper <- cor_matrix[upper.tri(cor_matrix)] + max_cor <- max(abs(cor_upper)) + text(5, 6.3, paste("Strongest correlation:", round(max_cor, 3)), cex = 1) + + # Average absolute correlation + avg_cor <- mean(abs(cor_upper)) + text(5, 5.7, paste("Average |correlation|:", round(avg_cor, 3)), cex = 1) + + par(mfrow = c(1, 1)) +} + +# Print methods +print.biomedical_correlation <- function(x, ...) { + cat("\n", x$method, "\n") + cat(rep("=", nchar(x$method) + 2), "\n", sep = "") + + cat("Sample size:", x$sample_size, "\n") + if (x$missing_pairs > 0) { + cat("Missing pairs removed:", x$missing_pairs, "\n") + } + + cat("Correlation coefficient:", x$correlation, "\n") + + if ("t_statistic" %in% names(x)) { + cat("t-statistic:", x$t_statistic, "\n") + cat("Degrees of freedom:", x$degrees_of_freedom, "\n") + } else if ("z_statistic" %in% names(x)) { + cat("z-statistic:", x$z_statistic, "\n") + } + + cat("p-value:", x$p_value, "\n") + cat(x$confidence_level * 100, "% Confidence interval: [", + x$confidence_interval[1], ", ", x$confidence_interval[2], "]\n", sep = "") + + if ("r_squared" %in% names(x)) { + cat("R-squared:", x$r_squared, "\n") + } + + # Interpretation + if (x$p_value < 0.05) { + cat("\nConclusion: Significant correlation detected\n") + } else { + cat("\nConclusion: No significant correlation detected\n") + } + + # Effect size interpretation + r_abs <- abs(x$correlation) + magnitude <- if (r_abs < 0.1) "negligible" + else if (r_abs < 0.3) "small" + else if (r_abs < 0.5) "medium" + else if (r_abs < 0.7) "large" + else "very large" + cat("Effect size:", magnitude, "\n") + + # Assumptions (if available) + if ("assumptions_check" %in% names(x)) { + cat("\nAssumptions check:\n") + cat("- Normality (X):", x$assumptions_check$normality_x, "\n") + cat("- Normality (Y):", x$assumptions_check$normality_y, "\n") + cat("- Linearity:", x$assumptions_check$linearity, "\n") + cat("- Homoscedasticity:", x$assumptions_check$homoscedasticity, "\n") + cat("- No outliers:", x$assumptions_check$no_outliers, "\n") + cat("- Overall suitable for Pearson:", x$assumptions_check$overall_suitable, "\n") + } +} + +print.biomedical_correlation_matrix <- function(x, ...) { + cat("\n", x$method, "\n") + cat(rep("=", nchar(x$method) + 2), "\n", sep = "") + + cat("Variables:", x$variables, "\n") + cat("Sample size:", x$sample_size, "\n") + if (x$missing_rows > 0) { + cat("Rows with missing data removed:", x$missing_rows, "\n") + } + + total_pairs <- x$variables * (x$variables - 1) / 2 + cat("Total variable pairs:", total_pairs, "\n") + + cat("\nSignificant correlations (α = 0.05):\n") + cat("- Uncorrected:", x$significant_pairs_uncorrected, "/", total_pairs, + "(", round(x$significant_pairs_uncorrected/total_pairs*100, 1), "%)\n") + cat("- Bonferroni corrected:", x$significant_pairs_bonferroni, "/", total_pairs, + "(", round(x$significant_pairs_bonferroni/total_pairs*100, 1), "%)\n") + cat("- FDR corrected:", x$significant_pairs_fdr, "/", total_pairs, + "(", round(x$significant_pairs_fdr/total_pairs*100, 1), "%)\n") + + cat("\nCorrelation matrix:\n") + print(round(x$correlation_matrix, 3)) +} + +# Demonstration function +demonstrate_correlation_analysis <- function() { + cat("=== Comprehensive Correlation Analysis for Biomedical Data ===\n\n") + + set.seed(123) + + # Example 1: Pearson correlation (linear relationship) + cat("1. PEARSON CORRELATION: Blood Pressure vs Age\n") + cat("Research Question: Is there a linear relationship between age and systolic BP?\n\n") + + age <- runif(50, 25, 75) + systolic_bp <- 90 + 1.2 * age + rnorm(50, 0, 8) # Linear relationship with noise + + pearson_result <- correlation_analysis(age, systolic_bp, method = "pearson", + plot_title = "Age vs Systolic BP") + print(pearson_result) + + cat("\n", rep("=", 60), "\n\n") + + # Example 2: Spearman correlation (monotonic but non-linear) + cat("2. SPEARMAN CORRELATION: Dose vs Response (Non-linear)\n") + cat("Research Question: Is there a monotonic relationship between drug dose and response?\n\n") + + dose <- seq(0, 100, length.out = 40) + response <- 10 * log(dose + 1) + rnorm(40, 0, 3) # Log relationship + + spearman_result <- correlation_analysis(dose, response, method = "spearman", + plot_title = "Drug Dose vs Response") + print(spearman_result) + + cat("\n", rep("=", 60), "\n\n") + + # Example 3: Correlation matrix analysis + cat("3. CORRELATION MATRIX: Multiple Biomarkers\n") + cat("Research Question: How are different biomarkers correlated?\n\n") + + n <- 100 + biomarker_data <- data.frame( + Cholesterol = rnorm(n, 200, 30), + BMI = rnorm(n, 25, 4), + Blood_Pressure = rnorm(n, 120, 15), + Heart_Rate = rnorm(n, 75, 10) + ) + + # Create some realistic correlations + biomarker_data$Blood_Pressure <- biomarker_data$Blood_Pressure + + 0.3 * biomarker_data$BMI + 0.2 * biomarker_data$Cholesterol/10 + biomarker_data$Heart_Rate <- biomarker_data$Heart_Rate + + 0.4 * biomarker_data$BMI - 0.1 * biomarker_data$Blood_Pressure + + cat("Biomarker correlation matrix analysis:\n") + matrix_result <- correlation_analysis(biomarker_data, method = "pearson", + plot_title = "Biomarker Correlations") + print(matrix_result) + + cat("\n", rep("=", 60), "\n\n") + + # Example 4: Robust correlation with outliers + cat("4. ROBUST CORRELATION: Kendall's Tau with Outliers\n") + cat("Research Question: How does outlier presence affect correlation methods?\n\n") + + x_clean <- rnorm(30, 50, 10) + y_clean <- 2 * x_clean + rnorm(30, 0, 5) + + # Add outliers + x_outliers <- c(x_clean, c(100, 10)) + y_outliers <- c(y_clean, c(20, 150)) + + cat("Pearson (sensitive to outliers):\n") + pearson_outliers <- correlation_analysis(x_outliers, y_outliers, method = "pearson", + plot = FALSE) + cat("r =", round(pearson_outliers$correlation, 3), + ", p =", round(pearson_outliers$p_value, 4), "\n") + + cat("\nKendall's tau (robust to outliers):\n") + kendall_outliers <- correlation_analysis(x_outliers, y_outliers, method = "kendall", + plot = FALSE) + cat("τ =", round(kendall_outliers$correlation, 3), + ", p =", round(kendall_outliers$p_value, 4), "\n") + + cat("\nWithout outliers:\n") + clean_result <- correlation_analysis(x_clean, y_clean, method = "pearson", plot = FALSE) + cat("r =", round(clean_result$correlation, 3), + ", p =", round(clean_result$p_value, 4), "\n") + + cat("\n", rep("=", 60), "\n\n") + + # Example 5: Power analysis for correlations + cat("5. POWER ANALYSIS: Sample Size Requirements\n") + cat("Demonstrating power for detecting different correlation strengths\n\n") + + correlation_strengths <- c(0.1, 0.3, 0.5, 0.7) + sample_sizes <- c(20, 50, 100, 200) + + power_matrix <- matrix(NA, nrow = length(sample_sizes), ncol = length(correlation_strengths)) + rownames(power_matrix) <- paste("n =", sample_sizes) + colnames(power_matrix) <- paste("ρ =", correlation_strengths) + + for (i in seq_along(sample_sizes)) { + for (j in seq_along(correlation_strengths)) { + n <- sample_sizes[i] + true_r <- correlation_strengths[j] + + # Monte Carlo power simulation + power <- mean(replicate(1000, { + x <- rnorm(n) + y <- true_r * x + sqrt(1 - true_r^2) * rnorm(n) + test_result <- cor.test(x, y) + test_result$p.value < 0.05 + })) + + power_matrix[i, j] <- power + } + } + + cat("Statistical Power Analysis (1000 simulations each):\n") + print(round(power_matrix, 3)) + + cat("\nGuidelines:\n") + cat("- Small correlations (r = 0.1) require very large samples\n") + cat("- Medium correlations (r = 0.3-0.5) need moderate to large samples\n") + cat("- Large correlations (r ≥ 0.7) detectable with smaller samples\n") + cat("- Aim for power ≥ 0.80 (80%) in study planning\n") + cat("- Consider effect size interpretation alongside statistical significance\n") +} + +# Run demonstration if script is executed directly +if (sys.nframe() == 0) { + demonstrate_correlation_analysis() +} \ No newline at end of file diff --git a/biomedical/t_tests_comprehensive.r b/biomedical/t_tests_comprehensive.r new file mode 100644 index 00000000..710e9610 --- /dev/null +++ b/biomedical/t_tests_comprehensive.r @@ -0,0 +1,691 @@ +# Comprehensive T-Tests for Biomedical Data Analysis +# +# This implementation provides multiple types of t-tests commonly used in biomedical research: +# 1. One-sample t-test (compare sample mean to known value) +# 2. Two-sample t-test (compare means of two independent groups) +# 3. Paired t-test (compare means of paired/dependent samples) +# 4. Welch's t-test (two-sample with unequal variances) +# +# Applications in biomedical research: +# - Clinical trials: comparing treatment effects +# - Laboratory studies: comparing measurements between groups +# - Epidemiological studies: comparing health outcomes +# - Pharmaceutical research: drug efficacy analysis +# - Medical device testing: performance comparisons +# +# Time Complexity: O(n) for all tests +# Space Complexity: O(1) for calculations, O(n) for visualizations + +# Helper function to create high-quality plots +create_biomedical_plot <- function(data, title, subtitle = "", save_plot = FALSE, filename = NULL) { + if (save_plot && !is.null(filename)) { + png(filename, width = 800, height = 600, res = 100) + } + + # Set up plotting parameters for professional appearance + par(mfrow = c(1, 1), mar = c(5, 5, 4, 2), cex.main = 1.2, cex.lab = 1.1, cex.axis = 1.0) + + return(invisible()) +} + +# One-sample t-test implementation +one_sample_t_test <- function(x, mu = 0, alternative = "two.sided", conf.level = 0.95, + plot = TRUE, plot_title = "One-Sample T-Test") { + #' One-Sample T-Test + #' + #' Tests whether the mean of a sample differs significantly from a hypothesized value + #' + #' @param x numeric vector of sample data + #' @param mu hypothesized population mean (null hypothesis value) + #' @param alternative character: "two.sided", "less", or "greater" + #' @param conf.level confidence level for confidence interval + #' @param plot logical: whether to create visualization + #' @param plot_title character: title for the plot + #' @return list with test results + + # Input validation + if (!is.numeric(x)) stop("x must be numeric") + if (length(x) < 2) stop("Sample size must be at least 2") + if (!alternative %in% c("two.sided", "less", "greater")) { + stop("alternative must be 'two.sided', 'less', or 'greater'") + } + if (conf.level <= 0 || conf.level >= 1) stop("conf.level must be between 0 and 1") + + # Remove missing values + x <- x[!is.na(x)] + n <- length(x) + + if (n < 2) stop("Need at least 2 non-missing observations") + + # Calculate test statistics + sample_mean <- mean(x) + sample_sd <- sd(x) + se <- sample_sd / sqrt(n) + t_statistic <- (sample_mean - mu) / se + df <- n - 1 + + # Calculate p-value based on alternative hypothesis + if (alternative == "two.sided") { + p_value <- 2 * pt(abs(t_statistic), df, lower.tail = FALSE) + } else if (alternative == "less") { + p_value <- pt(t_statistic, df, lower.tail = TRUE) + } else { # greater + p_value <- pt(t_statistic, df, lower.tail = FALSE) + } + + # Calculate confidence interval + alpha <- 1 - conf.level + t_critical <- qt(1 - alpha/2, df) + ci_lower <- sample_mean - t_critical * se + ci_upper <- sample_mean + t_critical * se + + # Effect size (Cohen's d) + cohens_d <- (sample_mean - mu) / sample_sd + + # Create visualization + if (plot) { + # Set up plot layout + par(mfrow = c(2, 2), mar = c(4, 4, 3, 2)) + + # 1. Histogram with normal overlay + hist(x, breaks = max(5, ceiling(sqrt(n))), probability = TRUE, + main = paste(plot_title, "\nSample Distribution"), + xlab = "Values", ylab = "Density", col = "lightblue", border = "black") + + # Add normal curve overlay + x_seq <- seq(min(x), max(x), length.out = 100) + normal_curve <- dnorm(x_seq, mean = sample_mean, sd = sample_sd) + lines(x_seq, normal_curve, col = "red", lwd = 2) + abline(v = sample_mean, col = "blue", lwd = 2, lty = 2) + abline(v = mu, col = "green", lwd = 2, lty = 2) + legend("topright", c("Sample Mean", "Hypothesized Mean", "Normal Fit"), + col = c("blue", "green", "red"), lty = c(2, 2, 1), lwd = 2, cex = 0.8) + + # 2. Box plot + boxplot(x, main = "Box Plot", ylab = "Values", col = "lightgreen") + abline(h = sample_mean, col = "blue", lwd = 2, lty = 2) + abline(h = mu, col = "green", lwd = 2, lty = 2) + + # 3. Q-Q plot for normality check + qqnorm(x, main = "Q-Q Plot (Normality Check)") + qqline(x, col = "red", lwd = 2) + + # 4. Test summary plot + plot(1, 1, type = "n", xlim = c(0, 10), ylim = c(0, 10), + main = "Test Summary", xlab = "", ylab = "", axes = FALSE) + + # Add test results text + text(5, 9, paste("One-Sample T-Test Results"), cex = 1.2, font = 2) + text(5, 8, paste("Sample Mean:", round(sample_mean, 4)), cex = 1) + text(5, 7.3, paste("Hypothesized Mean:", mu), cex = 1) + text(5, 6.6, paste("t-statistic:", round(t_statistic, 4)), cex = 1) + text(5, 5.9, paste("p-value:", round(p_value, 6)), cex = 1) + text(5, 5.2, paste("df:", df), cex = 1) + text(5, 4.5, paste(conf.level*100, "% CI: [", round(ci_lower, 4), ", ", round(ci_upper, 4), "]", sep = ""), cex = 1) + text(5, 3.8, paste("Cohen's d:", round(cohens_d, 4)), cex = 1) + + # Add significance indication + sig_level <- ifelse(p_value < 0.001, "***", + ifelse(p_value < 0.01, "**", + ifelse(p_value < 0.05, "*", "ns"))) + text(5, 3.1, paste("Significance:", sig_level), cex = 1, + col = ifelse(sig_level == "ns", "red", "darkgreen")) + + # Add interpretation + if (p_value < 0.05) { + text(5, 2.4, "Result: Reject null hypothesis", cex = 1, col = "darkgreen", font = 2) + } else { + text(5, 2.4, "Result: Fail to reject null hypothesis", cex = 1, col = "red", font = 2) + } + + par(mfrow = c(1, 1)) # Reset plot layout + } + + # Return results + result <- list( + statistic = t_statistic, + p_value = p_value, + degrees_of_freedom = df, + sample_mean = sample_mean, + hypothesized_mean = mu, + standard_error = se, + confidence_interval = c(ci_lower, ci_upper), + confidence_level = conf.level, + alternative = alternative, + effect_size_cohens_d = cohens_d, + method = "One-sample t-test", + sample_size = n, + data_summary = list( + mean = sample_mean, + sd = sample_sd, + min = min(x), + max = max(x), + median = median(x) + ) + ) + + class(result) <- "biomedical_ttest" + return(result) +} + +# Two-sample t-test implementation (independent samples) +two_sample_t_test <- function(x, y, alternative = "two.sided", var.equal = TRUE, + conf.level = 0.95, plot = TRUE, plot_title = "Two-Sample T-Test") { + #' Two-Sample T-Test (Independent Samples) + #' + #' Tests whether the means of two independent groups differ significantly + #' + #' @param x numeric vector for group 1 + #' @param y numeric vector for group 2 + #' @param alternative character: "two.sided", "less", or "greater" + #' @param var.equal logical: assume equal variances (Student's t) or not (Welch's t) + #' @param conf.level confidence level for confidence interval + #' @param plot logical: whether to create visualization + #' @param plot_title character: title for the plot + #' @return list with test results + + # Input validation + if (!is.numeric(x) || !is.numeric(y)) stop("Both x and y must be numeric") + if (!alternative %in% c("two.sided", "less", "greater")) { + stop("alternative must be 'two.sided', 'less', or 'greater'") + } + + # Remove missing values + x <- x[!is.na(x)] + y <- y[!is.na(y)] + + if (length(x) < 2 || length(y) < 2) stop("Both groups need at least 2 observations") + + n1 <- length(x) + n2 <- length(y) + mean1 <- mean(x) + mean2 <- mean(y) + sd1 <- sd(x) + sd2 <- sd(y) + var1 <- var(x) + var2 <- var(y) + + # Calculate test statistic and degrees of freedom + if (var.equal) { + # Student's t-test (equal variances) + pooled_var <- ((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2) + se_diff <- sqrt(pooled_var * (1/n1 + 1/n2)) + df <- n1 + n2 - 2 + method <- "Two-sample t-test (equal variances)" + } else { + # Welch's t-test (unequal variances) + se_diff <- sqrt(var1/n1 + var2/n2) + df <- (var1/n1 + var2/n2)^2 / ((var1/n1)^2/(n1-1) + (var2/n2)^2/(n2-1)) + method <- "Welch's two-sample t-test (unequal variances)" + } + + t_statistic <- (mean1 - mean2) / se_diff + + # Calculate p-value + if (alternative == "two.sided") { + p_value <- 2 * pt(abs(t_statistic), df, lower.tail = FALSE) + } else if (alternative == "less") { + p_value <- pt(t_statistic, df, lower.tail = TRUE) + } else { # greater + p_value <- pt(t_statistic, df, lower.tail = FALSE) + } + + # Confidence interval for difference in means + alpha <- 1 - conf.level + t_critical <- qt(1 - alpha/2, df) + mean_diff <- mean1 - mean2 + ci_lower <- mean_diff - t_critical * se_diff + ci_upper <- mean_diff + t_critical * se_diff + + # Effect size (Cohen's d) + if (var.equal) { + pooled_sd <- sqrt(pooled_var) + } else { + pooled_sd <- sqrt(((n1-1)*var1 + (n2-1)*var2) / (n1 + n2 - 2)) + } + cohens_d <- (mean1 - mean2) / pooled_sd + + # Create visualization + if (plot) { + par(mfrow = c(2, 3), mar = c(4, 4, 3, 2)) + + # 1. Side-by-side histograms + x_range <- range(c(x, y)) + breaks_seq <- seq(x_range[1], x_range[2], length.out = 15) + + hist(x, breaks = breaks_seq, probability = TRUE, main = "Group 1 Distribution", + xlab = "Values", ylab = "Density", col = "lightblue", alpha = 0.7) + abline(v = mean1, col = "blue", lwd = 2, lty = 2) + + hist(y, breaks = breaks_seq, probability = TRUE, main = "Group 2 Distribution", + xlab = "Values", ylab = "Density", col = "lightcoral", alpha = 0.7) + abline(v = mean2, col = "red", lwd = 2, lty = 2) + + # 2. Box plots comparison + boxplot(list("Group 1" = x, "Group 2" = y), + main = "Group Comparison", ylab = "Values", + col = c("lightblue", "lightcoral")) + + # 3. Density plot overlay + plot(density(x), main = "Density Comparison", xlab = "Values", ylab = "Density", + col = "blue", lwd = 2, xlim = x_range) + lines(density(y), col = "red", lwd = 2) + abline(v = mean1, col = "blue", lwd = 2, lty = 2) + abline(v = mean2, col = "red", lwd = 2, lty = 2) + legend("topright", c("Group 1", "Group 2", "Mean 1", "Mean 2"), + col = c("blue", "red", "blue", "red"), lty = c(1, 1, 2, 2), lwd = 2, cex = 0.8) + + # 4. Q-Q plots for normality + qqnorm(x, main = "Q-Q Plot Group 1", col = "blue") + qqline(x, col = "darkblue", lwd = 2) + + qqnorm(y, main = "Q-Q Plot Group 2", col = "red") + qqline(y, col = "darkred", lwd = 2) + + # 5. Test summary + plot(1, 1, type = "n", xlim = c(0, 10), ylim = c(0, 10), + main = "Test Summary", xlab = "", ylab = "", axes = FALSE) + + text(5, 9.5, paste("Two-Sample T-Test Results"), cex = 1.2, font = 2) + text(5, 8.8, paste("Group 1 Mean:", round(mean1, 4)), cex = 1) + text(5, 8.3, paste("Group 2 Mean:", round(mean2, 4)), cex = 1) + text(5, 7.8, paste("Mean Difference:", round(mean_diff, 4)), cex = 1) + text(5, 7.3, paste("t-statistic:", round(t_statistic, 4)), cex = 1) + text(5, 6.8, paste("p-value:", round(p_value, 6)), cex = 1) + text(5, 6.3, paste("df:", round(df, 2)), cex = 1) + text(5, 5.8, paste(conf.level*100, "% CI: [", round(ci_lower, 4), ", ", round(ci_upper, 4), "]", sep = ""), cex = 0.9) + text(5, 5.3, paste("Cohen's d:", round(cohens_d, 4)), cex = 1) + text(5, 4.8, paste("Method:", ifelse(var.equal, "Student's", "Welch's")), cex = 1) + + # Significance and interpretation + sig_level <- ifelse(p_value < 0.001, "***", + ifelse(p_value < 0.01, "**", + ifelse(p_value < 0.05, "*", "ns"))) + text(5, 4.1, paste("Significance:", sig_level), cex = 1, + col = ifelse(sig_level == "ns", "red", "darkgreen")) + + if (p_value < 0.05) { + text(5, 3.4, "Result: Significant difference", cex = 1, col = "darkgreen", font = 2) + } else { + text(5, 3.4, "Result: No significant difference", cex = 1, col = "red", font = 2) + } + + par(mfrow = c(1, 1)) + } + + # Return results + result <- list( + statistic = t_statistic, + p_value = p_value, + degrees_of_freedom = df, + mean_group1 = mean1, + mean_group2 = mean2, + mean_difference = mean_diff, + standard_error = se_diff, + confidence_interval = c(ci_lower, ci_upper), + confidence_level = conf.level, + alternative = alternative, + var_equal = var.equal, + effect_size_cohens_d = cohens_d, + method = method, + sample_sizes = c(n1, n2), + group_summaries = list( + group1 = list(mean = mean1, sd = sd1, n = n1, var = var1), + group2 = list(mean = mean2, sd = sd2, n = n2, var = var2) + ) + ) + + class(result) <- "biomedical_ttest" + return(result) +} + +# Paired t-test implementation +paired_t_test <- function(x, y, alternative = "two.sided", conf.level = 0.95, + plot = TRUE, plot_title = "Paired T-Test") { + #' Paired T-Test (Dependent Samples) + #' + #' Tests whether the mean difference between paired observations is significantly different from zero + #' + #' @param x numeric vector for condition 1 (e.g., before treatment) + #' @param y numeric vector for condition 2 (e.g., after treatment) + #' @param alternative character: "two.sided", "less", or "greater" + #' @param conf.level confidence level for confidence interval + #' @param plot logical: whether to create visualization + #' @param plot_title character: title for the plot + #' @return list with test results + + # Input validation + if (!is.numeric(x) || !is.numeric(y)) stop("Both x and y must be numeric") + if (length(x) != length(y)) stop("x and y must have the same length for paired test") + if (!alternative %in% c("two.sided", "less", "greater")) { + stop("alternative must be 'two.sided', 'less', or 'greater'") + } + + # Remove pairs with missing values + complete_pairs <- complete.cases(x, y) + x <- x[complete_pairs] + y <- y[complete_pairs] + + if (length(x) < 2) stop("Need at least 2 complete pairs") + + n <- length(x) + differences <- x - y + mean_diff <- mean(differences) + sd_diff <- sd(differences) + se_diff <- sd_diff / sqrt(n) + + # Calculate test statistic + t_statistic <- mean_diff / se_diff + df <- n - 1 + + # Calculate p-value + if (alternative == "two.sided") { + p_value <- 2 * pt(abs(t_statistic), df, lower.tail = FALSE) + } else if (alternative == "less") { + p_value <- pt(t_statistic, df, lower.tail = TRUE) + } else { # greater + p_value <- pt(t_statistic, df, lower.tail = FALSE) + } + + # Confidence interval for mean difference + alpha <- 1 - conf.level + t_critical <- qt(1 - alpha/2, df) + ci_lower <- mean_diff - t_critical * se_diff + ci_upper <- mean_diff + t_critical * se_diff + + # Effect size (Cohen's d for paired samples) + cohens_d <- mean_diff / sd_diff + + # Correlation between paired observations + correlation <- cor(x, y) + + # Create visualization + if (plot) { + par(mfrow = c(2, 3), mar = c(4, 4, 3, 2)) + + # 1. Before-after scatter plot + plot(x, y, main = "Before vs After", xlab = "Before (x)", ylab = "After (y)", + pch = 19, col = "blue", cex = 1.2) + abline(0, 1, col = "red", lwd = 2, lty = 2) # Line of equality + abline(lm(y ~ x), col = "darkgreen", lwd = 2) # Regression line + legend("topleft", c("Equality Line", "Regression Line"), + col = c("red", "darkgreen"), lty = c(2, 1), lwd = 2, cex = 0.8) + + # 2. Differences histogram + hist(differences, breaks = max(5, ceiling(sqrt(n))), probability = TRUE, + main = "Distribution of Differences", xlab = "Differences (x - y)", + ylab = "Density", col = "lightgreen") + abline(v = mean_diff, col = "red", lwd = 2, lty = 2) + abline(v = 0, col = "black", lwd = 2, lty = 1) + + # Add normal curve overlay + diff_seq <- seq(min(differences), max(differences), length.out = 100) + normal_curve <- dnorm(diff_seq, mean = mean_diff, sd = sd_diff) + lines(diff_seq, normal_curve, col = "blue", lwd = 2) + + # 3. Box plot of differences + boxplot(differences, main = "Differences Box Plot", ylab = "Differences (x - y)", + col = "lightblue") + abline(h = mean_diff, col = "red", lwd = 2, lty = 2) + abline(h = 0, col = "black", lwd = 2, lty = 1) + + # 4. Q-Q plot for differences + qqnorm(differences, main = "Q-Q Plot of Differences") + qqline(differences, col = "red", lwd = 2) + + # 5. Individual trajectories + plot(rep(1, n), x, xlim = c(0.5, 2.5), ylim = range(c(x, y)), + main = "Individual Changes", xlab = "Time Point", ylab = "Values", + pch = 19, col = "blue", cex = 1.2) + points(rep(2, n), y, pch = 19, col = "red", cex = 1.2) + + # Draw lines connecting paired observations + for (i in 1:n) { + lines(c(1, 2), c(x[i], y[i]), col = "gray", lwd = 0.5) + } + + # Add means + points(1, mean(x), pch = 15, col = "darkblue", cex = 2) + points(2, mean(y), pch = 15, col = "darkred", cex = 2) + lines(c(1, 2), c(mean(x), mean(y)), col = "black", lwd = 3) + + axis(1, at = c(1, 2), labels = c("Before", "After")) + legend("topright", c("Individual", "Group Mean"), + pch = c(19, 15), col = c("blue", "black"), cex = 0.8) + + # 6. Test summary + plot(1, 1, type = "n", xlim = c(0, 10), ylim = c(0, 10), + main = "Test Summary", xlab = "", ylab = "", axes = FALSE) + + text(5, 9.5, paste("Paired T-Test Results"), cex = 1.2, font = 2) + text(5, 8.8, paste("Before Mean:", round(mean(x), 4)), cex = 1) + text(5, 8.3, paste("After Mean:", round(mean(y), 4)), cex = 1) + text(5, 7.8, paste("Mean Difference:", round(mean_diff, 4)), cex = 1) + text(5, 7.3, paste("t-statistic:", round(t_statistic, 4)), cex = 1) + text(5, 6.8, paste("p-value:", round(p_value, 6)), cex = 1) + text(5, 6.3, paste("df:", df), cex = 1) + text(5, 5.8, paste(conf.level*100, "% CI: [", round(ci_lower, 4), ", ", round(ci_upper, 4), "]", sep = ""), cex = 0.9) + text(5, 5.3, paste("Cohen's d:", round(cohens_d, 4)), cex = 1) + text(5, 4.8, paste("Correlation:", round(correlation, 4)), cex = 1) + + # Significance and interpretation + sig_level <- ifelse(p_value < 0.001, "***", + ifelse(p_value < 0.01, "**", + ifelse(p_value < 0.05, "*", "ns"))) + text(5, 4.1, paste("Significance:", sig_level), cex = 1, + col = ifelse(sig_level == "ns", "red", "darkgreen")) + + if (p_value < 0.05) { + text(5, 3.4, "Result: Significant change", cex = 1, col = "darkgreen", font = 2) + } else { + text(5, 3.4, "Result: No significant change", cex = 1, col = "red", font = 2) + } + + par(mfrow = c(1, 1)) + } + + # Return results + result <- list( + statistic = t_statistic, + p_value = p_value, + degrees_of_freedom = df, + mean_before = mean(x), + mean_after = mean(y), + mean_difference = mean_diff, + standard_error = se_diff, + confidence_interval = c(ci_lower, ci_upper), + confidence_level = conf.level, + alternative = alternative, + effect_size_cohens_d = cohens_d, + correlation = correlation, + method = "Paired t-test", + sample_size = n, + difference_summary = list( + mean = mean_diff, + sd = sd_diff, + min = min(differences), + max = max(differences), + median = median(differences) + ) + ) + + class(result) <- "biomedical_ttest" + return(result) +} + +# Print method for biomedical t-test results +print.biomedical_ttest <- function(x, ...) { + cat("\n", x$method, "\n") + cat(rep("=", nchar(x$method) + 2), "\n", sep = "") + + if (grepl("One-sample", x$method)) { + cat("Sample mean:", x$sample_mean, "\n") + cat("Hypothesized mean:", x$hypothesized_mean, "\n") + } else if (grepl("Paired", x$method)) { + cat("Mean before:", x$mean_before, "\n") + cat("Mean after:", x$mean_after, "\n") + cat("Mean difference:", x$mean_difference, "\n") + cat("Correlation:", x$correlation, "\n") + } else { # Two-sample + cat("Group 1 mean:", x$mean_group1, "\n") + cat("Group 2 mean:", x$mean_group2, "\n") + cat("Mean difference:", x$mean_difference, "\n") + cat("Equal variances assumed:", x$var_equal, "\n") + } + + cat("t-statistic:", x$statistic, "\n") + cat("Degrees of freedom:", x$degrees_of_freedom, "\n") + cat("p-value:", x$p_value, "\n") + cat("Alternative hypothesis:", x$alternative, "\n") + cat(x$confidence_level * 100, "% Confidence interval: [", + x$confidence_interval[1], ", ", x$confidence_interval[2], "]\n", sep = "") + cat("Effect size (Cohen's d):", x$effect_size_cohens_d, "\n") + + # Interpretation + alpha <- 1 - x$confidence_level + if (x$p_value < alpha) { + cat("\nConclusion: Reject the null hypothesis (significant result)\n") + } else { + cat("\nConclusion: Fail to reject the null hypothesis (not significant)\n") + } + + # Effect size interpretation + d <- abs(x$effect_size_cohens_d) + effect_magnitude <- if (d < 0.2) "negligible" + else if (d < 0.5) "small" + else if (d < 0.8) "medium" + else "large" + cat("Effect size magnitude:", effect_magnitude, "\n") +} + +# Comprehensive demonstration function +demonstrate_t_tests <- function() { + cat("=== Comprehensive T-Tests for Biomedical Data Analysis ===\n\n") + + # Set random seed for reproducibility + set.seed(123) + + # Example 1: One-sample t-test (Drug efficacy study) + cat("1. ONE-SAMPLE T-TEST: Drug Efficacy Study\n") + cat("Research Question: Does a new drug significantly reduce blood pressure below 120 mmHg?\n") + cat("H0: μ = 120 (no effect)\n") + cat("H1: μ < 120 (drug reduces BP)\n\n") + + # Simulate blood pressure data after drug treatment + bp_after_drug <- rnorm(25, mean = 115, sd = 8) # Simulated BP reduction + + cat("Blood pressure measurements after drug treatment (n=25):\n") + cat(paste(round(bp_after_drug, 1), collapse = ", "), "\n\n") + + one_sample_result <- one_sample_t_test(bp_after_drug, mu = 120, + alternative = "less", + plot_title = "Drug Efficacy: BP Reduction") + print(one_sample_result) + + cat("\n" , rep("=", 60), "\n\n") + + # Example 2: Two-sample t-test (Treatment comparison) + cat("2. TWO-SAMPLE T-TEST: Treatment Comparison\n") + cat("Research Question: Is there a difference in recovery time between two treatments?\n") + cat("H0: μ1 = μ2 (no difference)\n") + cat("H1: μ1 ≠ μ2 (treatments differ)\n\n") + + # Simulate recovery times + treatment_a <- rnorm(20, mean = 12, sd = 3) # Treatment A: 12 days average + treatment_b <- rnorm(22, mean = 10, sd = 2.5) # Treatment B: 10 days average (better) + + cat("Treatment A recovery times (days, n=20):\n") + cat(paste(round(treatment_a, 1), collapse = ", "), "\n\n") + cat("Treatment B recovery times (days, n=22):\n") + cat(paste(round(treatment_b, 1), collapse = ", "), "\n\n") + + two_sample_result <- two_sample_t_test(treatment_a, treatment_b, + var.equal = FALSE, # Welch's t-test + plot_title = "Treatment Comparison") + print(two_sample_result) + + cat("\n" , rep("=", 60), "\n\n") + + # Example 3: Paired t-test (Before/After study) + cat("3. PAIRED T-TEST: Before/After Clinical Study\n") + cat("Research Question: Does the intervention significantly reduce cholesterol levels?\n") + cat("H0: μd = 0 (no change)\n") + cat("H1: μd > 0 (cholesterol decreases)\n\n") + + # Simulate paired cholesterol data + n_patients <- 18 + baseline_cholesterol <- rnorm(n_patients, mean = 220, sd = 25) + # Intervention reduces cholesterol by 15-25 points on average + reduction <- rnorm(n_patients, mean = 20, sd = 8) + after_cholesterol <- baseline_cholesterol - reduction + + cat("Patient cholesterol levels (mg/dL):\n") + cat("Before intervention (n=18):\n") + cat(paste(round(baseline_cholesterol, 1), collapse = ", "), "\n\n") + cat("After intervention (n=18):\n") + cat(paste(round(after_cholesterol, 1), collapse = ", "), "\n\n") + + paired_result <- paired_t_test(baseline_cholesterol, after_cholesterol, + alternative = "greater", # Before > After + plot_title = "Cholesterol Reduction Study") + print(paired_result) + + cat("\n" , rep("=", 60), "\n\n") + + # Example 4: Power analysis demonstration + cat("4. STATISTICAL POWER AND SAMPLE SIZE CONSIDERATIONS\n") + cat("Demonstrating the relationship between sample size, effect size, and statistical power\n\n") + + # Different sample sizes for power demonstration + sample_sizes <- c(5, 10, 20, 50) + power_results <- data.frame( + sample_size = sample_sizes, + power_small_effect = NA, + power_medium_effect = NA, + power_large_effect = NA + ) + + for (i in seq_along(sample_sizes)) { + n <- sample_sizes[i] + # Simulate multiple t-tests with different effect sizes + small_effect_power <- mean(replicate(1000, { + x <- rnorm(n, mean = 0.2, sd = 1) # Small effect (d = 0.2) + result <- one_sample_t_test(x, mu = 0, plot = FALSE) + result$p_value < 0.05 + })) + + medium_effect_power <- mean(replicate(1000, { + x <- rnorm(n, mean = 0.5, sd = 1) # Medium effect (d = 0.5) + result <- one_sample_t_test(x, mu = 0, plot = FALSE) + result$p_value < 0.05 + })) + + large_effect_power <- mean(replicate(1000, { + x <- rnorm(n, mean = 0.8, sd = 1) # Large effect (d = 0.8) + result <- one_sample_t_test(x, mu = 0, plot = FALSE) + result$p_value < 0.05 + })) + + power_results[i, 2:4] <- c(small_effect_power, medium_effect_power, large_effect_power) + } + + cat("Statistical Power Analysis (1000 simulations each):\n") + print(round(power_results, 3)) + + cat("\nInterpretation:\n") + cat("- Small effect size (d=0.2) requires large samples for adequate power\n") + cat("- Medium effect size (d=0.5) achievable with moderate samples\n") + cat("- Large effect size (d=0.8) detectable even with small samples\n") + cat("- Aim for power ≥ 0.80 (80%) in study design\n\n") + + cat("Clinical Significance Guidelines:\n") + cat("- Always consider clinical significance alongside statistical significance\n") + cat("- Effect sizes help interpret practical importance of findings\n") + cat("- Confidence intervals provide range of plausible effect sizes\n") + cat("- Consider Type I (false positive) and Type II (false negative) error rates\n") +} + +# Run demonstration if script is executed directly +if (sys.nframe() == 0) { + demonstrate_t_tests() +} \ No newline at end of file