Skip to content

Commit ea6f80d

Browse files
committed
add locusplot
1 parent d875111 commit ea6f80d

File tree

2 files changed

+327
-3
lines changed

2 files changed

+327
-3
lines changed

R/finemap.R

Lines changed: 293 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,3 +376,296 @@ gg_regionplot <- function(df,
376376
# Return the combined plot ====
377377
return(plot_region)
378378
}
379+
380+
#' @title Locus Plot for Genomic Regions
381+
#' @description Creates a locus plot for visualizing genomic regions, highlighting SNPs, p-values, and optionally LD information.
382+
#' @param df Dataframe containing the data to be plotted.
383+
#' @param lead_snp Character, optional. The lead SNP to be highlighted. Defaults to the SNP with the lowest p-value if not provided.
384+
#' @param rsid Column name for SNP IDs in `df`.
385+
#' @param chrom Column name for chromosome information in `df`.
386+
#' @param pos Column name for SNP position in `df`.
387+
#' @param ref Column name for reference allele in `df`.
388+
#' @param alt Column name for alternate allele in `df`.
389+
#' @param effect Column name for effect size in `df`. Used to calculate p-values if `std_err` is provided.
390+
#' @param std_err Column name for standard error in `df`. Used to calculate p-values if `effect` is provided.
391+
#' @param p_value Column name for p-values in `df`. Used if `effect` and `std_err` are not provided.
392+
#' @param trait Column name for traits in `df`, optional.
393+
#' @param plot_pvalue_threshold Numeric, p-value threshold for plotting. Default is 0.1.
394+
#' @param plot_subsample_prop Numeric, proportion of SNPs to subsample for plotting. Default is 0.25.
395+
#' @param plot_distance Numeric, distance in base pairs to include around the lead SNP. Default is 500,000.
396+
#' @param genome_build Character, genome build to use (e.g., "GRCh37"). Default is "GRCh37".
397+
#' @param population Character, population for LD calculations. Default is "ALL".
398+
#' @param plot_genes Logical, whether to include genes in the plot. Default is FALSE.
399+
#' @param plot_recombination Logical, whether to include recombination rates in the plot. Default is FALSE.
400+
#' @param plot_title Character, optional title for the plot.
401+
#' @param plot_subtitle Character, optional subtitle for the plot.
402+
#' @param label Character vector, optional SNP IDs to label in the plot.
403+
#' @return A ggplot2 object representing the locus plot.
404+
#' @examples
405+
#' \dontrun{
406+
#' gg_locusplot(df = my_data, rsid = rsid_col, chrom = chrom_col, pos = pos_col, ref = ref_col, alt = alt_col)
407+
#' }
408+
#' @export
409+
gg_locusplot <- function(df,
410+
lead_snp = NULL,
411+
rsid = rsid,
412+
chrom = chrom,
413+
pos = pos,
414+
ref = ref,
415+
alt = alt,
416+
effect = NULL,
417+
std_err = NULL,
418+
p_value = p_value,
419+
trait = NULL,
420+
plot_pvalue_threshold = 0.1,
421+
plot_subsample_prop = 0.25,
422+
plot_distance = 500000,
423+
genome_build = "GRCh37",
424+
population = "ALL",
425+
plot_genes = FALSE,
426+
plot_recombination = FALSE,
427+
plot_title = NULL,
428+
plot_subtitle = NULL,
429+
label = NULL) {
430+
# Check input arguments to ensure they are of the correct type and within reasonable ranges
431+
checkmate::assert_data_frame(df)
432+
# checkmate::assert_string(lead_snp)
433+
checkmate::assert_numeric(plot_pvalue_threshold, upper = 1)
434+
checkmate::assert_numeric(plot_subsample_prop, lower = 0, upper = 1)
435+
checkmate::assert_numeric(plot_distance, lower = 0)
436+
checkmate::assert_logical(plot_genes)
437+
if (!is.null(label)) {
438+
checkmate::assert_character(label)
439+
}
440+
441+
if(!rlang::quo_is_null(rlang::enquo(effect)) & !rlang::quo_is_null(rlang::enquo(std_err))) {
442+
checkmate::assert_numeric(df %>% pull({{ effect }}))
443+
checkmate::assert_numeric(df %>% pull({{ std_err }}))
444+
445+
df <- df %>%
446+
rename(.effect = {{ effect }},
447+
.std_err = {{ std_err }}) %>%
448+
mutate(log10_pval = abs((pnorm(-abs(.effect/.std_err), log.p=TRUE) + log(2)) / log(10)))
449+
} else {
450+
df <- df %>%
451+
mutate(log10_pval = -log10({{ p_value }}))
452+
}
453+
454+
if (rlang::quo_is_null(rlang::enquo(trait))) {
455+
df <- df %>%
456+
select(rsid = {{ rsid }}, chromosome = {{ chrom }}, position = {{ pos }}, ref = {{ ref }}, alt = {{ alt }}, log10_pval) %>%
457+
mutate_if(is.factor, as.character) %>%
458+
mutate(ref = stringr::str_to_upper(ref), alt = stringr::str_to_upper(alt)) %>%
459+
group_by(rsid) %>%
460+
slice_max(log10_pval) %>%
461+
ungroup() %>%
462+
tidyr::drop_na()
463+
} else {
464+
df <- df %>%
465+
select(rsid = {{ rsid }}, chromosome = {{ chrom }}, position = {{ pos }}, ref = {{ ref }}, alt = {{ alt }}, log10_pval, trait = {{ trait }}) %>%
466+
mutate_if(is.factor, as.character) %>%
467+
mutate(ref = stringr::str_to_upper(ref), alt = stringr::str_to_upper(alt)) %>%
468+
group_by(trait, rsid) %>%
469+
slice_max(log10_pval) %>%
470+
ungroup() %>%
471+
tidyr::drop_na()
472+
}
473+
474+
# Create df containing information about lead SNP (by default, select SNP with lowest p-value, otherwise take user-supplied value)
475+
if (is.null(lead_snp)) {
476+
indep_snps <- df %>%
477+
slice_max(log10_pval, with_ties = FALSE, n = 1) %>%
478+
select(lead_rsid = rsid, lead_chromosome = chromosome, lead_position = position, lead_ref = ref, lead_alt = alt)
479+
480+
cli::cli_alert_info("No lead_snp supplied. Defaulting to {indep_snps$lead_rsid} - {indep_snps$lead_chromosome}:{indep_snps$lead_position}, which has the lowest p-value in the region")
481+
} else if (!(lead_snp %in% df$rsid)) {
482+
# ensure lead_snp is in the supplied data; if not, use minimum p-value at locus
483+
indep_snps <- df %>%
484+
slice_max(log10_pval, with_ties = FALSE, n = 1) %>%
485+
select(lead_rsid = rsid, lead_chromosome = chromosome, lead_position = position, lead_ref = ref, lead_alt = alt)
486+
487+
cli::cli_alert_info("Lead snp not present in supplied locus data. Defaulting to {indep_snps$lead_rsid} - {indep_snps$lead_chromosome}:{indep_snps$lead_position}, which has the lowest p-value in the region")
488+
} else {
489+
indep_snps <- df %>%
490+
select(lead_rsid = rsid, lead_chromosome = chromosome, lead_position = position, lead_ref = ref, lead_alt = alt) %>%
491+
filter(lead_rsid == lead_snp) %>%
492+
distinct(lead_rsid, .keep_all = TRUE)
493+
}
494+
495+
# Create dataframe of variants within the region size specified by the user
496+
suppressMessages(locus_snps <- df %>%
497+
filter(rsid %in% indep_snps$lead_rsid) %>%
498+
select(chromosome, position, lead_rsid = rsid) %>%
499+
purrr::pmap_dfr(function(chromosome_filter = first, position_filter = second, lead_rsid = third) {
500+
df %>%
501+
filter(chromosome == chromosome_filter & between(position, position_filter - plot_distance / 2, position_filter + plot_distance / 2)) %>%
502+
mutate(lead_rsid = lead_rsid) %>%
503+
left_join(indep_snps)
504+
}))
505+
506+
# Extract LD and format colors
507+
possibly_ld_extract_locuszoom <- purrr::possibly(locusplotr::ld_extract_locuszoom, otherwise = NULL)
508+
509+
ld_extracted <- possibly_ld_extract_locuszoom(chrom = indep_snps$lead_chromosome, pos = indep_snps$lead_position, ref = indep_snps$lead_ref, alt = indep_snps$lead_alt, start = min(locus_snps$position), stop = max(locus_snps$position), genome_build = genome_build, population = population)
510+
511+
# Create dataframe with variants at locus, LD information, color codes, and labels in preparation for plotting
512+
if (!(is.null(ld_extracted))) {
513+
# Join locus df with LD information
514+
locus_snps_ld <- ld_extracted %>%
515+
select(chromosome = chromosome2, position = position2, variant2, correlation) %>%
516+
mutate(chromosome = as.numeric(chromosome), position = as.numeric(position)) %>%
517+
tidyr::separate(variant2, into = c("chr_pos", "ref_alt"), sep = "_") %>%
518+
tidyr::separate(ref_alt, into = c("ref", "alt"), sep = "/") %>%
519+
right_join(locus_snps, by = c("chromosome" = "chromosome", "position" = "position"), relationship = "many-to-many") %>%
520+
filter((ref.x == ref.y & alt.x == alt.y) | (ref.x == alt.y & alt.x == ref.y)) %>%
521+
select(-ends_with(".y"), -chr_pos) %>%
522+
rename_with(~ stringr::str_replace(.x, ".x", ""), .cols = ends_with(".x"))
523+
524+
# Create color codes and labels
525+
locus_snps_ld <- locus_snps_ld %>%
526+
mutate(color_code = as.character(cut(as.numeric(correlation), breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1), labels = c("blue4", "skyblue", "darkgreen", "orange", "red"), include.lowest = TRUE))) %>%
527+
mutate(legend_label = as.character(cut(as.numeric(correlation), breaks = c(0, 0.2, 0.4, 0.6, 0.8, 1), labels = c("0 - 0.2", "0.2 - 0.4", "0.4 - 0.6", "0.6 - 0.8", "0.8 - 1"), include.lowest = TRUE))) %>%
528+
mutate(lead = rsid == lead_rsid) %>%
529+
mutate(label = case_when(
530+
rsid == lead_rsid ~ lead_rsid,
531+
rsid %in% label ~ rsid,
532+
TRUE ~ NA_character_
533+
)) %>%
534+
mutate(color_code = case_when(
535+
rsid == lead_rsid ~ "purple",
536+
TRUE ~ color_code
537+
)) %>%
538+
mutate(color_code = forcats::fct_expand(color_code, "purple", "red", "orange", "darkgreen", "skyblue", "blue4")) %>%
539+
mutate(color_code = forcats::fct_relevel(color_code, "purple", "red", "orange", "darkgreen", "skyblue", "blue4")) %>%
540+
mutate(legend_label = case_when(
541+
rsid == lead_rsid ~ "Ref",
542+
TRUE ~ legend_label
543+
)) %>%
544+
mutate(legend_label = forcats::fct_expand(legend_label, "Ref", "0.8 - 1", "0.6 - 0.8", "0.4 - 0.6", "0.2 - 0.4", "0 - 0.2")) %>%
545+
mutate(legend_label = forcats::fct_relevel(legend_label, "Ref", "0.8 - 1", "0.6 - 0.8", "0.4 - 0.6", "0.2 - 0.4", "0 - 0.2"))
546+
} else {
547+
# Deal with scenario where lead variant is not present in LD database
548+
cli::cli_alert_info("No linkage disequilibrium information found")
549+
locus_snps_ld <- locus_snps %>%
550+
mutate(correlation = NA_integer_) %>%
551+
mutate(lead = rsid == lead_rsid) %>%
552+
mutate(label = case_when(
553+
rsid == lead_rsid ~ lead_rsid,
554+
TRUE ~ NA_character_
555+
)) %>%
556+
mutate(color_code = case_when(
557+
rsid == lead_rsid ~ "purple",
558+
TRUE ~ "grey50"
559+
)) %>%
560+
mutate(legend_label = case_when(
561+
rsid == lead_rsid ~ "Ref",
562+
TRUE ~ "Other"
563+
))
564+
}
565+
566+
# group locus by trait if necessary
567+
if (!rlang::quo_is_null(rlang::enquo(trait))) {
568+
locus_snps_ld <- locus_snps_ld %>%
569+
group_by(.data = ., trait)
570+
}
571+
572+
locus_snps_ld_label <- locus_snps_ld %>% filter(!is.na(label))
573+
574+
# Make plot (sample non-significant p-values to reduce overplotting)
575+
regional_assoc_plot <- locus_snps_ld %>%
576+
distinct(rsid, .keep_all = TRUE) %>%
577+
filter(log10_pval > -log10(plot_pvalue_threshold) | correlation > 0.2 | legend_label == "Ref") %>% # improve overplotting
578+
bind_rows(locus_snps_ld %>%
579+
filter(log10_pval <= -log10(plot_pvalue_threshold) & correlation < 0.2 & legend_label != "Ref") %>%
580+
slice_sample(prop = plot_subsample_prop)) %>%
581+
arrange(color_code, log10_pval) %>%
582+
ggplot(aes(position, log10_pval)) +
583+
geom_point(aes(fill = factor(color_code), size = lead, alpha = lead, shape = lead)) +
584+
ggrepel::geom_label_repel(data = locus_snps_ld_label, aes(label = label),
585+
size = 4,
586+
color = "black",
587+
fontface = "bold",
588+
fill = "white",
589+
min.segment.length = 0,
590+
box.padding = 1,
591+
alpha = 1,
592+
nudge_y = 4
593+
) +
594+
geom_hline(yintercept = -log10(5e-8), linetype = "dashed") +
595+
scale_fill_identity(parse(text = "r^2"), guide = "legend", labels = levels(forcats::fct_drop(locus_snps_ld$legend_label)), na.translate = FALSE) +
596+
scale_size_manual(values = c(3, 5), guide = "none") +
597+
scale_shape_manual(values = c(21, 23), guide = "none") +
598+
scale_alpha_manual(values = c(0.8, 1), guide = "none") +
599+
scale_x_continuous(breaks = scales::extended_breaks(n = 5), labels = scales::label_number(scale = 1 / 1e6)) +
600+
scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
601+
guides(fill = guide_legend(override.aes = list(shape = 22, size = 6),
602+
position = "inside")) +
603+
labs(
604+
title = plot_title,
605+
subtitle = plot_subtitle,
606+
x = glue::glue("Position on Chromosome {unique(indep_snps$lead_chromosome)} (Mb)"),
607+
y = "-log<sub>10</sub>(P-value)"
608+
) +
609+
theme_bw(base_size = 16) +
610+
theme(
611+
plot.title = element_text(face = "bold"),
612+
legend.position = "none",
613+
# legend.text = element_text(size = 10),
614+
# legend.title = element_text(size = 10, hjust = 0.5),
615+
# legend.justification.inside = c("right", "top"),
616+
# legend.position.inside = c(0.99, 0.99),
617+
# legend.spacing.y = unit(0, "pt"),
618+
strip.text = element_text(color = "black"),
619+
strip.text.x = element_blank(),
620+
axis.title.y = ggtext::element_markdown()
621+
)
622+
623+
if (plot_recombination) {
624+
cli::cli_alert_info("Extracting recombination rates for the region {indep_snps$lead_chromosome}:{indep_snps$lead_position - plot_distance/2}-{indep_snps$lead_position + plot_distance/2}")
625+
ylim <- max(pull(locus_snps_ld, log10_pval), na.rm = TRUE) +
626+
0.3 * max(pull(locus_snps_ld, log10_pval), na.rm = TRUE)
627+
628+
recomb_df <- recomb_extract_locuszoom(chrom = indep_snps$lead_chromosome, start = indep_snps$lead_position - plot_distance / 2, end = indep_snps$lead_position + plot_distance / 2, genome_build = genome_build) %>%
629+
select(position, recomb_rate)
630+
# return(recomb_df)
631+
suppressMessages(
632+
regional_assoc_plot <- regional_assoc_plot +
633+
geom_line(data = recomb_df, mapping = aes(x = position, y = recomb_rate), color = "lightblue", linewidth = 0.5) +
634+
scale_y_continuous(
635+
name = "-log<sub>10</sub>(P-value)",
636+
limits = c(0, ylim),
637+
sec.axis = sec_axis(
638+
~. * (100 / ylim),
639+
name = "Recombination rate (cM/Mb)"
640+
# labels = scales::percent_format()
641+
)
642+
) +
643+
theme(axis.title.y.right = element_text(vjust = 1.5))
644+
)
645+
646+
regional_assoc_plot <- gginnards::move_layers(regional_assoc_plot, "GeomLine", "bottom")
647+
648+
}
649+
650+
# Add plot of genes if requested by user
651+
if (plot_genes) {
652+
cli::cli_alert_info("Extracting genes for the region {indep_snps$lead_chromosome}:{indep_snps$lead_position - plot_distance/2}-{indep_snps$lead_position + plot_distance/2}")
653+
geneplot <- gg_geneplot(chr = indep_snps$lead_chromosome, start = indep_snps$lead_position - plot_distance / 2, end = indep_snps$lead_position + plot_distance / 2, genome_build = genome_build) +
654+
theme(plot.margin = margin(0, 5.5, 5.5, 5.5))
655+
656+
suppressWarnings(suppressMessages(regional_assoc_plot <- patchwork::wrap_plots(list(
657+
regional_assoc_plot +
658+
labs(x = "") +
659+
xlim(indep_snps$lead_position - plot_distance / 2, indep_snps$lead_position + plot_distance / 2) +
660+
theme(
661+
axis.text.x = element_blank(),
662+
axis.ticks.x = element_blank(),
663+
axis.title.x = element_blank(),
664+
plot.margin = margin(5.5, 5.5, 0, 5.5)
665+
),
666+
geneplot
667+
), nrow = 2, heights = c(3, 1))))
668+
}
669+
670+
return(regional_assoc_plot)
671+
}

man/gg_locusplot.Rd

Lines changed: 34 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)