@@ -376,3 +376,296 @@ gg_regionplot <- function(df,
376376 # Return the combined plot ====
377377 return (plot_region )
378378}
379+
380+ # ' @title Locus Plot for Genomic Regions
381+ # ' @description Creates a locus plot for visualizing genomic regions, highlighting SNPs, p-values, and optionally LD information.
382+ # ' @param df Dataframe containing the data to be plotted.
383+ # ' @param lead_snp Character, optional. The lead SNP to be highlighted. Defaults to the SNP with the lowest p-value if not provided.
384+ # ' @param rsid Column name for SNP IDs in `df`.
385+ # ' @param chrom Column name for chromosome information in `df`.
386+ # ' @param pos Column name for SNP position in `df`.
387+ # ' @param ref Column name for reference allele in `df`.
388+ # ' @param alt Column name for alternate allele in `df`.
389+ # ' @param effect Column name for effect size in `df`. Used to calculate p-values if `std_err` is provided.
390+ # ' @param std_err Column name for standard error in `df`. Used to calculate p-values if `effect` is provided.
391+ # ' @param p_value Column name for p-values in `df`. Used if `effect` and `std_err` are not provided.
392+ # ' @param trait Column name for traits in `df`, optional.
393+ # ' @param plot_pvalue_threshold Numeric, p-value threshold for plotting. Default is 0.1.
394+ # ' @param plot_subsample_prop Numeric, proportion of SNPs to subsample for plotting. Default is 0.25.
395+ # ' @param plot_distance Numeric, distance in base pairs to include around the lead SNP. Default is 500,000.
396+ # ' @param genome_build Character, genome build to use (e.g., "GRCh37"). Default is "GRCh37".
397+ # ' @param population Character, population for LD calculations. Default is "ALL".
398+ # ' @param plot_genes Logical, whether to include genes in the plot. Default is FALSE.
399+ # ' @param plot_recombination Logical, whether to include recombination rates in the plot. Default is FALSE.
400+ # ' @param plot_title Character, optional title for the plot.
401+ # ' @param plot_subtitle Character, optional subtitle for the plot.
402+ # ' @param label Character vector, optional SNP IDs to label in the plot.
403+ # ' @return A ggplot2 object representing the locus plot.
404+ # ' @examples
405+ # ' \dontrun{
406+ # ' gg_locusplot(df = my_data, rsid = rsid_col, chrom = chrom_col, pos = pos_col, ref = ref_col, alt = alt_col)
407+ # ' }
408+ # ' @export
409+ gg_locusplot <- function (df ,
410+ lead_snp = NULL ,
411+ rsid = rsid ,
412+ chrom = chrom ,
413+ pos = pos ,
414+ ref = ref ,
415+ alt = alt ,
416+ effect = NULL ,
417+ std_err = NULL ,
418+ p_value = p_value ,
419+ trait = NULL ,
420+ plot_pvalue_threshold = 0.1 ,
421+ plot_subsample_prop = 0.25 ,
422+ plot_distance = 500000 ,
423+ genome_build = " GRCh37" ,
424+ population = " ALL" ,
425+ plot_genes = FALSE ,
426+ plot_recombination = FALSE ,
427+ plot_title = NULL ,
428+ plot_subtitle = NULL ,
429+ label = NULL ) {
430+ # Check input arguments to ensure they are of the correct type and within reasonable ranges
431+ checkmate :: assert_data_frame(df )
432+ # checkmate::assert_string(lead_snp)
433+ checkmate :: assert_numeric(plot_pvalue_threshold , upper = 1 )
434+ checkmate :: assert_numeric(plot_subsample_prop , lower = 0 , upper = 1 )
435+ checkmate :: assert_numeric(plot_distance , lower = 0 )
436+ checkmate :: assert_logical(plot_genes )
437+ if (! is.null(label )) {
438+ checkmate :: assert_character(label )
439+ }
440+
441+ if (! rlang :: quo_is_null(rlang :: enquo(effect )) & ! rlang :: quo_is_null(rlang :: enquo(std_err ))) {
442+ checkmate :: assert_numeric(df %> % pull({{ effect }}))
443+ checkmate :: assert_numeric(df %> % pull({{ std_err }}))
444+
445+ df <- df %> %
446+ rename(.effect = {{ effect }},
447+ .std_err = {{ std_err }}) %> %
448+ mutate(log10_pval = abs((pnorm(- abs(.effect / .std_err ), log.p = TRUE ) + log(2 )) / log(10 )))
449+ } else {
450+ df <- df %> %
451+ mutate(log10_pval = - log10({{ p_value }}))
452+ }
453+
454+ if (rlang :: quo_is_null(rlang :: enquo(trait ))) {
455+ df <- df %> %
456+ select(rsid = {{ rsid }}, chromosome = {{ chrom }}, position = {{ pos }}, ref = {{ ref }}, alt = {{ alt }}, log10_pval ) %> %
457+ mutate_if(is.factor , as.character ) %> %
458+ mutate(ref = stringr :: str_to_upper(ref ), alt = stringr :: str_to_upper(alt )) %> %
459+ group_by(rsid ) %> %
460+ slice_max(log10_pval ) %> %
461+ ungroup() %> %
462+ tidyr :: drop_na()
463+ } else {
464+ df <- df %> %
465+ select(rsid = {{ rsid }}, chromosome = {{ chrom }}, position = {{ pos }}, ref = {{ ref }}, alt = {{ alt }}, log10_pval , trait = {{ trait }}) %> %
466+ mutate_if(is.factor , as.character ) %> %
467+ mutate(ref = stringr :: str_to_upper(ref ), alt = stringr :: str_to_upper(alt )) %> %
468+ group_by(trait , rsid ) %> %
469+ slice_max(log10_pval ) %> %
470+ ungroup() %> %
471+ tidyr :: drop_na()
472+ }
473+
474+ # Create df containing information about lead SNP (by default, select SNP with lowest p-value, otherwise take user-supplied value)
475+ if (is.null(lead_snp )) {
476+ indep_snps <- df %> %
477+ slice_max(log10_pval , with_ties = FALSE , n = 1 ) %> %
478+ select(lead_rsid = rsid , lead_chromosome = chromosome , lead_position = position , lead_ref = ref , lead_alt = alt )
479+
480+ cli :: cli_alert_info(" No lead_snp supplied. Defaulting to {indep_snps$lead_rsid} - {indep_snps$lead_chromosome}:{indep_snps$lead_position}, which has the lowest p-value in the region" )
481+ } else if (! (lead_snp %in% df $ rsid )) {
482+ # ensure lead_snp is in the supplied data; if not, use minimum p-value at locus
483+ indep_snps <- df %> %
484+ slice_max(log10_pval , with_ties = FALSE , n = 1 ) %> %
485+ select(lead_rsid = rsid , lead_chromosome = chromosome , lead_position = position , lead_ref = ref , lead_alt = alt )
486+
487+ cli :: cli_alert_info(" Lead snp not present in supplied locus data. Defaulting to {indep_snps$lead_rsid} - {indep_snps$lead_chromosome}:{indep_snps$lead_position}, which has the lowest p-value in the region" )
488+ } else {
489+ indep_snps <- df %> %
490+ select(lead_rsid = rsid , lead_chromosome = chromosome , lead_position = position , lead_ref = ref , lead_alt = alt ) %> %
491+ filter(lead_rsid == lead_snp ) %> %
492+ distinct(lead_rsid , .keep_all = TRUE )
493+ }
494+
495+ # Create dataframe of variants within the region size specified by the user
496+ suppressMessages(locus_snps <- df %> %
497+ filter(rsid %in% indep_snps $ lead_rsid ) %> %
498+ select(chromosome , position , lead_rsid = rsid ) %> %
499+ purrr :: pmap_dfr(function (chromosome_filter = first , position_filter = second , lead_rsid = third ) {
500+ df %> %
501+ filter(chromosome == chromosome_filter & between(position , position_filter - plot_distance / 2 , position_filter + plot_distance / 2 )) %> %
502+ mutate(lead_rsid = lead_rsid ) %> %
503+ left_join(indep_snps )
504+ }))
505+
506+ # Extract LD and format colors
507+ possibly_ld_extract_locuszoom <- purrr :: possibly(locusplotr :: ld_extract_locuszoom , otherwise = NULL )
508+
509+ ld_extracted <- possibly_ld_extract_locuszoom(chrom = indep_snps $ lead_chromosome , pos = indep_snps $ lead_position , ref = indep_snps $ lead_ref , alt = indep_snps $ lead_alt , start = min(locus_snps $ position ), stop = max(locus_snps $ position ), genome_build = genome_build , population = population )
510+
511+ # Create dataframe with variants at locus, LD information, color codes, and labels in preparation for plotting
512+ if (! (is.null(ld_extracted ))) {
513+ # Join locus df with LD information
514+ locus_snps_ld <- ld_extracted %> %
515+ select(chromosome = chromosome2 , position = position2 , variant2 , correlation ) %> %
516+ mutate(chromosome = as.numeric(chromosome ), position = as.numeric(position )) %> %
517+ tidyr :: separate(variant2 , into = c(" chr_pos" , " ref_alt" ), sep = " _" ) %> %
518+ tidyr :: separate(ref_alt , into = c(" ref" , " alt" ), sep = " /" ) %> %
519+ right_join(locus_snps , by = c(" chromosome" = " chromosome" , " position" = " position" ), relationship = " many-to-many" ) %> %
520+ filter((ref.x == ref.y & alt.x == alt.y ) | (ref.x == alt.y & alt.x == ref.y )) %> %
521+ select(- ends_with(" .y" ), - chr_pos ) %> %
522+ rename_with(~ stringr :: str_replace(.x , " .x" , " " ), .cols = ends_with(" .x" ))
523+
524+ # Create color codes and labels
525+ locus_snps_ld <- locus_snps_ld %> %
526+ mutate(color_code = as.character(cut(as.numeric(correlation ), breaks = c(0 , 0.2 , 0.4 , 0.6 , 0.8 , 1 ), labels = c(" blue4" , " skyblue" , " darkgreen" , " orange" , " red" ), include.lowest = TRUE ))) %> %
527+ mutate(legend_label = as.character(cut(as.numeric(correlation ), breaks = c(0 , 0.2 , 0.4 , 0.6 , 0.8 , 1 ), labels = c(" 0 - 0.2" , " 0.2 - 0.4" , " 0.4 - 0.6" , " 0.6 - 0.8" , " 0.8 - 1" ), include.lowest = TRUE ))) %> %
528+ mutate(lead = rsid == lead_rsid ) %> %
529+ mutate(label = case_when(
530+ rsid == lead_rsid ~ lead_rsid ,
531+ rsid %in% label ~ rsid ,
532+ TRUE ~ NA_character_
533+ )) %> %
534+ mutate(color_code = case_when(
535+ rsid == lead_rsid ~ " purple" ,
536+ TRUE ~ color_code
537+ )) %> %
538+ mutate(color_code = forcats :: fct_expand(color_code , " purple" , " red" , " orange" , " darkgreen" , " skyblue" , " blue4" )) %> %
539+ mutate(color_code = forcats :: fct_relevel(color_code , " purple" , " red" , " orange" , " darkgreen" , " skyblue" , " blue4" )) %> %
540+ mutate(legend_label = case_when(
541+ rsid == lead_rsid ~ " Ref" ,
542+ TRUE ~ legend_label
543+ )) %> %
544+ mutate(legend_label = forcats :: fct_expand(legend_label , " Ref" , " 0.8 - 1" , " 0.6 - 0.8" , " 0.4 - 0.6" , " 0.2 - 0.4" , " 0 - 0.2" )) %> %
545+ mutate(legend_label = forcats :: fct_relevel(legend_label , " Ref" , " 0.8 - 1" , " 0.6 - 0.8" , " 0.4 - 0.6" , " 0.2 - 0.4" , " 0 - 0.2" ))
546+ } else {
547+ # Deal with scenario where lead variant is not present in LD database
548+ cli :: cli_alert_info(" No linkage disequilibrium information found" )
549+ locus_snps_ld <- locus_snps %> %
550+ mutate(correlation = NA_integer_ ) %> %
551+ mutate(lead = rsid == lead_rsid ) %> %
552+ mutate(label = case_when(
553+ rsid == lead_rsid ~ lead_rsid ,
554+ TRUE ~ NA_character_
555+ )) %> %
556+ mutate(color_code = case_when(
557+ rsid == lead_rsid ~ " purple" ,
558+ TRUE ~ " grey50"
559+ )) %> %
560+ mutate(legend_label = case_when(
561+ rsid == lead_rsid ~ " Ref" ,
562+ TRUE ~ " Other"
563+ ))
564+ }
565+
566+ # group locus by trait if necessary
567+ if (! rlang :: quo_is_null(rlang :: enquo(trait ))) {
568+ locus_snps_ld <- locus_snps_ld %> %
569+ group_by(.data = . , trait )
570+ }
571+
572+ locus_snps_ld_label <- locus_snps_ld %> % filter(! is.na(label ))
573+
574+ # Make plot (sample non-significant p-values to reduce overplotting)
575+ regional_assoc_plot <- locus_snps_ld %> %
576+ distinct(rsid , .keep_all = TRUE ) %> %
577+ filter(log10_pval > - log10(plot_pvalue_threshold ) | correlation > 0.2 | legend_label == " Ref" ) %> % # improve overplotting
578+ bind_rows(locus_snps_ld %> %
579+ filter(log10_pval < = - log10(plot_pvalue_threshold ) & correlation < 0.2 & legend_label != " Ref" ) %> %
580+ slice_sample(prop = plot_subsample_prop )) %> %
581+ arrange(color_code , log10_pval ) %> %
582+ ggplot(aes(position , log10_pval )) +
583+ geom_point(aes(fill = factor (color_code ), size = lead , alpha = lead , shape = lead )) +
584+ ggrepel :: geom_label_repel(data = locus_snps_ld_label , aes(label = label ),
585+ size = 4 ,
586+ color = " black" ,
587+ fontface = " bold" ,
588+ fill = " white" ,
589+ min.segment.length = 0 ,
590+ box.padding = 1 ,
591+ alpha = 1 ,
592+ nudge_y = 4
593+ ) +
594+ geom_hline(yintercept = - log10(5e-8 ), linetype = " dashed" ) +
595+ scale_fill_identity(parse(text = " r^2" ), guide = " legend" , labels = levels(forcats :: fct_drop(locus_snps_ld $ legend_label )), na.translate = FALSE ) +
596+ scale_size_manual(values = c(3 , 5 ), guide = " none" ) +
597+ scale_shape_manual(values = c(21 , 23 ), guide = " none" ) +
598+ scale_alpha_manual(values = c(0.8 , 1 ), guide = " none" ) +
599+ scale_x_continuous(breaks = scales :: extended_breaks(n = 5 ), labels = scales :: label_number(scale = 1 / 1e6 )) +
600+ scale_y_continuous(expand = expansion(mult = c(0 , 0.1 ))) +
601+ guides(fill = guide_legend(override.aes = list (shape = 22 , size = 6 ),
602+ position = " inside" )) +
603+ labs(
604+ title = plot_title ,
605+ subtitle = plot_subtitle ,
606+ x = glue :: glue(" Position on Chromosome {unique(indep_snps$lead_chromosome)} (Mb)" ),
607+ y = " -log<sub>10</sub>(P-value)"
608+ ) +
609+ theme_bw(base_size = 16 ) +
610+ theme(
611+ plot.title = element_text(face = " bold" ),
612+ legend.position = " none" ,
613+ # legend.text = element_text(size = 10),
614+ # legend.title = element_text(size = 10, hjust = 0.5),
615+ # legend.justification.inside = c("right", "top"),
616+ # legend.position.inside = c(0.99, 0.99),
617+ # legend.spacing.y = unit(0, "pt"),
618+ strip.text = element_text(color = " black" ),
619+ strip.text.x = element_blank(),
620+ axis.title.y = ggtext :: element_markdown()
621+ )
622+
623+ if (plot_recombination ) {
624+ cli :: cli_alert_info(" Extracting recombination rates for the region {indep_snps$lead_chromosome}:{indep_snps$lead_position - plot_distance/2}-{indep_snps$lead_position + plot_distance/2}" )
625+ ylim <- max(pull(locus_snps_ld , log10_pval ), na.rm = TRUE ) +
626+ 0.3 * max(pull(locus_snps_ld , log10_pval ), na.rm = TRUE )
627+
628+ recomb_df <- recomb_extract_locuszoom(chrom = indep_snps $ lead_chromosome , start = indep_snps $ lead_position - plot_distance / 2 , end = indep_snps $ lead_position + plot_distance / 2 , genome_build = genome_build ) %> %
629+ select(position , recomb_rate )
630+ # return(recomb_df)
631+ suppressMessages(
632+ regional_assoc_plot <- regional_assoc_plot +
633+ geom_line(data = recomb_df , mapping = aes(x = position , y = recomb_rate ), color = " lightblue" , linewidth = 0.5 ) +
634+ scale_y_continuous(
635+ name = " -log<sub>10</sub>(P-value)" ,
636+ limits = c(0 , ylim ),
637+ sec.axis = sec_axis(
638+ ~ . * (100 / ylim ),
639+ name = " Recombination rate (cM/Mb)"
640+ # labels = scales::percent_format()
641+ )
642+ ) +
643+ theme(axis.title.y.right = element_text(vjust = 1.5 ))
644+ )
645+
646+ regional_assoc_plot <- gginnards :: move_layers(regional_assoc_plot , " GeomLine" , " bottom" )
647+
648+ }
649+
650+ # Add plot of genes if requested by user
651+ if (plot_genes ) {
652+ cli :: cli_alert_info(" Extracting genes for the region {indep_snps$lead_chromosome}:{indep_snps$lead_position - plot_distance/2}-{indep_snps$lead_position + plot_distance/2}" )
653+ geneplot <- gg_geneplot(chr = indep_snps $ lead_chromosome , start = indep_snps $ lead_position - plot_distance / 2 , end = indep_snps $ lead_position + plot_distance / 2 , genome_build = genome_build ) +
654+ theme(plot.margin = margin(0 , 5.5 , 5.5 , 5.5 ))
655+
656+ suppressWarnings(suppressMessages(regional_assoc_plot <- patchwork :: wrap_plots(list (
657+ regional_assoc_plot +
658+ labs(x = " " ) +
659+ xlim(indep_snps $ lead_position - plot_distance / 2 , indep_snps $ lead_position + plot_distance / 2 ) +
660+ theme(
661+ axis.text.x = element_blank(),
662+ axis.ticks.x = element_blank(),
663+ axis.title.x = element_blank(),
664+ plot.margin = margin(5.5 , 5.5 , 0 , 5.5 )
665+ ),
666+ geneplot
667+ ), nrow = 2 , heights = c(3 , 1 ))))
668+ }
669+
670+ return (regional_assoc_plot )
671+ }
0 commit comments