selection-stats/selectionStats_piNpiS.Rmd at main · topfm/selection-stats · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
---
title: "selection statistics and piNpiS"
output:
  pdf_document: default
  html_notebook: default
  html_document:
    df_print: paged
---
```{r, include = FALSE}
library(tidyverse)
library(formattable)
library(stargazer)
```

```{r, data, error=FALSE, echo=FALSE,warning=FALSE}
genes <- c("DPDCJFFM_01065","DPDCJFFM_01063", "dps", "pgl", "pdtaR", "pdtaS", "xfp", "nrdD", "metQ", "luxS", "ulaA")
sum_stats_20 <- read_delim("~/data/gvaginalis_expEvo/selectionStats/2023.03.06_selectionStats_script/20_selectionStats.txt", show_col_types = FALSE) %>%
  rename_with(~"gene", .cols = "Alignment") %>%
  mutate(across(c(Theta, TajimasD, Pi, nseff), as.numeric), TajimasD = case_when(Pi == 0 & Theta == 0 ~ 0, TRUE ~ TajimasD))

```

```{r, echo= FALSE, include=FALSE}
#filter out NAs and count them
num_nas <- sum_stats_20 %>%
  filter(is.na(TajimasD))

filter_out <- nrow(num_nas)
#filter out
sum_stats_20_filtered <- sum_stats_20 %>% filter(!is.na(TajimasD))

#report the number of genes that were filtered out for missing data
print(paste("There are", nrow(num_nas), "genes which were filtered out for >20% missing data"))
print(paste("There are", nrow(sum_stats_20_filtered), "genes that passed filtering."))

#make a list of the genes
TD_genes <- sum_stats_20_filtered$gene
num_genes <- nrow(sum_stats_20) - filter_out

round_and_rank <- function(sum_stats, n) {
  ranked <- sum_stats %>%
    mutate(across(c(TajimasD, Theta, Pi, nseff), ~ round(., digits = 3)),
           TD_rank = round(rank(TajimasD, ties.method = "average") / num_genes * 100),
           Theta_rank = round(rank(Theta,  ties.method = "average") / num_genes * 100),
           Pi_rank = round(rank(Pi,  ties.method = "average") / num_genes * 100))
  ranked$n = n
  return(ranked)
}

ranked_20 <- round_and_rank(sum_stats_20_filtered, 20)
all <- ranked_20


```

## The distribution and summary of piNpiS calculations where N/0 = None, and are discarded from the analysis.

```{r, echo=FALSE}

#read in the piNpiS output file from mean_piNpiS.py
pin_pis_None <- read_csv("~/piNpiS_test/None/selectionStats_piNpiS_average.csv", show_col_types = FALSE,
                    col_names = c("gene", "PiNPiS")) %>%
  mutate(PiNPiS_rank = round(rank(PiNPiS, ties.method = "average") / n() * 100)) %>% filter(gene %in% TD_genes)
stats_None <- merge(all, pin_pis_None)

#print the number of genes that were analyzed
print(paste(nrow(pin_pis_None), "of 1267 genes were analyzed"))

#give the summary statistics of the genes in the genome
print("Summary of selection stats across all genes: ")
summary(stats_None)

#make a list of the genes with the highest Tajima's D
top_TD <- stats_None %>% filter(TajimasD > 2)
#write_csv(top_TD, "~/2023.05.25_GV_Grant/top_TD.csv")

#filter to my particular genes of interest
interest <- filter(stats_None, gene %in% genes) %>%
  mutate(gene = if_else(gene == "DPDCJFFM_01065", "fas", gene), gene= if_else(gene == "DPDCJFFM_01063", "bapA", gene)) %>% select(gene, Pi, Pi_rank, Theta, Theta_rank, TajimasD, TD_rank,  PiNPiS, PiNPiS_rank)

#plot the distribution of genes
distribution <- ggplot(pin_pis_None, aes(PiNPiS)) + geom_histogram(binwidth = .1) + theme_classic() + ggtitle("None") + theme(text = element_text(size=15)) + ggtitle("distribution of piNpiS with N/0 discarded")
distribution
```
# The genes we analyzed were dps, pgl, fas, pdtaR, pdtaS, xfp, and nrdD
```{r, results='asis'}
#make a table in LaTeX
stargazer(interest, summary = FALSE, type= "latex")
```