PDA-DIA/01-dataprocessing-pg.qmd at main · statOmics/PDA-DIA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
---
title: Differential abundance analysis with DIA-NN - starting from proteins groups table
---

```{r, echo = FALSE}
source("R/knitr_setup.R")
```

```{r load_libraries}
library("QFeatures")
library("dplyr")
library("tidyr")
library("ggplot2")
library("msqrob2")
library("stringr")
library("ExploreModelMatrix")
library("MsCoreUtils")
library("matrixStats")
library("patchwork")
library("kableExtra")
library("ComplexHeatmap")
library("purrr")
library("tibble")
library("scater")
```

# Data processing

We will now make a QFeatures object starting from the protein level summaries from DIA-NN.
Normally we would start from the protein groups file that is provided by DIA-NN, i.e. the pg_matrix.tsv.


```{r}
proteinGroupFile <- "data/spikein248-staesetal2024.pg_matrix.tsv"
proteins <- data.table::fread(proteinGroupFile)
```

We add species information. Possible for this specific dataset, which is a benchmark dataset with lysate from human UPS proteins and Yeast proteins.

```{r}
proteins <- proteins |>
  mutate(species = grepl(pattern = "UPS",Protein.Group) |>
           as.factor() |>
           recode("TRUE"="ups","FALSE" = "yeast"))
proteins |>
  pull(species) |>
  table()
```


The protein group table is a file in wide format.
The columns with the raw file names contain the maxLFQ protein group intensities for the different samples.

```{r}
quantCols <- grep(".raw",names(proteins))
names(proteins)[quantCols]
```

We will rename these columns so that they have shorter file names.

```{r}
names(proteins)[quantCols] <- names(proteins)[quantCols] |>
    gsub(pattern = ".raw",replacement = "") |>
    gsub(pattern = "_DIA",replacement = "") |>
    strsplit(split="UPS2_") |>
    sapply("[",2)
names(proteins)
```
We now make the annotation file.

```{r}
quantCols <- names(proteins)[quantCols]

(annot_pg <- data.frame(quantCols = quantCols,
                      condition = gsub("ratio", "", quantCols) |>
                          strsplit(split="_") |>
                          sapply("[",1) |>
                          as.double() |>
                          as.factor(),
                      rep = gsub("ratio", "", quantCols) |>
                          strsplit(split="_") |>
                          sapply("[",2) |>
                          replace_na(replace = "1") |> #4.c
                          as.factor()
                      )
)
```

```{r}
(qf_pg <- readQFeatures(
  proteins, annot_pg, name = "proteins_raw", fnames = "Protein.Group"
  )
)
```


## log-transformation

We first convert zero's to NA and then we perform the log transformation.

```{r}
qf_pg <- zeroIsNA(qf_pg, i = names(qf_pg))
(
  qf_pg <- logTransform(qf_pg,  base = 2 , i="proteins_raw" , name = "proteins")
)
```

We plot the marginal intensity distributions to inspect if further normalisation is required.

```{r}
qf_pg[, , "proteins"] |> #1.
  longForm(colvars = c( "rep", "condition")) |>  #2.
  data.frame() |>
  filter(!is.na(value)) |>
  ggplot() + #3.
  aes(x = value,
      colour = condition,
      group = colname) +
  geom_density() +
  theme_minimal()
```

```{r}
qf_pg[, , "proteins"] |> #1.
  longForm(colvars = c( "rep", "condition"), rowvars = "species") |>  #2.
  data.frame() |>
  filter(!is.na(value)) |>
  ggplot() + #3.
  aes(x = value,
      colour = condition,
      group = colname) +
  geom_density() +
  theme_minimal() +
  facet_wrap(~species)
```

```{r}
qf_pg |>
  getWithColData("proteins") |>
  as("SingleCellExperiment") |>
  scater::runMDS(exprs_values = 1) |>
  scater::plotMDS(colour_by = "condition", shape_by = "rep")
```


# Data Modeling  and Inference

Setup model and contrasts of interest.

```{r}
model <- ~ condition

vd <- ExploreModelMatrix::VisualizeDesign(
    sampleData =  colData(qf_pg),
    designFormula = model,
    textSizeFitted = 4
)
vd$plotlist

(allHypotheses <- createPairwiseContrasts(
  model, colData(qf_pg), "condition"
  )
)

(L <- makeContrast(
  allHypotheses,
  parameterNames = colnames(vd$designmatrix)
))
```

Fit models and do inference.

```{r warning=FALSE}
  qf_pg <- msqrob(
    qf_pg,
    i = "proteins",
    formula = model,
    robust = TRUE) |>
  hypothesisTest(i = "proteins", contrast = L)
```

We extract all results tables

```{r}
inferences_pg <-
  msqrobCollect(qf_pg[["proteins"]], L) |>
    mutate(species =  rep(rowData(qf_pg[["proteins"]])$species, ncol(L)))
```


# Assess performance

Note, that the evaluations in this section are typically not possible on real experimental data.
Indeed, we are using a spike-in dataset so we know the ground truth: all human UPS proteins are differentially abundant (DA) between the conditions and the yeast proteins are non-DA.

We use a nominal FDR level of 0.05

```{r}
alpha <- 0.05
```

## Real Fold changes

As this is a spike-in study with known ground truth, we can also plot the log2 fold change distributions against the expected values, in this case 0 for the yeast proteins and the difference of the log concentration for the spiked-in UPS standards. We first create a small table with the real values.

As this is a spike-in study with known ground truth, we can also plot the log2 fold change distributions against the expected values, in this case 0 for the yeast proteins and the difference of the log concentration for the spiked-in UPS standards. We first create a small table with the real values.

1. We extract the levels from factor condition
2. We convert then into a number as they refer to the real ratio
3. We log2-transform them
4. We calculate all pairwise differences between log2 transformed ratio's to obtain the log2 fold changes

We use a similar operation to construct the name of the corresponding contrast.

```{r}
(realLogFC <- data.frame(
    logFC = colData(qf_pg)$condition |>
        levels() |> # 1.
        as.double() |> # 2.
        log2() |> # 3.
        combn(m=2, FUN = diff), #4.
    contrast = colData(qf_pg)$condition |>
        levels() |>
        combn(m=2, FUN= function(x) paste0(x[2]," - ",x[1])) |>
        recode("4 - 2" = "condition4",
               "8 - 2" = "condition8",
               "8 - 4" = "condition8 - condition4")
    )
)
```

We now evaluate of the estimated fold changes correspond to the real fold changes.

1. We plot the log fold changes in function of the spikein condition and color according to spikein condition.
2. We add a boxplot layer.
3. We use custom colors.
4. We add a vertical line at 0, which corresponds to the known log2 fold change for yeast proteins
5. We add a vertical lines for known log2 fold changes of the spiked UPS proteins.

```{r}
logFC <- inferences_pg |>
  filter(!is.na(logFC)) |>
  ggplot(aes(x = species, y = logFC, color= species)) + #1.
  geom_boxplot() + #2.
  theme_bw() +
  scale_color_manual(
    values = c("grey20", "firebrick"), #3.
    name = "",
    labels = c("yeast", "ups")
    ) +
  geom_hline(yintercept = 0, color="grey20") + # 4.
  facet_wrap(~contrast) +
  geom_hline(aes(yintercept = logFC), color="firebrick", data=realLogFC) #5.
logFC
```

## True and false positives

All human UPS proteins are differentially abundant (DA) between the conditions and the yeast proteins are non-DA.
We make a new variable spikein in the inference_list.

```{r}
tpFpTable <- group_by(inferences_pg, contrast) |>
    filter(adjPval < alpha) |>
    summarise("TP" = sum(species == "ups"),
              "FP" = sum(species != "ups"),
              "FDP" = mean(species != "ups"))
tpFpTable
```


## TPR - FDP curves

We generate the TPR-FDP curves to assess the performance of the different workflows to prioritise differentially abundant proteins. Again, these curves are built using the ground truth information about which proteins are differentially abundant (spiked in) and which proteins are constant across samples. We create two functions to compute the TPR and the FDP.

```{r}
computeFDP <- function(pval, tp) {
    ord <- order(pval)
    fdp <- cumsum(!tp[ord]) / 1:length(tp)
    fdp[order(ord)]
}
computeTPR <- function(pval, tp, nTP = NULL) {
    if (is.null(nTP)) nTP <- sum(tp)
    ord <- order(pval)
    tpr <- cumsum(tp[ord]) / nTP
    tpr[order(ord)]
}
```

We apply these functions and compute the corresponding metric using the statistical inference results and the ground truth information.

```{r}
performance <- inferences_pg |> group_by(contrast) |>
    na.exclude() |>
    mutate(tpr = computeTPR(pval, species == "ups"),
           fdp = computeFDP(pval, species == "ups")) |>
    arrange(fdp)
```

We also highlight the observed FDP at a $alpha = 5\%$ FDR threshold.

```{r}
workPoints <- performance |>
  group_by(contrast) |>
    filter(adjPval < 0.05) |>
    slice_max(pval)
```

```{r}
ggplot(performance) +
    aes(
        y = fdp,
        x = tpr,
    ) +
    geom_line() +
    geom_point(data = workPoints, size = 3) +
    geom_hline(yintercept = 0.05, linetype = 2) +
    facet_wrap( ~ contrast) +
    coord_flip(ylim = c(0, 0.2)) +
    theme_minimal()
```


## volcano-plots

We make a separate plot per contrast and compare the volcano plots for all normalisation x summarisation combinations.

```{r}
  inferences_pg |>
  plot_volcano() +
  facet_wrap(~contrast) +
  aes(color = species, shape = adjPval < alpha) +
    scale_color_manual(
      values = c("grey20", "firebrick"),
      name = "species",
      labels = c("yeast", "ups")
      ) +
  labs(shape="FDR < 0.05") +
  geom_vline(aes(xintercept = logFC), data = realLogFC , col ="firebrick") +
  geom_hline(aes(yintercept = -log10(adjAlpha)), data = inferences_pg |>
               group_by(contrast) |>
               summarize(adjAlpha = alpha *mean(adjPval < alpha, na.rm=TRUE), .groups="drop"))
```