Skip to content

Commit 1b91c70

Browse files
committed
adding data var
1 parent 1cc19de commit 1b91c70

16 files changed

+412
-0
lines changed

dataVariance/DESCRIPTION

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
Package: dataVariance
2+
Type: Package
3+
Title: A package that complements Kallisto for quick, informative *seq analysis
4+
Version: 0.0.1
5+
Date: 2015-09-16
6+
Author: Anthony Colombo, Tim Triche Jr.
7+
Depends: parallel, jsonlite, GenomeInfoDb, limma, TxDbLite, SummarizedExperiment, tools, arkas
8+
Imports: rhdf5, matrixStats, GenomicRanges, GenomicFeatures, Matrix
9+
Suggests: roxygen2, knitr
10+
VignetteBuilder: VignetteBuilder: knitr
11+
Maintainer: Anthony R. Colombo <[email protected]>
12+
Description: The package can compare data variance of two Kallito Experiments used with differring versions of Kallisto.
13+
License: GPL (>= 2)
14+
RoxygenNote: 6.0.1

dataVariance/NAMESPACE

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Generated by roxygen2: do not edit by hand
2+
3+
export(findOutliers)
4+
export(findStandardizedMeanDifference)
5+
export(findVariation)
6+
exportMethods(SetTxDbLiteMetaClass)
7+
import(GenomicRanges)
8+
import(arkas)
9+
import(artemis)
16 KB
Binary file not shown.
16 KB
Binary file not shown.

dataVariance/R/Methods.R

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#new generics for KallistoDifference Eobjectperiments
2+
setGeneric("SetTxDbLiteMetaClass", function(object) standardGeneric("SetTxDbLiteMetaClass"))
3+
4+
5+
#'
6+
#'
7+
#' Sets meta columns type to match TxDbLite classes
8+
#'@import GenomicRanges
9+
#'@export
10+
setMethod("SetTxDbLiteMetaClass",signature="KallistoExperiment",
11+
function(object){
12+
class(mcols(object)$tx_length)<-"integer"
13+
class(mcols(object)$gc_content)<-"numeric"
14+
class(mcols(object)$tx_id)<-"character"
15+
class(mcols(object)$gene_id)<-"character"
16+
class(mcols(object)$gene_name)<-"character"
17+
class(mcols(object)$entrezid)<-"character"
18+
class(mcols(object)$tx_biotype)<-"character"
19+
class(mcols(object)$gene_biotype)<-"character"
20+
class(mcols(object)$biotype_class)<-"character"
21+
return(object)
22+
23+
})

dataVariance/R/findOutliers.R

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#' this finds the outliers of two covarites in a SE4 object
2+
#'
3+
#'
4+
#' @param diff object from findPercentDifferenceAssays
5+
#'
6+
#' @return a kallisto Experiment of differences
7+
#'
8+
#' @import artemis
9+
#'@export
10+
#'
11+
#'
12+
findOutliers<-function(diff){
13+
14+
data(UnannotatedTranscripts)
15+
rowdat <- rep(UnannotatedTranscripts, nrow(diff$est_counts))
16+
results<-KallistoExperiment(est_counts=diff$est_counts,
17+
eff_length=diff$eff_length,
18+
est_counts_mad=diff$est_counts_mad,
19+
transcriptomes=diff$transcriptomes,
20+
kallistoVersion=diff$newkallistoVersion,
21+
covariates=diff$covariates,
22+
features=rowdat)
23+
24+
rownames(results)<-diff$rownames
25+
results<-SetTxDbLiteMetaClass(results)
26+
outs<-diff[grepl("_Outlrs",names(diff))]
27+
outs<-outs[which(as.numeric(summary(outs)[,1])>0)]
28+
29+
for(i in 1:length(outs)){
30+
foundTxs<-intersect(rownames(results),names(outs[[i]]))
31+
feats<-features(results)
32+
feats[foundTxs]<-outs[[i]]
33+
features(results)[foundTxs]<-feats[foundTxs]
34+
}
35+
36+
#add meta columns
37+
mcols(results)$deltaTx_length<-diff[[5]]
38+
mcols(results)$deltaGc_content<-diff[[6]]
39+
40+
return(results)
41+
42+
}
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
#' this finds the percent difference of two covarites in a SE4 object
2+
#'
3+
#' @param kexpNew a kallisto experiment from artemis
4+
#' @param kexpOrigin a kallisto experiment considered 'origin' to be compare to
5+
#'
6+
#' @return a list with a bunch of difference data
7+
#'
8+
#'
9+
#' @import arkas
10+
#'
11+
#'@export
12+
#'
13+
#'
14+
15+
16+
findStandardizedMeanDifference<-function(kexpNew,kexpOrigin){
17+
message(paste0("finding the difference between : ",kallistoVersion(kexpNew)," and ", kallistoVersion(kexpOrigin)))
18+
#check the parameters for types
19+
stopifnot((class(kexpNew)[[1]]=="KallistoExperiment"))
20+
message("checking assays names ...")
21+
stopifnot(names(assays(kexpNew))==names(assays(kexpOrigin)))
22+
message("assays names O.K. ...")
23+
message("checking transcriptomes...")
24+
stopifnot(identical(transcriptomes(kexpNew),transcriptomes(kexpOrigin)))
25+
message("checking covariates ... ")
26+
stopifnot(identical(colData(kexpNew),colData(kexpOrigin)))
27+
message("checking row names. ..")
28+
stopifnot(identical(rownames(kexpNew),rownames(kexpOrigin)))
29+
covariatesList<-as.list(names(assays(kexpNew)))
30+
#need to add GC_content and tx_length as target covariates
31+
message("checking meta colnames ... ")
32+
stopifnot(identical(colnames(mcols(rowRanges(kexpNew))),colnames(mcols(rowRanges(kexpOrigin)))))
33+
34+
35+
covariatesList<-append(covariatesList,list(colnames(mcols(rowRanges(kexpNew)))[grepl("tx_length",colnames(mcols(rowRanges(kexpNew))))]))
36+
covariatesList<-append(covariatesList,list(colnames(mcols(rowRanges(kexpNew)))[grepl("gc_content",colnames(mcols(rowRanges(kexpNew))))]))
37+
38+
names(covariatesList)<-covariatesList
39+
resNew<-lapply(covariatesList,function(x) .kexpAssayAccessor(x,kexpNew))
40+
resOrigin<-lapply(covariatesList,function(x) .kexpAssayAccessor(x,kexpOrigin))
41+
message("checking list lengths ... ")
42+
stopifnot(length(resNew)==length(resOrigin))
43+
resultsDiffList<-.calculateDifferenceFromList(kexpNew,resNew,resOrigin)
44+
resultsDiffList$transcriptomes<-transcriptomes(kexpNew)
45+
resultsDiffList$newkallistoVersion<-kallistoVersion(kexpNew)
46+
resultsDiffList$covariates<-colData(kexpNew)$ID
47+
resultsDiffList$rownames<-rownames(kexpNew)
48+
return(resultsDiffList)
49+
} #top
50+
51+
52+
.kexpAssayAccessor<-function(type,kexp){
53+
if(type=="est_counts"){
54+
results<-assays(kexp)$est_counts
55+
56+
}
57+
else if(type=="tpm"){
58+
results<-assays(kexp)$tpm
59+
}
60+
61+
else if(type=="eff_length"){
62+
results<-assays(kexp)$eff_length
63+
}
64+
65+
else if(type=="est_counts_mad"){
66+
results<-assays(kexp)$est_counts_mad
67+
}
68+
69+
else if(type=="tx_length"){
70+
results<-as.matrix(mcols(rowRanges(kexp))$tx_length)
71+
rownames(results)<-rownames(kexp)
72+
colnames(results)<-"tx_length"
73+
}
74+
else if(type=="gc_content"){
75+
results<-as.matrix(mcols(rowRanges(kexp))$gc_content)
76+
rownames(results)<-rownames(kexp)
77+
colnames(results)<-"gc_content"
78+
}
79+
80+
else{
81+
82+
message("I'm afraid, I don't know how to access ...")
83+
}
84+
return(results)
85+
86+
}
87+
88+
89+
.calculateDifferenceFromList<-function(kexp,resNew,resOrigin){
90+
diffList<-list()
91+
outliers<-NULL
92+
outLiars<-list()
93+
94+
for(k in 1:length(resNew)){
95+
dataN<-resNew[[k]]
96+
dataO<-resOrigin[[k]]
97+
diffMatrix<-matrix(data=NA,nrow=nrow(dataN),ncol=ncol(dataN))
98+
stopifnot(rownames(dataN)==rownames(dataO)) #this must be true!!!
99+
rownames(diffMatrix)<-rownames(dataN)
100+
colnames(diffMatrix)<-colnames(dataN)
101+
# diffMatrix<-(dataN-dataO)/(dataO + 1)
102+
##standardized mean difference
103+
x<-dataN-dataO
104+
X<-x-mean(x)
105+
X<-X/(sd(X)/sqrt(ncol(X)))
106+
diffMatrix<-X
107+
outliers<-rownames(diffMatrix)[ which(abs(diffMatrix)>1.01)]
108+
outliers<-outliers[(!is.na(outliers))]
109+
diffList[[k]]<-diffMatrix
110+
names(diffList)[k]<-names(resNew)[k]
111+
outLiars[[k]]<-rowRanges(kexp)[outliers]
112+
names(outLiars)[k]<-paste0(names(resNew)[k],"_Outlrs")
113+
114+
}#for top res
115+
116+
for(m in 1:length(outLiars)){
117+
diffList<-append(diffList,list(outLiars[[m]]))
118+
names(diffList)[which(names(diffList)=="")]<-names(outLiars)[m]
119+
}
120+
121+
return(diffList)
122+
123+
}#top of function
124+
125+

dataVariance/R/findVariation.R

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
#' this finds the percent difference of two covarites in a SE4 object
2+
#'
3+
#' @param kexpNew a kallisto experiment from artemis
4+
#' @param kexpOrigin a kallisto experiment considered 'origin' to be compare to
5+
#'
6+
#' @return a list with a bunch of difference data
7+
#'
8+
#'
9+
#' @import arkas
10+
#'
11+
#'@export
12+
#'
13+
#'
14+
15+
16+
findVariation<-function(kexpNew,kexpOrigin){
17+
message(paste0("finding the difference between : ",kallistoVersion(kexpNew)," and ", kallistoVersion(kexpOrigin)))
18+
#check the parameters for types
19+
stopifnot((class(kexpNew)[[1]]=="KallistoExperiment"))
20+
message("checking assays names ...")
21+
stopifnot(names(assays(kexpNew))==names(assays(kexpOrigin)))
22+
message("assays names O.K. ...")
23+
message("checking transcriptomes...")
24+
stopifnot(identical(transcriptomes(kexpNew),transcriptomes(kexpOrigin)))
25+
message("checking covariates ... ")
26+
stopifnot(identical(colData(kexpNew),colData(kexpOrigin)))
27+
message("checking row names. ..")
28+
stopifnot(identical(rownames(kexpNew),rownames(kexpOrigin)))
29+
covariatesList<-as.list(names(assays(kexpNew)))
30+
#need to add GC_content and tx_length as target covariates
31+
message("checking meta colnames ... ")
32+
stopifnot(identical(colnames(mcols(rowRanges(kexpNew))),colnames(mcols(rowRanges(kexpOrigin)))))
33+
34+
35+
covariatesList<-append(covariatesList,list(colnames(mcols(rowRanges(kexpNew)))[grepl("tx_length",colnames(mcols(rowRanges(kexpNew))))]))
36+
covariatesList<-append(covariatesList,list(colnames(mcols(rowRanges(kexpNew)))[grepl("gc_content",colnames(mcols(rowRanges(kexpNew))))]))
37+
38+
names(covariatesList)<-covariatesList
39+
resNew<-lapply(covariatesList,function(x) .kexpAssayAccessor(x,kexpNew))
40+
resOrigin<-lapply(covariatesList,function(x) .kexpAssayAccessor(x,kexpOrigin))
41+
message("checking list lengths ... ")
42+
stopifnot(length(resNew)==length(resOrigin))
43+
resultsDiffList<-.calculateVariation(kexpNew,resNew,resOrigin)
44+
resultsDiffList$transcriptomes<-transcriptomes(kexpNew)
45+
resultsDiffList$newkallistoVersion<-kallistoVersion(kexpNew)
46+
resultsDiffList$covariates<-colData(kexpNew)$ID
47+
resultsDiffList$rownames<-rownames(kexpNew)
48+
return(resultsDiffList)
49+
} #top
50+
51+
52+
.kexpAssayAccessor<-function(type,kexp){
53+
if(type=="est_counts"){
54+
results<-assays(kexp)$est_counts
55+
56+
}
57+
else if(type=="tpm"){
58+
results<-assays(kexp)$tpm
59+
}
60+
61+
else if(type=="eff_length"){
62+
results<-assays(kexp)$eff_length
63+
}
64+
65+
else if(type=="est_counts_mad"){
66+
results<-assays(kexp)$est_counts_mad
67+
}
68+
69+
else if(type=="tx_length"){
70+
results<-as.matrix(mcols(rowRanges(kexp))$tx_length)
71+
rownames(results)<-rownames(kexp)
72+
colnames(results)<-"tx_length"
73+
}
74+
else if(type=="gc_content"){
75+
results<-as.matrix(mcols(rowRanges(kexp))$gc_content)
76+
rownames(results)<-rownames(kexp)
77+
colnames(results)<-"gc_content"
78+
}
79+
80+
else{
81+
82+
message("I'm afraid, I don't know how to access ...")
83+
}
84+
return(results)
85+
86+
}
87+
88+
89+
.calculateVariation<-function(kexp,resNew,resOrigin){
90+
diffList<-list()
91+
outliers<-NULL
92+
outLiars<-list()
93+
94+
for(k in 1:length(resNew)){
95+
dataN<-resNew[[k]]
96+
dataO<-resOrigin[[k]]
97+
diffMatrix<-matrix(data=NA,nrow=nrow(dataN),ncol=ncol(dataN))
98+
stopifnot(rownames(dataN)==rownames(dataO)) #this must be true!!!
99+
rownames(diffMatrix)<-rownames(dataN)
100+
colnames(diffMatrix)<-colnames(dataN)
101+
#diffMatrix<-(dataN-dataO)/(dataO + 1)
102+
###variation of the difference i.e. the spread of error
103+
diffMatrix<-var(dataN-dataO)
104+
# outliers<-rownames(diffMatrix)[ which(abs(diffMatrix)>1.01)]
105+
# outliers<-outliers[(!is.na(outliers))]
106+
diffList[[k]]<-diffMatrix
107+
names(diffList)[k]<-names(resNew)[k]
108+
# outLiars[[k]]<-features(kexp)[outliers]
109+
# names(outLiars)[k]<-paste0(names(resNew)[k],"_Outlrs")
110+
111+
}#for top res
112+
113+
#for(m in 1:length(outLiars)){
114+
#diffList<-append(diffList,list(outLiars[[m]]))
115+
#names(diffList)[which(names(diffList)=="")]<-names(outLiars)[m]
116+
#}
117+
118+
return(diffList)
119+
120+
}#top of function
121+
122+

dataVariance/README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# dataVariance
2+
This is a package for measuring data variance amidst a computation S4 object.
3+
4+
For an example of measuring the difference between two kallisto Experiments, please see the package artemis, and artemisData. From artemisData, we can load two Kallisto Experiments and compare their differences.
5+
6+
>library(artemisVariance)
7+
>data(kexpNew)
8+
>data(kexpOriginal)
9+
>diff<-findPercentDifferenceAssays(kexpNew,kexpOriginal)
10+
>kde<-findOutliers(diff)
565 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)