merged dev

Tallulah Andrews · Tallulah Andrews · commit 71036b9da46b · 2016-04-17T18:34:23.000+01:00
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: M3Drop
-Version: 0.99.1
-Date: 2016-02-10
+Version: 0.99.2
+Date: 2016-04-17
 Title: Michaelis-Menten Modelling of Dropouts
 Author: Tallulah Andrews <tallulandrews@gmail.com>
 Maintainer: Tallulah Andrews <tallulandrews@gmail.com>
diff --git a/R/Brennecke_implementation.R b/R/Brennecke_implementation.R
@@ -2,7 +2,7 @@
 Brennecke_getVariableGenes <- function(expr_mat, spikes=NA, suppress.plot=FALSE, fdr=0.1, minBiolDisp=0.5) {
         #require(statmod)
 
-        rowVars <- function(x) { unlist(apply(x,1,var))}
+        rowVars <- function(x) { unlist(apply(x,1,var, na.rm=T))}
 
         colGenes = "black"
         colSp = "blue"
@@ -22,23 +22,23 @@ Brennecke_getVariableGenes <- function(expr_mat, spikes=NA, suppress.plot=FALSE,
                 countsGenes = fullCountTable;
         }
 
-        meansSp = rowMeans(countsSp)
+        meansSp = rowMeans(countsSp, na.rm=T)
         varsSp = rowVars(countsSp)
         cv2Sp = varsSp/meansSp^2
-        meansGenes = rowMeans(countsGenes)
+        meansGenes = rowMeans(countsGenes, na.rm=T)
         varsGenes = rowVars(countsGenes)
         cv2Genes = varsGenes/meansGenes^2
         # Fit Model
         minMeanForFit <- unname( quantile( meansSp[ which( cv2Sp > 0.3 ) ], 0.80))
         useForFit <- meansSp >= minMeanForFit
-        if (sum(useForFit) < 50) {
+        if (sum(useForFit, na.rm=T) < 20) {
                 warning("Too few spike-ins exceed minMeanForFit, recomputing using all genes.")
                 meansAll = c(meansGenes, meansSp)
                 cv2All = c(cv2Genes,cv2Sp)
                 minMeanForFit <- unname( quantile( meansAll[ which( cv2All > 0.3 ) ], 0.80))
                 useForFit <- meansSp >= minMeanForFit
         }
-        if (sum(useForFit) < 50) {warning(paste("Only", sum(useForFit), "spike-ins to be used in fitting, may result in poor fit."))}
+        if (sum(useForFit, na.rm=T) < 30) {warning(paste("Only", sum(useForFit), "spike-ins to be used in fitting, may result in poor fit."))}
         fit <- glmgam.fit( cbind( a0 = 1, a1tilde = 1/meansSp[useForFit] ), cv2Sp[useForFit] )
         a0 <- unname( fit$coefficients["a0"] )
         a1 <- unname( fit$coefficients["a1tilde"])
diff --git a/R/Curve_fitting.R b/R/Curve_fitting.R
@@ -6,7 +6,7 @@ bg__fit_MM <- function (p,s) {
 #	Kerr = exp(Kcoeff+Kerr)-exp(Kcoeff)
 #        predicted = fitted(fit)
 #        krt=summary(fit)$parameters[1,1]
-#	return(list(K=krt,Kerr=Kerr,predictions=predicted, model=c("MMenton",paste("Krt =",round(krt,digits=3))),SSr=round(sum((residuals(fit))^2)),SAr=round(sum(abs(residuals(fit))))))
+#	return(list(K=krt,Kerr=Kerr,predictions=predicted, model=c("MMenten",paste("Krt =",round(krt,digits=3))),SSr=round(sum((residuals(fit))^2)),SAr=round(sum(abs(residuals(fit))))))
 	if (length(p) != length(s)) {
 		stop(print("Error: p and s not same length. Cannot fit Michaelis-Menten."))
 	}
@@ -30,7 +30,7 @@ bg__fit_MM <- function (p,s) {
 	Kerr = fit@coef[2]
 	predicted = 1-(s/(krt+s))
 	residuals = p-predicted
-	return(list(K=krt,Kerr=Kerr,fitted_err = res_err,predictions=predicted, model=c("MMenton",paste("Krt =",round(krt,digits=3))),SSr=round(sum((residuals)^2)),SAr=round(sum(abs(residuals)))))
+	return(list(K=krt,Kerr=Kerr,fitted_err = res_err,predictions=predicted, model=c("MMenten",paste("K =",round(krt,digits=3))),SSr=round(sum((residuals)^2)),SAr=round(sum(abs(residuals)))))
 
 }
 bg__fit_logistic <- function(p,s) {
@@ -98,6 +98,6 @@ M3Drop_Dropout_Models <- function(expr_mat, xlim=NA, suppress.plot=FALSE) {
 		sizeloc = bg__add_model_to_plot(SCDE, BasePlot, lty=2, lwd=2.5, col="magenta3",legend_loc = c(sizeloc$rect$left+sizeloc$rect$w,sizeloc$rect$top-sizeloc$rect$h-0.05));
 		sizeloc = bg__add_model_to_plot(ZIFA, BasePlot, lty=3, lwd=2.5, col="red",legend_loc = c(sizeloc$rect$left+sizeloc$rect$w,sizeloc$rect$top-sizeloc$rect$h-0.05));
 	}
-	invisible(list(MMfit = MM, LogiFit = SCDE, ExpoFit = ZIFA));
+	invisible(list(MMFit = MM, LogiFit = SCDE, ExpoFit = ZIFA));
 }
 
diff --git a/R/Extremes.R b/R/Extremes.R
@@ -22,7 +22,7 @@ bg__test_DE_K_equiv <- function (expr_mat, fit=NA) {
 # Use the fact that errors of proportions are well define by converting S to proportion detected equivalents?
 hidden__test_DE_P_equiv <- function (expr_mat,  fit=NA) {
 	gene_info = bg__calc_variables(expr_mat);
-	if (is.na(fit)) {
+	if (is.na(fit)[1]) {
 		fit = bg__fit_MM(gene_info$p, gene_info$s);
 	}
 	p_obs = gene_info$p;
@@ -78,7 +78,7 @@ hidden__test_DE_S_equiv <- function (expr_mat, fit=NA, method="propagate") {
 
 bg__get_extreme_residuals <- function (expr_mat, fit=NA, v_threshold=c(0.05,0.95), percent = NA, fdr_threshold = 0.1, direction="right", suppress.plot = FALSE) {
 	gene_info = bg__calc_variables(expr_mat);
-	if (is.na(fit)) {
+	if (is.na(fit)[1]) {
 		fit = bg__fit_MM(gene_info$p, gene_info$s);
 	}
 	res = bg__horizontal_residuals_MM_log10(fit$K, gene_info$p, gene_info$s)
diff --git a/R/Normalization.R b/R/Normalization.R
@@ -2,11 +2,11 @@
 hidden__UQ <- function(x){quantile(x[x>0],0.75)};
 
 bg__filter_cells <- function(expr_mat,labels=NA, suppress.plot=FALSE, min_detected_genes=NA) {
-	num_detected =  colSums(expr_mat > 0);
+	num_detected =  colSums(expr_mat > 0, na.rm=T);
 	if (!is.na(min_detected_genes)) {
 		low_quality = num_detected < min_detected_genes;
 	} else {
-		num_zero = colSums(expr_mat == 0);
+		num_zero = colSums(expr_mat == 0, na.rm=T);
 		cell_zero = num_zero;
 		mu = mean(cell_zero);
 		sigma = sd(cell_zero);
diff --git a/R/Plotting_fxns.R b/R/Plotting_fxns.R
@@ -17,7 +17,7 @@ bg__dropout_plot_base <- function (expr_mat, xlim = NA, suppress.plot=FALSE) {
 	if (!suppress.plot) {
         	par(fg="black")
 		if (!(sum(is.na(xlim)))) {
-	        	plot(xes,gene_info$p, main="", ylab="Dropout Proportion", xlab="log(expression)", col = dens.col,pch=16, xlim=xlim, ylim=c(0,1))
+	        	plot(xes,gene_info$p, main="", ylab="Dropout Proportion", xlab="log10(expression)", col = dens.col,pch=16, xlim=xlim, ylim=c(0,1))
 		} else {
 	        	plot(xes,gene_info$p, main="", ylab="Dropout Proportion", xlab="log(expression)", col = dens.col,pch=16)
 		}
@@ -55,10 +55,10 @@ bg__expression_heatmap <- function (genes, expr_mat, cell_labels=NA, gene_labels
 	if(!is.numeric(genes)) {
 		new_genes = match(genes, rownames(expr_mat));
 		nomatch = sum(is.na(new_genes));
-		if (nomatch > 0) {warning(paste(nomatch, " genes could not be matched to data, they will not be included in the heatmap."));}
+		if (nomatch > 0) {warning(paste("Warning: ",nomatch, " genes could not be matched to data, they will not be included in the heatmap."));}
 		genes = new_genes[!is.na(new_genes)];
 	}
-	if (length(genes) < 1) {warning("No genes for heatmap.");return();}
+	if (length(genes) < 1) {stop("Error: No genes for heatmap.");return();}
 	# Plot heatmap of expression
 	heatcolours <- rev(brewer.pal(11,"RdBu"))
 	col_breaks = c(-100,seq(-2,2,length=10),100)
@@ -71,7 +71,7 @@ bg__expression_heatmap <- function (genes, expr_mat, cell_labels=NA, gene_labels
 	if (!is.na(key_genes[1])) {
 		rownames(heat_data)[rownames(expr_mat[genes,]) %in% key_genes] = rownames(expr_mat[genes,])[rownames(expr_mat[genes,]) %in% key_genes]; 
 	}
-	colnames(heat_data) = rep("", length(heat_data[1,]));
+	colnames(heat_data) = 1:length(colnames(heat_data));
 	if (!is.na(key_cells[1])) {
 		colnames(heat_data)[colnames(expr_mat[genes,]) %in% key_cells] = colnames(expr_mat[genes,])[colnames(expr_mat[genes,]) %in% key_cells]; 
 	}
@@ -98,7 +98,11 @@ bg__expression_heatmap <- function (genes, expr_mat, cell_labels=NA, gene_labels
 	lmat=rbind(c(6,0,5),c(0,0,2),c(4,1,3))
 
 
-	heatmap_output = suppressWarnings(heatmap.2(heat_data, ColSideColors = ColColors, RowSideColors = RowColors, col=heatcolours, breaks=col_breaks, scale="row",symbreaks=T, trace="none", dendrogram="column", key=FALSE, Rowv=TRUE, Colv=TRUE,lwid=lwid, lhei=lhei,lmat=lmat, hclustfun=function(x){hclust(x,method="ward.D2")}))
+	if (dim(heat_data)[1] < 10000) {
+		heatmap_output = suppressWarnings(heatmap.2(heat_data, ColSideColors = ColColors, RowSideColors = RowColors, col=heatcolours, breaks=col_breaks, scale="row",symbreaks=T, trace="none", dendrogram="column", key=FALSE, Rowv=TRUE, Colv=TRUE,lwid=lwid, lhei=lhei,lmat=lmat, hclustfun=function(x){hclust(x,method="ward.D2")}))
+	} else {
+		heatmap_output = suppressWarnings(heatmap.2(heat_data, ColSideColors = ColColors, RowSideColors = RowColors, col=heatcolours, breaks=col_breaks, scale="row",symbreaks=T, trace="none", dendrogram="column", key=FALSE, Rowv=FALSE, Colv=TRUE,lwid=lwid, lhei=lhei,lmat=lmat, hclustfun=function(x){hclust(x,method="ward.D2")}))
+	}
 	# Custom key
 	par(fig = c(0, 1/(5.2),4/(5.2), 1), mar=c(4,1,1,1), new=TRUE)
 	scale01 <- function(x, low = min(x), high = max(x)) {
@@ -149,8 +153,54 @@ M3Drop_Expression_Heatmap <- function(genes, expr_mat, cell_labels=NA, interesti
 	if (is.numeric(key_cells) | is.logical(key_cells)) {
 		key_cells = rownames(expr_mat)[key_cells];
 	}
+	if (is.factor(genes)) {
+		genes = as.character(genes);
+	}
+	if (!is.vector(genes)) {
+		stop("Error: genes must be a vector.")
+	}
 	heatmap_output = bg__expression_heatmap(genes, expr_mat, cell_labels=cell_labels, gene_labels=as.numeric(gene_labels), key_genes=as.character(key_genes), key_cells=key_cells);
 	invisible(heatmap_output);
 }
 
-M3Drop_Get_Heatmap_Cell_Clusters <- function (heatmap_output, k) {cutree(as.hclust(heatmap_output$colDendrogram), k=k)}
+M3Drop_Get_Heatmap_Cell_Clusters <- function (heatmap_output, k) {
+	tryCatch(
+		returned_val <- cutree(as.hclust(heatmap_output$colDendrogram), k=k),
+		warning=function(w) {print(w)},
+		error=function(e){
+			print(e);
+			print("Dendrogram may have flat branches, trying again");
+			returned_val <-hidden_get_clusters(heatmap_output,k)
+			}
+	)
+	return(returned_val);
+}
+
+hidden_get_clusters<- function(heatout, k){
+        dendro=heatout$colDendrogram
+        curr_k = 1;
+        dendro_list = list(dendro)
+        dendro_heights = attr(dendro, "height")
+        while( curr_k < k ){
+                to_split = which(dendro_heights == max(dendro_heights))
+                to_split_dendro = dendro_list[[to_split]]
+                to_split_height =  dendro_heights[to_split]
+
+                children = as.list(to_split_dendro)
+                for (i in 1:length(children)) {
+                        dendro_heights = c(dendro_heights,attr(children[[i]],"height"))
+                        dendro_list[[length(dendro_list)+1]] <- children[[i]]
+                }
+                # Remove to split
+                dendro_list[to_split] = NULL
+                dendro_heights = dendro_heights[-to_split]
+                curr_k = curr_k-1+length(children)
+        }
+        # Make group vector
+        names_orig_order = labels(dendro)[order(heatout$colInd)]
+        groups = rep(0, times=length(names_orig_order))
+        for (i in 1:length(dendro_list)) {
+                groups[names_orig_order %in% labels(dendro_list[[i]])] = i
+        }
+        return(groups);
+}
diff --git a/R/basics.R b/R/basics.R
@@ -1,5 +1,6 @@
 bg__calc_variables <- function(expr_mat) {
         # Calc variables
+	if (sum(expr_mat < 0) >0) {stop("Expression matrix contains negative values! M3Drop requires an expression matrix that is not log-transformed.")}
 	p = apply(expr_mat,1,function(x){y = x[!is.na(x)]; sum(y==0)/length(y)});
 	s = rowMeans(expr_mat, na.rm=T);
 	s_stderr = unlist(apply(expr_mat,1,sd))/sqrt(length(expr_mat[1,]));
@@ -15,7 +16,6 @@ hidden__invert_MM <- function (K, p) {K*(1-p)/(p)}
 bg__horizontal_residuals_MM_log10 <- function (K, p, s) {log(s)/log(10) - log(hidden__invert_MM(K,p))/log(10)}
 
 hidden_getAUC <- function(gene, labels) {
-        require("ROCR")
         ranked=rank(gene);
         ms = aggregate(ranked~unlist(labels),FUN=mean); #Get average score for each cluster
         posgroup = as.character(unlist(ms[which(ms[,2]==max(ms[,2])),1])); #Get cluster with highest average score
@@ -25,8 +25,8 @@ hidden_getAUC <- function(gene, labels) {
         truth = labels == posgroup
 
         #Make predictions & get auc using RCOR package.
-        pred=prediction(ranked,as.numeric(truth))
-        val = unlist(performance(pred,"auc")@y.values)
+        pred=ROCR::prediction(ranked,as.numeric(truth))
+        val = unlist(ROCR::performance(pred,"auc")@y.values)
         pval = wilcox.test(gene[truth],gene[!truth])$p.value
         if (!exists("pval")) {pval=NA}
 
@@ -44,6 +44,6 @@ M3Drop_getmarkers <- function(expr_mat, labels) {
         auc_df[,1] = as.numeric(as.character(auc_df[,1]))
         auc_df[,3] = as.numeric(as.character(auc_df[,3]))
         auc_df = auc_df[auc_df[,1] > 0,]
+	auc_df = auc_df[order(-auc_df$AUC),]
         return(auc_df);
-
 }
diff --git a/man/M3D_Test_Shift.Rd b/man/M3D_Test_Shift.Rd
@@ -11,13 +11,16 @@
   \item{expr_mat}{a numeric matrix of expression values, columns = samples, rows = genes.}
   \item{genes_to_test}{vector of gene names to test.}
   \item{name}{string used to title the plot.}
-  \item{background}{vector of gene names to test against.}
+  \item{background}{vector of gene names to test against. (default = all genes)}
   \item{suppress.plot}{logical, whether to the fitted Michaelis-Menten curve and highlight the given set of genes to test.}
 }
-\details{Fits a Michaelis-Menten function to the dropout-rate of the provided data, then tests whether a given set of genes is significantly shifted left or right of the curve. Horizontal residuals are calculated as : \deqn{\log_{10} S - \log_{10} \frac{K*(1-P)}{P}}{log_10(S) - log_10( (K * (1 - P)) / P )}. Uses a Wilcox rank-sum test/Mann-Whitney U test to compare the residuals for the given genes to the residuals for all genes.
+\details{Fits a Michaelis-Menten function to the dropout-rate of the provided data, then tests whether a given set of genes (eg. pseudogenes) is significantly shifted left or right of the curve. Horizontal residuals are calculated as : \deqn{\log_{10} S - \log_{10} \frac{K*(1-P)}{P}}{log_10(S) - log_10( (K * (1 - P)) / P )}. Uses a Wilcox rank-sum test/Mann-Whitney U test to compare the residuals for the given genes to the residuals for all genes.
 }
 \value{
-A one row dataframe with columns: sample (sample median horizontal residual), pop (population median horizontal residual), p.value
+A one row dataframe with columns: 
+    sample (median horizontal residual of genes in the test set), 
+    pop (median horizontal residual of genes in the background set), 
+    p.value
 }
 \examples{
   library(M3DExampleData)
diff --git a/man/bg__horizontal_residuals_MM_log10.Rd b/man/bg__horizontal_residuals_MM_log10.Rd
@@ -23,7 +23,7 @@
   residuals = bg__horizontal_residuals_MM_log10(K=9, p=gene_info$p, s=gene_info$s)
 }
 \seealso{
-  \code{\link{M3D_Test_Shift}}
-  \code{\link{M3D_Get_Extremes}}
+  \code{\link{M3Drop_Test_Shift}}
+  \code{\link{M3Drop_Get_Extremes}}
 }
 \keyword{residuals}
diff --git a/vignette/M3Drop_Vignette.Rmd b/vignette/M3Drop_Vignette.Rmd

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@ bg__fit_MM <- function (p,s) {`
`6`	`6`	`# Kerr = exp(Kcoeff+Kerr)-exp(Kcoeff)`
`7`	`7`	`# predicted = fitted(fit)`
`8`	`8`	`# krt=summary(fit)$parameters[1,1]`
`9`		`-# return(list(K=krt,Kerr=Kerr,predictions=predicted, model=c("MMenton",paste("Krt =",round(krt,digits=3))),SSr=round(sum((residuals(fit))^2)),SAr=round(sum(abs(residuals(fit))))))`
	`9`	`+# return(list(K=krt,Kerr=Kerr,predictions=predicted, model=c("MMenten",paste("Krt =",round(krt,digits=3))),SSr=round(sum((residuals(fit))^2)),SAr=round(sum(abs(residuals(fit))))))`
`10`	`10`	`if (length(p) != length(s)) {`
`11`	`11`	`stop(print("Error: p and s not same length. Cannot fit Michaelis-Menten."))`
`12`	`12`	`}`
`@@ -30,7 +30,7 @@ bg__fit_MM <- function (p,s) {`
`30`	`30`	`Kerr = fit@coef[2]`
`31`	`31`	`predicted = 1-(s/(krt+s))`
`32`	`32`	`residuals = p-predicted`
`33`		`- return(list(K=krt,Kerr=Kerr,fitted_err = res_err,predictions=predicted, model=c("MMenton",paste("Krt =",round(krt,digits=3))),SSr=round(sum((residuals)^2)),SAr=round(sum(abs(residuals)))))`
	`33`	`+ return(list(K=krt,Kerr=Kerr,fitted_err = res_err,predictions=predicted, model=c("MMenten",paste("K =",round(krt,digits=3))),SSr=round(sum((residuals)^2)),SAr=round(sum(abs(residuals)))))`
`34`	`34`
`35`	`35`	`}`
`36`	`36`	`bg__fit_logistic <- function(p,s) {`
`@@ -98,6 +98,6 @@ M3Drop_Dropout_Models <- function(expr_mat, xlim=NA, suppress.plot=FALSE) {`
`98`	`98`	`sizeloc = bg__add_model_to_plot(SCDE, BasePlot, lty=2, lwd=2.5, col="magenta3",legend_loc = c(sizeloc$rect$left+sizeloc$rect$w,sizeloc$rect$top-sizeloc$rect$h-0.05));`
`99`	`99`	`sizeloc = bg__add_model_to_plot(ZIFA, BasePlot, lty=3, lwd=2.5, col="red",legend_loc = c(sizeloc$rect$left+sizeloc$rect$w,sizeloc$rect$top-sizeloc$rect$h-0.05));`
`100`	`100`	`}`
`101`		`- invisible(list(MMfit = MM, LogiFit = SCDE, ExpoFit = ZIFA));`
	`101`	`+ invisible(list(MMFit = MM, LogiFit = SCDE, ExpoFit = ZIFA));`
`102`	`102`	`}`
`103`	`103`
Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@`
`23`	`23`	`residuals = bg__horizontal_residuals_MM_log10(K=9, p=gene_info$p, s=gene_info$s)`
`24`	`24`	`}`
`25`	`25`	`\seealso{`
`26`		`- \code{\link{M3D_Test_Shift}}`
`27`		`- \code{\link{M3D_Get_Extremes}}`
	`26`	`+ \code{\link{M3Drop_Test_Shift}}`
	`27`	`+ \code{\link{M3Drop_Get_Extremes}}`
`28`	`28`	`}`
`29`	`29`	`\keyword{residuals}`