Updated links. R code formated by LSP.

V-Z · V-Z · commit 02d4ff93bdb3 · 2025-06-09T17:12:47.000+02:00
diff --git a/presentation/hybseq_course.tex b/presentation/hybseq_course.tex
@@ -110,7 +110,7 @@ \section{Introduction}
 	\begin{itemize}
 		\item Course \href{https://github.com/V-Z/hybseq-course}{Git (slides with all links)} and \href{https://is.cuni.cz/studium/eng/predmety/index.php?do=predmet&kod=MB120C117}{information in SIS} (\href{https://is.cuni.cz/studium/predmety/index.php?do=predmet&kod=MB120C117}{česky})
 		\item \alert{Scripts} \url{https://github.com/V-Z/hybseq-scripts} \textbf{shown in the course} --- clone the Git repository and adapt to your needs
-		\item Download presentation from \url{https://trapa.cz/en/hybseq-course-2024}
+		\item Download presentation from \url{https://trapa.cz/en/hybseq-course-2025}
 		\item Most of the work is done in Linux/UNIX (macOS,~\ldots) command line, so that good knowledge of work in command line is essential, good starting point can be my Linux and MetaCentrum course \url{https://soubory.trapa.cz/linuxcourse/}
 		\item Many tasks are done in R, so that at least basic knowledge of R is needed, good starting point can be my R course \url{https://soubory.trapa.cz/rcourse/}
 		\item Processing HybSeq data is computationally demanding (it requires plenty of resources), during the course we use \href{https://www.metacentrum.cz/en/Sluzby/Grid/}{MetaCentrum, Czech National Grid Infrastructure} (\href{https://www.metacentrum.cz/cs/Sluzby/Grid/}{česky}) (slide~\ref{CESNET}), but any computing cluster or powerful desktop (for patient users;-) can be used
@@ -216,7 +216,7 @@ \subsection{Software needed}
 		\item \href{https://blast.ncbi.nlm.nih.gov/doc/blast-help/downloadblastdata.html\#downloadblastdata}{BLAST+} (used by \href{https://github.com/mossmatters/HybPiper/wiki}{HybPiper})
 		\item \href{https://github.com/lh3/bwa}{BWA} (used by \href{https://github.com/mossmatters/HybPiper/wiki}{HybPiper})
 		\item \href{https://www.wsi.uni-tuebingen.de/lehrstuehle/algorithms-in-bioinformatics/software/dendroscope/}{Dendroscope} --- visualize outputs of \href{https://phylogenomics.rice.edu/html/phylonet.html}{PhyloNet}
-		\item \href{https://www.ebi.ac.uk/about/vertebrate-genomics/software/exonerate}{Exonerate} (used by \href{https://github.com/mossmatters/HybPiper/wiki}{HybPiper})
+		\item \href{https://github.com/nathanweeks/exonerate}{Exonerate} (used by \href{https://github.com/mossmatters/HybPiper/wiki}{HybPiper})
 		\item \href{https://www.gnu.org/software/parallel/}{GNU Parallel} (used by \href{https://github.com/mossmatters/HybPiper/wiki}{HybPiper} and in BASH scripts)
 		\item \href{https://github.com/mossmatters/HybPiper/wiki}{HybPiper} --- recovering genes from targeted sequence capture data
 		\item \href{http://www.iqtree.org/}{IQ-TREE} --- gene trees
@@ -252,10 +252,7 @@ \subsection{MetaCentrum computing environment}
 		\item Information about data storage \url{https://du.cesnet.cz/en/start} (\href{https://du.cesnet.cz/cs/start}{česky}) contains detailed usage instructions
 		\item Information about MetaCentrum \url{https://www.metacentrum.cz/en/} (\href{https://www.metacentrum.cz/cs/}{česky})
 		\item Most of practical information for users are at \url{https://docs.metacentrum.cz/}
-		\begin{itemize}
-			\item Old \href{https://wiki.metacentrum.cz/}{wiki} is deprecated
-		\end{itemize}
-		\item To start work see at least \href{https://docs.metacentrum.cz/access/log-in/}{access}, \href{https://docs.metacentrum.cz/computing/}{computing}, \href{https://docs.metacentrum.cz/data/data-within/}{work with data} and some \href{https://docs.metacentrum.cz/tutorials/}{tutorial}
+		\item To start work see at least \href{https://docs.metacentrum.cz/en/docs/access/log-in}{access}, \href{https://docs.metacentrum.cz/en/docs/computing/concepts}{computing}, \href{https://docs.metacentrum.cz/en/docs/data/large-data}{work with data} and some \href{https://docs.metacentrum.cz/en/docs/tutorials}{tutorial}
 		\item Of course, good knowledge of work in Linux command line (BASH) is needed\ldots
 	\end{itemize}
 	\vfill
@@ -275,8 +272,8 @@ \subsection{MetaCentrum computing environment}
 		\item Current state and usage as available at \url{https://metavo.metacentrum.cz/}
 		\item Manage your user account at \url{http://metavo.metacentrum.cz/en/myaccount/} (\href{https://metavo.metacentrum.cz/cs/myaccount/}{česky})
 		\item Personal view on actual resources and running tasks is at \url{https://metavo.metacentrum.cz/pbsmon2/person}
-		\item Listing of available applications \url{https://docs.metacentrum.cz/software/search-soft/}
-		\item It has several \href{https://docs.metacentrum.cz/access/log-in/}{front ends} where users log, various \href{https://docs.metacentrum.cz/data/data-within/}{storages}, and thousands of computers (nodes) doing the calculations --- they are not accessed directly to run task
+		\item Listing of available applications \url{https://docs.metacentrum.cz/en/docs/software/alphabet}
+		\item It has several \href{https://docs.metacentrum.cz/en/docs/access/log-in}{front ends} where users log, various \href{https://docs.metacentrum.cz/en/docs/data/large-data}{storages}, and thousands of computers (nodes) doing the calculations --- they are not accessed directly to run task
 		\begin{itemize}
 			\item Distributed nature and number of front ends and storages may be confusing for beginners
 		\end{itemize}
@@ -292,7 +289,7 @@ \subsection{MetaCentrum computing environment}
 
 \begin{frame}[fragile]{Launching of tasks}
 	\begin{itemize}
-		\item \url{https://docs.metacentrum.cz/computing/run-basic-job/}
+		\item \url{https://docs.metacentrum.cz/en/docs/computing/run-basic-job}
 		\item Personal view \url{https://metavo.metacentrum.cz/pbsmon2/person} has nice overview of available resources and tasks and allows comfortable construction of submission command
 	\end{itemize}
 	\vfill
@@ -313,10 +310,10 @@ \subsection{MetaCentrum computing environment}
 \begin{frame}[fragile]{Key MetaCentrum commands}
 	\begin{itemize}
 		\item MetaCentrum is \enquote{just} normal Linux server --- work as usually
-		\item Command \texttt{module} loads/unloads selected \href{https://docs.metacentrum.cz/software/search-soft/}{application} (\texttt{module add r})
+		\item Command \texttt{module} loads/unloads selected \href{https://docs.metacentrum.cz/en/docs/software/alphabet}{application} (\texttt{module add r})
 		\item Tasks (BASH scripts) are submitted for computing by \texttt{qsub} --- the script must copy the data into \texttt{\$SCRATCHDIR} and do all calculations there
 		\begin{itemize}
-			\item It has plenty of options how to specify requirements (see \href{https://docs.metacentrum.cz/computing/run-basic-job/}{help})
+			\item It has plenty of options how to specify requirements (see \href{https://docs.metacentrum.cz/en/docs/computing/run-basic-job}{help})
 		\end{itemize}
 		\item Queued and running jobs can be seen by \texttt{qstat -u \$USER} (\texttt{qstat} has much more options) and any job can be terminated by \texttt{qdel 123456789} (number from \texttt{qstat})
 	\end{itemize}
@@ -419,7 +416,7 @@ \subsection{Data download and start}
 \begin{frame}[fragile]{Data download}
 	\label{datadownload}
 	\begin{itemize}
-		\item MetaCentrum storages have sometimes too limited quota for such a large data --- see your \href{https://metavo.metacentrum.cz/en/myaccount/kvoty}{quotas} (\href{http://metavo.metacentrum.cz/cs/myaccount/kvoty}{česky}), see \href{https://docs.metacentrum.cz/data/quotas/}{details}
+		\item MetaCentrum storages have sometimes too limited quota for such a large data --- see your \href{https://metavo.metacentrum.cz/en/myaccount/kvoty}{quotas} (\href{http://metavo.metacentrum.cz/cs/myaccount/kvoty}{česky}), see \href{https://docs.metacentrum.cz/en/docs/data/quotas}{details}
 		\item The \href{https://github.com/V-Z/hybseq-scripts}{pipeline} produce a lot of data (especially \href{https://github.com/mossmatters/HybPiper/}{HybPiper}; gene trees can be also large) --- ensure to have enough space to store everything
 		\begin{itemize}
 			\item Especially \textbf{HybPiper produces a lot of files} --- \alert{user may reach quota for number of files}, not necessarily (only) amount of data
@@ -1173,7 +1170,7 @@ \subsection{Filtering trees}
 	\begin{spluscode}
     trees # See original trees
     # Remove trees identified in the PCoA plot
-    trees[c("Assembly_12866", "Assembly_14143", "Assembly_1500")] <- NULL
+    trees[c("Assembly_1033", "Assembly_10103", "Assembly_10222")] <- NULL
     trees # See new object
     # Possibly remove trees with too few tips
     print(trees, details=TRUE)
@@ -1539,7 +1536,7 @@ \subsection{Comparing trees}
     # Pie chart: concordance (blue) top conflict (green), other conflict
     # (red), no signal (gray). Run phypartspiecharts.py to get the graphics:
     python phypartspiecharts.py --svg_name trees_good_res.svg \
-      parsimony_sp_tree.nwk trees_good_res 219
+      parsimony_sp_tree.nwk trees_good_res 20
 	\end{bashcode}
 \end{frame}
 
diff --git a/trees_filtration.r b/trees_filtration.r
@@ -1,10 +1,12 @@
 # Install needed packages
-install.packages(pkgs=c("ape", "ade4", "distory", "gplots", "ggplot2", "phangorn", "phytools"), repos="https://mirrors.nic.cz/R/", dependencies="Imports")
+install.packages(pkgs = c("ape", "ade4", "distory", "gplots", "ggplot2", "phangorn", "phytools"), repos = "https://mirrors.nic.cz/R/", dependencies = "Imports")
 # Install kdetrees package (removed from CRAN)
 # Ensure package 'devtools' is installed
-if( ! 'devtools' %in% installed.packages() ) { install.packages('devtools') }
+if (!"devtools" %in% installed.packages()) {
+  install.packages("devtools")
+}
 # Install 'kdetrees' from https://github.com/V-Z/kdetrees Git repository
-devtools::install_github('V-Z/kdetrees')
+devtools::install_github("V-Z/kdetrees")
 
 # Load libraries
 library(ape)
@@ -20,77 +22,77 @@ library(phytools)
 setwd("~/dokumenty/vyuka/hybseq/")
 
 # Load the list of trees
-trees <- read.tree(file="trees_ml_exons.nwk")
+trees <- read.tree(file = "trees_ml_exons.nwk")
 trees
-print(trees, details=TRUE)
+print(trees, details = TRUE)
 
 # Compute distance of topological similarities
-trees.d <- dist.topo(x=trees, method="score", mc.cores=4) # Set number of cores according to your computer
+trees.d <- dist.topo(x = trees, method = "score", mc.cores = 4) # Set number of cores according to your computer
 
 # Plot the heatmap (package gplots)
-png(filename="trees_dist.png", width=10000, height=10000)
-	heatmap.2(x=as.matrix(trees.d), Rowv=FALSE, Colv="Rowv", dendrogram="none", symm=TRUE, scale="none", na.rm=TRUE, revC=FALSE, col=rainbow(15), cellnote=as.matrix(trees.d), notecex=1, notecol="white", trace="none", labRow=rownames(as.matrix(trees.d)), labCol=colnames(as.matrix(trees.d)), key=FALSE, main="Correlation matrix of topographical distances")
-	dev.off() # Saves the image
+png(filename = "trees_dist.png", width = 10000, height = 10000)
+heatmap.2(x = as.matrix(trees.d), Rowv = FALSE, Colv = "Rowv", dendrogram = "none", symm = TRUE, scale = "none", na.rm = TRUE, revC = FALSE, col = rainbow(15), cellnote = as.matrix(trees.d), notecex = 1, notecol = "white", trace = "none", labRow = rownames(as.matrix(trees.d)), labCol = colnames(as.matrix(trees.d)), key = FALSE, main = "Correlation matrix of topographical distances")
+dev.off() # Saves the image
 
 # Test if the distance matrix is Euclidean
-is.euclid(distmat=as.dist(trees.d), plot=TRUE, tol=1e-05)
+is.euclid(distmat = as.dist(trees.d), plot = TRUE, tol = 1e-05)
 
 # PCoA
-trees.pcoa <- dudi.pco(d=trees.d, scannf=FALSE, nf=5)
+trees.pcoa <- dudi.pco(d = trees.d, scannf = FALSE, nf = 5)
 trees.pcoa
 
 # Plot PCoA
-s.label(dfxy=trees.pcoa$li)
-s.kde2d(dfxy=trees.pcoa$li, cpoint=0, add.plot=TRUE)
-add.scatter.eig(trees.pcoa[["eig"]], 3,1,2, posi="topright")
+s.label(dfxy = trees.pcoa$li)
+s.kde2d(dfxy = trees.pcoa$li, cpoint = 0, add.plot = TRUE)
+add.scatter.eig(trees.pcoa[["eig"]], 3, 1, 2, posi = "topright")
 title("PCoA of matrix of pairwise trees distances")
 
 # Remove outlying trees
 trees
-trees[c("Assembly_12866", "Assembly_14143", "Assembly_1500")] <- NULL
+trees[c("Assembly_1033", "Assembly_10103", "Assembly_10222")] <- NULL
 trees
 
 # Now you can repeat recalculation of distance matrix and PCoA and possibly remove more trees...
 
 # # Possibly remove trees with too few tips
-# print(trees, details=TRUE)
+# print(trees, details = TRUE)
 # trees[c(1, 2, 3, 4)] <- NULL
 # trees
 
 # # Possibly remove rare tips
-# trees <- lapply(X=trees, FUN=drop.tip, tip=c("Amomum-sp7_S308_L001", "Amomum-trilobum_S12_L001"))
+# trees <- lapply(X = trees, FUN = drop.tip, tip = c("Amomum-sp7_S308_L001", "Amomum-trilobum_S12_L001"))
 # class(trees) <- "multiPhylo" # Use after usage of lapply to multiPhylo
 
 # Run kdetrees to detect outliers - play with k
 ?kdetrees # See options for kdetrees
-trees.kde <- kdetrees(trees=trees, k=0.9, distance="dissimilarity", topo.only=FALSE, greedy=TRUE)
+trees.kde <- kdetrees(trees = trees, k = 0.9, distance = "dissimilarity", topo.only = FALSE, greedy = TRUE)
 # See text results with list of outlying trees
 trees.kde
 # See graphical results
-plot(x=trees.kde)
-hist(x=trees.kde)
+plot(x = trees.kde)
+hist(x = trees.kde)
 # See removed trees
 plot.multiPhylo(trees.kde[["outliers"]])
 # Save removed trees
-write.tree(phy=trees.kde[["outliers"]], file="trees_outliers.nwk")
+write.tree(phy = trees.kde[["outliers"]], file = "trees_outliers.nwk")
 # Save kdetrees report
-write.table(x=as.data.frame(x=trees.kde), file="trees_scores.tsv", quote=FALSE, sep="\t")
+write.table(x = as.data.frame(x = trees.kde), file = "trees_scores.tsv", quote = FALSE, sep = "\t")
 # Extract passing trees
 trees.good <- trees[names(trees) %in% names(trees.kde[["outliers"]]) == FALSE]
 trees.good
 # Save passing trees
-write.tree(phy=trees.good, file="trees_good.nwk")
+write.tree(phy = trees.good, file = "trees_good.nwk")
 
 # Compute parsimony super tree
 ?superTree # See help first...
-tree.sp <- superTree(tree=trees.good, method="NNI", rooted=TRUE, trace=2, start=NULL, multicore=TRUE)
+tree.sp <- superTree(tree = trees.good, method = "NNI", rooted = TRUE, trace = 2, start = NULL, multicore = TRUE)
 tree.sp # See details
 # Root it
-tree.sp <- root(phy=tree.sp, outgroup=c("Riedelia-arfakensis_S49_L001", "Zingiber-officinale_S242_L001"), resolve.root=TRUE)
+tree.sp <- root(phy = tree.sp, outgroup = c("Riedelia-arfakensis_S49_L001", "Zingiber-officinale_S242_L001"), resolve.root = TRUE)
 # Save parsimony super tree
-write.tree(phy=tree.sp, file="parsimony_sp_tree.nwk")
+write.tree(phy = tree.sp, file = "parsimony_sp_tree.nwk")
 # Plot parsimony super tree
-plot.phylo(x=tree.sp, type="phylogram", edge.width=2, label.offset=0.01, cex=1.2)
+plot.phylo(x = tree.sp, type = "phylogram", edge.width = 2, label.offset = 0.01, cex = 1.2)
 add.scale.bar()
 # Tune display of the tree...
 
@@ -99,34 +101,35 @@ add.scale.bar()
 ?phytools::mrp.supertree
 ?phangorn::coalSpeciesTree
 # All trees must be ultrametric - chronos scale them
-trees.ultra <- lapply(X=trees.good, FUN=chronos, model="correlated")
+trees.ultra <- lapply(X = trees.good, FUN = chronos, model = "correlated")
 class(trees.ultra) <- "multiPhylo"
 # Calculate the species tree
 # tree.sp.mean <- speciesTree(x=trees.ultra, FUN=mean)
-tree.sp2 <- mrp.supertree(tree=trees.good, method="optim.parsimony", rooted=TRUE)
-tree.sp2 <- root(phy=tree.sp2, outgroup=c("Riedelia-arfakensis_S49_L001", "Zingiber-officinale_S242_L001"), resolve.root=TRUE)
-plot.phylo(x=tree.sp2, type="phylogram", edge.width=2, label.offset=0.01, cex=1.2)
+tree.sp2 <- mrp.supertree(tree = trees.good, method = "optim.parsimony", rooted = TRUE)
+tree.sp2 <- root(phy = tree.sp2, outgroup = c("Riedelia-arfakensis_S49_L001", "Zingiber-officinale_S242_L001"), resolve.root = TRUE)
+plot.phylo(x = tree.sp2, type = "phylogram", edge.width = 2, label.offset = 0.01, cex = 1.2)
 
 # # Consensus networks
-# ?consensusNet
+# # Requires all trees to have same set of tips (no missing data)
+# # ?consensusNet
 # # Compute consensus network
-# tree.net <- consensusNet(obj=trees.good, prob=0.25)
+# # tree.net <- consensusNet(obj = trees.good, prob = 0.25)
 # # Plot 2D or 3D
-# plot(x=tree.net, planar=FALSE, type="2D", use.edge.length=TRUE, show.tip.label=TRUE, show.edge.label=TRUE, show.node.label=TRUE, show.nodes=TRUE, edge.color="black", tip.color="blue") # 2D
-# plot(x=tree.net, planar=FALSE, type="3D", use.edge.length=TRUE, show.tip.label=TRUE, show.edge.label=TRUE, show.node.label=TRUE, show.nodes=TRUE, edge.color="black", tip.color="blue") # 3D
+# # plot(x = tree.net, planar = FALSE, type = "2D", use.edge.length = TRUE, show.tip.label = TRUE, show.edge.label = TRUE, show.node.label = TRUE, show.nodes = TRUE, edge.color = "black", tip.color = "blue") # 2D
+# # plot(x = tree.net, planar = FALSE, type = "3D", use.edge.length = TRUE, show.tip.label = TRUE, show.edge.label = TRUE, show.node.label = TRUE, show.nodes = TRUE, edge.color = "black", tip.color = "blue") # 3D
 
 # Save trees.good in NEXUS for PhyloNet
-write.nexus(trees.good, file="trees_good.nex", translate=FALSE)
+write.nexus(trees.good, file = "trees_good.nex", translate = FALSE)
 
 # Cophyloplots - comparing 2 phylogenetic trees
 # We need 2 column matrix with tip labels
-tips.labels <- matrix(data=c(sort(tree.sp[["tip.label"]]), sort(tree.sp2[["tip.label"]])), nrow=length(tree.sp[["tip.label"]]), ncol=2)
+tips.labels <- matrix(data = c(sort(tree.sp[["tip.label"]]), sort(tree.sp2[["tip.label"]])), nrow = length(tree.sp[["tip.label"]]), ncol = 2)
 # Draw the tree, play with graphical parameters
 # Click to nodes to rotate them to get better display
-cophyloplot(x=tree.sp, y=tree.sp2, assoc=tips.labels, use.edge.length=FALSE, space=60, length.line=1, gap=2, type="phylogram", rotate=TRUE, col="red", lwd=1.5, lty=2)
+cophyloplot(x = tree.sp, y = tree.sp2, assoc = tips.labels, use.edge.length = FALSE, space = 60, length.line = 1, gap = 2, type = "phylogram", rotate = TRUE, col = "red", lwd = 1.5, lty = 2)
 # Slihtly better display in phytools::cophylo
-trees.cophylo <- cophylo(tr1=tree.sp, tr2=tree.sp2, assoc=tips.labels, rotate=TRUE)
-plot.cophylo(x=trees.cophylo, lwd=2, link.type="curved")
+trees.cophylo <- cophylo(tr1 = tree.sp, tr2 = tree.sp2, assoc = tips.labels, rotate = TRUE)
+plot.cophylo(x = trees.cophylo, lwd = 2, link.type = "curved")
 
 # Density trees
 is.rooted.multiPhylo(trees.ultra) # rooted
@@ -135,15 +138,14 @@ is.binary.multiPhylo(trees.ultra) # binary bifurcating
 # See help page
 ?phangorn::densiTree
 # Plotting density trees
-densiTree(x=trees.ultra, scaleX=TRUE, col=rainbow(6), width=5, cex=1.5)
-densiTree(x=trees.ultra, direction="upwards", scaleX=TRUE, width=5)
-densiTree(x=trees.ultra, scaleX=TRUE, width=5, cex=1.5)
-densiTree(x=trees.ultra[1:10], scaleX=TRUE, width=5, cex=1.25)
+densiTree(x = trees.ultra, scaleX = TRUE, col = rainbow(6), width = 5, cex = 1.5)
+densiTree(x = trees.ultra, direction = "upwards", scaleX = TRUE, width = 5)
+densiTree(x = trees.ultra, scaleX = TRUE, width = 5, cex = 1.5)
+densiTree(x = trees.ultra[1:10], scaleX = TRUE, width = 5, cex = 1.25)
 # See help page
 ?phytools::densityTree
 # Plotting density trees
-densityTree(trees=c(tree.sp, tree.sp2), fix.depth=TRUE, lwd=4)
-# densityTree(trees=trees.ultra, fix.depth=TRUE, use.gradient=TRUE, alpha=0.5, lwd=4)
-# densityTree(trees=trees.ultra[1:3], fix.depth=TRUE, use.gradient=TRUE, alpha=0.5, lwd=4)
-# densityTree(trees=trees.ultra[c(2, 4, 6)], fix.depth=TRUE, use.gradient=TRUE, alpha=0.5, lwd=4)
-
+densityTree(trees = c(tree.sp, tree.sp2), fix.depth = TRUE, lwd = 4)
+densityTree(trees = trees.ultra, fix.depth = TRUE, use.gradient = TRUE, alpha = 0.5, lwd = 4)
+# densityTree(trees = trees.ultra[1:3], fix.depth = TRUE, use.gradient = TRUE, alpha = 0.5, lwd = 4)
+# densityTree(trees = trees.ultra[c(2, 4, 6)], fix.depth = TRUE, use.gradient = TRUE, alpha = 0.5, lwd = 4)