|
| 1 | +process FILTER_SAMPLES { |
| 2 | + label 'process_single' |
| 3 | + |
| 4 | + conda "conda-forge::r-base=4.2.1" |
| 5 | + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? |
| 6 | + 'https://depot.galaxyproject.org/singularity/r-base:4.2.1' : |
| 7 | + 'biocontainers/r-base:4.2.1' }" |
| 8 | + |
| 9 | + input: |
| 10 | + path(metadata, stageAs: 'input/*') |
| 11 | + path(table, stageAs: 'input/*') |
| 12 | + |
| 13 | + output: |
| 14 | + path("metadata.tsv"), emit: metadata |
| 15 | + path("table.tsv") , emit: abundances |
| 16 | + path("*.log") , emit: log, optional: true |
| 17 | + path "versions.yml" , emit: versions |
| 18 | + |
| 19 | + script: |
| 20 | + """ |
| 21 | + #!/usr/bin/env Rscript |
| 22 | +
|
| 23 | + # first column in meta has sample id |
| 24 | + meta <- read.table( "$metadata", header = TRUE, sep = "\t", stringsAsFactors = FALSE) |
| 25 | + # column names are sample ids, but first column is asv id |
| 26 | + abund <- read.table( "$table", header = TRUE, sep = "\t", stringsAsFactors = FALSE) |
| 27 | +
|
| 28 | + # samples that arent in both files are dropped |
| 29 | + meta_filtered <- meta[meta[,1] %in% colnames(abund)[2:length(colnames(abund))],] |
| 30 | + abund_filtered <- abund[,colnames(abund) %in% c( colnames(abund)[1], meta[,1] ) ] |
| 31 | +
|
| 32 | + # write filtered data |
| 33 | + write.table(meta_filtered, file = "metadata.tsv", row.names = FALSE, col.names = TRUE, quote = FALSE, na = '', sep = "\t") |
| 34 | + write.table(abund_filtered, file = "table.tsv", row.names = FALSE, col.names = TRUE, quote = FALSE, na = '', sep = "\t") |
| 35 | +
|
| 36 | + # error in case all samples were removed |
| 37 | + if ( nrow(meta_filtered) == 0 ) { |
| 38 | + stop("All samples were removed. That means no overlap between the metadata sample IDs and the abundance table sample IDs was found. Make sure that sample IDs match.") |
| 39 | + } |
| 40 | +
|
| 41 | + # this is in case some samples were lost during preprocessing, i.e. samples in metadata but not in abundance table |
| 42 | + if ( nrow(meta) > nrow(meta_filtered) ) { |
| 43 | + log_message = paste("The metadata file rows were reduced from", nrow(meta), "to", nrow(meta_filtered),", because some samples were missing in the abundance table") |
| 44 | + write.table(log_message, file = paste0(log_message,".log"), row.names = FALSE, col.names = FALSE, quote = FALSE) |
| 45 | + } |
| 46 | + # this is in case some samples were not in metadata, i.e. only a subset of samples is entering downstream analysis |
| 47 | + if ( ncol(abund) > ncol(abund_filtered) ) { |
| 48 | + log_message = paste("Samples in the abundance file were reduced from", ncol(abund)-1, "to", ncol(abund_filtered)-1,", because the metadata did not contain all samples in the abundance table") |
| 49 | + write.table(log_message, file = paste0(log_message,".log"), row.names = FALSE, col.names = FALSE, quote = FALSE) |
| 50 | + } |
| 51 | +
|
| 52 | + # versions |
| 53 | + writeLines(c("\\"${task.process}\\":", paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")) ), "versions.yml") |
| 54 | + """ |
| 55 | +} |
0 commit comments