|
| 1 | +package nextflow.bactopia.inputs |
| 2 | + |
| 3 | +import groovy.util.logging.Slf4j |
| 4 | +import java.nio.file.Path |
| 5 | + |
| 6 | +import static nextflow.bactopia.BactopiaUtils.fileExists |
| 7 | + |
| 8 | +@Slf4j |
| 9 | +class BactopiaTools { |
| 10 | + |
| 11 | + |
| 12 | + // |
| 13 | + // Collect the input samples from the Bactopia directory to be used by a given Bactopia Tool |
| 14 | + // |
| 15 | + public static List collectInputs(String bactopiaDir, String extension, String includeFile, String excludeFile) { |
| 16 | + def Boolean includeAll = true |
| 17 | + def List inclusions = processFOFN(includeFile, true) |
| 18 | + def List exclusions = processFOFN(excludeFile, false) |
| 19 | + def List ignoreList = ['.nextflow', 'bactopia-info', 'bactopia-tools', 'work', 'bactopia-runs', 'pipeline_info'] |
| 20 | + |
| 21 | + // Check if bactopiaDir exists, and if so loop through it |
| 22 | + def List samples = [] |
| 23 | + def List missing = [] |
| 24 | + if (bactopiaDir) { |
| 25 | + Path bactopiaPath = Path.of(bactopiaDir) |
| 26 | + if (bactopiaPath.exists()) { |
| 27 | + // loop through the Bactopia directory and collect the samples |
| 28 | + bactopiaPath.eachFile { item -> |
| 29 | + if (item.isDirectory()) { |
| 30 | + def String sample = item.getName() |
| 31 | + if (!ignoreList.contains(sample)) { |
| 32 | + if (inclusions.contains(sample) || includeAll) { |
| 33 | + if (!exclusions.contains(sample)) { |
| 34 | + if (_isSampleDir(sample, bactopiaDir)) { |
| 35 | + def List inputs = _collectInputs(sample, bactopiaDir, extension) |
| 36 | + log.info ("inputs: ${inputs.getClass()}") |
| 37 | + if (inputs[0] instanceof String) { |
| 38 | + missing << inputs |
| 39 | + } else { |
| 40 | + samples << inputs |
| 41 | + } |
| 42 | + } else { |
| 43 | + log.info("${sample} does not appear to be a Bactopia sample, skipping...") |
| 44 | + } |
| 45 | + } |
| 46 | + } |
| 47 | + } |
| 48 | + } |
| 49 | + } |
| 50 | + } else { |
| 51 | + log.error("The Bactopia directory ${bactopiaDir} (--bactopia) does not exist.") |
| 52 | + } |
| 53 | + } else { |
| 54 | + log.error("--bactopia is is not set.") |
| 55 | + System.exit(1) |
| 56 | + } |
| 57 | + |
| 58 | + log.info("Found ${samples.size()} samples to process") |
| 59 | + if (missing.size() > 0) { |
| 60 | + log.warn("${missing.size()} samples were excluded due to missing files. They are:") |
| 61 | + for (sample in missing) { |
| 62 | + log.warn(" ${sample}") |
| 63 | + } |
| 64 | + } |
| 65 | + log.info("\nIf this looks wrong, now's your chance to back out (CTRL+C 3 times).") |
| 66 | + log.info("Sleeping for 5 seconds...") |
| 67 | + log.info("--------------------------------------------------------------------") |
| 68 | + sleep(5000) |
| 69 | + return samples |
| 70 | + } |
| 71 | + |
| 72 | + |
| 73 | + // |
| 74 | + // Process the include/exclude FOFN files |
| 75 | + // |
| 76 | + private static List processFOFN(String fofn, Boolean isInclude) { |
| 77 | + def List samples = [] |
| 78 | + |
| 79 | + if (fofn) { |
| 80 | + // Check if the file exists, and if so collect the samples |
| 81 | + if (fileExists(fofn)) { |
| 82 | + new File(fofn).eachLine { line -> |
| 83 | + def sample = line.trim().split('\t')[0] |
| 84 | + if (sample) { |
| 85 | + samples << sample |
| 86 | + } |
| 87 | + } |
| 88 | + } |
| 89 | + |
| 90 | + // If samples were found, log the number of samples |
| 91 | + if (samples.size() > 0) { |
| 92 | + if (isInclude) { |
| 93 | + log.info "Including ${samples.size()} samples for analysis" |
| 94 | + } else { |
| 95 | + log.info "Excluding ${samples.size()} samples from the analysis" |
| 96 | + } |
| 97 | + } |
| 98 | + } |
| 99 | + return samples |
| 100 | + } |
| 101 | + |
| 102 | + |
| 103 | + // |
| 104 | + // Test if the sample directory is likely to contain Bactopia results |
| 105 | + // |
| 106 | + private static Boolean _isSampleDir(String sample, String dir) { |
| 107 | + return fileExists("${dir}/${sample}/main/gather/${sample}-meta.tsv") |
| 108 | + } |
| 109 | + |
| 110 | + |
| 111 | + // |
| 112 | + // Navigate the Bactopia output directory and collect the inputs for a given Bactopia Tool |
| 113 | + // |
| 114 | + private static List _collectInputs(String sample, String dir, String extension) { |
| 115 | + def Map PATHS = [:] |
| 116 | + PATHS.blastdb = "annotator" |
| 117 | + PATHS.fastq = "qc" |
| 118 | + PATHS.fna = "assembler" |
| 119 | + PATHS.faa = "annotator" |
| 120 | + PATHS.gbk = "annotator" |
| 121 | + PATHS.gff = "annotator" |
| 122 | + PATHS.meta = "gather" |
| 123 | + |
| 124 | + // Set up the paths for each extension |
| 125 | + def String baseDir = "${dir}/${sample}/main/" |
| 126 | + def String se = "${baseDir}/${PATHS['fastq']}/${sample}.fastq.gz" |
| 127 | + def String pe1 = "${baseDir}/${PATHS['fastq']}/${sample}_R1.fastq.gz" |
| 128 | + def String pe2 = "${baseDir}/${PATHS['fastq']}/${sample}_R2.fastq.gz" |
| 129 | + def String fna = "${baseDir}/${PATHS['fna']}/${sample}.fna" |
| 130 | + def String meta = "${baseDir}/${PATHS['meta']}/${sample}-meta.tsv" |
| 131 | + |
| 132 | + // Check if the SE reads are ONT or Illumina |
| 133 | + def Boolean ont = false |
| 134 | + if (fileExists("${baseDir}/${PATHS['fastq']}/summary/${sample}-final_NanoPlot-report.html")) { |
| 135 | + // the se read is ONT data |
| 136 | + ont = true |
| 137 | + } |
| 138 | + |
| 139 | + // Determine the inputs files required for the given extension |
| 140 | + // NOTE: Remote files will be assumed to exist |
| 141 | + // |
| 142 | + // Return List looks like: |
| 143 | + // [ [id:sample, single_end:true/false, runtype:'illumina'/'ont'], [file1], [file2], ... ] |
| 144 | + // 0 - meta map |
| 145 | + // 1 - input files |
| 146 | + // 2 - extra files |
| 147 | + // 3 - extra files |
| 148 | + if (extension == "illumina_fastq") { |
| 149 | + // Prioritize PE reads first |
| 150 | + if (fileExists(pe1) && fileExists(pe2)) { |
| 151 | + return [[id:sample, single_end:false, runtype:'illumina'], [pe1, pe2], [], []] |
| 152 | + } else if (fileExists(se) && !ont) { |
| 153 | + return [[id:sample, single_end:true, runtype:'illumina'], [se], [], []] |
| 154 | + } |
| 155 | + } else if (extension == 'fastq') { |
| 156 | + if (fileExists(se)) { |
| 157 | + if (ont) { |
| 158 | + return [[id:sample, single_end:true, runtype:'ont'], [se], [], []] |
| 159 | + } else { |
| 160 | + return [[id:sample, single_end:true, runtype:'illumina'], [se], []] |
| 161 | + } |
| 162 | + } else if (fileExists(pe1) && fileExists(pe2)) { |
| 163 | + return [[id:sample, single_end:false, runtype:'illumina'], [pe1, pe2], [], []] |
| 164 | + } |
| 165 | + } else if (extension == 'fna_fastq') { |
| 166 | + if (fileExists(se)) { |
| 167 | + def String runtype = "illumina" |
| 168 | + if (ont) { |
| 169 | + runtype = "ont" |
| 170 | + } |
| 171 | + |
| 172 | + if (fileExists("${fna}.gz")) { |
| 173 | + return [[id:sample, single_end:true, is_compressed:true, runtype:runtype], ["${fna}.gz"], [se], []] |
| 174 | + } else if (fileExists(fna)) { |
| 175 | + return [[id:sample, single_end:true, is_compressed:false, runtype:runtype], [fna], [se], []] |
| 176 | + } |
| 177 | + } else if (fileExists(pe1) && fileExists(pe2)) { |
| 178 | + if (fileExists("${fna}.gz")) { |
| 179 | + return [[id:sample, single_end:false, is_compressed:true, runtype:'illumina'], ["${fna}.gz"], [pe1, pe2], []] |
| 180 | + } else if (fileExists(fna)) { |
| 181 | + return [[id:sample, single_end:false, is_compressed:false, runtype:'illumina'], [fna], [pe1, pe2], []] |
| 182 | + } |
| 183 | + } |
| 184 | + } else if (extension == 'fna_faa_gff') { |
| 185 | + // Default to Bakta faa |
| 186 | + fna = "${baseDir}/${PATHS['faa']}/bakta/${sample}.fna" |
| 187 | + def String faa = "${baseDir}/${PATHS['faa']}/bakta/${sample}.faa" |
| 188 | + def String gff = "${baseDir}/${PATHS['faa']}/bakta/${sample}.gff3" |
| 189 | + if (!fileExists(faa) && !fileExists("${faa}.gz")) { |
| 190 | + // Fall back on Prokka |
| 191 | + fna = "${baseDir}/${PATHS['faa']}/prokka/${sample}.fna" |
| 192 | + faa = "${baseDir}/${PATHS['faa']}/prokka/${sample}.faa" |
| 193 | + gff = "${baseDir}/${PATHS['faa']}/prokka/${sample}.gff" |
| 194 | + } |
| 195 | + |
| 196 | + if (fileExists("${fna}.gz") && fileExists("${faa}.gz") && fileExists("${gff}.gz")) { |
| 197 | + return [[id:sample, is_compressed:true], ["${fna}.gz"], ["${faa}.gz"], ["${gff}.gz"]] |
| 198 | + } else if (fileExists(fna) && fileExists(faa) && fileExists(gff)) { |
| 199 | + return [[id:sample, is_compressed:false], [fna], [faa], [gff]] |
| 200 | + } |
| 201 | + } else if (extension == 'fna_faa') { |
| 202 | + // Default to Bakta faa |
| 203 | + def String faa = "${baseDir}/${PATHS['faa']}/bakta/${sample}.faa" |
| 204 | + if (!fileExists(faa) && !fileExists("${faa}.gz")) { |
| 205 | + // Fall back on Prokka |
| 206 | + faa = "${baseDir}/${PATHS['faa']}/prokka/${sample}.faa" |
| 207 | + } |
| 208 | + |
| 209 | + if (fileExists("${fna}.gz") && fileExists("${faa}.gz")) { |
| 210 | + return [[id:sample, is_compressed:true], ["${fna}.gz"], ["${faa}.gz"], []] |
| 211 | + } else if (fileExists(fna) && fileExists(faa)) { |
| 212 | + return [[id:sample, is_compressed:false], [fna], [faa], []] |
| 213 | + } |
| 214 | + } else if (extension == 'fna_meta') { |
| 215 | + // include the meta file |
| 216 | + if (fileExists("${fna}.gz") && fileExists(meta)) { |
| 217 | + return [[id:sample, is_compressed:true], ["${fna}.gz"], [meta], []] |
| 218 | + } else if (fileExists(fna) && fileExists(meta)) { |
| 219 | + return [[id:sample, is_compressed:false], [fna], [meta], []] |
| 220 | + } |
| 221 | + } else if (extension == 'blastdb') { |
| 222 | + // Default to Bakta blastdb |
| 223 | + def String input = "${baseDir}/${PATHS[extension]}/bakta/${sample}-${extension}.tar.gz" |
| 224 | + if (!fileExists(input)) { |
| 225 | + // Fall back on Prokka |
| 226 | + input = "${baseDir}/${PATHS[extension]}/prokka/${sample}-${extension}.tar.gz" |
| 227 | + } |
| 228 | + |
| 229 | + if (fileExists(input)) { |
| 230 | + return [[id:sample], [input], [], []] |
| 231 | + } |
| 232 | + } else { |
| 233 | + // The remaining are generic 1 to 1 mappings |
| 234 | + def String input = "${baseDir}/${PATHS[extension]}/${sample}.${extension}" |
| 235 | + if (extension == "gbk") { |
| 236 | + // Default to Bakta (gbff) |
| 237 | + input = "${baseDir}/${PATHS[extension]}/bakta/${sample}.gbff" |
| 238 | + if (!fileExists(input) && !fileExists("${input}.gz")) { |
| 239 | + // Fall back on Prokka (gbk) |
| 240 | + input = "${baseDir}/${PATHS[extension]}/prokka/${sample}.${extension}" |
| 241 | + } |
| 242 | + } else if (extension == "gff") { |
| 243 | + // Default to Bakta (gff3) |
| 244 | + input = "${baseDir}/${PATHS[extension]}/bakta/${sample}.gff3" |
| 245 | + if (!fileExists(input) && !fileExists("${input}.gz")) { |
| 246 | + // Fall back on Prokka (gff) |
| 247 | + input = "${baseDir}/${PATHS[extension]}/prokka/${sample}.${extension}" |
| 248 | + } |
| 249 | + } else if (extension == "faa") { |
| 250 | + // Default to Bakta faa |
| 251 | + input = "${baseDir}/${PATHS[extension]}/bakta/${sample}.${extension}" |
| 252 | + if (!fileExists(input) && !fileExists("${input}.gz")) { |
| 253 | + // Fall back on Prokka |
| 254 | + input = "${baseDir}/${PATHS[extension]}/prokka/${sample}.${extension}" |
| 255 | + } |
| 256 | + } |
| 257 | + |
| 258 | + if (fileExists("${input}.gz")) { |
| 259 | + return [[id:sample, is_compressed:true], ["${input}.gz"], [], []] |
| 260 | + } else if (fileExists(input)) { |
| 261 | + return [[id:sample, is_compressed:false], [input], [], []] |
| 262 | + } |
| 263 | + } |
| 264 | + |
| 265 | + // If we get here, the sample is missing the required files |
| 266 | + return [sample] |
| 267 | + } |
| 268 | +} |
0 commit comments