Feature/gzip output 20220317 (#20)

cschu · web-flow · commit 7ee86f902532 · 2022-03-17T14:41:19.000+01:00
* outputs are now gzipped and logs are no longer published
* version -&gt; 1.5.0
* minimum nextflow version is now 21.10.4
* updated license dates
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2020 Christian Schudoma
+Copyright (c) 2020-2022 Christian Schudoma
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/config/run.config b/config/run.config
@@ -23,8 +23,7 @@ process {
 		errorStrategy = {task.attempt <= 4 ? "retry" : "ignore"}
 		cpus = 1
 		memory = {16.GB * task.attempt}
-		//queue = 'htc'
-		time = '24h'
+		time = {2.d * task.attempt}
 		maxRetries = 4
 		
 	}
@@ -34,7 +33,7 @@ process {
         errorStrategy = {task.attempt <= 4 ? "retry" : "ignore"}
         cpus = 1
         memory = {8.GB * task.attempt}
-        time = '2d'
+        time = {2.d * task.attempt}
         maxRetries = 4
     }
 }
diff --git a/environment.yml b/environment.yml
@@ -5,4 +5,4 @@ channels:
   - defaults
 dependencies:
   - python>=3.7
-  - nextflow>=20.10
+  - nextflow>=21.10.4
diff --git a/gffquant/__init__.py b/gffquant/__init__.py
@@ -1 +1 @@
-__version__ = "1.4.0"
+__version__ = "1.5.0"
diff --git a/gffquant/bin/collate_counts.py b/gffquant/bin/collate_counts.py
@@ -1,37 +1,39 @@
 import argparse
-import pathlib
+import gzip
 import os
+import pathlib
 
 import pandas as pd
 
 
 class FeatureCountCollator:
-    def __init__(self, count_dir, prefix, column, recursive=False):
+    def __init__(self, count_dir, prefix, column, recursive=False, suffix=".txt.gz"):
         self.count_dir = count_dir
         self.prefix = prefix
+        self.suffix = suffix
         self.column = column
         self.categories = {}
         self._collect_count_files(recursive=recursive)
 
     @staticmethod
-    def is_valid_file(f):
+    def is_valid_file(f, suffix):
         return all((
-            f.endswith(".txt"),
-            not f.endswith(".seqname.dist1.txt"),
-            not f.endswith(".seqname.uniq.txt"),
-            not f.endswith(".gene_counts.txt"),
-            not f.endswith(".ambig_tmp.txt"),
+            f.endswith(suffix),
+            not f.endswith(f".seqname.dist1{suffix}"),
+            not f.endswith(f".seqname.uniq{suffix}"),
+            not f.endswith(f".gene_counts{suffix}"),
+            not f.endswith(f".ambig_tmp{suffix}"),
         ))
 
     def _collect_count_files(self, recursive=False):
         all_files = []
         for pwd, _, files in os.walk(self.count_dir):
-            all_files += (os.path.join(pwd, f) for f in files if FeatureCountCollator.is_valid_file(f))
+            all_files += (os.path.join(pwd, f) for f in files if FeatureCountCollator.is_valid_file(f, self.suffix))
             if not recursive:
                 break
 
         for f in all_files:
-            sample, category = os.path.splitext(os.path.basename(f).replace(".txt", ""))
+            sample, category = os.path.splitext(os.path.basename(f).replace(self.suffix, ""))
             self.categories.setdefault(category[1:], []).append((sample, f))
 
     def collate(self):
@@ -40,18 +42,18 @@ def collate(self):
             self._collate_category(category, sorted(files))
 
     def _collate_category(self, category, files):
-        with open(f"{self.prefix}.{category}.{self.column}.txt", "wt") as table_out:
-            index = set()
-            for _, fn in files:
-                with open(fn) as _in:
-                    index.update(row.strip().split("\t")[0] for row in _in if row.strip())
-            merged_tab = pd.DataFrame(index=['unannotated'] + sorted(index.difference({'feature', 'unannotated'})))
-            for sample, fn in files:
-                src_tab = pd.read_csv(fn, sep="\t", index_col=0)
-                merged_tab = merged_tab.merge(src_tab[self.column], left_index=True, right_index=True, how="outer")
-                merged_tab.rename(columns={self.column: sample}, inplace=True)
-                merged_tab[sample]["unannotated"] = src_tab["uniq_raw"]["unannotated"]
-            merged_tab.to_csv(table_out, sep="\t", na_rep="NA", index_label="feature")
+        table_file = f"{self.prefix}.{category}.{self.column}.txt.gz"
+        index = set()
+        for _, fn in files:
+            with gzip.open(fn, "rt") as _in:
+                index.update(row.strip().split("\t")[0] for row in _in if row.strip())
+        merged_tab = pd.DataFrame(index=['unannotated'] + sorted(index.difference({'feature', 'unannotated'})))
+        for sample, fn in files:
+            src_tab = pd.read_csv(fn, sep="\t", index_col=0)
+            merged_tab = merged_tab.merge(src_tab[self.column], left_index=True, right_index=True, how="outer")
+            merged_tab.rename(columns={self.column: sample}, inplace=True)
+            merged_tab[sample]["unannotated"] = src_tab["uniq_raw"]["unannotated"]
+        merged_tab.to_csv(table_file, sep="\t", na_rep="NA", index_label="feature")
 
 
 def main():
diff --git a/gffquant/overlap_counter.py b/gffquant/overlap_counter.py
@@ -1,3 +1,4 @@
+import gzip
 import time
 from collections import Counter
 
@@ -355,7 +356,7 @@ def get_header(self):
 
 	def _dump_feature_counts(self):
 		for ftype, counts in sorted(self.featcounts.items()):
-			with open(f"{self.out_prefix}.{ftype}.txt", "w") as feat_out:
+			with gzip.open(f"{self.out_prefix}.{ftype}.txt.gz", "wt") as feat_out:
 				print("feature", *self.get_header(), sep="\t", file=feat_out, flush=True)
 				print("unannotated", self.unannotated_reads, sep="\t", file=feat_out, flush=True)
 				scaling_factor, ambig_scaling_factor = self.feature_scaling_factors[ftype]
@@ -382,7 +383,7 @@ def _dump_seq_counts(self, bam):
 					_seqcounts[rid] += count
 				self.ambig_seqcounts = _seqcounts
 
-		with open(f"{self.out_prefix}.seqname.uniq.txt", "w") as seq_out:
+		with gzip.open(f"{self.out_prefix}.seqname.uniq.txt.gz", "wt") as seq_out:
 			print(*SEQ_COUNT_HEADER, sep="\t", flush=True, file=seq_out)
 			if sum(self.seqcounts.values()):
 				seqcount_scaling_factor = OverlapCounter.calculate_seqcount_scaling_factor(self.seqcounts, bam)
@@ -395,7 +396,7 @@ def _dump_seq_counts(self, bam):
 					)
 
 		if self.ambig_seqcounts:
-			with open(f"{self.out_prefix}.seqname.dist1.txt", "w") as seq_out:
+			with gzip.open(f"{self.out_prefix}.seqname.dist1.txt.gz", "wt") as seq_out:
 				print(*SEQ_COUNT_HEADER, sep="\t", flush=True, file=seq_out)
 				self.seqcounts.update(self.ambig_seqcounts)
 				seqcount_scaling_factor = OverlapCounter.calculate_seqcount_scaling_factor(self.seqcounts, bam)
@@ -408,7 +409,7 @@ def _dump_seq_counts(self, bam):
 					)
 
 	def _dump_gene_counts(self, bam=None):
-		with open(f"{self.out_prefix}.gene_counts.txt", "w") as gene_out:
+		with gzip.open(f"{self.out_prefix}.gene_counts.txt.gz", "wt") as gene_out:
 			print("gene", *self.get_header(), sep="\t", file=gene_out, flush=True)
 			if self.do_overlap_detection:
 				gene_counts = self.gene_counts
@@ -511,7 +512,7 @@ def summarise_coverage(self, bam):
 					domain_cov[domtype]["cov_ambig"].append(sum(1 for v in ambig_cov.values() if v) / len(ambig_cov.values()))
 
 
-		with open(self.out_prefix + ".covsum.txt", "wt") as cov_out:
+		with gzip.open(self.out_prefix + ".covsum.txt.gz", "wt") as cov_out:
 			print("#domain", "depth_unique", "depth_combined", "coverage_unique", "coverage_combined", sep="\t", file=cov_out)
 			for domtype, counts in sorted(domain_cov.items()):
 
diff --git a/main.nf b/main.nf
@@ -68,9 +68,8 @@ process run_gffquant {
 	path(db)
 
 	output:
-	tuple val(sample), path("${sample}/*.txt"), emit: results
-	tuple val(sample), path("logs/${sample}.*"), emit: logs
-	
+	tuple val(sample), path("${sample}/*.txt.gz"), emit: results
+
 	script:
 	def emapper_version = (params.emapper_version) ? "--emapper_version ${params.emapper_version}" : ""
 	"""
@@ -87,7 +86,7 @@ process collate_feature_counts {
 	tuple val(sample), path(count_tables)
 
 	output:
-	path("collated/*.txt"), emit: collated, optional: true
+	path("collated/*.txt.gz"), emit: collated, optional: true
 
 	script:
 	"""

Original file line number	Diff line number	Diff line change
`@@ -23,8 +23,7 @@ process {`
`23`	`23`	`errorStrategy = {task.attempt <= 4 ? "retry" : "ignore"}`
`24`	`24`	`cpus = 1`
`25`	`25`	`memory = {16.GB * task.attempt}`
`26`		`- //queue = 'htc'`
`27`		`- time = '24h'`
	`26`	`+ time = {2.d * task.attempt}`
`28`	`27`	`maxRetries = 4`
`29`	`28`
`30`	`29`	`}`
`@@ -34,7 +33,7 @@ process {`
`34`	`33`	`errorStrategy = {task.attempt <= 4 ? "retry" : "ignore"}`
`35`	`34`	`cpus = 1`
`36`	`35`	`memory = {8.GB * task.attempt}`
`37`		`- time = '2d'`
	`36`	`+ time = {2.d * task.attempt}`
`38`	`37`	`maxRetries = 4`
`39`	`38`	`}`
`40`	`39`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "1.4.0"`
	`1`	`+__version__ = "1.5.0"`