Skip to content

Commit 7ee86f9

Browse files
authored
Feature/gzip output 20220317 (#20)
* outputs are now gzipped and logs are no longer published * version -> 1.5.0 * minimum nextflow version is now 21.10.4 * updated license dates
1 parent adf97d1 commit 7ee86f9

File tree

7 files changed

+38
-37
lines changed

7 files changed

+38
-37
lines changed

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
MIT License
22

3-
Copyright (c) 2020 Christian Schudoma
3+
Copyright (c) 2020-2022 Christian Schudoma
44

55
Permission is hereby granted, free of charge, to any person obtaining a copy
66
of this software and associated documentation files (the "Software"), to deal

config/run.config

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,7 @@ process {
2323
errorStrategy = {task.attempt <= 4 ? "retry" : "ignore"}
2424
cpus = 1
2525
memory = {16.GB * task.attempt}
26-
//queue = 'htc'
27-
time = '24h'
26+
time = {2.d * task.attempt}
2827
maxRetries = 4
2928

3029
}
@@ -34,7 +33,7 @@ process {
3433
errorStrategy = {task.attempt <= 4 ? "retry" : "ignore"}
3534
cpus = 1
3635
memory = {8.GB * task.attempt}
37-
time = '2d'
36+
time = {2.d * task.attempt}
3837
maxRetries = 4
3938
}
4039
}

environment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ channels:
55
- defaults
66
dependencies:
77
- python>=3.7
8-
- nextflow>=20.10
8+
- nextflow>=21.10.4

gffquant/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.4.0"
1+
__version__ = "1.5.0"

gffquant/bin/collate_counts.py

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,39 @@
11
import argparse
2-
import pathlib
2+
import gzip
33
import os
4+
import pathlib
45

56
import pandas as pd
67

78

89
class FeatureCountCollator:
9-
def __init__(self, count_dir, prefix, column, recursive=False):
10+
def __init__(self, count_dir, prefix, column, recursive=False, suffix=".txt.gz"):
1011
self.count_dir = count_dir
1112
self.prefix = prefix
13+
self.suffix = suffix
1214
self.column = column
1315
self.categories = {}
1416
self._collect_count_files(recursive=recursive)
1517

1618
@staticmethod
17-
def is_valid_file(f):
19+
def is_valid_file(f, suffix):
1820
return all((
19-
f.endswith(".txt"),
20-
not f.endswith(".seqname.dist1.txt"),
21-
not f.endswith(".seqname.uniq.txt"),
22-
not f.endswith(".gene_counts.txt"),
23-
not f.endswith(".ambig_tmp.txt"),
21+
f.endswith(suffix),
22+
not f.endswith(f".seqname.dist1{suffix}"),
23+
not f.endswith(f".seqname.uniq{suffix}"),
24+
not f.endswith(f".gene_counts{suffix}"),
25+
not f.endswith(f".ambig_tmp{suffix}"),
2426
))
2527

2628
def _collect_count_files(self, recursive=False):
2729
all_files = []
2830
for pwd, _, files in os.walk(self.count_dir):
29-
all_files += (os.path.join(pwd, f) for f in files if FeatureCountCollator.is_valid_file(f))
31+
all_files += (os.path.join(pwd, f) for f in files if FeatureCountCollator.is_valid_file(f, self.suffix))
3032
if not recursive:
3133
break
3234

3335
for f in all_files:
34-
sample, category = os.path.splitext(os.path.basename(f).replace(".txt", ""))
36+
sample, category = os.path.splitext(os.path.basename(f).replace(self.suffix, ""))
3537
self.categories.setdefault(category[1:], []).append((sample, f))
3638

3739
def collate(self):
@@ -40,18 +42,18 @@ def collate(self):
4042
self._collate_category(category, sorted(files))
4143

4244
def _collate_category(self, category, files):
43-
with open(f"{self.prefix}.{category}.{self.column}.txt", "wt") as table_out:
44-
index = set()
45-
for _, fn in files:
46-
with open(fn) as _in:
47-
index.update(row.strip().split("\t")[0] for row in _in if row.strip())
48-
merged_tab = pd.DataFrame(index=['unannotated'] + sorted(index.difference({'feature', 'unannotated'})))
49-
for sample, fn in files:
50-
src_tab = pd.read_csv(fn, sep="\t", index_col=0)
51-
merged_tab = merged_tab.merge(src_tab[self.column], left_index=True, right_index=True, how="outer")
52-
merged_tab.rename(columns={self.column: sample}, inplace=True)
53-
merged_tab[sample]["unannotated"] = src_tab["uniq_raw"]["unannotated"]
54-
merged_tab.to_csv(table_out, sep="\t", na_rep="NA", index_label="feature")
45+
table_file = f"{self.prefix}.{category}.{self.column}.txt.gz"
46+
index = set()
47+
for _, fn in files:
48+
with gzip.open(fn, "rt") as _in:
49+
index.update(row.strip().split("\t")[0] for row in _in if row.strip())
50+
merged_tab = pd.DataFrame(index=['unannotated'] + sorted(index.difference({'feature', 'unannotated'})))
51+
for sample, fn in files:
52+
src_tab = pd.read_csv(fn, sep="\t", index_col=0)
53+
merged_tab = merged_tab.merge(src_tab[self.column], left_index=True, right_index=True, how="outer")
54+
merged_tab.rename(columns={self.column: sample}, inplace=True)
55+
merged_tab[sample]["unannotated"] = src_tab["uniq_raw"]["unannotated"]
56+
merged_tab.to_csv(table_file, sep="\t", na_rep="NA", index_label="feature")
5557

5658

5759
def main():

gffquant/overlap_counter.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import gzip
12
import time
23
from collections import Counter
34

@@ -355,7 +356,7 @@ def get_header(self):
355356

356357
def _dump_feature_counts(self):
357358
for ftype, counts in sorted(self.featcounts.items()):
358-
with open(f"{self.out_prefix}.{ftype}.txt", "w") as feat_out:
359+
with gzip.open(f"{self.out_prefix}.{ftype}.txt.gz", "wt") as feat_out:
359360
print("feature", *self.get_header(), sep="\t", file=feat_out, flush=True)
360361
print("unannotated", self.unannotated_reads, sep="\t", file=feat_out, flush=True)
361362
scaling_factor, ambig_scaling_factor = self.feature_scaling_factors[ftype]
@@ -382,7 +383,7 @@ def _dump_seq_counts(self, bam):
382383
_seqcounts[rid] += count
383384
self.ambig_seqcounts = _seqcounts
384385

385-
with open(f"{self.out_prefix}.seqname.uniq.txt", "w") as seq_out:
386+
with gzip.open(f"{self.out_prefix}.seqname.uniq.txt.gz", "wt") as seq_out:
386387
print(*SEQ_COUNT_HEADER, sep="\t", flush=True, file=seq_out)
387388
if sum(self.seqcounts.values()):
388389
seqcount_scaling_factor = OverlapCounter.calculate_seqcount_scaling_factor(self.seqcounts, bam)
@@ -395,7 +396,7 @@ def _dump_seq_counts(self, bam):
395396
)
396397

397398
if self.ambig_seqcounts:
398-
with open(f"{self.out_prefix}.seqname.dist1.txt", "w") as seq_out:
399+
with gzip.open(f"{self.out_prefix}.seqname.dist1.txt.gz", "wt") as seq_out:
399400
print(*SEQ_COUNT_HEADER, sep="\t", flush=True, file=seq_out)
400401
self.seqcounts.update(self.ambig_seqcounts)
401402
seqcount_scaling_factor = OverlapCounter.calculate_seqcount_scaling_factor(self.seqcounts, bam)
@@ -408,7 +409,7 @@ def _dump_seq_counts(self, bam):
408409
)
409410

410411
def _dump_gene_counts(self, bam=None):
411-
with open(f"{self.out_prefix}.gene_counts.txt", "w") as gene_out:
412+
with gzip.open(f"{self.out_prefix}.gene_counts.txt.gz", "wt") as gene_out:
412413
print("gene", *self.get_header(), sep="\t", file=gene_out, flush=True)
413414
if self.do_overlap_detection:
414415
gene_counts = self.gene_counts
@@ -511,7 +512,7 @@ def summarise_coverage(self, bam):
511512
domain_cov[domtype]["cov_ambig"].append(sum(1 for v in ambig_cov.values() if v) / len(ambig_cov.values()))
512513

513514

514-
with open(self.out_prefix + ".covsum.txt", "wt") as cov_out:
515+
with gzip.open(self.out_prefix + ".covsum.txt.gz", "wt") as cov_out:
515516
print("#domain", "depth_unique", "depth_combined", "coverage_unique", "coverage_combined", sep="\t", file=cov_out)
516517
for domtype, counts in sorted(domain_cov.items()):
517518

main.nf

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,8 @@ process run_gffquant {
6868
path(db)
6969

7070
output:
71-
tuple val(sample), path("${sample}/*.txt"), emit: results
72-
tuple val(sample), path("logs/${sample}.*"), emit: logs
73-
71+
tuple val(sample), path("${sample}/*.txt.gz"), emit: results
72+
7473
script:
7574
def emapper_version = (params.emapper_version) ? "--emapper_version ${params.emapper_version}" : ""
7675
"""
@@ -87,7 +86,7 @@ process collate_feature_counts {
8786
tuple val(sample), path(count_tables)
8887

8988
output:
90-
path("collated/*.txt"), emit: collated, optional: true
89+
path("collated/*.txt.gz"), emit: collated, optional: true
9190

9291
script:
9392
"""

0 commit comments

Comments
 (0)