Skip to content

Commit d1214de

Browse files
committed
tab -> spaces
1 parent e78173f commit d1214de

File tree

5 files changed

+594
-594
lines changed

5 files changed

+594
-594
lines changed

gffquant/__main__.py

Lines changed: 70 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -8,84 +8,84 @@
88
from . import __version__
99

1010
def main():
11-
ap = argparse.ArgumentParser(prog="gffquant", formatter_class=argparse.RawTextHelpFormatter)
12-
ap.add_argument(
13-
"annotation_db", type=str,
14-
help=textwrap.dedent("""\
15-
Path to a text file containing the reference annotation.
16-
The required type of file is determined by the --mode argument (gff3 or tsv)."""
17-
)
18-
)
19-
ap.add_argument(
20-
"bam_file", type=str,
21-
help=textwrap.dedent("""\
22-
Path to a position-sorted bam file. Ambiguous alignments need to be flagged as secondary
23-
alignments with the same read id as their primary alignment.
24-
(e.g. output from BWA mem -a). All alignments of an ambiguous group need to have MAPQ=0."""
25-
)
26-
)
27-
ap.add_argument(
28-
"--mode", "-m", type=str, default="genome", choices=("genome", "genes", "gene", "domain"),
29-
help=textwrap.dedent("""\
30-
Run mode:"
31-
- 'genome' counts reads aligned against contigs, which are annotated with a gff3 file.
32-
The gff3 needs to have been indexed with gffindex prior to the run.
33-
- 'gene' counts reads aligned against gene sequences, which are annotated with a tab-separated file.
34-
- 'genes' is an alias for the 'gene' mode
35-
- 'domain' counts reads against domain annotations within gene sequences, which are annotated with a bed4 file."""
36-
)
37-
)
38-
ap.add_argument(
39-
"--out_prefix", "-o", type=str, default="gffquant",
40-
help="Prefix for output files."
41-
)
42-
ap.add_argument(
43-
"--ambig_mode", type=str, choices=("unique_only", "all1", "primary_only", "1overN"), default="unique_only",
44-
help=textwrap.dedent("""\
45-
Setting how ambiguous alignments should be treated. This setting mimics NGLess' behaviour.
46-
- 'unique_only' ignores any alignment flagged as ambiguous (MAPQ=0). This is the default setting.
47-
- 'all1' treats each alignment as unique (each ambiguous alignment contributes 1 count to features it aligns to.)
48-
- 'primary_only' takes the unique alignments and the primary and alignment of each ambiguous read group.
49-
- '1overN' each alignment contributes 1/(n=number of ambiguous alignments of the same read) counts to features it aligns to."""
50-
)
51-
)
52-
ap.add_argument(
53-
"--strand_specific", action="store_true",
54-
help="Perform strand-specific counting for RNAseq reads. This currently only works for single-end data. This flag is ignored for paired-end data."
55-
)
56-
ap.add_argument("--version", "-v", action="version", version="%(prog)s " + __version__)
11+
ap = argparse.ArgumentParser(prog="gffquant", formatter_class=argparse.RawTextHelpFormatter)
12+
ap.add_argument(
13+
"annotation_db", type=str,
14+
help=textwrap.dedent("""\
15+
Path to a text file containing the reference annotation.
16+
The required type of file is determined by the --mode argument (gff3 or tsv)."""
17+
)
18+
)
19+
ap.add_argument(
20+
"bam_file", type=str,
21+
help=textwrap.dedent("""\
22+
Path to a position-sorted bam file. Ambiguous alignments need to be flagged as secondary
23+
alignments with the same read id as their primary alignment.
24+
(e.g. output from BWA mem -a). All alignments of an ambiguous group need to have MAPQ=0."""
25+
)
26+
)
27+
ap.add_argument(
28+
"--mode", "-m", type=str, default="genome", choices=("genome", "genes", "gene", "domain"),
29+
help=textwrap.dedent("""\
30+
Run mode:"
31+
- 'genome' counts reads aligned against contigs, which are annotated with a gff3 file.
32+
The gff3 needs to have been indexed with gffindex prior to the run.
33+
- 'gene' counts reads aligned against gene sequences, which are annotated with a tab-separated file.
34+
- 'genes' is an alias for the 'gene' mode
35+
- 'domain' counts reads against domain annotations within gene sequences, which are annotated with a bed4 file."""
36+
)
37+
)
38+
ap.add_argument(
39+
"--out_prefix", "-o", type=str, default="gffquant",
40+
help="Prefix for output files."
41+
)
42+
ap.add_argument(
43+
"--ambig_mode", type=str, choices=("unique_only", "all1", "primary_only", "1overN"), default="unique_only",
44+
help=textwrap.dedent("""\
45+
Setting how ambiguous alignments should be treated. This setting mimics NGLess' behaviour.
46+
- 'unique_only' ignores any alignment flagged as ambiguous (MAPQ=0). This is the default setting.
47+
- 'all1' treats each alignment as unique (each ambiguous alignment contributes 1 count to features it aligns to.)
48+
- 'primary_only' takes the unique alignments and the primary and alignment of each ambiguous read group.
49+
- '1overN' each alignment contributes 1/(n=number of ambiguous alignments of the same read) counts to features it aligns to."""
50+
)
51+
)
52+
ap.add_argument(
53+
"--strand_specific", action="store_true",
54+
help="Perform strand-specific counting for RNAseq reads. This currently only works for single-end data. This flag is ignored for paired-end data."
55+
)
56+
ap.add_argument("--version", "-v", action="version", version="%(prog)s " + __version__)
5757

5858

59-
args = ap.parse_args()
59+
args = ap.parse_args()
6060

61-
print("Version:", __version__)
62-
print("Command:", os.path.basename(sys.argv[0]), *sys.argv[1:])
61+
print("Version:", __version__)
62+
print("Command:", os.path.basename(sys.argv[0]), *sys.argv[1:])
6363

64-
if not os.path.exists(args.bam_file):
65-
raise ValueError("bam file does not exist", args.bam_file)
66-
if not os.path.exists(args.annotation_db):
67-
raise ValueError("annotation database does not exist", args.annotation_db)
64+
if not os.path.exists(args.bam_file):
65+
raise ValueError("bam file does not exist", args.bam_file)
66+
if not os.path.exists(args.annotation_db):
67+
raise ValueError("annotation database does not exist", args.annotation_db)
6868

69-
db_index = None
70-
if args.mode == "genome":
71-
db_index = args.annotation_db + ".index"
72-
if not os.path.exists(db_index):
73-
raise ValueError("gff index '{}' does not exist (please generate index with 'gffindex {}')".format(db_index, args.annotation_db))
69+
db_index = None
70+
if args.mode == "genome":
71+
db_index = args.annotation_db + ".index"
72+
if not os.path.exists(db_index):
73+
raise ValueError("gff index '{}' does not exist (please generate index with 'gffindex {}')".format(db_index, args.annotation_db))
7474

75-
if os.path.dirname(args.out_prefix):
76-
pathlib.Path(os.path.dirname(args.out_prefix)).mkdir(exist_ok=True, parents=True)
75+
if os.path.dirname(args.out_prefix):
76+
pathlib.Path(os.path.dirname(args.out_prefix)).mkdir(exist_ok=True, parents=True)
7777

78-
fq = FeatureQuantifier(
79-
db=args.annotation_db,
80-
db_index=db_index,
81-
out_prefix=args.out_prefix,
82-
ambig_mode=args.ambig_mode,
83-
do_overlap_detection=args.mode in ("genome", "domain"),
84-
strand_specific=args.strand_specific
85-
)
78+
fq = FeatureQuantifier(
79+
db=args.annotation_db,
80+
db_index=db_index,
81+
out_prefix=args.out_prefix,
82+
ambig_mode=args.ambig_mode,
83+
do_overlap_detection=args.mode in ("genome", "domain"),
84+
strand_specific=args.strand_specific
85+
)
8686

87-
fq.process_bamfile(args.bam_file)
87+
fq.process_bamfile(args.bam_file)
8888

8989

9090
if __name__ == "__main__":
91-
main()
91+
main()

gffquant/gff_dbm.py

Lines changed: 62 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -7,72 +7,72 @@
77

88
class GffDatabaseManager:
99

10-
def iterate(self):
11-
header = None
12-
with self.db as db_stream:
13-
for line in db_stream:
14-
line = line.strip()
15-
if line.startswith("#"):
16-
header = line.strip("#").split("\t")
17-
else:
18-
line = line.split("\t")
19-
features = list()
20-
for feat_cat, subfeatures in zip(header[6:], line[6:]):
21-
subfeatures = tuple(sf for sf in subfeatures.strip().split(",") if sf)
22-
if subfeatures:
23-
features.append((feat_cat, subfeatures))
24-
yield line[0], (("strand", None),) + tuple(features)
10+
def iterate(self):
11+
header = None
12+
with self.db as db_stream:
13+
for line in db_stream:
14+
line = line.strip()
15+
if line.startswith("#"):
16+
header = line.strip("#").split("\t")
17+
else:
18+
line = line.split("\t")
19+
features = list()
20+
for feat_cat, subfeatures in zip(header[6:], line[6:]):
21+
subfeatures = tuple(sf for sf in subfeatures.strip().split(",") if sf)
22+
if subfeatures:
23+
features.append((feat_cat, subfeatures))
24+
yield line[0], (("strand", None),) + tuple(features)
2525

26-
def _read_index(self, f):
27-
db_index = dict()
28-
for line in open(f, "rt"):
29-
line = line.strip().split("\t")
30-
db_index.setdefault(line[0], list()).append(list(map(int, line[1:3])))
31-
return db_index
26+
def _read_index(self, f):
27+
db_index = dict()
28+
for line in open(f, "rt"):
29+
line = line.strip().split("\t")
30+
db_index.setdefault(line[0], list()).append(list(map(int, line[1:3])))
31+
return db_index
3232

33-
def __init__(self, db, db_index=None):
34-
gz_magic = b"\x1f\x8b\x08"
35-
gzipped = open(db, "rb").read(3).startswith(gz_magic)
36-
if db_index:
37-
if gzipped:
38-
raise ValueError(f"Database {db} is gzipped. This doesn't work together with an index. Please unzip and re-index.")
39-
_open = open
40-
self.db_index = self._read_index(db_index)
41-
else:
42-
_open = gzip.open if gzipped else open
43-
self.db_index = None
44-
self.db = _open(db, "rt")
33+
def __init__(self, db, db_index=None):
34+
gz_magic = b"\x1f\x8b\x08"
35+
gzipped = open(db, "rb").read(3).startswith(gz_magic)
36+
if db_index:
37+
if gzipped:
38+
raise ValueError(f"Database {db} is gzipped. This doesn't work together with an index. Please unzip and re-index.")
39+
_open = open
40+
self.db_index = self._read_index(db_index)
41+
else:
42+
_open = gzip.open if gzipped else open
43+
self.db_index = None
44+
self.db = _open(db, "rt")
4545

46-
@lru_cache(maxsize=4096)
47-
def _read_data(self, ref_id, include_payload=False):
48-
gff_annotation = dict()
49-
for offset, size in self.db_index.get(ref_id, list()):
50-
self.db.seek(offset)
51-
for line in self.db.read(size).strip("\n").split("\n"):
52-
if not line.startswith("#"):
53-
line = line.strip().split("\t")
54-
features = dict()
55-
if include_payload:
56-
features = (("strand", line[6]),)
57-
features += tuple((item.split("=")[0], tuple(sorted(item.split("=")[1].split(",")))) for item in line[8].strip().split(";"))
58-
key = (line[0], int(line[3]), int(line[4]) + 1)
59-
gff_annotation[key] = features
60-
if not gff_annotation and not include_payload:
61-
print("WARNING: contig {contig} does not have an annotation in the index.".format(contig=ref_id), file=sys.stderr, flush=True)
62-
return gff_annotation
46+
@lru_cache(maxsize=4096)
47+
def _read_data(self, ref_id, include_payload=False):
48+
gff_annotation = dict()
49+
for offset, size in self.db_index.get(ref_id, list()):
50+
self.db.seek(offset)
51+
for line in self.db.read(size).strip("\n").split("\n"):
52+
if not line.startswith("#"):
53+
line = line.strip().split("\t")
54+
features = dict()
55+
if include_payload:
56+
features = (("strand", line[6]),)
57+
features += tuple((item.split("=")[0], tuple(sorted(item.split("=")[1].split(",")))) for item in line[8].strip().split(";"))
58+
key = (line[0], int(line[3]), int(line[4]) + 1)
59+
gff_annotation[key] = features
60+
if not gff_annotation and not include_payload:
61+
print("WARNING: contig {contig} does not have an annotation in the index.".format(contig=ref_id), file=sys.stderr, flush=True)
62+
return gff_annotation
6363

64-
@lru_cache(maxsize=4096)
65-
def _get_tree(self, ref, cache_data=False):
66-
return IntervalTree.from_tuples(sorted([key[1:] for key in self._read_data(ref, include_payload=cache_data)]))
64+
@lru_cache(maxsize=4096)
65+
def _get_tree(self, ref, cache_data=False):
66+
return IntervalTree.from_tuples(sorted([key[1:] for key in self._read_data(ref, include_payload=cache_data)]))
6767

68-
def get_data(self, ref, start, end):
69-
return self._read_data(ref, include_payload=True).get((ref, start, end), dict())
68+
def get_data(self, ref, start, end):
69+
return self._read_data(ref, include_payload=True).get((ref, start, end), dict())
7070

71-
def get_overlaps(self, ref, start, end, cache_data=False):
72-
return self._get_tree(ref, cache_data=cache_data)[start:end]
71+
def get_overlaps(self, ref, start, end, cache_data=False):
72+
return self._get_tree(ref, cache_data=cache_data)[start:end]
7373

74-
def clear_caches(self):
75-
print(self._read_data.cache_info(), flush=True)
76-
self._read_data.cache_clear()
77-
print(self._get_tree.cache_info(), flush=True)
78-
self._get_tree.cache_clear()
74+
def clear_caches(self):
75+
print(self._read_data.cache_info(), flush=True)
76+
self._read_data.cache_clear()
77+
print(self._get_tree.cache_info(), flush=True)
78+
self._get_tree.cache_clear()

gffquant/gff_indexer.py

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -4,45 +4,45 @@
44
import os
55

66
class GffIndexer:
7-
def __init__(self, gff, overwrite=False):
8-
self._refs = dict()
9-
offset = 0
10-
cur_ref = None
11-
index_fn = gff + ".index"
12-
if os.path.exists(index_fn):
13-
if not overwrite:
14-
raise FileExistsError("Index {fn} already exists. Please use -f/--force option to overwrite.".format(fn=index_fn))
15-
print("--force parameter is set: overwriting existing index {fn}.".format(fn=index_fn))
16-
17-
with open(gff, "rt") as f, open(index_fn, "wt") as index_out:
18-
for line in f:
19-
if not line.startswith("#"):
20-
ref = line.split("\t")[0]
21-
if ref != cur_ref:
22-
#print(self._refs)
23-
if cur_ref is not None:
24-
self._refs[cur_ref][-1][1] = offset - self._refs[cur_ref][-1][0]
25-
print(cur_ref, *self._refs[cur_ref][-1], sep="\t", flush=True, file=index_out)
26-
cur_ref = ref
27-
self._refs.setdefault(cur_ref, list()).append([offset, 0])
28-
offset += len(line)
29-
30-
self._refs[cur_ref][-1][1] = offset - self._refs[cur_ref][-1][0]
31-
print(cur_ref, *self._refs[cur_ref][-1], sep="\t", flush=True, file=index_out)
32-
33-
def get_index(self, out=sys.stdout):
34-
for r, p in self._refs.items():
35-
for pp in p:
36-
print(r, *pp, file=out, sep="\t", flush=True)
7+
def __init__(self, gff, overwrite=False):
8+
self._refs = dict()
9+
offset = 0
10+
cur_ref = None
11+
index_fn = gff + ".index"
12+
if os.path.exists(index_fn):
13+
if not overwrite:
14+
raise FileExistsError("Index {fn} already exists. Please use -f/--force option to overwrite.".format(fn=index_fn))
15+
print("--force parameter is set: overwriting existing index {fn}.".format(fn=index_fn))
16+
17+
with open(gff, "rt") as f, open(index_fn, "wt") as index_out:
18+
for line in f:
19+
if not line.startswith("#"):
20+
ref = line.split("\t")[0]
21+
if ref != cur_ref:
22+
#print(self._refs)
23+
if cur_ref is not None:
24+
self._refs[cur_ref][-1][1] = offset - self._refs[cur_ref][-1][0]
25+
print(cur_ref, *self._refs[cur_ref][-1], sep="\t", flush=True, file=index_out)
26+
cur_ref = ref
27+
self._refs.setdefault(cur_ref, list()).append([offset, 0])
28+
offset += len(line)
29+
30+
self._refs[cur_ref][-1][1] = offset - self._refs[cur_ref][-1][0]
31+
print(cur_ref, *self._refs[cur_ref][-1], sep="\t", flush=True, file=index_out)
32+
33+
def get_index(self, out=sys.stdout):
34+
for r, p in self._refs.items():
35+
for pp in p:
36+
print(r, *pp, file=out, sep="\t", flush=True)
3737

3838

3939
def main():
40-
ap = argparse.ArgumentParser()
41-
ap.add_argument("gff_file", type=str)
42-
ap.add_argument("--force", "-f", action="store_true", help="Force overwrite of existing index file (input_file.index)")
43-
args = ap.parse_args()
40+
ap = argparse.ArgumentParser()
41+
ap.add_argument("gff_file", type=str)
42+
ap.add_argument("--force", "-f", action="store_true", help="Force overwrite of existing index file (input_file.index)")
43+
args = ap.parse_args()
4444

45-
GffIndexer(args.gff_file, overwrite=args.force)
45+
GffIndexer(args.gff_file, overwrite=args.force)
4646

4747
if __name__ == "__main__":
48-
main()
48+
main()

0 commit comments

Comments
 (0)