cschu
diff --git a/‎gffquant/__main__.py‎
Lines changed: 70 additions & 70 deletions b/‎gffquant/__main__.py‎
Lines changed: 70 additions & 70 deletions
diff --git a/‎gffquant/gff_dbm.py‎
Lines changed: 62 additions & 62 deletions b/‎gffquant/gff_dbm.py‎
Lines changed: 62 additions & 62 deletions
diff --git a/‎gffquant/gff_indexer.py‎
Lines changed: 36 additions & 36 deletions b/‎gffquant/gff_indexer.py‎
Lines changed: 36 additions & 36 deletions
@@ -8,84 +8,84 @@
 from . import __version__
 
 def main():
-    ap = argparse.ArgumentParser(prog="gffquant", formatter_class=argparse.RawTextHelpFormatter)
-    ap.add_argument(
-        "annotation_db", type=str,
-        help=textwrap.dedent("""\
-            Path to a text file containing the reference annotation.
-            The required type of file is determined by the --mode argument (gff3 or tsv)."""
-        )
-    )
-    ap.add_argument(
-        "bam_file", type=str,
-        help=textwrap.dedent("""\
-            Path to a position-sorted bam file. Ambiguous alignments need to be flagged as secondary
-            alignments with the same read id as their primary alignment.
-            (e.g. output from BWA mem -a). All alignments of an ambiguous group need to have MAPQ=0."""
-        )
-    )
-    ap.add_argument(
-        "--mode", "-m", type=str, default="genome", choices=("genome", "genes", "gene", "domain"),
-        help=textwrap.dedent("""\
-            Run mode:"
-             - 'genome' counts reads aligned against contigs, which are annotated with a gff3 file.
-                The gff3 needs to have been indexed with gffindex prior to the run.
-             - 'gene' counts reads aligned against gene sequences, which are annotated with a tab-separated file.
-             - 'genes' is an alias for the 'gene' mode
-             - 'domain' counts reads against domain annotations within gene sequences, which are annotated with a bed4 file."""
-        )
-    )
-    ap.add_argument(
-        "--out_prefix", "-o", type=str, default="gffquant",
-        help="Prefix for output files."
-    )
-    ap.add_argument(
-        "--ambig_mode", type=str, choices=("unique_only", "all1", "primary_only", "1overN"), default="unique_only",
-        help=textwrap.dedent("""\
-            Setting how ambiguous alignments should be treated. This setting mimics NGLess' behaviour.
-            - 'unique_only' ignores any alignment flagged as ambiguous (MAPQ=0). This is the default setting.
-            - 'all1' treats each alignment as unique (each ambiguous alignment contributes 1 count to features it aligns to.)
-            - 'primary_only' takes the unique alignments and the primary and alignment of each ambiguous read group.
-            - '1overN' each alignment contributes 1/(n=number of ambiguous alignments of the same read) counts to features it aligns to."""
-        )
-    )
-    ap.add_argument(
-        "--strand_specific", action="store_true",
-        help="Perform strand-specific counting for RNAseq reads. This currently only works for single-end data. This flag is ignored for paired-end data."
-    )
-    ap.add_argument("--version", "-v", action="version", version="%(prog)s " + __version__)
+	ap = argparse.ArgumentParser(prog="gffquant", formatter_class=argparse.RawTextHelpFormatter)
+	ap.add_argument(
+		"annotation_db", type=str,
+		help=textwrap.dedent("""\
+			Path to a text file containing the reference annotation.
+			The required type of file is determined by the --mode argument (gff3 or tsv)."""
+		)
+	)
+	ap.add_argument(
+		"bam_file", type=str,
+		help=textwrap.dedent("""\
+			Path to a position-sorted bam file. Ambiguous alignments need to be flagged as secondary
+			alignments with the same read id as their primary alignment.
+			(e.g. output from BWA mem -a). All alignments of an ambiguous group need to have MAPQ=0."""
+		)
+	)
+	ap.add_argument(
+		"--mode", "-m", type=str, default="genome", choices=("genome", "genes", "gene", "domain"),
+		help=textwrap.dedent("""\
+			Run mode:"
+			 - 'genome' counts reads aligned against contigs, which are annotated with a gff3 file.
+				The gff3 needs to have been indexed with gffindex prior to the run.
+			 - 'gene' counts reads aligned against gene sequences, which are annotated with a tab-separated file.
+			 - 'genes' is an alias for the 'gene' mode
+			 - 'domain' counts reads against domain annotations within gene sequences, which are annotated with a bed4 file."""
+		)
+	)
+	ap.add_argument(
+		"--out_prefix", "-o", type=str, default="gffquant",
+		help="Prefix for output files."
+	)
+	ap.add_argument(
+		"--ambig_mode", type=str, choices=("unique_only", "all1", "primary_only", "1overN"), default="unique_only",
+		help=textwrap.dedent("""\
+			Setting how ambiguous alignments should be treated. This setting mimics NGLess' behaviour.
+			- 'unique_only' ignores any alignment flagged as ambiguous (MAPQ=0). This is the default setting.
+			- 'all1' treats each alignment as unique (each ambiguous alignment contributes 1 count to features it aligns to.)
+			- 'primary_only' takes the unique alignments and the primary and alignment of each ambiguous read group.
+			- '1overN' each alignment contributes 1/(n=number of ambiguous alignments of the same read) counts to features it aligns to."""
+		)
+	)
+	ap.add_argument(
+		"--strand_specific", action="store_true",
+		help="Perform strand-specific counting for RNAseq reads. This currently only works for single-end data. This flag is ignored for paired-end data."
+	)
+	ap.add_argument("--version", "-v", action="version", version="%(prog)s " + __version__)
 
 
-    args = ap.parse_args()
+	args = ap.parse_args()
 
-    print("Version:", __version__)
-    print("Command:", os.path.basename(sys.argv[0]), *sys.argv[1:])
+	print("Version:", __version__)
+	print("Command:", os.path.basename(sys.argv[0]), *sys.argv[1:])
 
-    if not os.path.exists(args.bam_file):
-        raise ValueError("bam file does not exist", args.bam_file)
-    if not os.path.exists(args.annotation_db):
-        raise ValueError("annotation database does not exist", args.annotation_db)
+	if not os.path.exists(args.bam_file):
+		raise ValueError("bam file does not exist", args.bam_file)
+	if not os.path.exists(args.annotation_db):
+		raise ValueError("annotation database does not exist", args.annotation_db)
 
-    db_index = None
-    if args.mode == "genome":
-        db_index = args.annotation_db + ".index"
-        if not os.path.exists(db_index):
-            raise ValueError("gff index '{}' does not exist (please generate index with 'gffindex {}')".format(db_index, args.annotation_db))
+	db_index = None
+	if args.mode == "genome":
+		db_index = args.annotation_db + ".index"
+		if not os.path.exists(db_index):
+			raise ValueError("gff index '{}' does not exist (please generate index with 'gffindex {}')".format(db_index, args.annotation_db))
 
-    if os.path.dirname(args.out_prefix):
-        pathlib.Path(os.path.dirname(args.out_prefix)).mkdir(exist_ok=True, parents=True)
+	if os.path.dirname(args.out_prefix):
+		pathlib.Path(os.path.dirname(args.out_prefix)).mkdir(exist_ok=True, parents=True)
 
-    fq = FeatureQuantifier(
-        db=args.annotation_db,
-        db_index=db_index,
-        out_prefix=args.out_prefix,
-        ambig_mode=args.ambig_mode,
-        do_overlap_detection=args.mode in ("genome", "domain"),
-        strand_specific=args.strand_specific
-    )
+	fq = FeatureQuantifier(
+		db=args.annotation_db,
+		db_index=db_index,
+		out_prefix=args.out_prefix,
+		ambig_mode=args.ambig_mode,
+		do_overlap_detection=args.mode in ("genome", "domain"),
+		strand_specific=args.strand_specific
+	)
 
-    fq.process_bamfile(args.bam_file)
+	fq.process_bamfile(args.bam_file)
 
 
 if __name__ == "__main__":
-    main()
+	main()
@@ -7,72 +7,72 @@
 
 class GffDatabaseManager:
 
-    def iterate(self):
-        header = None
-        with self.db as db_stream:
-            for line in db_stream:
-                line = line.strip()
-                if line.startswith("#"):
-                    header = line.strip("#").split("\t")
-                else:
-                    line = line.split("\t")
-                    features = list()
-                    for feat_cat, subfeatures in zip(header[6:], line[6:]):
-                        subfeatures = tuple(sf for sf in subfeatures.strip().split(",") if sf)
-                        if subfeatures:
-                            features.append((feat_cat, subfeatures))
-                    yield line[0], (("strand", None),) + tuple(features)
+	def iterate(self):
+		header = None
+		with self.db as db_stream:
+			for line in db_stream:
+				line = line.strip()
+				if line.startswith("#"):
+					header = line.strip("#").split("\t")
+				else:
+					line = line.split("\t")
+					features = list()
+					for feat_cat, subfeatures in zip(header[6:], line[6:]):
+						subfeatures = tuple(sf for sf in subfeatures.strip().split(",") if sf)
+						if subfeatures:
+							features.append((feat_cat, subfeatures))
+					yield line[0], (("strand", None),) + tuple(features)
 
-    def _read_index(self, f):
-        db_index = dict()
-        for line in open(f, "rt"):
-            line = line.strip().split("\t")
-            db_index.setdefault(line[0], list()).append(list(map(int, line[1:3])))
-        return db_index
+	def _read_index(self, f):
+		db_index = dict()
+		for line in open(f, "rt"):
+			line = line.strip().split("\t")
+			db_index.setdefault(line[0], list()).append(list(map(int, line[1:3])))
+		return db_index
 
-    def __init__(self, db, db_index=None):
-        gz_magic = b"\x1f\x8b\x08"
-        gzipped = open(db, "rb").read(3).startswith(gz_magic)
-        if db_index:
-            if gzipped:
-                raise ValueError(f"Database {db} is gzipped. This doesn't work together with an index. Please unzip and re-index.")
-            _open = open
-            self.db_index = self._read_index(db_index)
-        else:
-            _open = gzip.open if gzipped else open
-            self.db_index = None
-        self.db = _open(db, "rt")
+	def __init__(self, db, db_index=None):
+		gz_magic = b"\x1f\x8b\x08"
+		gzipped = open(db, "rb").read(3).startswith(gz_magic)
+		if db_index:
+			if gzipped:
+				raise ValueError(f"Database {db} is gzipped. This doesn't work together with an index. Please unzip and re-index.")
+			_open = open
+			self.db_index = self._read_index(db_index)
+		else:
+			_open = gzip.open if gzipped else open
+			self.db_index = None
+		self.db = _open(db, "rt")
 
-    @lru_cache(maxsize=4096)
-    def _read_data(self, ref_id, include_payload=False):
-        gff_annotation = dict()
-        for offset, size in self.db_index.get(ref_id, list()):
-            self.db.seek(offset)
-            for line in self.db.read(size).strip("\n").split("\n"):
-                if not line.startswith("#"):
-                    line = line.strip().split("\t")
-                    features = dict()
-                    if include_payload:
-                        features = (("strand", line[6]),)
-                        features += tuple((item.split("=")[0], tuple(sorted(item.split("=")[1].split(",")))) for item in line[8].strip().split(";"))
-                    key = (line[0], int(line[3]), int(line[4]) + 1)
-                    gff_annotation[key] = features
-        if not gff_annotation and not include_payload:
-            print("WARNING: contig {contig} does not have an annotation in the index.".format(contig=ref_id), file=sys.stderr, flush=True)
-        return gff_annotation
+	@lru_cache(maxsize=4096)
+	def _read_data(self, ref_id, include_payload=False):
+		gff_annotation = dict()
+		for offset, size in self.db_index.get(ref_id, list()):
+			self.db.seek(offset)
+			for line in self.db.read(size).strip("\n").split("\n"):
+				if not line.startswith("#"):
+					line = line.strip().split("\t")
+					features = dict()
+					if include_payload:
+						features = (("strand", line[6]),)
+						features += tuple((item.split("=")[0], tuple(sorted(item.split("=")[1].split(",")))) for item in line[8].strip().split(";"))
+					key = (line[0], int(line[3]), int(line[4]) + 1)
+					gff_annotation[key] = features
+		if not gff_annotation and not include_payload:
+			print("WARNING: contig {contig} does not have an annotation in the index.".format(contig=ref_id), file=sys.stderr, flush=True)
+		return gff_annotation
 
-    @lru_cache(maxsize=4096)
-    def _get_tree(self, ref, cache_data=False):
-        return IntervalTree.from_tuples(sorted([key[1:] for key in self._read_data(ref, include_payload=cache_data)]))
+	@lru_cache(maxsize=4096)
+	def _get_tree(self, ref, cache_data=False):
+		return IntervalTree.from_tuples(sorted([key[1:] for key in self._read_data(ref, include_payload=cache_data)]))
 
-    def get_data(self, ref, start, end):
-        return self._read_data(ref, include_payload=True).get((ref, start, end), dict())
+	def get_data(self, ref, start, end):
+		return self._read_data(ref, include_payload=True).get((ref, start, end), dict())
 
-    def get_overlaps(self, ref, start, end, cache_data=False):
-        return self._get_tree(ref, cache_data=cache_data)[start:end]
+	def get_overlaps(self, ref, start, end, cache_data=False):
+		return self._get_tree(ref, cache_data=cache_data)[start:end]
 
-    def clear_caches(self):
-        print(self._read_data.cache_info(), flush=True)
-        self._read_data.cache_clear()
-        print(self._get_tree.cache_info(), flush=True)
-        self._get_tree.cache_clear()
+	def clear_caches(self):
+		print(self._read_data.cache_info(), flush=True)
+		self._read_data.cache_clear()
+		print(self._get_tree.cache_info(), flush=True)
+		self._get_tree.cache_clear()
@@ -4,45 +4,45 @@
 import os
 
 class GffIndexer:
-    def __init__(self, gff, overwrite=False):
-        self._refs = dict()
-        offset = 0
-        cur_ref = None
-        index_fn = gff + ".index"
-        if os.path.exists(index_fn):
-            if not overwrite:
-                raise FileExistsError("Index {fn} already exists. Please use -f/--force option to overwrite.".format(fn=index_fn))
-            print("--force parameter is set: overwriting existing index {fn}.".format(fn=index_fn))
-
-        with open(gff, "rt") as f, open(index_fn, "wt") as index_out:
-            for line in f:
-                if not line.startswith("#"):
-                    ref = line.split("\t")[0]
-                    if ref != cur_ref:
-                        #print(self._refs)
-                        if cur_ref is not None:
-                            self._refs[cur_ref][-1][1] = offset - self._refs[cur_ref][-1][0]
-                            print(cur_ref, *self._refs[cur_ref][-1], sep="\t", flush=True, file=index_out)
-                        cur_ref = ref
-                        self._refs.setdefault(cur_ref, list()).append([offset, 0])
-                offset += len(line)
-
-            self._refs[cur_ref][-1][1] = offset - self._refs[cur_ref][-1][0]
-            print(cur_ref, *self._refs[cur_ref][-1], sep="\t", flush=True, file=index_out)
-
-    def get_index(self, out=sys.stdout):
-        for r, p in self._refs.items():
-            for pp in p:
-                print(r, *pp, file=out, sep="\t", flush=True)
+	def __init__(self, gff, overwrite=False):
+		self._refs = dict()
+		offset = 0
+		cur_ref = None
+		index_fn = gff + ".index"
+		if os.path.exists(index_fn):
+			if not overwrite:
+				raise FileExistsError("Index {fn} already exists. Please use -f/--force option to overwrite.".format(fn=index_fn))
+			print("--force parameter is set: overwriting existing index {fn}.".format(fn=index_fn))
+
+		with open(gff, "rt") as f, open(index_fn, "wt") as index_out:
+			for line in f:
+				if not line.startswith("#"):
+					ref = line.split("\t")[0]
+					if ref != cur_ref:
+						#print(self._refs)
+						if cur_ref is not None:
+							self._refs[cur_ref][-1][1] = offset - self._refs[cur_ref][-1][0]
+							print(cur_ref, *self._refs[cur_ref][-1], sep="\t", flush=True, file=index_out)
+						cur_ref = ref
+						self._refs.setdefault(cur_ref, list()).append([offset, 0])
+				offset += len(line)
+
+			self._refs[cur_ref][-1][1] = offset - self._refs[cur_ref][-1][0]
+			print(cur_ref, *self._refs[cur_ref][-1], sep="\t", flush=True, file=index_out)
+
+	def get_index(self, out=sys.stdout):
+		for r, p in self._refs.items():
+			for pp in p:
+				print(r, *pp, file=out, sep="\t", flush=True)
 
 
 def main():
-    ap = argparse.ArgumentParser()
-    ap.add_argument("gff_file", type=str)
-    ap.add_argument("--force", "-f", action="store_true", help="Force overwrite of existing index file (input_file.index)")
-    args = ap.parse_args()
+	ap = argparse.ArgumentParser()
+	ap.add_argument("gff_file", type=str)
+	ap.add_argument("--force", "-f", action="store_true", help="Force overwrite of existing index file (input_file.index)")
+	args = ap.parse_args()
 
-    GffIndexer(args.gff_file, overwrite=args.force)
+	GffIndexer(args.gff_file, overwrite=args.force)
 
 if __name__ == "__main__":
-    main()
+	main()