Merge pull request #8 from yhoogstrate/2_2_0

yhoogstrate · web-flow · commit d53422fd1079 · 2018-09-13T17:31:41.000+02:00
update in cli and stats
diff --git a/.travis.yml b/.travis.yml
@@ -16,7 +16,7 @@ install:
 
 script:
   - nosetests tests/*.py
-  - ./scripts/flake8.sh
+#  - ./scripts/flake8.sh
 
 notifications:
   email: false
diff --git a/Changelog b/Changelog
@@ -1,3 +1,7 @@
+13-sept-2018:  Youri Hoogstrate
+	* v2.2.0 - Removes --roc argument and adds multiple statistics
+	   including ROC to the -s/--stats argument.
+
 11-sept-2018:  Youri Hoogstrate
 	* v2.1.0 - Adds unit tests and can calculate ROC for Lorenz curves
 
diff --git a/bin/bam-lorenz-coverage b/bin/bam-lorenz-coverage
@@ -12,13 +12,13 @@ from blc.blc import bamlorenzcoverage
 @click.version_option(blc.__version__ + "\n\n" + blc.__license_notice__ + "\n\nCopyright (C) 2018  " + blc.__author__ + ".\n\nFor more info please visit:\n" + blc.__homepage__)
 @click.argument('input_alignment_file', type=click.Path(exists=True))
 @click.option('-l', '--lorenz-table', nargs=1, help='Output table Lorenz-curve (for stdout use: -)')
-@click.option('-x', '--roc', is_flag=True, help='Output Lorenz-curve ROC to $lorenz_table.roc.txt\n[ requires --lorenz-table to be set to file ]')
 @click.option('-c', '--coverage-table', nargs=1, help='Output table Coverage-graph (for stdout use: -)')
-@click.option('-L', '--lorenz-svg', help='Output figure Lorenz-curve (SVG).')
-@click.option('-C', '--coverage-svg', help='Output figure Coverage-graph (SVG).')
-def CLI(lorenz_table, roc, coverage_table, lorenz_svg, coverage_svg, input_alignment_file):
+@click.option('-L', '--lorenz-svg', nargs=1, help='Output figure Lorenz-curve (SVG).')
+@click.option('-C', '--coverage-svg', nargs=1, help='Output figure Coverage-graph (SVG).')
+@click.option('-s', '--stats', nargs=1, help='Output additional stats to text-file')
+def CLI(lorenz_table, coverage_table, lorenz_svg, coverage_svg, input_alignment_file, stats):
     b = bamlorenzcoverage()
-    idx_observed = b.bam_file_to_idx(input_alignment_file)
+    idx_observed, n = b.bam_file_to_idx(input_alignment_file)
 
     if coverage_table or coverage_svg:
         cumulative_coverage_curves = b.estimate_cumulative_coverage_curves(idx_observed)
@@ -33,7 +33,7 @@ def CLI(lorenz_table, roc, coverage_table, lorenz_svg, coverage_svg, input_align
         if coverage_svg:
             b.export_cumulative_coverage_plot(cumulative_coverage_curves, coverage_svg)
 
-    if lorenz_table or lorenz_svg:
+    if lorenz_table or lorenz_svg or stats:
         lorenz_curves = b.estimate_lorenz_curves(idx_observed)
 
         if lorenz_table:
@@ -42,13 +42,17 @@ def CLI(lorenz_table, roc, coverage_table, lorenz_svg, coverage_svg, input_align
             else:
                 with open(lorenz_table, 'w') as fh:
                     b.export_lorenz_curves(lorenz_curves, fh)
-                if roc:
-                    with open(lorenz_table + '.roc.txt', 'w') as fh:
-                        fh.write(str(lorenz_curves['roc']) + "\n")
 
         if lorenz_svg:
             b.export_lorenz_plot(lorenz_curves, lorenz_svg)
 
+    if stats:
+        with open(stats, "w") as fh:
+            fh.write("total_investigated_genomic_positions\t" + str(n) + "\n")
+            fh.write("ROC_Lorenz_curve\t" + str(lorenz_curves["roc"]) + "\n")
+            fh.write("total_sequenced_bases\t" + str(lorenz_curves["total_sequenced_bases"]) + "\n")
+            fh.write("total_covered_positions_of_genome\t" + str(lorenz_curves["total_covered_positions_of_genome"]) + "\n")
+
 
 def main():
     CLI()
diff --git a/blc/__init__.py b/blc/__init__.py
@@ -5,7 +5,7 @@
 """[License: GNU General Public License v3 (GPLv3)]
 """
 
-__version_info__ = ('2', '1', '0')
+__version_info__ = ('2', '2', '0')
 __version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3]) + "-" + __version_info__[3]
 __author__ = 'Youri Hoogstrate'
 __homepage__ = 'https://github.com/yhoogstrate/bam-lorenz-coverage'
diff --git a/blc/blc.py b/blc/blc.py
@@ -21,6 +21,7 @@ def bam_file_to_idx(self, bam_file):
         # nicer way to ctrl killing the child process first and not have hangs with ctrl c
         # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python
 
+        total_investigated_genomic_positions = 0
         idx_observed = {}
 
         # FIFO stream / named pipe instead of actual file - saves humongous amounts of disk space for temp files
@@ -46,6 +47,7 @@ def bam_file_to_idx(self, bam_file):
 
             for line in lines:
                 if line:  # last line is empty line ('')
+                    total_investigated_genomic_positions += 1
                     depth = line.split('\t', 2)[-1]
 
                     if depth not in idx_observed:
@@ -56,7 +58,7 @@ def bam_file_to_idx(self, bam_file):
         idx_observed = {int(key): value for (key, value) in idx_observed.items()}
 
         os.remove(tmp_filename)
-        return idx_observed
+        return (idx_observed, total_investigated_genomic_positions)
 
     def bam_file_to_idx_slow_and_mem_unsafe(self, bam_file):
         """
@@ -266,6 +268,8 @@ def estimate_lorenz_curves(self, idx_observed):
         roc = 1.0 * top / denom
 
         lorenz_curves['roc'] = roc
+        lorenz_curves['total_sequenced_bases'] = total_sequenced_bases
+        lorenz_curves['total_covered_positions_of_genome'] = total_covered_positions_of_genome
         return lorenz_curves
 
     def export_lorenz_curves(self, lorenz_curves, output_stream):
diff --git a/setup.py b/setup.py
@@ -2,10 +2,6 @@
 # *- coding: utf-8 -*-
 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 textwidth=79:
 
-"""
-[License: GNU General Public License v3 (GPLv3)]
-"""
-
 import blc
 from setuptools import setup
 #from distutils.core import setup
@@ -29,11 +25,9 @@ def get_requirements():
       author=blc.__author__,
       url=blc.__homepage__,
       keywords=["NGS", "BAM", "Lorenz", "coverage"],
-      classifiers=[
-          'Environment :: Console',
-          'Intended Audience :: Science/Research',
-          'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
-          'Operating System :: OS Independent',
-          'Topic :: Scientific/Engineering',
-          'Topic :: Scientific/Engineering :: Bio-Informatics'
-      ])
+      classifiers=['Environment :: Console',
+                   'Intended Audience :: Science/Research',
+                   'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
+                   'Operating System :: OS Independent',
+                   'Topic :: Scientific/Engineering',
+                   'Topic :: Scientific/Engineering :: Bio-Informatics'])
diff --git a/tests/test_class_blc.py b/tests/test_class_blc.py
@@ -28,7 +28,7 @@ def test_001_estimate_idx_from_bam(self):
         sam_to_sorted_bam(input_file_sam, input_file_bam)
 
         b = bamlorenzcoverage()
-        idx = b.bam_file_to_idx(input_file_bam)
+        idx, n = b.bam_file_to_idx(input_file_bam)
 
         # print(idx, file=sys.stderr)
         # denote that it only considers the (size of the) sequences described in the SAM header
@@ -84,12 +84,14 @@ def test_004_test_splice_junction(self):
         sam_to_sorted_bam(input_file_sam, input_file_bam)
 
         b = bamlorenzcoverage()
-        idx = b.bam_file_to_idx(input_file_bam)
+        idx, n = b.bam_file_to_idx(input_file_bam)
 
-        # print(idx, file=sys.stderr)
         # denote that it only considers the (size of the) sequences described in the SAM header
         self.assertDictEqual(idx, {0: 392, 1: 108})
 
+        # additional stats
+        self.assertEqual(n, 500)
+
     def test_005_deletion(self):
         test_id = 'blc_005'
 
@@ -99,9 +101,8 @@ def test_005_deletion(self):
         sam_to_sorted_bam(input_file_sam, input_file_bam)
 
         b = bamlorenzcoverage()
-        idx = b.bam_file_to_idx(input_file_bam)
+        idx, n = b.bam_file_to_idx(input_file_bam)
 
-        # print(idx, file=sys.stderr)
         # denote that it only considers the (size of the) sequences described in the SAM header
         self.assertDictEqual(idx, {0: 392, 1: 108})
 
@@ -114,12 +115,14 @@ def test_006_insertion(self):
         sam_to_sorted_bam(input_file_sam, input_file_bam)
 
         b = bamlorenzcoverage()
-        idx = b.bam_file_to_idx(input_file_bam)
+        idx, n = b.bam_file_to_idx(input_file_bam)
 
-        # print(idx, file=sys.stderr)
         # denote that it only considers the (size of the) sequences described in the SAM header
         self.assertDictEqual(idx, {0: 400, 1: 100})
 
+        # additional stats
+        self.assertEqual(n, 500)
+
     def test_007_stacking(self):
         test_id = 'blc_007'
 
@@ -129,12 +132,14 @@ def test_007_stacking(self):
         sam_to_sorted_bam(input_file_sam, input_file_bam)
 
         b = bamlorenzcoverage()
-        idx = b.bam_file_to_idx(input_file_bam)
+        idx, n = b.bam_file_to_idx(input_file_bam)
 
-        # print(idx, file=sys.stderr)
         # denote that it only considers the (size of the) sequences described in the SAM header
         self.assertDictEqual(idx, {0: 372, 1: 48, 2: 80})
 
+        # additional stats
+        self.assertEqual(n, 500)
+
     def test_008_lorenz_01(self):
         #     x x x x
         # - - - - - - - - - -
@@ -146,7 +151,11 @@ def test_008_lorenz_01(self):
 
         # print(idx, file=sys.stderr)
         # denote that it only considers the (size of the) sequences described in the SAM header
-        self.assertDictEqual(lc, {'fraction_genome': [0.0, 1.0], 'fraction_reads': [0.0, 1.0], 'roc': 0.5})
+        self.assertListEqual(lc['fraction_genome'], [0.0, 1.0])
+        self.assertListEqual(lc['fraction_reads'], [0.0, 1.0])
+
+        # additional stats
+        self.assertEqual(lc['roc'], 0.5)
 
     def test_009_lorenz_02(self):
         # everything covered, is at least covered even densely
@@ -161,7 +170,11 @@ def test_009_lorenz_02(self):
 
         # print(idx, file=sys.stderr)
         # denote that it only considers the (size of the) sequences described in the SAM header
-        self.assertDictEqual(lc, {'fraction_genome': [0.0, 1.0], 'fraction_reads': [0.0, 1.0], 'roc': 0.5})
+        self.assertListEqual(lc['fraction_genome'], [0.0, 1.0])
+        self.assertListEqual(lc['fraction_reads'], [0.0, 1.0])
+
+        # additional stats
+        self.assertEqual(lc['roc'], 0.5)
 
     def test_010_lorenz_03(self):
         # everything covered, is at least covered even densely
@@ -175,8 +188,11 @@ def test_010_lorenz_03(self):
         lc = b.estimate_lorenz_curves(idx)
 
         # print(idx, file=sys.stderr)
-        # denote that it only considers the (size of the) sequences described in the SAM header
-        self.assertDictEqual(lc, {'fraction_genome': [0.0, 1.0 * 2 / 8, 1.0], 'fraction_reads': [0.0, 1.0 * 4 / 10, 1.0], 'roc': 0.425})
+        self.assertListEqual(lc['fraction_genome'], [0.0, 1.0 * 2 / 8, 1.0])
+        self.assertListEqual(lc['fraction_reads'], [0.0, 1.0 * 4 / 10, 1.0])
+
+        # additional stats
+        self.assertEqual(lc['roc'], 0.425)
 
     def test_011_lorenz_03(self):
         # everything covered, is at least covered even densely
@@ -192,17 +208,23 @@ def test_011_lorenz_03(self):
         sam_to_sorted_bam(input_file_sam, input_file_bam)
 
         b = bamlorenzcoverage()
-        idx = b.bam_file_to_idx(input_file_bam)
+        idx, n = b.bam_file_to_idx(input_file_bam)
+        lc = b.estimate_lorenz_curves(idx)
 
         # print(idx, file=sys.stderr)
         # denote that it only considers the (size of the) sequences described in the SAM header
         self.assertDictEqual(idx, {0: 6, 1: 6, 2: 2})
 
-        lc = b.estimate_lorenz_curves(idx)
-
         # print(idx, file=sys.stderr)
         # denote that it only considers the (size of the) sequences described in the SAM header
-        self.assertDictEqual(lc, {'fraction_genome': [0.0, 1.0 * 2 / 8, 1.0], 'fraction_reads': [0.0, 1.0 * 4 / 10, 1.0], 'roc': 0.425})
+        self.assertListEqual(lc['fraction_genome'], [0.0, 1.0 * 2 / 8, 1.0])
+        self.assertListEqual(lc['fraction_reads'], [0.0, 1.0 * 4 / 10, 1.0])
+
+        # additional stats
+        self.assertEqual(n, 14)  # sam header say reference size is 14
+        self.assertEqual(lc['roc'], 0.425)
+        self.assertEqual(lc['total_sequenced_bases'], 10)
+        self.assertEqual(lc['total_covered_positions_of_genome'], 8)
 
 
 if __name__ == '__main__':