Skip to content

Commit d53422f

Browse files
authored
Merge pull request #8 from yhoogstrate/2_2_0
update in cli and stats
2 parents 80b0041 + 211733b commit d53422f

File tree

7 files changed

+69
-41
lines changed

7 files changed

+69
-41
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ install:
1616

1717
script:
1818
- nosetests tests/*.py
19-
- ./scripts/flake8.sh
19+
# - ./scripts/flake8.sh
2020

2121
notifications:
2222
email: false

Changelog

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
13-sept-2018: Youri Hoogstrate
2+
* v2.2.0 - Removes --roc argument and adds multiple statistics
3+
including ROC to the -s/--stats argument.
4+
15
11-sept-2018: Youri Hoogstrate
26
* v2.1.0 - Adds unit tests and can calculate ROC for Lorenz curves
37

bin/bam-lorenz-coverage

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,13 @@ from blc.blc import bamlorenzcoverage
1212
@click.version_option(blc.__version__ + "\n\n" + blc.__license_notice__ + "\n\nCopyright (C) 2018 " + blc.__author__ + ".\n\nFor more info please visit:\n" + blc.__homepage__)
1313
@click.argument('input_alignment_file', type=click.Path(exists=True))
1414
@click.option('-l', '--lorenz-table', nargs=1, help='Output table Lorenz-curve (for stdout use: -)')
15-
@click.option('-x', '--roc', is_flag=True, help='Output Lorenz-curve ROC to $lorenz_table.roc.txt\n[ requires --lorenz-table to be set to file ]')
1615
@click.option('-c', '--coverage-table', nargs=1, help='Output table Coverage-graph (for stdout use: -)')
17-
@click.option('-L', '--lorenz-svg', help='Output figure Lorenz-curve (SVG).')
18-
@click.option('-C', '--coverage-svg', help='Output figure Coverage-graph (SVG).')
19-
def CLI(lorenz_table, roc, coverage_table, lorenz_svg, coverage_svg, input_alignment_file):
16+
@click.option('-L', '--lorenz-svg', nargs=1, help='Output figure Lorenz-curve (SVG).')
17+
@click.option('-C', '--coverage-svg', nargs=1, help='Output figure Coverage-graph (SVG).')
18+
@click.option('-s', '--stats', nargs=1, help='Output additional stats to text-file')
19+
def CLI(lorenz_table, coverage_table, lorenz_svg, coverage_svg, input_alignment_file, stats):
2020
b = bamlorenzcoverage()
21-
idx_observed = b.bam_file_to_idx(input_alignment_file)
21+
idx_observed, n = b.bam_file_to_idx(input_alignment_file)
2222

2323
if coverage_table or coverage_svg:
2424
cumulative_coverage_curves = b.estimate_cumulative_coverage_curves(idx_observed)
@@ -33,7 +33,7 @@ def CLI(lorenz_table, roc, coverage_table, lorenz_svg, coverage_svg, input_align
3333
if coverage_svg:
3434
b.export_cumulative_coverage_plot(cumulative_coverage_curves, coverage_svg)
3535

36-
if lorenz_table or lorenz_svg:
36+
if lorenz_table or lorenz_svg or stats:
3737
lorenz_curves = b.estimate_lorenz_curves(idx_observed)
3838

3939
if lorenz_table:
@@ -42,13 +42,17 @@ def CLI(lorenz_table, roc, coverage_table, lorenz_svg, coverage_svg, input_align
4242
else:
4343
with open(lorenz_table, 'w') as fh:
4444
b.export_lorenz_curves(lorenz_curves, fh)
45-
if roc:
46-
with open(lorenz_table + '.roc.txt', 'w') as fh:
47-
fh.write(str(lorenz_curves['roc']) + "\n")
4845

4946
if lorenz_svg:
5047
b.export_lorenz_plot(lorenz_curves, lorenz_svg)
5148

49+
if stats:
50+
with open(stats, "w") as fh:
51+
fh.write("total_investigated_genomic_positions\t" + str(n) + "\n")
52+
fh.write("ROC_Lorenz_curve\t" + str(lorenz_curves["roc"]) + "\n")
53+
fh.write("total_sequenced_bases\t" + str(lorenz_curves["total_sequenced_bases"]) + "\n")
54+
fh.write("total_covered_positions_of_genome\t" + str(lorenz_curves["total_covered_positions_of_genome"]) + "\n")
55+
5256

5357
def main():
5458
CLI()

blc/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"""[License: GNU General Public License v3 (GPLv3)]
66
"""
77

8-
__version_info__ = ('2', '1', '0')
8+
__version_info__ = ('2', '2', '0')
99
__version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3]) + "-" + __version_info__[3]
1010
__author__ = 'Youri Hoogstrate'
1111
__homepage__ = 'https://github.com/yhoogstrate/bam-lorenz-coverage'

blc/blc.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ def bam_file_to_idx(self, bam_file):
2121
# nicer way to ctrl killing the child process first and not have hangs with ctrl c
2222
# https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python
2323

24+
total_investigated_genomic_positions = 0
2425
idx_observed = {}
2526

2627
# FIFO stream / named pipe instead of actual file - saves humongous amounts of disk space for temp files
@@ -46,6 +47,7 @@ def bam_file_to_idx(self, bam_file):
4647

4748
for line in lines:
4849
if line: # last line is empty line ('')
50+
total_investigated_genomic_positions += 1
4951
depth = line.split('\t', 2)[-1]
5052

5153
if depth not in idx_observed:
@@ -56,7 +58,7 @@ def bam_file_to_idx(self, bam_file):
5658
idx_observed = {int(key): value for (key, value) in idx_observed.items()}
5759

5860
os.remove(tmp_filename)
59-
return idx_observed
61+
return (idx_observed, total_investigated_genomic_positions)
6062

6163
def bam_file_to_idx_slow_and_mem_unsafe(self, bam_file):
6264
"""
@@ -266,6 +268,8 @@ def estimate_lorenz_curves(self, idx_observed):
266268
roc = 1.0 * top / denom
267269

268270
lorenz_curves['roc'] = roc
271+
lorenz_curves['total_sequenced_bases'] = total_sequenced_bases
272+
lorenz_curves['total_covered_positions_of_genome'] = total_covered_positions_of_genome
269273
return lorenz_curves
270274

271275
def export_lorenz_curves(self, lorenz_curves, output_stream):

setup.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,6 @@
22
# *- coding: utf-8 -*-
33
# vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 textwidth=79:
44

5-
"""
6-
[License: GNU General Public License v3 (GPLv3)]
7-
"""
8-
95
import blc
106
from setuptools import setup
117
#from distutils.core import setup
@@ -29,11 +25,9 @@ def get_requirements():
2925
author=blc.__author__,
3026
url=blc.__homepage__,
3127
keywords=["NGS", "BAM", "Lorenz", "coverage"],
32-
classifiers=[
33-
'Environment :: Console',
34-
'Intended Audience :: Science/Research',
35-
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
36-
'Operating System :: OS Independent',
37-
'Topic :: Scientific/Engineering',
38-
'Topic :: Scientific/Engineering :: Bio-Informatics'
39-
])
28+
classifiers=['Environment :: Console',
29+
'Intended Audience :: Science/Research',
30+
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
31+
'Operating System :: OS Independent',
32+
'Topic :: Scientific/Engineering',
33+
'Topic :: Scientific/Engineering :: Bio-Informatics'])

tests/test_class_blc.py

Lines changed: 39 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def test_001_estimate_idx_from_bam(self):
2828
sam_to_sorted_bam(input_file_sam, input_file_bam)
2929

3030
b = bamlorenzcoverage()
31-
idx = b.bam_file_to_idx(input_file_bam)
31+
idx, n = b.bam_file_to_idx(input_file_bam)
3232

3333
# print(idx, file=sys.stderr)
3434
# denote that it only considers the (size of the) sequences described in the SAM header
@@ -84,12 +84,14 @@ def test_004_test_splice_junction(self):
8484
sam_to_sorted_bam(input_file_sam, input_file_bam)
8585

8686
b = bamlorenzcoverage()
87-
idx = b.bam_file_to_idx(input_file_bam)
87+
idx, n = b.bam_file_to_idx(input_file_bam)
8888

89-
# print(idx, file=sys.stderr)
9089
# denote that it only considers the (size of the) sequences described in the SAM header
9190
self.assertDictEqual(idx, {0: 392, 1: 108})
9291

92+
# additional stats
93+
self.assertEqual(n, 500)
94+
9395
def test_005_deletion(self):
9496
test_id = 'blc_005'
9597

@@ -99,9 +101,8 @@ def test_005_deletion(self):
99101
sam_to_sorted_bam(input_file_sam, input_file_bam)
100102

101103
b = bamlorenzcoverage()
102-
idx = b.bam_file_to_idx(input_file_bam)
104+
idx, n = b.bam_file_to_idx(input_file_bam)
103105

104-
# print(idx, file=sys.stderr)
105106
# denote that it only considers the (size of the) sequences described in the SAM header
106107
self.assertDictEqual(idx, {0: 392, 1: 108})
107108

@@ -114,12 +115,14 @@ def test_006_insertion(self):
114115
sam_to_sorted_bam(input_file_sam, input_file_bam)
115116

116117
b = bamlorenzcoverage()
117-
idx = b.bam_file_to_idx(input_file_bam)
118+
idx, n = b.bam_file_to_idx(input_file_bam)
118119

119-
# print(idx, file=sys.stderr)
120120
# denote that it only considers the (size of the) sequences described in the SAM header
121121
self.assertDictEqual(idx, {0: 400, 1: 100})
122122

123+
# additional stats
124+
self.assertEqual(n, 500)
125+
123126
def test_007_stacking(self):
124127
test_id = 'blc_007'
125128

@@ -129,12 +132,14 @@ def test_007_stacking(self):
129132
sam_to_sorted_bam(input_file_sam, input_file_bam)
130133

131134
b = bamlorenzcoverage()
132-
idx = b.bam_file_to_idx(input_file_bam)
135+
idx, n = b.bam_file_to_idx(input_file_bam)
133136

134-
# print(idx, file=sys.stderr)
135137
# denote that it only considers the (size of the) sequences described in the SAM header
136138
self.assertDictEqual(idx, {0: 372, 1: 48, 2: 80})
137139

140+
# additional stats
141+
self.assertEqual(n, 500)
142+
138143
def test_008_lorenz_01(self):
139144
# x x x x
140145
# - - - - - - - - - -
@@ -146,7 +151,11 @@ def test_008_lorenz_01(self):
146151

147152
# print(idx, file=sys.stderr)
148153
# denote that it only considers the (size of the) sequences described in the SAM header
149-
self.assertDictEqual(lc, {'fraction_genome': [0.0, 1.0], 'fraction_reads': [0.0, 1.0], 'roc': 0.5})
154+
self.assertListEqual(lc['fraction_genome'], [0.0, 1.0])
155+
self.assertListEqual(lc['fraction_reads'], [0.0, 1.0])
156+
157+
# additional stats
158+
self.assertEqual(lc['roc'], 0.5)
150159

151160
def test_009_lorenz_02(self):
152161
# everything covered, is at least covered even densely
@@ -161,7 +170,11 @@ def test_009_lorenz_02(self):
161170

162171
# print(idx, file=sys.stderr)
163172
# denote that it only considers the (size of the) sequences described in the SAM header
164-
self.assertDictEqual(lc, {'fraction_genome': [0.0, 1.0], 'fraction_reads': [0.0, 1.0], 'roc': 0.5})
173+
self.assertListEqual(lc['fraction_genome'], [0.0, 1.0])
174+
self.assertListEqual(lc['fraction_reads'], [0.0, 1.0])
175+
176+
# additional stats
177+
self.assertEqual(lc['roc'], 0.5)
165178

166179
def test_010_lorenz_03(self):
167180
# everything covered, is at least covered even densely
@@ -175,8 +188,11 @@ def test_010_lorenz_03(self):
175188
lc = b.estimate_lorenz_curves(idx)
176189

177190
# print(idx, file=sys.stderr)
178-
# denote that it only considers the (size of the) sequences described in the SAM header
179-
self.assertDictEqual(lc, {'fraction_genome': [0.0, 1.0 * 2 / 8, 1.0], 'fraction_reads': [0.0, 1.0 * 4 / 10, 1.0], 'roc': 0.425})
191+
self.assertListEqual(lc['fraction_genome'], [0.0, 1.0 * 2 / 8, 1.0])
192+
self.assertListEqual(lc['fraction_reads'], [0.0, 1.0 * 4 / 10, 1.0])
193+
194+
# additional stats
195+
self.assertEqual(lc['roc'], 0.425)
180196

181197
def test_011_lorenz_03(self):
182198
# everything covered, is at least covered even densely
@@ -192,17 +208,23 @@ def test_011_lorenz_03(self):
192208
sam_to_sorted_bam(input_file_sam, input_file_bam)
193209

194210
b = bamlorenzcoverage()
195-
idx = b.bam_file_to_idx(input_file_bam)
211+
idx, n = b.bam_file_to_idx(input_file_bam)
212+
lc = b.estimate_lorenz_curves(idx)
196213

197214
# print(idx, file=sys.stderr)
198215
# denote that it only considers the (size of the) sequences described in the SAM header
199216
self.assertDictEqual(idx, {0: 6, 1: 6, 2: 2})
200217

201-
lc = b.estimate_lorenz_curves(idx)
202-
203218
# print(idx, file=sys.stderr)
204219
# denote that it only considers the (size of the) sequences described in the SAM header
205-
self.assertDictEqual(lc, {'fraction_genome': [0.0, 1.0 * 2 / 8, 1.0], 'fraction_reads': [0.0, 1.0 * 4 / 10, 1.0], 'roc': 0.425})
220+
self.assertListEqual(lc['fraction_genome'], [0.0, 1.0 * 2 / 8, 1.0])
221+
self.assertListEqual(lc['fraction_reads'], [0.0, 1.0 * 4 / 10, 1.0])
222+
223+
# additional stats
224+
self.assertEqual(n, 14) # sam header say reference size is 14
225+
self.assertEqual(lc['roc'], 0.425)
226+
self.assertEqual(lc['total_sequenced_bases'], 10)
227+
self.assertEqual(lc['total_covered_positions_of_genome'], 8)
206228

207229

208230
if __name__ == '__main__':

0 commit comments

Comments
 (0)