Skip to content

Commit 9f59e21

Browse files
authored
Merge pull request #126 from Aratz/DATAOPS-1021_small_fixes
Small fixes to new parsers and checkers
2 parents 5fe1aa9 + 70954d4 commit 9f59e21

File tree

8 files changed

+248
-80
lines changed

8 files changed

+248
-80
lines changed

checkQC/parsers/illumina.py

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,7 @@ def from_bclconvert(cls, runfolder_path, parser_config):
2222
/ parser_config["reports_location"]
2323
/ "Top_Unknown_Barcodes.csv"
2424
)
25-
samplesheet = _read_samplesheet(runfolder_path)["BCLConvert_Data"]
26-
for row in samplesheet:
27-
row["Index"] = row["Index"].replace(" ", "")
28-
row["Index2"] = row["Index2"].replace(" ", "")
25+
samplesheet = _read_samplesheet(runfolder_path)
2926

3027
instrument, read_length = _read_run_metadata(runfolder_path)
3128

@@ -87,6 +84,11 @@ def from_bclconvert(cls, runfolder_path, parser_config):
8784

8885

8986
def _read_interop_summary(runfolder_path):
87+
"""
88+
Read interop files and return interop objects for run_summary and index
89+
summary.
90+
"""
91+
9092
runfolder_path = str(runfolder_path) # interop does not handle Path objects
9193

9294
run_info = interop.py_interop_run.info()
@@ -105,25 +107,53 @@ def _read_interop_summary(runfolder_path):
105107

106108

107109
def _read_quality_metrics(quality_metrics_path):
110+
"""
111+
Read quality metrics file
112+
"""
108113
with open(quality_metrics_path, encoding="utf-8") as csvfile:
109114
return list(csv.DictReader(csvfile))
110115

111116

112117
def _read_top_unknown_barcodes(top_unknown_barcodes_path):
118+
"""
119+
Read top unknown barcodes file
120+
"""
113121
with open(top_unknown_barcodes_path, encoding="utf-8") as csvfile:
114-
return list(csv.DictReader(csvfile))
122+
return list(csv.DictReader(csvfile))
115123

116124

117-
# TODO add docs
118125
def _read_run_metadata(runfolder_path):
126+
"""
127+
Read intrument, reagent and read_length
128+
"""
119129
run_type_recognizer = RunTypeRecognizer(runfolder_path)
120130

121131
return (
122132
run_type_recognizer.instrument_and_reagent_version(),
123-
int(run_type_recognizer.read_length()),
133+
# NOTE: read length can be either "151" or "151-151" in case of paired
134+
# reads. For now, only symetric read length is supported
135+
# see checkQC/app.py#L159
136+
int(run_type_recognizer.read_length().split("-")[0]),
124137
)
125138

126139

127-
# TODO reorder helper functions
128140
def _read_samplesheet(runfolder_path):
129-
return read_sectionedsheet(runfolder_path / "SampleSheet.csv")
141+
"""
142+
Parse `BCLConvert_Data` section of samplesheet
143+
144+
NOTE: column name can sometimes start with an uppercase letter.
145+
"""
146+
samplesheet = read_sectionedsheet(
147+
runfolder_path / "SampleSheet.csv")["BCLConvert_Data"]
148+
149+
for i, row in enumerate(samplesheet):
150+
samplesheet[i] = {
151+
key.lower(): value
152+
for key, value in row.items()
153+
}
154+
155+
for row in samplesheet:
156+
row["index"] = row["index"].replace(" ", "")
157+
row["index2"] = row["index2"].replace(" ", "")
158+
159+
return samplesheet

checkQC/qc_checkers/unidentified_index.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,16 +98,16 @@ def __init__(self, samplesheet):
9898
self.samplesheet_single_indices = {}
9999
self.samplesheet_dual_indices = {}
100100
for row in samplesheet:
101-
if row.get("Index2"):
101+
if row.get("index2"):
102102
self.samplesheet_dual_indices.setdefault(
103-
row["Index"], []).append(row)
103+
row["index"], []).append(row)
104104
self.samplesheet_dual_indices.setdefault(
105-
row["Index2"], []).append(row)
105+
row["index2"], []).append(row)
106106
self.samplesheet_dual_indices.setdefault(
107-
f"{row['Index']}+{row['Index2']}", []).append(row)
107+
f"{row['index']}+{row['index2']}", []).append(row)
108108
else:
109109
self.samplesheet_single_indices.setdefault(
110-
row["Index"], []).append(row)
110+
row["index"], []).append(row)
111111

112112
def list_causes(self, barcode_data):
113113
"""

checkQC/qc_reporter.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def _select_configs(
7878

7979
def _select_read_len(self, qc_data, use_closest_read_len):
8080
def dist(read_len):
81-
if mtch := re.match(r"(\d+)-(\d+)", read_len):
81+
if mtch := re.match(r"(\d+)-(\d+)", str(read_len)):
8282
low, high = (int(n) for n in mtch.groups())
8383
return (
8484
0
@@ -96,7 +96,7 @@ def dist(read_len):
9696
if not use_closest_read_len and dist(best_match_read_len) > 0:
9797
raise KeyError(
9898
f"No config entry matching read length {qc_data.read_length}"
99-
f"found for instrument {qc_data.instrument}."
99+
f" found for instrument {qc_data.instrument}."
100100
)
101101

102102
return best_match_read_len
@@ -109,7 +109,7 @@ def _get_checker_configs(
109109
):
110110
checker_configs = {
111111
checker_config["name"]: {
112-
f"{k}_threshold" if k in ["error", "warning"] else k: v
112+
(f"{k}_threshold" if k in ["error", "warning"] else k): v
113113
for k, v in checker_config.items()
114114
if k != "name"
115115
}
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
from pathlib import Path
2+
3+
import pytest
4+
5+
from checkQC.parsers.illumina import (
6+
_read_interop_summary,
7+
_read_quality_metrics,
8+
_read_top_unknown_barcodes,
9+
_read_run_metadata,
10+
_read_samplesheet,
11+
)
12+
13+
14+
@pytest.fixture
15+
def runfolder_path():
16+
return (
17+
Path(__file__).parent.parent
18+
/ "resources/bclconvert/200624_A00834_0183_BHMTFYTINY"
19+
)
20+
21+
22+
def test_read_interop_summary(runfolder_path):
23+
run_summary, index_summary = _read_interop_summary(runfolder_path)
24+
25+
total_cluster_pf = run_summary.at(0).at(0).reads_pf()
26+
assert total_cluster_pf == 532464327
27+
28+
sample_id = index_summary.at(0).at(0).sample_id()
29+
assert sample_id == "Sample_14574-Qiagen-IndexSet1-SP-Lane1"
30+
31+
32+
def test_read_quality_metrics(runfolder_path):
33+
quality_metrics = _read_quality_metrics(
34+
runfolder_path / "Reports/Quality_Metrics.csv")
35+
36+
assert len(quality_metrics) == 6
37+
assert quality_metrics[0] == {
38+
'% Q30': '0.96',
39+
'Lane': '1',
40+
'Mean Quality Score (PF)': '36.37',
41+
'QualityScoreSum': '12987964',
42+
'ReadNumber': '1',
43+
'SampleID': 'Sample_14574-Qiagen-IndexSet1-SP-Lane1',
44+
'Sample_Project': 'AB-1234',
45+
'Yield': '357120',
46+
'YieldQ30': '342989',
47+
'index': 'GAACTGAGCG',
48+
'index2': 'TCGTGGAGCG',
49+
}
50+
51+
52+
def test_read_to_unknown_barcodes(runfolder_path):
53+
top_unknown_barcodes = _read_top_unknown_barcodes(
54+
runfolder_path / "Reports/Top_Unknown_Barcodes.csv")
55+
56+
assert len(top_unknown_barcodes) == 2084
57+
assert top_unknown_barcodes[:3] == [
58+
{
59+
'# Reads': '12857',
60+
'% of All Reads': '0.003775',
61+
'% of Unknown Barcodes': '0.003796',
62+
'Lane': '1',
63+
'index': 'ATATCTGCTT',
64+
'index2': 'TAGACAATCT',
65+
},
66+
{
67+
'# Reads': '12406',
68+
'% of All Reads': '0.003643',
69+
'% of Unknown Barcodes': '0.003663',
70+
'Lane': '1',
71+
'index': 'CACCTCTCTT',
72+
'index2': 'CTCGACTCCT',
73+
},
74+
{
75+
'# Reads': '12177',
76+
'% of All Reads': '0.003575',
77+
'% of Unknown Barcodes': '0.003595',
78+
'Lane': '1',
79+
'index': 'ATGTAACGTT',
80+
'index2': 'ACGATTGCTG',
81+
},
82+
]
83+
84+
85+
# TODO add tests with paired end reads
86+
def test_read_run_metadata(runfolder_path):
87+
instrument, read_length = _read_run_metadata(runfolder_path)
88+
assert instrument == "novaseq_SP"
89+
assert read_length == 36
90+
91+
92+
def test_read_samplesheet(runfolder_path):
93+
samplesheet = _read_samplesheet(runfolder_path)
94+
95+
assert len(samplesheet) == 4
96+
assert all(
97+
key.lower() == key
98+
for row in samplesheet
99+
for key in row
100+
)
101+
assert all(
102+
" " not in row["index"] and " " not in row["index2"]
103+
for row in samplesheet
104+
)
105+
assert samplesheet[0] == {
106+
'index': 'GAACTGAGCG',
107+
'index2': 'TCGTGGAGCG',
108+
'lane': 1,
109+
'sample_id': 'Sample_14574-Qiagen-IndexSet1-SP-Lane1',
110+
'sample_project': 'AB-1234',
111+
}

tests/qc_checkers/test_unidentified_index.py

Lines changed: 33 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,15 @@
77
@pytest.fixture
88
def samplesheet_matcher():
99
return SamplesheetMatcher([
10-
{"Index": "CCAA", "Index2": "AGCA", "Lane": 1, "Sample_ID": "dual reverse"},
11-
{"Index": "GGTT", "Index2": "TCGT", "Lane": 2, "Sample_ID": "dual reverse complement"},
12-
{"Index": "TGCT", "Index2": "TTGG", "Lane": 3, "Sample_ID": "dual complement"},
13-
{"Index": "AAAT", "Index2": "ATAT", "Lane": 1, "Sample_ID": "sample_id"},
14-
{"Index": "CAAT", "Index2": "CTAT", "Lane": 1, "Sample_ID": "sample_id"},
15-
{"Index": "TCCA", "Index2": "", "Lane": 1, "Sample_ID": "reverse"},
16-
{"Index": "AGGT", "Index2": "", "Lane": 1, "Sample_ID": "reverse complement"},
17-
{"Index": "TGGA", "Index2": "", "Lane": 1, "Sample_ID": "complement"},
18-
{"Index": "AAGG", "Index2": "", "Lane": 1, "Sample_ID": "test"},
10+
{"index": "CCAA", "index2": "AGCA", "Lane": 1, "Sample_ID": "dual reverse"},
11+
{"index": "GGTT", "index2": "TCGT", "Lane": 2, "Sample_ID": "dual reverse complement"},
12+
{"index": "TGCT", "index2": "TTGG", "Lane": 3, "Sample_ID": "dual complement"},
13+
{"index": "AAAT", "index2": "ATAT", "Lane": 1, "Sample_ID": "sample_id"},
14+
{"index": "CAAT", "index2": "CTAT", "Lane": 1, "Sample_ID": "sample_id"},
15+
{"index": "TCCA", "index2": "", "Lane": 1, "Sample_ID": "reverse"},
16+
{"index": "AGGT", "index2": "", "Lane": 1, "Sample_ID": "reverse complement"},
17+
{"index": "TGGA", "index2": "", "Lane": 1, "Sample_ID": "complement"},
18+
{"index": "AAGG", "index2": "", "Lane": 1, "Sample_ID": "test"},
1919
])
2020

2121

@@ -35,7 +35,7 @@ def test_check_complement(samplesheet_matcher):
3535
assert msg == "complement index swap: \"AAGG\" found in samplesheet for sample \"test\", lane 1"
3636
assert data == (
3737
"complement",
38-
{"Index": "AAGG", "Index2": "", "Lane": 1, "Sample_ID": "test"}
38+
{"index": "AAGG", "index2": "", "Lane": 1, "Sample_ID": "test"}
3939
)
4040

4141

@@ -53,7 +53,7 @@ def test_check_reverse(samplesheet_matcher):
5353
assert msg == "reverse index swap: \"AAGG\" found in samplesheet for sample \"test\", lane 1"
5454
assert data == (
5555
"reverse",
56-
{"Index": "AAGG", "Index2": "", "Lane": 1, "Sample_ID": "test"}
56+
{"index": "AAGG", "index2": "", "Lane": 1, "Sample_ID": "test"}
5757
)
5858

5959

@@ -71,7 +71,7 @@ def test_check_reverse_complement(samplesheet_matcher):
7171
assert msg == "reverse complement index swap: \"AAGG\" found in samplesheet for sample \"test\", lane 1"
7272
assert data == (
7373
"reverse complement",
74-
{"Index": "AAGG", "Index2": "", "Lane": 1, "Sample_ID": "test"}
74+
{"index": "AAGG", "index2": "", "Lane": 1, "Sample_ID": "test"}
7575
)
7676

7777

@@ -132,8 +132,8 @@ def test_lane_swap(samplesheet_matcher):
132132
assert data == (
133133
"lane swap",
134134
{
135-
"Index": "CCAA",
136-
"Index2": "AGCA",
135+
"index": "CCAA",
136+
"index2": "AGCA",
137137
"Lane": 1,
138138
"Sample_ID": "dual reverse",
139139
}
@@ -152,8 +152,8 @@ def test_dual_index_swap(samplesheet_matcher):
152152
assert data == (
153153
"dual index swap",
154154
{
155-
"Index": "CCAA",
156-
"Index2": "AGCA",
155+
"index": "CCAA",
156+
"index2": "AGCA",
157157
"Lane": 1,
158158
"Sample_ID": "dual reverse",
159159
}
@@ -173,8 +173,8 @@ def qc_data():
173173
}
174174
},
175175
[
176-
{"Index": "ACCT", "Lane": 2, "Sample_ID": "lane swap"},
177-
{"Index": "TCCA", "Lane": 1, "Sample_ID": "reverse"},
176+
{"index": "ACCT", "Lane": 2, "Sample_ID": "lane swap"},
177+
{"index": "TCCA", "Lane": 1, "Sample_ID": "reverse"},
178178
]
179179
)
180180

@@ -189,6 +189,21 @@ def test_unidentified_index(qc_data):
189189
- reverse index swap: "TCCA" found in samplesheet for sample "reverse", lane 1
190190
- lane swap: index "ACCT" found in samplesheet for sample "lane swap", lane 2"""
191191
assert reports[0].type() == "error"
192+
assert reports[0].data == {
193+
"barcode": {
194+
"count": 10,
195+
"index": "ACCT",
196+
"lane": 1,
197+
},
198+
"causes": [
199+
("reverse", {"index": "TCCA", "Lane": 1, "Sample_ID": "reverse"}),
200+
("lane swap", {"index": "ACCT", "Lane": 2, "Sample_ID": "lane swap"}),
201+
],
202+
"is_white_listed": False,
203+
"lane": 1,
204+
"significance": 10.0,
205+
"threshold": 5.0,
206+
}
192207
assert str(reports[1]) == "Fatal QC error: Overrepresented unknown barcode \"AC\" on lane 1 (50.0% > 5.0%)."
193208
assert reports[1].type() == "error"
194209

0 commit comments

Comments
 (0)