Molmed
diff --git a/‎checkQC/parsers/illumina.py‎
Lines changed: 39 additions & 9 deletions b/‎checkQC/parsers/illumina.py‎
Lines changed: 39 additions & 9 deletions
diff --git a/‎checkQC/qc_checkers/unidentified_index.py‎
Lines changed: 5 additions & 5 deletions b/‎checkQC/qc_checkers/unidentified_index.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎checkQC/qc_reporter.py‎
Lines changed: 3 additions & 3 deletions b/‎checkQC/qc_reporter.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tests/parsers/test_illumina_parser.py‎
Lines changed: 111 additions & 0 deletions b/‎tests/parsers/test_illumina_parser.py‎
Lines changed: 111 additions & 0 deletions
diff --git a/‎tests/qc_checkers/test_unidentified_index.py‎
Lines changed: 33 additions & 18 deletions b/‎tests/qc_checkers/test_unidentified_index.py‎
Lines changed: 33 additions & 18 deletions
@@ -22,10 +22,7 @@ def from_bclconvert(cls, runfolder_path, parser_config):
         / parser_config["reports_location"]
         / "Top_Unknown_Barcodes.csv"
     )
-    samplesheet = _read_samplesheet(runfolder_path)["BCLConvert_Data"]
-    for row in samplesheet:
-        row["Index"] = row["Index"].replace(" ", "")
-        row["Index2"] = row["Index2"].replace(" ", "")
+    samplesheet = _read_samplesheet(runfolder_path)
 
     instrument, read_length = _read_run_metadata(runfolder_path)
 
@@ -87,6 +84,11 @@ def from_bclconvert(cls, runfolder_path, parser_config):
 
 
 def _read_interop_summary(runfolder_path):
+    """
+    Read interop files and return interop objects for run_summary and index
+    summary.
+    """
+
     runfolder_path = str(runfolder_path)  # interop does not handle Path objects
 
     run_info = interop.py_interop_run.info()
@@ -105,25 +107,53 @@ def _read_interop_summary(runfolder_path):
 
 
 def _read_quality_metrics(quality_metrics_path):
+    """
+    Read quality metrics file
+    """
     with open(quality_metrics_path, encoding="utf-8") as csvfile:
         return list(csv.DictReader(csvfile))
 
 
 def _read_top_unknown_barcodes(top_unknown_barcodes_path):
+    """
+    Read top unknown barcodes file
+    """
     with open(top_unknown_barcodes_path, encoding="utf-8") as csvfile:
-         return list(csv.DictReader(csvfile))
+        return list(csv.DictReader(csvfile))
 
 
-# TODO add docs
 def _read_run_metadata(runfolder_path):
+    """
+    Read intrument, reagent and read_length
+    """
     run_type_recognizer = RunTypeRecognizer(runfolder_path)
 
     return (
         run_type_recognizer.instrument_and_reagent_version(),
-        int(run_type_recognizer.read_length()),
+        # NOTE: read length can be either "151" or "151-151" in case of paired
+        # reads. For now, only symetric read length is supported
+        # see checkQC/app.py#L159
+        int(run_type_recognizer.read_length().split("-")[0]),
     )
 
 
-# TODO reorder helper functions
 def _read_samplesheet(runfolder_path):
-    return read_sectionedsheet(runfolder_path / "SampleSheet.csv")
+    """
+    Parse `BCLConvert_Data` section of samplesheet
+
+    NOTE: column name can sometimes start with an uppercase letter.
+    """
+    samplesheet = read_sectionedsheet(
+        runfolder_path / "SampleSheet.csv")["BCLConvert_Data"]
+
+    for i, row in enumerate(samplesheet):
+        samplesheet[i] = {
+            key.lower(): value
+            for key, value in row.items()
+        }
+
+    for row in samplesheet:
+        row["index"] = row["index"].replace(" ", "")
+        row["index2"] = row["index2"].replace(" ", "")
+
+    return samplesheet
@@ -98,16 +98,16 @@ def __init__(self, samplesheet):
         self.samplesheet_single_indices = {}
         self.samplesheet_dual_indices = {}
         for row in samplesheet:
-            if row.get("Index2"):
+            if row.get("index2"):
                 self.samplesheet_dual_indices.setdefault(
-                    row["Index"], []).append(row)
+                    row["index"], []).append(row)
                 self.samplesheet_dual_indices.setdefault(
-                    row["Index2"], []).append(row)
+                    row["index2"], []).append(row)
                 self.samplesheet_dual_indices.setdefault(
-                    f"{row['Index']}+{row['Index2']}", []).append(row)
+                    f"{row['index']}+{row['index2']}", []).append(row)
             else:
                 self.samplesheet_single_indices.setdefault(
-                    row["Index"], []).append(row)
+                    row["index"], []).append(row)
 
     def list_causes(self, barcode_data):
         """
 
@@ -78,7 +78,7 @@ def _select_configs(
 
     def _select_read_len(self, qc_data, use_closest_read_len):
         def dist(read_len):
-            if mtch := re.match(r"(\d+)-(\d+)", read_len):
+            if mtch := re.match(r"(\d+)-(\d+)", str(read_len)):
                 low, high = (int(n) for n in mtch.groups())
                 return (
                     0
@@ -96,7 +96,7 @@ def dist(read_len):
         if not use_closest_read_len and dist(best_match_read_len) > 0:
             raise KeyError(
                 f"No config entry matching read length {qc_data.read_length}"
-                f"found for instrument {qc_data.instrument}."
+                f" found for instrument {qc_data.instrument}."
             )
 
         return best_match_read_len
@@ -109,7 +109,7 @@ def _get_checker_configs(
     ):
         checker_configs = {
             checker_config["name"]: {
-                f"{k}_threshold" if k in ["error", "warning"] else k: v
+                (f"{k}_threshold" if k in ["error", "warning"] else k): v
                 for k, v in checker_config.items()
                 if k != "name"
             }
 
@@ -0,0 +1,111 @@
+from pathlib import Path
+
+import pytest
+
+from checkQC.parsers.illumina import (
+    _read_interop_summary,
+    _read_quality_metrics,
+    _read_top_unknown_barcodes,
+    _read_run_metadata,
+    _read_samplesheet,
+)
+
+
+@pytest.fixture
+def runfolder_path():
+    return (
+        Path(__file__).parent.parent
+        / "resources/bclconvert/200624_A00834_0183_BHMTFYTINY"
+    )
+
+
+def test_read_interop_summary(runfolder_path):
+    run_summary, index_summary = _read_interop_summary(runfolder_path)
+
+    total_cluster_pf = run_summary.at(0).at(0).reads_pf()
+    assert total_cluster_pf == 532464327
+
+    sample_id = index_summary.at(0).at(0).sample_id()
+    assert sample_id == "Sample_14574-Qiagen-IndexSet1-SP-Lane1"
+
+
+def test_read_quality_metrics(runfolder_path):
+    quality_metrics = _read_quality_metrics(
+            runfolder_path / "Reports/Quality_Metrics.csv")
+
+    assert len(quality_metrics) == 6
+    assert quality_metrics[0] == {
+        '% Q30': '0.96',
+        'Lane': '1',
+        'Mean Quality Score (PF)': '36.37',
+        'QualityScoreSum': '12987964',
+        'ReadNumber': '1',
+        'SampleID': 'Sample_14574-Qiagen-IndexSet1-SP-Lane1',
+        'Sample_Project': 'AB-1234',
+        'Yield': '357120',
+        'YieldQ30': '342989',
+        'index': 'GAACTGAGCG',
+        'index2': 'TCGTGGAGCG',
+    }
+
+
+def test_read_to_unknown_barcodes(runfolder_path):
+    top_unknown_barcodes = _read_top_unknown_barcodes(
+            runfolder_path / "Reports/Top_Unknown_Barcodes.csv")
+
+    assert len(top_unknown_barcodes) == 2084
+    assert top_unknown_barcodes[:3] == [
+        {
+            '# Reads': '12857',
+            '% of All Reads': '0.003775',
+            '% of Unknown Barcodes': '0.003796',
+            'Lane': '1',
+            'index': 'ATATCTGCTT',
+            'index2': 'TAGACAATCT',
+        },
+        {
+            '# Reads': '12406',
+            '% of All Reads': '0.003643',
+            '% of Unknown Barcodes': '0.003663',
+            'Lane': '1',
+            'index': 'CACCTCTCTT',
+            'index2': 'CTCGACTCCT',
+        },
+        {
+            '# Reads': '12177',
+            '% of All Reads': '0.003575',
+            '% of Unknown Barcodes': '0.003595',
+            'Lane': '1',
+            'index': 'ATGTAACGTT',
+            'index2': 'ACGATTGCTG',
+        },
+    ]
+
+
+# TODO add tests with paired end reads
+def test_read_run_metadata(runfolder_path):
+    instrument, read_length = _read_run_metadata(runfolder_path)
+    assert instrument == "novaseq_SP"
+    assert read_length == 36
+
+
+def test_read_samplesheet(runfolder_path):
+    samplesheet = _read_samplesheet(runfolder_path)
+
+    assert len(samplesheet) == 4
+    assert all(
+        key.lower() == key
+        for row in samplesheet
+        for key in row
+    )
+    assert all(
+        " " not in row["index"] and " " not in row["index2"]
+        for row in samplesheet
+    )
+    assert samplesheet[0] == {
+        'index': 'GAACTGAGCG',
+        'index2': 'TCGTGGAGCG',
+        'lane': 1,
+        'sample_id': 'Sample_14574-Qiagen-IndexSet1-SP-Lane1',
+        'sample_project': 'AB-1234',
+    }
@@ -7,15 +7,15 @@
 @pytest.fixture
 def samplesheet_matcher():
     return SamplesheetMatcher([
-        {"Index": "CCAA", "Index2": "AGCA", "Lane": 1, "Sample_ID": "dual reverse"},
-        {"Index": "GGTT", "Index2": "TCGT", "Lane": 2, "Sample_ID": "dual reverse complement"},
-        {"Index": "TGCT", "Index2": "TTGG", "Lane": 3, "Sample_ID": "dual complement"},
-        {"Index": "AAAT", "Index2": "ATAT", "Lane": 1, "Sample_ID": "sample_id"},
-        {"Index": "CAAT", "Index2": "CTAT", "Lane": 1, "Sample_ID": "sample_id"},
-        {"Index": "TCCA", "Index2": "", "Lane": 1, "Sample_ID": "reverse"},
-        {"Index": "AGGT", "Index2": "", "Lane": 1, "Sample_ID": "reverse complement"},
-        {"Index": "TGGA", "Index2": "", "Lane": 1, "Sample_ID": "complement"},
-        {"Index": "AAGG", "Index2": "", "Lane": 1, "Sample_ID": "test"},
+        {"index": "CCAA", "index2": "AGCA", "Lane": 1, "Sample_ID": "dual reverse"},
+        {"index": "GGTT", "index2": "TCGT", "Lane": 2, "Sample_ID": "dual reverse complement"},
+        {"index": "TGCT", "index2": "TTGG", "Lane": 3, "Sample_ID": "dual complement"},
+        {"index": "AAAT", "index2": "ATAT", "Lane": 1, "Sample_ID": "sample_id"},
+        {"index": "CAAT", "index2": "CTAT", "Lane": 1, "Sample_ID": "sample_id"},
+        {"index": "TCCA", "index2": "", "Lane": 1, "Sample_ID": "reverse"},
+        {"index": "AGGT", "index2": "", "Lane": 1, "Sample_ID": "reverse complement"},
+        {"index": "TGGA", "index2": "", "Lane": 1, "Sample_ID": "complement"},
+        {"index": "AAGG", "index2": "", "Lane": 1, "Sample_ID": "test"},
     ])
 
 
@@ -35,7 +35,7 @@ def test_check_complement(samplesheet_matcher):
     assert msg == "complement index swap: \"AAGG\" found in samplesheet for sample \"test\", lane 1"
     assert data == (
         "complement",
-        {"Index": "AAGG", "Index2": "", "Lane": 1, "Sample_ID": "test"}
+        {"index": "AAGG", "index2": "", "Lane": 1, "Sample_ID": "test"}
     )
 
 
@@ -53,7 +53,7 @@ def test_check_reverse(samplesheet_matcher):
     assert msg == "reverse index swap: \"AAGG\" found in samplesheet for sample \"test\", lane 1"
     assert data == (
         "reverse",
-        {"Index": "AAGG", "Index2": "", "Lane": 1, "Sample_ID": "test"}
+        {"index": "AAGG", "index2": "", "Lane": 1, "Sample_ID": "test"}
     )
 
 
@@ -71,7 +71,7 @@ def test_check_reverse_complement(samplesheet_matcher):
     assert msg == "reverse complement index swap: \"AAGG\" found in samplesheet for sample \"test\", lane 1"
     assert data == (
         "reverse complement",
-        {"Index": "AAGG", "Index2": "", "Lane": 1, "Sample_ID": "test"}
+        {"index": "AAGG", "index2": "", "Lane": 1, "Sample_ID": "test"}
     )
 
 
@@ -132,8 +132,8 @@ def test_lane_swap(samplesheet_matcher):
     assert data == (
         "lane swap",
         {
-            "Index": "CCAA",
-            "Index2": "AGCA",
+            "index": "CCAA",
+            "index2": "AGCA",
             "Lane": 1,
             "Sample_ID": "dual reverse",
         }
@@ -152,8 +152,8 @@ def test_dual_index_swap(samplesheet_matcher):
     assert data == (
         "dual index swap",
         {
-            "Index": "CCAA",
-            "Index2": "AGCA",
+            "index": "CCAA",
+            "index2": "AGCA",
             "Lane": 1,
             "Sample_ID": "dual reverse",
         }
@@ -173,8 +173,8 @@ def qc_data():
             }
         },
         [
-            {"Index": "ACCT", "Lane": 2, "Sample_ID": "lane swap"},
-            {"Index": "TCCA", "Lane": 1, "Sample_ID": "reverse"},
+            {"index": "ACCT", "Lane": 2, "Sample_ID": "lane swap"},
+            {"index": "TCCA", "Lane": 1, "Sample_ID": "reverse"},
         ]
     )
 
@@ -189,6 +189,21 @@ def test_unidentified_index(qc_data):
 - reverse index swap: "TCCA" found in samplesheet for sample "reverse", lane 1
 - lane swap: index "ACCT" found in samplesheet for sample "lane swap", lane 2"""
     assert reports[0].type() == "error"
+    assert reports[0].data == {
+        "barcode": {
+            "count": 10,
+            "index": "ACCT",
+            "lane": 1,
+        },
+        "causes": [
+            ("reverse", {"index": "TCCA", "Lane": 1, "Sample_ID": "reverse"}),
+            ("lane swap", {"index": "ACCT", "Lane": 2, "Sample_ID": "lane swap"}),
+        ],
+        "is_white_listed": False,
+        "lane": 1,
+        "significance": 10.0,
+        "threshold": 5.0,
+    }
     assert str(reports[1]) == "Fatal QC error: Overrepresented unknown barcode \"AC\" on lane 1 (50.0% > 5.0%)."
     assert reports[1].type() == "error"