Skip to content

Commit 189f1b7

Browse files
authored
Merge pull request #50 from murphycj/bug/fix-fusioninspector-parser
Fix Fusioninspector parser
2 parents 749c814 + cbe6215 commit 189f1b7

File tree

5 files changed

+70
-31
lines changed

5 files changed

+70
-31
lines changed

.github/workflows/test.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
python-version: ["3.7", "3.8", "3.9", "3.10"]
1212
steps:
1313
- name: Checkout source code
14-
uses: actions/checkout@v1
14+
uses: actions/checkout@v3
1515
- name: Install dependencies
1616
run: |
1717
python -m pip install --upgrade pip

agfusion/parsers.py

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -701,30 +701,22 @@ class FusionInspector(_Parser):
701701
def __init__(self, infile, logger):
702702
super().__init__(logger)
703703

704-
fin = open(infile, "r")
705-
for line in fin.readlines():
706-
if re.findall(r"^#", line):
707-
line = line.rstrip().split("\t")
708-
if line[0] != "#FusionName" and line[0] != "#fusion_name":
709-
raise AssertionError(
710-
"Unrecognized FusionInspector input for first column"
711-
+ " in header. Should be #FusionName or #fusion_name."
712-
)
704+
data = pd.read_csv(infile, delimiter="\t")
705+
data.columns = [i.replace("#", "") for i in data.columns]
713706

714-
assert line[3] == "LeftGene", "Unrecognized " + "FusionInspector input"
715-
assert line[5] == "LeftBreakpoint", "Unrecognized " + "FusionInspector input"
716-
assert line[6] == "RightGene", "Unrecognized " + "FusionInspector input"
717-
assert line[8] == "RightBreakpoint", "Unrecognized " + "FusionInspector input"
718-
continue
707+
cols = ["LeftGene", "LeftBreakpoint", "RightGene", "RightBreakpoint"]
708+
assert all(
709+
i in data.columns for i in cols
710+
), "Unrecognized FusionInspector input. Could not find all columns: " + ",".join(cols)
719711

720-
line = line.strip().split("\t")
712+
for i in data.index:
721713

722-
gene_5prime = line[3].split("^")[1].split(".")[0]
723-
gene_5prime_name = line[3].split("^")[0]
724-
gene_5prime_junction = int(line[5].split(":")[1])
725-
gene_3prime = line[6].split("^")[1].split(".")[0]
726-
gene_3prime_name = line[6].split("^")[0]
727-
gene_3prime_junction = int(line[8].split(":")[1])
714+
gene_5prime = data.at[i, "LeftGene"].split("^")[1].split(".")[0]
715+
gene_5prime_name = data.at[i, "LeftGene"].split("^")[0]
716+
gene_5prime_junction = int(data.at[i, "LeftBreakpoint"].split(":")[1])
717+
gene_3prime = data.at[i, "RightGene"].split("^")[1].split(".")[0]
718+
gene_3prime_name = data.at[i, "RightGene"].split("^")[0]
719+
gene_3prime_junction = int(data.at[i, "RightBreakpoint"].split(":")[1])
728720
self.fusions.append(
729721
{
730722
"gene5prime": gene_5prime,
@@ -735,7 +727,6 @@ def __init__(self, infile, logger):
735727
"gene3prime_junction": gene_3prime_junction,
736728
}
737729
)
738-
fin.close()
739730

740731
self._check_data()
741732

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#FusionName JunctionReadCount SpanningFragCount est_J est_S LeftGene LeftLocalBreakpoint LeftBreakpoint RightGene RightLocalBreakpoint RightBreakpoint SpliceType LargeAnchorSupport NumCounterFusionLeft NumCounterFusionRight FAR_left FAR_right LeftBreakDinuc LeftBreakEntropy RightBreakDinuc RightBreakEntropy FFPM microh_brkpt_dist num_microh_near_brkpt
2+
AL627171.2--TPM3 1551 3 1538.54 3.00 AL627171.2^ENSG00000282885.2 2641 chr14:49862686:- TPM3^ENSG00000143549.21 23194 chr1:154166382:- INCL_NON_REF_SPLICE NO 152 89 10.16 17.28 GT 1.5058 AG 1.8892 26.6109 1 30
3+
STAT3--AL627171.2 955 1 934.46 1.00 STAT3^ENSG00000168610.16 21538 chr17:42321234:- AL627171.2^ENSG00000282885.2 32742 chr14:49862799:- INCL_NON_REF_SPLICE YES 0 152 957.00 6.25 GT 1.8295 AG 1.7968 16.1485 1 25

test/data/FusionsFindingAlgorithms/FusionInspector/test.FusionInspector.fusions.txt

Lines changed: 3 additions & 0 deletions
Large diffs are not rendered by default.

test/test_parsers.py

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
db_human95.build = "homo_sapiens_95"
2121

2222

23+
BASEDIR = "./data/FusionsFindingAlgorithms"
24+
25+
2326
class TestFusionCatcher(unittest.TestCase):
2427
"""Test parse FusionCatcher parse."""
2528

@@ -37,7 +40,7 @@ def test_parse(self):
3740
"C920009B18Rik_H60b",
3841
]
3942
for fusion in parsers.parsers["fusioncatcher"](
40-
"./data/FusionsFindingAlgorithms/FusionCatcher/final-list_candidate-fusion-genes.txt",
43+
f"{BASEDIR}/FusionCatcher/final-list_candidate-fusion-genes.txt",
4144
db_mouse.logger,
4245
):
4346
fusion = model.Fusion(
@@ -63,7 +66,7 @@ def test_parse(self):
6366
"BCR_ABL1",
6467
]
6568
for fusion in parsers.parsers["arriba"](
66-
"./data/FusionsFindingAlgorithms/Arriba/fusions.tsv",
69+
f"{BASEDIR}/Arriba/fusions.tsv",
6770
db_human.logger,
6871
):
6972
fusion = model.Fusion(
@@ -87,8 +90,7 @@ def test_basic(self):
8790

8891
all_fusions = ["ACACA_STAC2", "RPS6KB1_SNF8"]
8992
for fusion in parsers.parsers["starfusion"](
90-
"./data/FusionsFindingAlgorithms/STARFusion/"
91-
+ "star-fusion.fusion_candidates.final.abridged",
93+
f"{BASEDIR}/STARFusion/" + "star-fusion.fusion_candidates.final.abridged",
9294
db_human.logger,
9395
):
9496
fusion = model.Fusion(
@@ -108,8 +110,7 @@ def test_with_coding_effect(self):
108110

109111
all_fusions = ["ARID3B_MYCNUT", "ARID3B_MYCN", "TVP23C_CDRT4"]
110112
for fusion in parsers.parsers["starfusion"](
111-
"./data/FusionsFindingAlgorithms/STARFusion/"
112-
+ "star-fusion.fusion_predictions.abridged.coding_effect.tsv",
113+
f"{BASEDIR}/STARFusion/" + "star-fusion.fusion_predictions.abridged.coding_effect.tsv",
113114
db_human95.logger,
114115
):
115116
fusion = model.Fusion(
@@ -133,7 +134,7 @@ def test_parse_mouse(self):
133134

134135
all_fusions = ["Mocos_Rprd1a", "Ubc_Ubb", "Ubc_Gm11808", "Gm21887_Gm47283"]
135136
for fusion in parsers.parsers["longgf"](
136-
"./data/FusionsFindingAlgorithms/LongGF/fusions_mouse.log",
137+
f"{BASEDIR}/LongGF/fusions_mouse.log",
137138
db_mouse.logger,
138139
):
139140
fusion = model.Fusion(
@@ -153,7 +154,48 @@ def test_parse_human(self):
153154

154155
all_fusions = ["BCAS4_BCAS3", "HNRNPC_ACIN1"]
155156
for fusion in parsers.parsers["longgf"](
156-
"./data/FusionsFindingAlgorithms/LongGF/fusions_hg38.log",
157+
f"{BASEDIR}/LongGF/fusions_hg38.log",
158+
db_human95.logger,
159+
):
160+
fusion = model.Fusion(
161+
gene5prime=fusion["gene5prime"],
162+
gene5primejunction=fusion["gene5prime_junction"],
163+
gene3prime=fusion["gene3prime"],
164+
gene3primejunction=fusion["gene3prime_junction"],
165+
db=db_human95,
166+
pyensembl_data=data_human95,
167+
protein_databases=["pfam"],
168+
noncanonical=False,
169+
)
170+
assert fusion.name in all_fusions, f"{fusion.name} not in list!"
171+
172+
173+
class TestFusionInspector(unittest.TestCase):
174+
"""Test parse FusionInspector"""
175+
176+
def test_parse_human(self):
177+
"""Test basic parsing."""
178+
179+
all_fusions = ["AL627171.2_TPM3", "STAT3_AL627171.2"]
180+
181+
for fusion in parsers.parsers["fusioninspector"](
182+
f"{BASEDIR}/FusionInspector/test.FusionInspector.fusions.abridged.txt",
183+
db_human95.logger,
184+
):
185+
fusion = model.Fusion(
186+
gene5prime=fusion["gene5prime"],
187+
gene5primejunction=fusion["gene5prime_junction"],
188+
gene3prime=fusion["gene3prime"],
189+
gene3primejunction=fusion["gene3prime_junction"],
190+
db=db_human95,
191+
pyensembl_data=data_human95,
192+
protein_databases=["pfam"],
193+
noncanonical=False,
194+
)
195+
assert fusion.name in all_fusions, f"{fusion.name} not in list!"
196+
197+
for fusion in parsers.parsers["fusioninspector"](
198+
f"{BASEDIR}/FusionInspector/test.FusionInspector.fusions.txt",
157199
db_human95.logger,
158200
):
159201
fusion = model.Fusion(

0 commit comments

Comments
 (0)