Skip to content

Commit 093bb2c

Browse files
authored
Improve DNA match parser (#610)
* Refactor DNA match parser into separate module * Fix Python 3.9 * FIx typing issues on 3.9 * Start improving parser * New DNA match parser * Small changes, more tests * Add Geneanet format * Small fix * Update doc string * More tests
1 parent dab5b82 commit 093bb2c

File tree

4 files changed

+575
-73
lines changed

4 files changed

+575
-73
lines changed

gramps_webapi/api/dna.py

Lines changed: 297 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,297 @@
1+
"""Parser for raw DNA match data."""
2+
3+
from __future__ import annotations
4+
5+
import itertools
6+
import re
7+
from collections.abc import Callable
8+
from dataclasses import dataclass
9+
from typing import Literal, Sequence, overload
10+
11+
from gramps_webapi.types import MatchSegment
12+
13+
SIDE_UNKNOWN = "U"
14+
SIDE_MATERNAL = "M"
15+
SIDE_PATERNAL = "P"
16+
17+
18+
@dataclass
19+
class SegmentColumnOrder:
20+
"""Order of the columns of a DNA match table."""
21+
22+
chromosome: int
23+
start_position: int
24+
end_position: int
25+
centimorgans: int
26+
num_snps: int | None = None
27+
side: int | None = None
28+
comment: int | None = None
29+
30+
31+
def get_delimiter(rows: list[str]) -> str:
32+
"""Guess the delimiter of a string containing a CSV-like table.
33+
34+
It is assumed that the table has at least 4 columns and at least one
35+
row.
36+
"""
37+
if rows[0].count("\t") >= 3:
38+
return "\t"
39+
if rows[0].count(",") >= 3:
40+
return ","
41+
if rows[0].count(";") >= 3:
42+
return ";"
43+
raise ValueError("Could not determine delimiter.")
44+
45+
46+
def is_numeric(value: str) -> bool:
47+
"""Determine if a string is number-like."""
48+
if value == "":
49+
return False
50+
try:
51+
float(value)
52+
return True
53+
except ValueError:
54+
pass
55+
if re.match(r"^\d[\d\.,]*$", value):
56+
return True
57+
return False
58+
59+
60+
def cast_int(value: str) -> int:
61+
"""Cast a string to an integer."""
62+
try:
63+
return int(value.replace(",", "").replace(".", ""))
64+
except (ValueError, TypeError):
65+
return 0
66+
67+
68+
def cast_float(value: str) -> float:
69+
"""Cast a string to a float."""
70+
value = value.replace(" ", "")
71+
if value.count(".") > 1:
72+
value = value.replace(".", "")
73+
if value.count(",") > 1:
74+
value = value.replace(",", "")
75+
if value.count(",") == 1 and value.count(".") == 0:
76+
value = value.replace(",", ".")
77+
try:
78+
return float(value)
79+
except ValueError:
80+
return 0.0
81+
82+
83+
def has_header(rows: list[str], delimiter: str) -> bool:
84+
"""Determine if the table has a header."""
85+
if len(rows) < 2:
86+
return False
87+
header = rows[0]
88+
if len(header) < 4:
89+
return False
90+
header_columns = header.split(delimiter)
91+
if any(is_numeric(column) for column in header_columns):
92+
return False
93+
return True
94+
95+
96+
@overload
97+
def find_column_position(
98+
column_names: list[str],
99+
condition: Callable[[str], bool],
100+
exclude_indices: Sequence[int],
101+
allow_missing: Literal[False],
102+
) -> int: ...
103+
104+
105+
@overload
106+
def find_column_position(
107+
column_names: list[str],
108+
condition: Callable[[str], bool],
109+
exclude_indices: Sequence[int],
110+
allow_missing: Literal[True],
111+
) -> int | None: ...
112+
113+
114+
def find_column_position(
115+
column_names: list[str],
116+
condition: Callable[[str], bool],
117+
exclude_indices: Sequence[int],
118+
allow_missing: bool = False,
119+
) -> int | None:
120+
"""Find the position of a column in a list of column names or raise a ValueError."""
121+
for i, column in enumerate(column_names):
122+
if i in exclude_indices:
123+
continue
124+
if condition(column.lower().strip()):
125+
return i
126+
if allow_missing:
127+
return None
128+
raise ValueError("Column not found.")
129+
130+
131+
def get_order(
132+
header: list[str] | None, data_columns: Sequence[Sequence[str | None]]
133+
) -> SegmentColumnOrder:
134+
"""Get the order of the columns."""
135+
if header is None:
136+
# use the default ordering of the DNASegmentMap Gramplet
137+
# https://gramps-project.org/wiki/index.php/Addon:DNASegmentMapGramplet
138+
if len(data_columns) >= 6:
139+
# check whether the 6th column contains side information
140+
if all(
141+
(not value) or (value in {SIDE_MATERNAL, SIDE_PATERNAL, SIDE_UNKNOWN})
142+
for value in data_columns[5]
143+
):
144+
return SegmentColumnOrder(
145+
chromosome=0,
146+
start_position=1,
147+
end_position=2,
148+
centimorgans=3,
149+
num_snps=4,
150+
side=5,
151+
comment=6,
152+
)
153+
return SegmentColumnOrder(
154+
chromosome=0,
155+
start_position=1,
156+
end_position=2,
157+
centimorgans=3,
158+
num_snps=4,
159+
comment=5,
160+
)
161+
exclude_indices: list[int] = []
162+
chromosome = find_column_position(
163+
header,
164+
lambda col: col.startswith("chr"),
165+
exclude_indices=exclude_indices,
166+
allow_missing=False,
167+
)
168+
exclude_indices.append(chromosome)
169+
start_position = find_column_position(
170+
header,
171+
lambda col: "start" in col,
172+
exclude_indices=exclude_indices,
173+
allow_missing=False,
174+
)
175+
exclude_indices.append(start_position)
176+
end_position = find_column_position(
177+
header,
178+
lambda col: "end" in col
179+
or "stop" in col
180+
or ("length" in col and "morgan" not in col),
181+
exclude_indices=exclude_indices,
182+
allow_missing=False,
183+
)
184+
exclude_indices.append(end_position)
185+
centimorgans = find_column_position(
186+
header,
187+
lambda col: col.startswith("cm") or "centimorgan" in col or "length" in col,
188+
exclude_indices=exclude_indices,
189+
allow_missing=False,
190+
)
191+
exclude_indices.append(centimorgans)
192+
num_snps = find_column_position(
193+
header,
194+
lambda col: "snp" in col,
195+
exclude_indices=exclude_indices,
196+
allow_missing=True,
197+
)
198+
if num_snps is not None:
199+
exclude_indices.append(num_snps)
200+
side = find_column_position(
201+
header,
202+
lambda col: col.startswith("side"),
203+
exclude_indices=exclude_indices,
204+
allow_missing=True,
205+
)
206+
if side is not None:
207+
exclude_indices.append(side)
208+
comment = find_column_position(
209+
header,
210+
lambda _: True, # take the first column that has not been matched yet
211+
exclude_indices=exclude_indices,
212+
allow_missing=True,
213+
)
214+
return SegmentColumnOrder(
215+
chromosome=chromosome,
216+
start_position=start_position,
217+
end_position=end_position,
218+
centimorgans=centimorgans,
219+
num_snps=num_snps,
220+
side=side,
221+
comment=comment,
222+
)
223+
224+
225+
def transpose_jagged_nested_list(
226+
data: Sequence[Sequence[str | None]],
227+
) -> list[list[str | None]]:
228+
"""Transpose a jagged nested list, replacing missing values with None."""
229+
return list(map(list, itertools.zip_longest(*data, fillvalue=None)))
230+
231+
232+
def parse_raw_dna_match_string(raw_string: str) -> list[MatchSegment]:
233+
"""Parse a raw DNA match string."""
234+
rows = raw_string.strip().split("\n")
235+
try:
236+
delimiter = get_delimiter(rows)
237+
except ValueError:
238+
return []
239+
header: list[str] | None
240+
if has_header(rows, delimiter):
241+
header = rows[0].split(delimiter)
242+
rows = rows[1:]
243+
else:
244+
header = None
245+
data = [row.split(delimiter) for row in rows]
246+
data_columns = transpose_jagged_nested_list(data)
247+
try:
248+
order = get_order(header, data_columns=data_columns)
249+
except ValueError:
250+
return []
251+
segments = []
252+
for row in rows:
253+
if row.strip() == "":
254+
continue
255+
try:
256+
match_segment = process_row(fields=row.split(delimiter), order=order)
257+
except (ValueError, TypeError):
258+
continue
259+
if match_segment:
260+
segments.append(match_segment)
261+
return segments
262+
263+
264+
def process_row(fields: list[str], order: SegmentColumnOrder) -> MatchSegment | None:
265+
"""Process a row of a DNA match table."""
266+
if len(fields) < 4:
267+
return None
268+
try:
269+
chromo = fields[order.chromosome].strip()
270+
start = cast_int(fields[order.start_position].strip())
271+
stop = cast_int(fields[order.end_position].strip())
272+
cms = cast_float(fields[order.centimorgans].strip())
273+
if order.num_snps is not None and len(fields) >= order.num_snps + 1:
274+
snp = cast_int(fields[order.num_snps].strip())
275+
else:
276+
snp = 0
277+
if order.side is not None and len(fields) >= order.side + 1:
278+
side = fields[order.side].strip().upper()
279+
if side not in {SIDE_MATERNAL, SIDE_PATERNAL}:
280+
side = SIDE_UNKNOWN
281+
else:
282+
side = SIDE_UNKNOWN
283+
if order.comment is not None and len(fields) >= order.comment + 1:
284+
comment = fields[order.comment].strip()
285+
else:
286+
comment = ""
287+
except (ValueError, TypeError):
288+
return None
289+
return {
290+
"chromosome": chromo,
291+
"start": start,
292+
"stop": stop,
293+
"side": side,
294+
"cM": cms,
295+
"SNPs": snp,
296+
"comment": comment,
297+
}

0 commit comments

Comments
 (0)