Skip to content

Commit b8e9e5c

Browse files
authored
fix: Update alignment and annotation functionality (#79)
1 parent ab81645 commit b8e9e5c

File tree

3 files changed

+18
-11
lines changed

3 files changed

+18
-11
lines changed

src/dcd_mapping/align.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -165,10 +165,6 @@ def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> QueryResult:
165165
"""
166166
with tempfile.NamedTemporaryFile() as query_file:
167167
query_file = _build_query_file(metadata, Path(query_file.name))
168-
if len(metadata.target_sequence) > 25000:
169-
msg = f"Target sequence for {metadata.urn} must have a length <= 25000 to run BLAT"
170-
raise AlignmentError(msg)
171-
172168
if metadata.target_sequence_type == TargetSequenceType.PROTEIN:
173169
target_args = "-q=prot -t=dnax"
174170
elif (
@@ -240,7 +236,10 @@ def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit:
240236

241237

242238
def _get_best_hsp(
243-
hit: Hit, urn: str, gene_location: GeneLocation | None, output: QueryResult
239+
hit: Hit,
240+
metadata: ScoresetMetadata,
241+
gene_location: GeneLocation | None,
242+
output: QueryResult,
244243
) -> HSP:
245244
"""Retrieve preferred HSP from BLAT Hit object.
246245
@@ -262,13 +261,15 @@ def _get_best_hsp(
262261
hit, key=lambda hsp: (hsp.query_end - hsp.query_start) / output.seq_len
263262
)
264263
best_hsp = hsp_list[0]
265-
else: # Sort by distance from gene start, than by coverage
264+
else: # Sort by distance from gene start with respect to strandedness, then by coverage
266265
for hsp in hit:
267266
BlatOutput = namedtuple("BlatOutput", ["hsp", "distance", "coverage"])
268267
hsp_list.append(
269268
BlatOutput(
270269
hsp,
271-
abs(hsp.hit_start - gene_location.start),
270+
abs(hsp.hit_start - gene_location.start)
271+
if hsp[0].query_strand == 1
272+
else abs(hsp.hit_end - gene_location.end),
272273
(hsp.query_end - hsp.query_start) / output.seq_len,
273274
)
274275
)
@@ -283,7 +284,7 @@ def _get_best_hsp(
283284
if best_hsp is None:
284285
_logger.error(
285286
"Unable to get best HSP from hit -- this should be impossible? urn: %s, hit: %s",
286-
urn,
287+
metadata.urn,
287288
hit,
288289
)
289290
raise AlignmentError
@@ -300,7 +301,7 @@ def _get_best_match(output: QueryResult, metadata: ScoresetMetadata) -> Alignmen
300301
location = get_gene_location(metadata)
301302
chromosome = location.chromosome if location else None
302303
best_hit = _get_best_hit(output, metadata.urn, chromosome)
303-
best_hsp = _get_best_hsp(best_hit, metadata.urn, location, output)
304+
best_hsp = _get_best_hsp(best_hit, metadata, location, output)
304305

305306
strand = None
306307
if metadata.target_gene_category == TargetType.PROTEIN_CODING:

src/dcd_mapping/annotate.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -337,8 +337,10 @@ def _annotate_cpb_mapping(
337337
post_mapped.extensions = [
338338
Extension(name="vrs_v1.3_id", value=_get_vrs_1_3_haplotype_id(post_mapped)),
339339
]
340-
else:
340+
elif len(post_mapped.members) == 1:
341341
post_mapped = post_mapped.members[0]
342+
else:
343+
post_mapped = None
342344

343345
namespace = metadata.urn
344346
val = mapping.accession_id.split("#")[1]

src/dcd_mapping/vrs_map.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -441,7 +441,11 @@ def _get_variation(
441441
allele.location.end = allele.location.start + diff2
442442
if "dup" in hgvs_string:
443443
allele.state.sequence = SequenceString(
444-
2 * _get_allele_sequence(allele)
444+
str(
445+
Seq(
446+
2 * _get_allele_sequence(allele)
447+
).reverse_complement()
448+
)
445449
)
446450
temp_str = str(
447451
Seq(str(allele.state.sequence.root)).reverse_complement()

0 commit comments

Comments
 (0)