@@ -165,10 +165,6 @@ def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> QueryResult:
165165 """
166166 with tempfile .NamedTemporaryFile () as query_file :
167167 query_file = _build_query_file (metadata , Path (query_file .name ))
168- if len (metadata .target_sequence ) > 25000 :
169- msg = f"Target sequence for { metadata .urn } must have a length <= 25000 to run BLAT"
170- raise AlignmentError (msg )
171-
172168 if metadata .target_sequence_type == TargetSequenceType .PROTEIN :
173169 target_args = "-q=prot -t=dnax"
174170 elif (
@@ -240,7 +236,10 @@ def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit:
240236
241237
242238def _get_best_hsp (
243- hit : Hit , urn : str , gene_location : GeneLocation | None , output : QueryResult
239+ hit : Hit ,
240+ metadata : ScoresetMetadata ,
241+ gene_location : GeneLocation | None ,
242+ output : QueryResult ,
244243) -> HSP :
245244 """Retrieve preferred HSP from BLAT Hit object.
246245
@@ -262,13 +261,15 @@ def _get_best_hsp(
262261 hit , key = lambda hsp : (hsp .query_end - hsp .query_start ) / output .seq_len
263262 )
264263 best_hsp = hsp_list [0 ]
265- else : # Sort by distance from gene start, than by coverage
264+ else : # Sort by distance from gene start with respect to strandedness, then by coverage
266265 for hsp in hit :
267266 BlatOutput = namedtuple ("BlatOutput" , ["hsp" , "distance" , "coverage" ])
268267 hsp_list .append (
269268 BlatOutput (
270269 hsp ,
271- abs (hsp .hit_start - gene_location .start ),
270+ abs (hsp .hit_start - gene_location .start )
271+ if hsp [0 ].query_strand == 1
272+ else abs (hsp .hit_end - gene_location .end ),
272273 (hsp .query_end - hsp .query_start ) / output .seq_len ,
273274 )
274275 )
@@ -283,7 +284,7 @@ def _get_best_hsp(
283284 if best_hsp is None :
284285 _logger .error (
285286 "Unable to get best HSP from hit -- this should be impossible? urn: %s, hit: %s" ,
286- urn ,
287+ metadata . urn ,
287288 hit ,
288289 )
289290 raise AlignmentError
@@ -300,7 +301,7 @@ def _get_best_match(output: QueryResult, metadata: ScoresetMetadata) -> Alignmen
300301 location = get_gene_location (metadata )
301302 chromosome = location .chromosome if location else None
302303 best_hit = _get_best_hit (output , metadata .urn , chromosome )
303- best_hsp = _get_best_hsp (best_hit , metadata . urn , location , output )
304+ best_hsp = _get_best_hsp (best_hit , metadata , location , output )
304305
305306 strand = None
306307 if metadata .target_gene_category == TargetType .PROTEIN_CODING :
0 commit comments