44import subprocess
55import tempfile
66from pathlib import Path
7+ from typing import Any
78from urllib .parse import urlparse
89
910import requests
1011from Bio .SearchIO import HSP
11- from Bio .SearchIO import read as read_blat
12+ from Bio .SearchIO import parse as parse_blat
1213from Bio .SearchIO ._model import Hit , QueryResult
1314from cool_seq_tool .schemas import Strand
1415
2526 GeneLocation ,
2627 ScoresetMetadata ,
2728 SequenceRange ,
29+ TargetGene ,
2830 TargetSequenceType ,
2931)
3032
@@ -61,7 +63,10 @@ def _build_query_file(scoreset_metadata: ScoresetMetadata, query_file: Path) ->
6163 :return: Yielded Path to constructed file. Deletes file once complete.
6264 """
6365 _logger .debug ("Writing BLAT query to %s" , query_file )
64- lines = [">query" , scoreset_metadata .target_sequence ]
66+ lines = []
67+ for target_gene in scoreset_metadata .target_genes :
68+ lines .append (f">{ target_gene } " )
69+ lines .append (scoreset_metadata .target_genes [target_gene ].target_sequence )
6570 _write_query_file (query_file , lines )
6671 return query_file
6772
@@ -143,50 +148,77 @@ def _write_blat_output_tempfile(result: subprocess.CompletedProcess) -> str:
143148 return tmp .name
144149
145150
146- def _get_blat_output (metadata : ScoresetMetadata , silent : bool ) -> QueryResult :
151+ def _get_target_sequence_type (metadata : ScoresetMetadata ) -> TargetSequenceType | str :
152+ """Get overall target sequence type for a score set's target genes.
153+ Protein if all target sequences are protein sequences, nucleotide if all target
154+ sequences are nucleotide sequences, and mixed if there is a mix within the score set.
155+ :param metadata: object containing score set attributes
156+ :return: TargetSequenceType enum (protein or nucleotide) or string "mixed"
157+ """
158+ target_sequence_types = set ()
159+ for target_gene in metadata .target_genes :
160+ target_sequence_types .add (
161+ metadata .target_genes [target_gene ].target_sequence_type
162+ )
163+ if len (target_sequence_types ) > 1 :
164+ return "mixed"
165+ elif len (target_sequence_types ) == 1 : # noqa: RET505
166+ return target_sequence_types .pop ()
167+ else :
168+ msg = f"Target sequence types not available for score set { metadata .urn } "
169+ raise ValueError (msg )
170+
171+
172+ def _get_blat_output (metadata : ScoresetMetadata , silent : bool ) -> Any : # noqa: ANN401
147173 """Run a BLAT query and returns a path to the output object.
148174
149175 If unable to produce a valid query the first time, then try a query using ``dnax``
150176 bases.
151177
152178 :param scoreset_metadata: object containing scoreset attributes
153179 :param silent: suppress BLAT command output
154- :return: BLAT query result
180+ :return: dict where keys are target gene identifiers and values are BLAT query result objects
155181 :raise AlignmentError: if BLAT subprocess returns error code
156182 """
157183 with tempfile .NamedTemporaryFile () as tmp_file :
158184 query_file = _build_query_file (metadata , Path (tmp_file .name ))
159- if metadata .target_sequence_type == TargetSequenceType .PROTEIN :
185+ target_sequence_type = _get_target_sequence_type (metadata )
186+ if target_sequence_type == TargetSequenceType .PROTEIN :
160187 target_args = "-q=prot -t=dnax"
161- else :
188+ elif target_sequence_type == TargetSequenceType . DNA :
162189 target_args = ""
190+ else :
191+ # TODO implement support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments.
192+ msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported."
193+ raise NotImplementedError (msg )
163194 process_result = _run_blat (target_args , query_file , "/dev/stdout" , silent )
164195 out_file = _write_blat_output_tempfile (process_result )
165196
166197 try :
167- output = read_blat (out_file , "blat-psl" )
198+ output = parse_blat (out_file , "blat-psl" )
199+
200+ # TODO reevaluate this code block - are there cases in mavedb where target sequence type is incorrectly supplied?
168201 except ValueError :
169202 target_args = "-q=dnax -t=dnax"
170203 process_result = _run_blat (target_args , query_file , "/dev/stdout" , silent )
171204 out_file = _write_blat_output_tempfile (process_result )
172205 try :
173- output = read_blat (out_file , "blat-psl" )
206+ output = parse_blat (out_file , "blat-psl" )
174207 except ValueError as e :
175208 msg = f"Unable to run successful BLAT on { metadata .urn } "
176209 raise AlignmentError (msg ) from e
177210
178211 return output
179212
180213
181- def _get_best_hit (output : QueryResult , urn : str , chromosome : str | None ) -> Hit :
214+ def _get_best_hit (output : QueryResult , chromosome : str | None ) -> Hit :
182215 """Get best hit from BLAT output.
183216
184217 First, try to return hit corresponding to expected chromosome taken from scoreset
185218 metadata. If chromosome doesn't match any of the outputs or is unavailable, take
186219 the hit with the single highest-scoring HSP.
187220
188221 :param output: BLAT output
189- :param urn: scoreset URN to use in error messages
190222 :param chromosome: refseq chromosome ID, e.g. ``"NC_000001.11"``
191223 :return: best Hit
192224 :raise AlignmentError: if unable to get hits from output
@@ -207,8 +239,8 @@ def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit:
207239 hit_chrs = [h .id for h in output ]
208240 # TODO should this be an error rather than a warning? it seems like a problem if we can't find a hit on the expected chromosome
209241 _logger .warning (
210- "Failed to match hit chromosomes during alignment. URN: %s, expected chromosome: %s, hit chromosomes: %s" ,
211- urn ,
242+ "Failed to match hit chromosomes during alignment for target %s. Expected chromosome: %s, hit chromosomes: %s" ,
243+ output . id ,
212244 chromosome ,
213245 hit_chrs ,
214246 )
@@ -222,21 +254,20 @@ def _get_best_hit(output: QueryResult, urn: str, chromosome: str | None) -> Hit:
222254 best_score_hit = hit
223255
224256 if best_score_hit is None :
225- msg = f"Couldn't get BLAT hits from { urn } "
257+ msg = f"Couldn't get BLAT hits for target { output . id } . "
226258 raise AlignmentError (msg )
227259
228260 return best_score_hit
229261
230262
231- def _get_best_hsp (hit : Hit , urn : str , gene_location : GeneLocation | None ) -> HSP :
263+ def _get_best_hsp (hit : Hit , gene_location : GeneLocation | None ) -> HSP :
232264 """Retrieve preferred HSP from BLAT Hit object.
233265
234266 If gene location data is available, prefer the HSP with the least distance
235267 between the start of the hit and the start coordinate of the gene. Otherwise,
236268 take the HSP with the highest score value.
237269
238270 :param hit: hit object from BLAT result
239- :param urn: scoreset identifier for use in error messages
240271 :param gene_location: location data acquired by normalizing scoreset metadata
241272 :return: Preferred HSP object
242273 :raise AlignmentError: if hit object appears to be empty (should be impossible)
@@ -252,17 +283,17 @@ def _get_best_hsp(hit: Hit, urn: str, gene_location: GeneLocation | None) -> HSP
252283 return best_hsp
253284
254285
255- def _get_best_match (output : QueryResult , metadata : ScoresetMetadata ) -> AlignmentResult :
286+ def _get_best_match (output : QueryResult , target_gene : TargetGene ) -> AlignmentResult :
256287 """Obtain best high-scoring pairs (HSP) object for query sequence.
257288
258289 :param metadata: scoreset metadata
259290 :param output: BLAT result object
260291 :return: alignment result ??
261292 """
262- location = get_gene_location (metadata )
293+ location = get_gene_location (target_gene )
263294 chromosome = location .chromosome if location else None
264- best_hit = _get_best_hit (output , metadata . urn , chromosome )
265- best_hsp = _get_best_hsp (best_hit , metadata . urn , location )
295+ best_hit = _get_best_hit (output , chromosome )
296+ best_hsp = _get_best_hsp (best_hit , location )
266297
267298 strand = Strand .POSITIVE if best_hsp [0 ].query_strand == 1 else Strand .NEGATIVE
268299 coverage = 100 * (best_hsp .query_end - best_hsp .query_start ) / output .seq_len
@@ -291,12 +322,19 @@ def _get_best_match(output: QueryResult, metadata: ScoresetMetadata) -> Alignmen
291322 )
292323
293324
294- def align (scoreset_metadata : ScoresetMetadata , silent : bool = True ) -> AlignmentResult :
325+ def align (
326+ scoreset_metadata : ScoresetMetadata , silent : bool = True
327+ ) -> dict [str , AlignmentResult ]:
295328 """Align target sequence to a reference genome.
296329
297330 :param scoreset_metadata: object containing scoreset metadata
298331 :param silent: suppress BLAT process output if true
299- :return: data wrapper containing alignment results
332+ :return: dictionary where keys are target gene identifiers and values are alignment result objects
300333 """
301334 blat_output = _get_blat_output (scoreset_metadata , silent )
302- return _get_best_match (blat_output , scoreset_metadata )
335+ alignment_results = {}
336+ for blat_result in blat_output :
337+ target_label = blat_result .id
338+ target_gene = scoreset_metadata .target_genes [target_label ]
339+ alignment_results [target_label ] = _get_best_match (blat_result , target_gene )
340+ return alignment_results
0 commit comments