|
14 | 14 | from cool_seq_tool.schemas import Strand |
15 | 15 |
|
16 | 16 | from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location |
17 | | -from dcd_mapping.mavedb_data import ( |
18 | | - LOCAL_STORE_PATH, |
19 | | -) |
| 17 | +from dcd_mapping.mavedb_data import LOCAL_STORE_PATH, ScoresetNotSupportedError |
20 | 18 | from dcd_mapping.resource_utils import ( |
21 | 19 | ResourceAcquisitionError, |
22 | 20 | http_download, |
@@ -180,36 +178,35 @@ def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> Any: # noqa: |
180 | 178 | :return: dict where keys are target gene identifiers and values are BLAT query result objects |
181 | 179 | :raise AlignmentError: if BLAT subprocess returns error code |
182 | 180 | """ |
183 | | - return parse_blat(f"{metadata.urn}_blat.psl", "blat-psl") |
184 | | - # with tempfile.NamedTemporaryFile() as tmp_file: |
185 | | - # query_file = _build_query_file(metadata, Path(tmp_file.name)) |
186 | | - # target_sequence_type = _get_target_sequence_type(metadata) |
187 | | - # if target_sequence_type == TargetSequenceType.PROTEIN: |
188 | | - # target_args = "-q=prot -t=dnax" |
189 | | - # elif target_sequence_type == TargetSequenceType.DNA: |
190 | | - # target_args = "" |
191 | | - # else: |
192 | | - # # TODO implement support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments. |
193 | | - # msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported." |
194 | | - # raise NotImplementedError(msg) |
195 | | - # process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) |
196 | | - # out_file = _write_blat_output_tempfile(process_result) |
197 | | - |
198 | | - # try: |
199 | | - # output = parse_blat(out_file, "blat-psl") |
200 | | - |
201 | | - # # TODO reevaluate this code block - are there cases in mavedb where target sequence type is incorrectly supplied? |
202 | | - # except ValueError: |
203 | | - # target_args = "-q=dnax -t=dnax" |
204 | | - # process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) |
205 | | - # out_file = _write_blat_output_tempfile(process_result) |
206 | | - # try: |
207 | | - # output = parse_blat(out_file, "blat-psl") |
208 | | - # except ValueError as e: |
209 | | - # msg = f"Unable to run successful BLAT on {metadata.urn}" |
210 | | - # raise AlignmentError(msg) from e |
211 | | - |
212 | | - # return output |
| 181 | + with tempfile.NamedTemporaryFile() as tmp_file: |
| 182 | + query_file = _build_query_file(metadata, Path(tmp_file.name)) |
| 183 | + target_sequence_type = _get_target_sequence_type(metadata) |
| 184 | + if target_sequence_type == TargetSequenceType.PROTEIN: |
| 185 | + target_args = "-q=prot -t=dnax" |
| 186 | + elif target_sequence_type == TargetSequenceType.DNA: |
| 187 | + target_args = "" |
| 188 | + else: |
| 189 | + # TODO consider implementing support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments. |
| 190 | + msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported." |
| 191 | + raise NotImplementedError(msg) |
| 192 | + process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) |
| 193 | + out_file = _write_blat_output_tempfile(process_result) |
| 194 | + |
| 195 | + try: |
| 196 | + output = parse_blat(out_file, "blat-psl") |
| 197 | + |
| 198 | + # TODO reevaluate this code block - are there cases in mavedb where target sequence type is incorrectly supplied? |
| 199 | + except ValueError: |
| 200 | + target_args = "-q=dnax -t=dnax" |
| 201 | + process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) |
| 202 | + out_file = _write_blat_output_tempfile(process_result) |
| 203 | + try: |
| 204 | + output = parse_blat(out_file, "blat-psl") |
| 205 | + except ValueError as e: |
| 206 | + msg = f"Unable to run successful BLAT on {metadata.urn}" |
| 207 | + raise AlignmentError(msg) from e |
| 208 | + |
| 209 | + return output |
213 | 210 |
|
214 | 211 |
|
215 | 212 | def _get_best_hit(output: QueryResult, chromosome: str | None) -> Hit: |
@@ -342,3 +339,106 @@ def align( |
342 | 339 | target_gene = scoreset_metadata.target_genes[target_label] |
343 | 340 | alignment_results[target_label] = _get_best_match(blat_result, target_gene) |
344 | 341 | return alignment_results |
| 342 | + |
| 343 | + |
| 344 | +def fetch_alignment( |
| 345 | + metadata: ScoresetMetadata, silent: bool |
| 346 | +) -> dict[str, AlignmentResult | None]: |
| 347 | + alignment_results = {} |
| 348 | + for target_gene in metadata.target_genes: |
| 349 | + accession_id = metadata.target_genes[target_gene].target_accession_id |
| 350 | + # protein and contig/chromosome accession ids do not need to be aligned to the genome |
| 351 | + if accession_id.startswith(("NP", "ENSP", "NC_")): |
| 352 | + alignment_results[accession_id] = None |
| 353 | + else: |
| 354 | + url = f"https://cdot.cc/transcript/{accession_id}" |
| 355 | + r = requests.get(url, timeout=30) |
| 356 | + |
| 357 | + try: |
| 358 | + r.raise_for_status() |
| 359 | + except requests.HTTPError as e: |
| 360 | + msg = f"Received HTTPError from {url} for scoreset {metadata.urn}" |
| 361 | + _logger.error(msg) |
| 362 | + raise ResourceAcquisitionError(msg) from e |
| 363 | + |
| 364 | + cdot_mapping = r.json() |
| 365 | + alignment_results[accession_id] = parse_cdot_mapping(cdot_mapping, silent) |
| 366 | + return alignment_results |
| 367 | + |
| 368 | + |
| 369 | +def parse_cdot_mapping(cdot_mapping: dict, silent: bool) -> AlignmentResult: |
| 370 | + # blat psl & AlignmentResult: 0-based, start inclusive, stop exclusive |
| 371 | + # cdot: 1-based, start inclusive, stop inclusive |
| 372 | + # so, to "translate" cdot ranges to AlignmentResult-style ranges: |
| 373 | + # subtract 1 from start and end to go from 1-based to 0-based coord, |
| 374 | + # and then add 1 to the stop to go from inclusive to exclusive |
| 375 | + # so just subtract 1 from start and do nothing to end |
| 376 | + |
| 377 | + grch38 = cdot_mapping.get("genome_builds", {}).get("GRCh38") |
| 378 | + grch37 = cdot_mapping.get("genome_builds", {}).get("GRCh37") |
| 379 | + mapping = grch38 if grch38 else grch37 |
| 380 | + if mapping is None: |
| 381 | + msg = f"Cdot transcript results for transcript {cdot_mapping.get('id')} do not include GRCh37 or GRCh38 mapping" |
| 382 | + raise AlignmentError(msg) |
| 383 | + |
| 384 | + chrom = mapping["contig"] |
| 385 | + strand = Strand.POSITIVE if mapping["strand"] == "+" else Strand.NEGATIVE |
| 386 | + query_subranges = [] |
| 387 | + hit_subranges = [] |
| 388 | + for exon in mapping["exons"]: |
| 389 | + query_subranges.append(SequenceRange(start=exon[3] - 1, end=exon[4])) |
| 390 | + hit_subranges.append(SequenceRange(start=exon[0] - 1, end=exon[1])) |
| 391 | + |
| 392 | + if strand == Strand.POSITIVE: |
| 393 | + query_range = SequenceRange( |
| 394 | + start=query_subranges[0].start, end=query_subranges[-1].end |
| 395 | + ) |
| 396 | + hit_range = SequenceRange( |
| 397 | + start=hit_subranges[0].start, end=hit_subranges[-1].end |
| 398 | + ) |
| 399 | + else: |
| 400 | + query_range = SequenceRange( |
| 401 | + start=query_subranges[-1].start, end=query_subranges[0].end |
| 402 | + ) |
| 403 | + hit_range = SequenceRange( |
| 404 | + start=hit_subranges[-1].start, end=hit_subranges[0].end |
| 405 | + ) |
| 406 | + |
| 407 | + return AlignmentResult( |
| 408 | + chrom=chrom, |
| 409 | + strand=strand, |
| 410 | + query_range=query_range, |
| 411 | + query_subranges=query_subranges, |
| 412 | + hit_range=hit_range, |
| 413 | + hit_subranges=hit_subranges, |
| 414 | + ) |
| 415 | + |
| 416 | + |
| 417 | +def build_alignment_result( |
| 418 | + metadata: ScoresetMetadata, silent: bool |
| 419 | +) -> dict[str, AlignmentResult | None]: |
| 420 | + # NOTE: Score set must contain all accession-based target genes or all sequence-based target genes |
| 421 | + # This decision was made because it is most efficient to run BLAT all together, so the alignment function |
| 422 | + # works on an entire score set rather than per target gene. |
| 423 | + # However, if the need arises, we can allow both types of target genes in a score set. |
| 424 | + |
| 425 | + # determine whether score set is accession-based or sequence-based |
| 426 | + score_set_type = None |
| 427 | + for target_gene in metadata.target_genes: |
| 428 | + if metadata.target_genes[target_gene].target_accession_id: |
| 429 | + if score_set_type == "sequence": |
| 430 | + msg = "Score set contains both accession-based and sequence-based target genes. This is not currently supported." |
| 431 | + raise ScoresetNotSupportedError(msg) |
| 432 | + score_set_type = "accession" |
| 433 | + else: |
| 434 | + if score_set_type == "accession": |
| 435 | + msg = "Score set contains both accession-based and sequence-based target genes. This is not currently supported." |
| 436 | + raise ScoresetNotSupportedError(msg) |
| 437 | + score_set_type = "sequence" |
| 438 | + |
| 439 | + if score_set_type == "sequence": |
| 440 | + alignment_result = align(metadata, silent) |
| 441 | + else: |
| 442 | + alignment_result = fetch_alignment(metadata, silent) |
| 443 | + |
| 444 | + return alignment_result |
0 commit comments