Skip to content

Commit 9a03bda

Browse files
committed
feat!: add chromosome lookup to sequence reference utils (#67)
I just realized that you need to also know the chromosome name for your refget accession ID to do liftover, so I just moved stuff around in that module so both assembly and chromosome are returned.
1 parent 1a22487 commit 9a03bda

File tree

7 files changed

+255
-204
lines changed

7 files changed

+255
-204
lines changed

src/agct/__init__.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
11
"""Provide fast liftover in Python via the ``chainfile`` crate."""
22

3-
from agct.assembly_registry import Assembly, get_assembly_from_refget_id
43
from agct.converter import Converter, LiftoverResult, Strand, get_converter
4+
from agct.seqref_registry import (
5+
Assembly,
6+
get_refget_id_from_seqinfo,
7+
get_seqinfo_from_refget_id,
8+
)
59

610
__all__ = [
711
"Assembly",
812
"Converter",
913
"LiftoverResult",
1014
"Strand",
11-
"get_assembly_from_refget_id",
1215
"get_converter",
16+
"get_refget_id_from_seqinfo",
17+
"get_seqinfo_from_refget_id",
1318
]

src/agct/assembly_registry.py

Lines changed: 0 additions & 160 deletions
This file was deleted.

src/agct/converter.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from wags_tails.utils.storage import get_data_dir
1313

1414
import agct._core as _core
15-
from agct.assembly_registry import Assembly
15+
from agct.seqref_registry import Assembly
1616

1717
_logger = logging.getLogger(__name__)
1818

@@ -73,7 +73,7 @@ def __init__(
7373
try:
7474
file_prefix = f"chainfile_{from_assembly.value}_to_{to_assembly.value}"
7575
except AttributeError as e:
76-
msg = f"Assembly args must be instance of `agct.assembly_registry.Genome`, instead got from_assembly={from_assembly} and to_assembly={to_assembly}"
76+
msg = f"Assembly args must be instance of `agct.seqref_registry.Genome`, instead got from_assembly={from_assembly} and to_assembly={to_assembly}"
7777
_logger.exception(msg)
7878
raise ValueError(msg) from e
7979

src/agct/seqref_registry.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
"""Sequence reference registry.
2+
3+
Maps refget accessions (``SQ.*``) to a tuple of (:class:`Assembly`, :class:`Chromosome`)
4+
and exposes helpers to look up the assembly/chromosome for a given ID. The registry is
5+
curated for internally-used builds (currently ``hg19``/``hg38``); extend as needed.
6+
"""
7+
8+
import logging
9+
import re
10+
from enum import StrEnum
11+
12+
_logger = logging.getLogger(__name__)
13+
14+
15+
class Assembly(StrEnum):
16+
"""Constrain reference genome assembly values.
17+
18+
We could conceivably support every UCSC chainfile offering, but for now, we'll
19+
stick with internal use cases only.
20+
"""
21+
22+
HG38 = "hg38"
23+
HG19 = "hg19"
24+
25+
26+
class Chromosome(StrEnum):
27+
"""Constrain chromosome values to UCSC-style names.
28+
29+
This class should NOT be used to type-constrain input in the converter
30+
module, because in practice, chainfiles can use any name for an accession. In practice,
31+
though, we're mostly interested in UCSC chainfiles, so this class is provided as a
32+
utility for likely-relevant chromosome names.
33+
"""
34+
35+
CHR1 = "chr1"
36+
CHR2 = "chr2"
37+
CHR3 = "chr3"
38+
CHR4 = "chr4"
39+
CHR5 = "chr5"
40+
CHR6 = "chr6"
41+
CHR7 = "chr7"
42+
CHR8 = "chr8"
43+
CHR9 = "chr9"
44+
CHR10 = "chr10"
45+
CHR11 = "chr11"
46+
CHR12 = "chr12"
47+
CHR13 = "chr13"
48+
CHR14 = "chr14"
49+
CHR15 = "chr15"
50+
CHR16 = "chr16"
51+
CHR17 = "chr17"
52+
CHR18 = "chr18"
53+
CHR19 = "chr19"
54+
CHR20 = "chr20"
55+
CHR21 = "chr21"
56+
CHR22 = "chr22"
57+
CHRX = "chrX"
58+
CHRY = "chrY"
59+
60+
61+
REFGET_ID_INFO = {
62+
"SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO": (Assembly.HG38, Chromosome.CHR1),
63+
"SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g": (Assembly.HG38, Chromosome.CHR2),
64+
"SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX": (Assembly.HG38, Chromosome.CHR3),
65+
"SQ.HxuclGHh0XCDuF8x6yQrpHUBL7ZntAHc": (Assembly.HG38, Chromosome.CHR4),
66+
"SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI": (Assembly.HG38, Chromosome.CHR5),
67+
"SQ.0iKlIQk2oZLoeOG9P1riRU6hvL5Ux8TV": (Assembly.HG38, Chromosome.CHR6),
68+
"SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul": (Assembly.HG38, Chromosome.CHR7),
69+
"SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs": (Assembly.HG38, Chromosome.CHR8),
70+
"SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI": (Assembly.HG38, Chromosome.CHR9),
71+
"SQ.ss8r_wB0-b9r44TQTMmVTI92884QvBiB": (Assembly.HG38, Chromosome.CHR10),
72+
"SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1": (Assembly.HG38, Chromosome.CHR11),
73+
"SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl": (Assembly.HG38, Chromosome.CHR12),
74+
"SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT": (Assembly.HG38, Chromosome.CHR13),
75+
"SQ.eK4D2MosgK_ivBkgi6FVPg5UXs1bYESm": (Assembly.HG38, Chromosome.CHR14),
76+
"SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6": (Assembly.HG38, Chromosome.CHR15),
77+
"SQ.yC_0RBj3fgBlvgyAuycbzdubtLxq-rE0": (Assembly.HG38, Chromosome.CHR16),
78+
"SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7": (Assembly.HG38, Chromosome.CHR17),
79+
"SQ.vWwFhJ5lQDMhh-czg06YtlWqu0lvFAZV": (Assembly.HG38, Chromosome.CHR18),
80+
"SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl": (Assembly.HG38, Chromosome.CHR19),
81+
"SQ.-A1QmD_MatoqxvgVxBLZTONHz9-c7nQo": (Assembly.HG38, Chromosome.CHR20),
82+
"SQ.5ZUqxCmDDgN4xTRbaSjN8LwgZironmB8": (Assembly.HG38, Chromosome.CHR21),
83+
"SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ": (Assembly.HG38, Chromosome.CHR22),
84+
"SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP": (Assembly.HG38, Chromosome.CHRX),
85+
"SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5": (Assembly.HG38, Chromosome.CHRY),
86+
"SQ.S_KjnFVz-FE7M0W6yoaUDgYxLPc1jyWU": (Assembly.HG19, Chromosome.CHR1),
87+
"SQ.9KdcA9ZpY1Cpvxvg8bMSLYDUpsX6GDLO": (Assembly.HG19, Chromosome.CHR2),
88+
"SQ.VNBualIltAyi2AI_uXcKU7M9XUOuA7MS": (Assembly.HG19, Chromosome.CHR3),
89+
"SQ.iy7Zfceb5_VGtTQzJ-v5JpPbpeifHD_V": (Assembly.HG19, Chromosome.CHR4),
90+
"SQ.vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX": (Assembly.HG19, Chromosome.CHR5),
91+
"SQ.KqaUhJMW3CDjhoVtBetdEKT1n6hM-7Ek": (Assembly.HG19, Chromosome.CHR6),
92+
"SQ.IW78mgV5Cqf6M24hy52hPjyyo5tCCd86": (Assembly.HG19, Chromosome.CHR7),
93+
"SQ.tTm7wmhz0G4lpt8wPspcNkAD_qiminj6": (Assembly.HG19, Chromosome.CHR8),
94+
"SQ.HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt": (Assembly.HG19, Chromosome.CHR9),
95+
"SQ.-BOZ8Esn8J88qDwNiSEwUr5425UXdiGX": (Assembly.HG19, Chromosome.CHR10),
96+
"SQ.XXi2_O1ly-CCOi3HP5TypAw7LtC6niFG": (Assembly.HG19, Chromosome.CHR11),
97+
"SQ.105bBysLoDFQHhajooTAUyUkNiZ8LJEH": (Assembly.HG19, Chromosome.CHR12),
98+
"SQ.Ewb9qlgTqN6e_XQiRVYpoUfZJHXeiUfH": (Assembly.HG19, Chromosome.CHR13),
99+
"SQ.5Ji6FGEKfejK1U6BMScqrdKJK8GqmIGf": (Assembly.HG19, Chromosome.CHR14),
100+
"SQ.zIMZb3Ft7RdWa5XYq0PxIlezLY2ccCgt": (Assembly.HG19, Chromosome.CHR15),
101+
"SQ.W6wLoIFOn4G7cjopxPxYNk2lcEqhLQFb": (Assembly.HG19, Chromosome.CHR16),
102+
"SQ.AjWXsI7AkTK35XW9pgd3UbjpC3MAevlz": (Assembly.HG19, Chromosome.CHR17),
103+
"SQ.BTj4BDaaHYoPhD3oY2GdwC_l0uqZ92UD": (Assembly.HG19, Chromosome.CHR18),
104+
"SQ.ItRDD47aMoioDCNW_occY5fWKZBKlxCX": (Assembly.HG19, Chromosome.CHR19),
105+
"SQ.iy_UbUrvECxFRX5LPTH_KPojdlT7BKsf": (Assembly.HG19, Chromosome.CHR20),
106+
"SQ.LpTaNW-hwuY_yARP0rtarCnpCQLkgVCg": (Assembly.HG19, Chromosome.CHR21),
107+
"SQ.XOgHwwR3Upfp5sZYk6ZKzvV25a4RBVu8": (Assembly.HG19, Chromosome.CHR22),
108+
"SQ.v7noePfnNpK8ghYXEqZ9NukMXW7YeNsm": (Assembly.HG19, Chromosome.CHRX),
109+
"SQ.BT7QyW5iXaX_1PSX-msSGYsqRdMKqkj-": (Assembly.HG19, Chromosome.CHRY),
110+
}
111+
112+
REFGET_ID_LOOKUP = {v: k for k, v in REFGET_ID_INFO.items()}
113+
114+
115+
_REFGET_AC_PATTERN = re.compile(r"^SQ\.[0-9A-Za-z_\\-]{32}$")
116+
117+
118+
def get_seqinfo_from_refget_id(
119+
refget_accession: str,
120+
) -> tuple[Assembly, Chromosome] | None:
121+
"""Given a GA4GH SequenceReference refget accession ID, get back its reference genome and chromosome name
122+
123+
.. code-block:: pycon
124+
125+
>>> from agct.assembly_registry import get_assembly_from_refget_id
126+
>>> get_assembly_from_refget_id("SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g")
127+
(<Assembly.HG38: 'hg38'>, <Chromosome.CHR2: 'chr2'>)
128+
129+
Use for acquiring a converter instance and calling liftover on a referenced GA4GH
130+
variation object.
131+
132+
:param refget_accession: sequence reference (must start with `"SQ."`)
133+
:return: a reference assembly and chromosome, if successful
134+
:raise ValueError: if input appears to be in an invalid format for a refget accession ID
135+
"""
136+
if not re.match(_REFGET_AC_PATTERN, refget_accession):
137+
msg = f"refget accession ID must be in format 'SQ.ABCDEFGHIJKLMNOPQRSTUVWXYZ123456'; got {refget_accession}"
138+
_logger.error(msg)
139+
raise ValueError(msg)
140+
return REFGET_ID_INFO.get(refget_accession)
141+
142+
143+
def get_refget_id_from_seqinfo(
144+
assembly: Assembly, chromosome: Chromosome
145+
) -> str | None:
146+
"""Given an assembly/chromosome pairing, get a refget accession ID, if known
147+
148+
:param assembly: reference assembly for sequence
149+
:param chromosome: chromosome name
150+
:return: a refget sequence accession ID, if known
151+
"""
152+
return REFGET_ID_LOOKUP.get((assembly, chromosome))

tests/test_assembly_registry.py

Lines changed: 0 additions & 39 deletions
This file was deleted.

0 commit comments

Comments
 (0)