|
| 1 | +"""Sequence reference registry. |
| 2 | +
|
| 3 | +Maps refget accessions (``SQ.*``) to a tuple of (:class:`Assembly`, :class:`Chromosome`) |
| 4 | +and exposes helpers to look up the assembly/chromosome for a given ID. The registry is |
| 5 | +curated for internally-used builds (currently ``hg19``/``hg38``); extend as needed. |
| 6 | +""" |
| 7 | + |
| 8 | +import logging |
| 9 | +import re |
| 10 | +from enum import StrEnum |
| 11 | + |
| 12 | +_logger = logging.getLogger(__name__) |
| 13 | + |
| 14 | + |
| 15 | +class Assembly(StrEnum): |
| 16 | + """Constrain reference genome assembly values. |
| 17 | +
|
| 18 | + We could conceivably support every UCSC chainfile offering, but for now, we'll |
| 19 | + stick with internal use cases only. |
| 20 | + """ |
| 21 | + |
| 22 | + HG38 = "hg38" |
| 23 | + HG19 = "hg19" |
| 24 | + |
| 25 | + |
| 26 | +class Chromosome(StrEnum): |
| 27 | + """Constrain chromosome values to UCSC-style names. |
| 28 | +
|
| 29 | + This class should NOT be used to type-constrain input in the converter |
| 30 | + module, because in practice, chainfiles can use any name for an accession. In practice, |
| 31 | + though, we're mostly interested in UCSC chainfiles, so this class is provided as a |
| 32 | + utility for likely-relevant chromosome names. |
| 33 | + """ |
| 34 | + |
| 35 | + CHR1 = "chr1" |
| 36 | + CHR2 = "chr2" |
| 37 | + CHR3 = "chr3" |
| 38 | + CHR4 = "chr4" |
| 39 | + CHR5 = "chr5" |
| 40 | + CHR6 = "chr6" |
| 41 | + CHR7 = "chr7" |
| 42 | + CHR8 = "chr8" |
| 43 | + CHR9 = "chr9" |
| 44 | + CHR10 = "chr10" |
| 45 | + CHR11 = "chr11" |
| 46 | + CHR12 = "chr12" |
| 47 | + CHR13 = "chr13" |
| 48 | + CHR14 = "chr14" |
| 49 | + CHR15 = "chr15" |
| 50 | + CHR16 = "chr16" |
| 51 | + CHR17 = "chr17" |
| 52 | + CHR18 = "chr18" |
| 53 | + CHR19 = "chr19" |
| 54 | + CHR20 = "chr20" |
| 55 | + CHR21 = "chr21" |
| 56 | + CHR22 = "chr22" |
| 57 | + CHRX = "chrX" |
| 58 | + CHRY = "chrY" |
| 59 | + |
| 60 | + |
| 61 | +REFGET_ID_INFO = { |
| 62 | + "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO": (Assembly.HG38, Chromosome.CHR1), |
| 63 | + "SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g": (Assembly.HG38, Chromosome.CHR2), |
| 64 | + "SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX": (Assembly.HG38, Chromosome.CHR3), |
| 65 | + "SQ.HxuclGHh0XCDuF8x6yQrpHUBL7ZntAHc": (Assembly.HG38, Chromosome.CHR4), |
| 66 | + "SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI": (Assembly.HG38, Chromosome.CHR5), |
| 67 | + "SQ.0iKlIQk2oZLoeOG9P1riRU6hvL5Ux8TV": (Assembly.HG38, Chromosome.CHR6), |
| 68 | + "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul": (Assembly.HG38, Chromosome.CHR7), |
| 69 | + "SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs": (Assembly.HG38, Chromosome.CHR8), |
| 70 | + "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI": (Assembly.HG38, Chromosome.CHR9), |
| 71 | + "SQ.ss8r_wB0-b9r44TQTMmVTI92884QvBiB": (Assembly.HG38, Chromosome.CHR10), |
| 72 | + "SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1": (Assembly.HG38, Chromosome.CHR11), |
| 73 | + "SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl": (Assembly.HG38, Chromosome.CHR12), |
| 74 | + "SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT": (Assembly.HG38, Chromosome.CHR13), |
| 75 | + "SQ.eK4D2MosgK_ivBkgi6FVPg5UXs1bYESm": (Assembly.HG38, Chromosome.CHR14), |
| 76 | + "SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6": (Assembly.HG38, Chromosome.CHR15), |
| 77 | + "SQ.yC_0RBj3fgBlvgyAuycbzdubtLxq-rE0": (Assembly.HG38, Chromosome.CHR16), |
| 78 | + "SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7": (Assembly.HG38, Chromosome.CHR17), |
| 79 | + "SQ.vWwFhJ5lQDMhh-czg06YtlWqu0lvFAZV": (Assembly.HG38, Chromosome.CHR18), |
| 80 | + "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl": (Assembly.HG38, Chromosome.CHR19), |
| 81 | + "SQ.-A1QmD_MatoqxvgVxBLZTONHz9-c7nQo": (Assembly.HG38, Chromosome.CHR20), |
| 82 | + "SQ.5ZUqxCmDDgN4xTRbaSjN8LwgZironmB8": (Assembly.HG38, Chromosome.CHR21), |
| 83 | + "SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ": (Assembly.HG38, Chromosome.CHR22), |
| 84 | + "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP": (Assembly.HG38, Chromosome.CHRX), |
| 85 | + "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5": (Assembly.HG38, Chromosome.CHRY), |
| 86 | + "SQ.S_KjnFVz-FE7M0W6yoaUDgYxLPc1jyWU": (Assembly.HG19, Chromosome.CHR1), |
| 87 | + "SQ.9KdcA9ZpY1Cpvxvg8bMSLYDUpsX6GDLO": (Assembly.HG19, Chromosome.CHR2), |
| 88 | + "SQ.VNBualIltAyi2AI_uXcKU7M9XUOuA7MS": (Assembly.HG19, Chromosome.CHR3), |
| 89 | + "SQ.iy7Zfceb5_VGtTQzJ-v5JpPbpeifHD_V": (Assembly.HG19, Chromosome.CHR4), |
| 90 | + "SQ.vbjOdMfHJvTjK_nqvFvpaSKhZillW0SX": (Assembly.HG19, Chromosome.CHR5), |
| 91 | + "SQ.KqaUhJMW3CDjhoVtBetdEKT1n6hM-7Ek": (Assembly.HG19, Chromosome.CHR6), |
| 92 | + "SQ.IW78mgV5Cqf6M24hy52hPjyyo5tCCd86": (Assembly.HG19, Chromosome.CHR7), |
| 93 | + "SQ.tTm7wmhz0G4lpt8wPspcNkAD_qiminj6": (Assembly.HG19, Chromosome.CHR8), |
| 94 | + "SQ.HBckYGQ4wYG9APHLpjoQ9UUe9v7NxExt": (Assembly.HG19, Chromosome.CHR9), |
| 95 | + "SQ.-BOZ8Esn8J88qDwNiSEwUr5425UXdiGX": (Assembly.HG19, Chromosome.CHR10), |
| 96 | + "SQ.XXi2_O1ly-CCOi3HP5TypAw7LtC6niFG": (Assembly.HG19, Chromosome.CHR11), |
| 97 | + "SQ.105bBysLoDFQHhajooTAUyUkNiZ8LJEH": (Assembly.HG19, Chromosome.CHR12), |
| 98 | + "SQ.Ewb9qlgTqN6e_XQiRVYpoUfZJHXeiUfH": (Assembly.HG19, Chromosome.CHR13), |
| 99 | + "SQ.5Ji6FGEKfejK1U6BMScqrdKJK8GqmIGf": (Assembly.HG19, Chromosome.CHR14), |
| 100 | + "SQ.zIMZb3Ft7RdWa5XYq0PxIlezLY2ccCgt": (Assembly.HG19, Chromosome.CHR15), |
| 101 | + "SQ.W6wLoIFOn4G7cjopxPxYNk2lcEqhLQFb": (Assembly.HG19, Chromosome.CHR16), |
| 102 | + "SQ.AjWXsI7AkTK35XW9pgd3UbjpC3MAevlz": (Assembly.HG19, Chromosome.CHR17), |
| 103 | + "SQ.BTj4BDaaHYoPhD3oY2GdwC_l0uqZ92UD": (Assembly.HG19, Chromosome.CHR18), |
| 104 | + "SQ.ItRDD47aMoioDCNW_occY5fWKZBKlxCX": (Assembly.HG19, Chromosome.CHR19), |
| 105 | + "SQ.iy_UbUrvECxFRX5LPTH_KPojdlT7BKsf": (Assembly.HG19, Chromosome.CHR20), |
| 106 | + "SQ.LpTaNW-hwuY_yARP0rtarCnpCQLkgVCg": (Assembly.HG19, Chromosome.CHR21), |
| 107 | + "SQ.XOgHwwR3Upfp5sZYk6ZKzvV25a4RBVu8": (Assembly.HG19, Chromosome.CHR22), |
| 108 | + "SQ.v7noePfnNpK8ghYXEqZ9NukMXW7YeNsm": (Assembly.HG19, Chromosome.CHRX), |
| 109 | + "SQ.BT7QyW5iXaX_1PSX-msSGYsqRdMKqkj-": (Assembly.HG19, Chromosome.CHRY), |
| 110 | +} |
| 111 | + |
| 112 | +REFGET_ID_LOOKUP = {v: k for k, v in REFGET_ID_INFO.items()} |
| 113 | + |
| 114 | + |
| 115 | +_REFGET_AC_PATTERN = re.compile(r"^SQ\.[0-9A-Za-z_\\-]{32}$") |
| 116 | + |
| 117 | + |
| 118 | +def get_seqinfo_from_refget_id( |
| 119 | + refget_accession: str, |
| 120 | +) -> tuple[Assembly, Chromosome] | None: |
| 121 | + """Given a GA4GH SequenceReference refget accession ID, get back its reference genome and chromosome name |
| 122 | +
|
| 123 | + .. code-block:: pycon |
| 124 | +
|
| 125 | + >>> from agct.assembly_registry import get_assembly_from_refget_id |
| 126 | + >>> get_assembly_from_refget_id("SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g") |
| 127 | + (<Assembly.HG38: 'hg38'>, <Chromosome.CHR2: 'chr2'>) |
| 128 | +
|
| 129 | + Use for acquiring a converter instance and calling liftover on a referenced GA4GH |
| 130 | + variation object. |
| 131 | +
|
| 132 | + :param refget_accession: sequence reference (must start with `"SQ."`) |
| 133 | + :return: a reference assembly and chromosome, if successful |
| 134 | + :raise ValueError: if input appears to be in an invalid format for a refget accession ID |
| 135 | + """ |
| 136 | + if not re.match(_REFGET_AC_PATTERN, refget_accession): |
| 137 | + msg = f"refget accession ID must be in format 'SQ.ABCDEFGHIJKLMNOPQRSTUVWXYZ123456'; got {refget_accession}" |
| 138 | + _logger.error(msg) |
| 139 | + raise ValueError(msg) |
| 140 | + return REFGET_ID_INFO.get(refget_accession) |
| 141 | + |
| 142 | + |
| 143 | +def get_refget_id_from_seqinfo( |
| 144 | + assembly: Assembly, chromosome: Chromosome |
| 145 | +) -> str | None: |
| 146 | + """Given an assembly/chromosome pairing, get a refget accession ID, if known |
| 147 | +
|
| 148 | + :param assembly: reference assembly for sequence |
| 149 | + :param chromosome: chromosome name |
| 150 | + :return: a refget sequence accession ID, if known |
| 151 | + """ |
| 152 | + return REFGET_ID_LOOKUP.get((assembly, chromosome)) |
0 commit comments