diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a4b042..471c631 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,52 @@ + +# [1.0.6 Validation of allele specific MAC codes](https://github.com/nmdp-bioinformatics/py-ard/releases/tag/1.0.6) - 30 Oct 2023 + +- Use allele specific antigen code rules when validating MACs that cross antigen group similar to [MAC Service](https://hml.nmdp.org/macui/) +- Returns the original `InvalidAlleleError` instead of wrapping it in `InvalidTypingError` when an allele is not valid. + +[Changes][1.0.6] + + + +# [1.0.5 Non strict mode](https://github.com/nmdp-bioinformatics/py-ard/releases/tag/1.0.5) - 04 Oct 2023 + +Supports non-strict mode makes valid alleles by adding expression characters to invalid alleles. + +## Use non `strict` mode in config to reduce alleles that may be valid with expression characters. + +```python +>>> my_configs = {'strict': False, 'verbose_log': True} +>>> import pyard +>>> ard = pyard.init(config=my_configs, load_mac=False) + +>>> ard.redux('A*24:329', 'lgx') +A*24:329 is not valid. Using A*24:329Q +'A*24:329Q' + +>>> ard.redux('DQB1*03:276', 'lgx') +DQB1*03:276 is not valid. Using DQB1*03:276N +'DQB1*03:01' +``` + +## Add non-strict and verbose modes to pyard CLI. + +```bash +❯ pyard --gl "DQB1*03:276" -r lgx +Typing Error: DQB1*03:276 is not valid GL String. + DQB1*03:276 is not a valid Allele + +❯ pyard --non-strict --gl "DQB1*03:276" -r lgx +DQB1*03:01 + +❯ pyard --non-strict --verbose --gl "DQB1*03:276" -r lgx +DQB1*03:276 is not valid. Using DQB1*03:276N +DQB1*03:01 +``` + + +[Changes][1.0.5] + + # [Fixes when used without login user (1.0.4)](https://github.com/nmdp-bioinformatics/py-ard/releases/tag/1.0.4) - 19 Sep 2023 @@ -563,6 +612,8 @@ yes [Changes][0.0.14] +[1.0.6]: https://github.com/nmdp-bioinformatics/py-ard/compare/1.0.5...1.0.6 +[1.0.5]: https://github.com/nmdp-bioinformatics/py-ard/compare/1.0.4...1.0.5 [1.0.4]: https://github.com/nmdp-bioinformatics/py-ard/compare/1.0.3...1.0.4 [1.0.3]: https://github.com/nmdp-bioinformatics/py-ard/compare/1.0.2...1.0.3 [1.0.2]: https://github.com/nmdp-bioinformatics/py-ard/compare/1.0.1...1.0.2 diff --git a/Dockerfile b/Dockerfile index ac47d19..8b68907 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ LABEL MAINTAINER="Pradeep Bashyal" WORKDIR /app -ARG PY_ARD_VERSION=1.0.5 +ARG PY_ARD_VERSION=1.0.6 COPY requirements.txt /app RUN pip install --no-cache-dir --upgrade pip && \ diff --git a/api-spec.yaml b/api-spec.yaml index 0218e7d..e6eeb34 100644 --- a/api-spec.yaml +++ b/api-spec.yaml @@ -2,7 +2,7 @@ openapi: 3.0.3 info: title: ARD Reduction description: Reduce to ARD Level - version: "1.0.5" + version: "1.0.6" servers: - url: 'http://localhost:8080' tags: diff --git a/pyard/__init__.py b/pyard/__init__.py index ab12b6c..05f96f0 100644 --- a/pyard/__init__.py +++ b/pyard/__init__.py @@ -27,7 +27,7 @@ from .misc import get_imgt_db_versions as db_versions __author__ = """NMDP Bioinformatics""" -__version__ = "1.0.5" +__version__ = "1.0.6" def init( diff --git a/pyard/ard.py b/pyard/ard.py index fb8d618..6881393 100644 --- a/pyard/ard.py +++ b/pyard/ard.py @@ -25,6 +25,7 @@ import re import sqlite3 import sys +from collections import Counter from typing import Iterable, List from . import broad_splits, smart_sort @@ -408,8 +409,8 @@ def redux(self, glstring: str, redux_type: VALID_REDUCTION_TYPES) -> str: ) # Handle MAC - if self._config["reduce_MAC"] and self.is_mac(glstring): - if db.is_valid_mac_code(self.db_connection, code): + if self._config["reduce_MAC"] and code.isalpha(): + if self.is_mac(glstring): # Make sure it's a valid MAC if HLA_regex.search(glstring): # Remove HLA- prefix allele_name = glstring.split("-")[1] @@ -436,12 +437,7 @@ def validate(self, glstring): :param glstring: GL String to validate :return: boolean indicating success """ - try: - return self._is_valid_gl(glstring) - except InvalidAlleleError as e: - raise InvalidTypingError( - f"{glstring} is not valid GL String. \n {e.message}", e - ) from None + return self._is_valid_gl(glstring) def is_XX(self, glstring: str, loc_antigen: str = None, code: str = None) -> bool: if loc_antigen is None or code is None: @@ -484,12 +480,39 @@ def is_mac(self, allele: str) -> bool: :return: True if MAC """ if ":" in allele: - code = allele.split(":")[1] - try: + allele_split = allele.split(":") + if len(allele_split) == 2: # MACs have only single : + locus_antigen, code = allele_split if code.isalpha(): - return db.is_valid_mac_code(self.db_connection, code) - except sqlite3.OperationalError as e: - print("Error: ", e) + try: + alleles = db.mac_code_to_alleles(self.db_connection, code) + if alleles: + if any(map(lambda a: ":" in a, alleles)): + # allele specific antigen codes have ':' in the MAC mapping + # e.g. CFWRN -> 15:01/15:98/15:157/15:202/ + # 15:239/15:280/15:340/35:43/35:67/35:79/35:102/35:118/35:185/51:220 + # Extract the antigens from the mapped alleles + antigen_groups = map(lambda a: a.split(":")[0], alleles) + # Rule 1: The 1st field with the most allele designations in the request is + # the 1st field of the allele code designation + # Rule 2: If there is a tie in the number of alleles designations sharing the 1st field, + # the 1st field with the lowest numeric value is selected. + antigen_counts = Counter(antigen_groups) + # Create a table of antigen to it's counts + # '15': 7 + # '35': 6 + # '51': 1 + # Valid antigen is the first most common one. + # As it's presorted in db, also satisfies Rule 2. + valid_antigen = antigen_counts.most_common(1).pop()[0] + # Get antigen value 15 from 'DRB1*15' + provided_antigen = locus_antigen.split("*").pop() + # The MAC is only valid if the given antigen satisfies the antigen matching Rule 1 and 2 + return provided_antigen == valid_antigen + # Valid when antigen group codes + return True + except sqlite3.OperationalError as e: + print("Error: ", e) return False def is_v2(self, allele: str) -> bool: @@ -719,8 +742,8 @@ def expand_mac(self, mac_code: str): :return: GL String of expanded alleles :rtype: str """ - locus_antigen, code = mac_code.split(":") - if db.is_valid_mac_code(self.db_connection, code): + if self.is_mac(mac_code): # Validate MAC first + locus_antigen, code = mac_code.split(":") if HLA_regex.search(mac_code): locus_antigen = locus_antigen.split("-")[1] # Remove HLA- prefix return "/".join( diff --git a/pyard/db.py b/pyard/db.py index e08b506..9973997 100644 --- a/pyard/db.py +++ b/pyard/db.py @@ -172,21 +172,6 @@ def alleles_to_mac_code( return None -def is_valid_mac_code(connection: sqlite3.Connection, code: str) -> bool: - """ - Check db if the MAC code exists. - - :param connection: db connection of type sqlite.Connection - :param code: MAC code - :return: code is MAC code ? - """ - mac_query = "SELECT count(alleles) from mac_codes where code = ?" - cursor = connection.execute(mac_query, (code,)) - result = cursor.fetchone() - cursor.close() - return result[0] > 0 - - def serology_to_alleles(connection: sqlite3.Connection, serology: str) -> List[str]: """ Look up Serology in the database and return corresponding list of alleles. @@ -371,16 +356,16 @@ def load_dict( return table_as_dict -def similar_alleles(connection: sqlite3.Connection, allele_name: str) -> Set[str]: +def similar_alleles(connection: sqlite3.Connection, allele_prefix: str) -> Set[str]: """ - Find similar alleles starting with the provided allele_name. + Find similar alleles starting with the provided prefix. :param connection: db connection of type sqlite.Connection - :param allele_name: Allele name to use as a prefix to find similar alleles + :param allele_prefix: Allele name to use as a prefix to find similar alleles :return: list of similar alleles """ query = "SELECT allele FROM alleles WHERE allele LIKE ?" - cursor = connection.execute(query, (f"{allele_name}%",)) + cursor = connection.execute(query, (f"{allele_prefix}%",)) result = cursor.fetchall() # fetchall() returns a list of tuples of results # e.g. [('C*04:09N',)] @@ -389,6 +374,24 @@ def similar_alleles(connection: sqlite3.Connection, allele_name: str) -> Set[str return alleles +def similar_mac(connection: sqlite3.Connection, mac_prefix: str) -> Set[str]: + """ + Find similar MAC codes starting with the provided prefix. + + :param connection: db connection of type sqlite.Connection + :param mac_prefix: MAC fragment to use as a prefix to find similar MACs + :return: list of similar MAC codes + """ + query = "SELECT code FROM mac_codes WHERE code LIKE ?" + cursor = connection.execute(query, (f"{mac_prefix}%",)) + result = cursor.fetchall() + # fetchall() returns a list of tuples of results + # e.g. [('DJZUP',)] + # Get out the first value of the tuple from the result list + codes = set(map(lambda t: t[0], result)) + return codes + + def find_serology_for_allele( connection: sqlite3.Connection, allele_name: str ) -> Dict[str, str]: diff --git a/scripts/pyard b/scripts/pyard index 035c9ee..8f182d6 100755 --- a/scripts/pyard +++ b/scripts/pyard @@ -22,13 +22,92 @@ # > http://www.opensource.org/licenses/lgpl-license.php # import argparse +import functools import sys +from pyard import smart_sort from pyard.constants import VALID_REDUCTION_TYPES import pyard.misc +from pyard.db import similar_alleles, similar_mac from pyard.exceptions import InvalidAlleleError, InvalidTypingError, InvalidMACError from pyard.misc import get_data_dir, get_imgt_version + +def find_similar_alleles(ard, prefix): + if "*" in prefix: # Only for those that have locus + locus, fields = prefix.split("*") + if fields: # Only if at least a field is specified after * + if len(fields.split(":")) == 2: # Check for MACs + first_field, mac_prefix = fields.split(":") + if mac_prefix.isalpha(): + similar_mac_names = similar_mac(ard.db_connection, mac_prefix) + if similar_mac_names: + locus_prefix = f"{locus}*{first_field}" + # TODO: validate all the mac codes with the prefix + # show only the valid macs + for code in sorted(similar_mac_names): + print(f"{locus_prefix}:{code}") + else: + # Nothing after * + sys.exit(2) + else: + # No * + sys.exit(1) + + # find similar alleles + similar_allele_names = similar_alleles(ard.db_connection, prefix) + if similar_allele_names: + for allele in sorted( + similar_allele_names, + key=functools.cmp_to_key(smart_sort.smart_sort_comparator), + ): + print(allele) + sys.exit(0) + + +def lookup_mac_codes(): + global e + try: + mac = ard.lookup_mac(args.lookup_mac) + print(mac) + except InvalidMACError as e: + print(e.message, file=sys.stderr) + sys.exit(0) + + +def expand_mac_code(): + global allele_list, e + try: + allele_list = ard.expand_mac(args.expand_mac) + print(allele_list) + except InvalidMACError as e: + print(e.message, file=sys.stderr) + sys.exit(0) + + +def find_broad_splits(): + mapping = pyard.find_broad_splits(args.splits) + if mapping: + print(f"{mapping[0]} = {'/'.join(mapping[1])}") + sys.exit(0) + + +def show_version(): + version = ard.get_db_version() + print(f"IPD-IMGT/HLA version:", version) + print(f"py-ard version:", pyard.__version__) + sys.exit(0) + + +def perform_cwd_redux(): + global cwd_redux + if args.validate: + ard.validate(args.cwd) + cwd_redux = ard.cwd_redux(args.cwd) + print(cwd_redux) + sys.exit(0) + + if __name__ == "__main__": parser = argparse.ArgumentParser( description=""" @@ -76,6 +155,11 @@ if __name__ == "__main__": parser.add_argument( "--lookup-mac", dest="lookup_mac", help="Lookup MAC for an Allele List" ) + parser.add_argument( + "--similar", + dest="similar_allele", + help="Find Similar Alleles with given prefix", + ) parser.add_argument( "--non-strict", dest="non_strict", @@ -100,45 +184,32 @@ if __name__ == "__main__": ard = pyard.init(imgt_version=imgt_version, data_dir=data_dir, config=new_config) + # Handle --version option if args.version: - version = ard.get_db_version() - print(f"IPD-IMGT/HLA version:", version) - print(f"py-ard version:", pyard.__version__) - sys.exit(0) + show_version() + # Handle --splits option if args.splits: - mapping = pyard.find_broad_splits(args.splits) - if mapping: - print(f"{mapping[0]} = {'/'.join(mapping[1])}") - sys.exit(0) + find_broad_splits() # Handle --expand-mac option if args.expand_mac: - try: - allele_list = ard.expand_mac(args.expand_mac) - print(allele_list) - except InvalidMACError as e: - print(e.message, file=sys.stderr) - sys.exit(0) + expand_mac_code() # Handle --lookup-mac option if args.lookup_mac: - try: - mac = ard.lookup_mac(args.lookup_mac) - print(mac) - except InvalidMACError as e: - print(e.message, file=sys.stderr) - sys.exit(0) + lookup_mac_codes() + + # Handle --similar option + if args.similar_allele: + find_similar_alleles(ard, args.similar_allele) try: + if args.cwd: + perform_cwd_redux() + if args.validate and args.gl_string: ard.validate(args.gl_string) - if args.cwd: - if args.validate: - ard.validate(args.cwd) - cwd_redux = ard.cwd_redux(args.cwd) - print(cwd_redux) - sys.exit(0) if args.redux_type: print(ard.redux(args.gl_string, args.redux_type)) @@ -155,6 +226,9 @@ if __name__ == "__main__": except InvalidTypingError as e: print("Typing Error:", e.message, file=sys.stderr) sys.exit(2) + except InvalidMACError as e: + print("MAC Error:", e.message, file=sys.stderr) + sys.exit(3) else: # Remove ard and close db connection del ard diff --git a/setup.cfg b/setup.cfg index 6978158..d0b4c39 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.0.5 +current_version = 1.0.6 commit = True tag = True diff --git a/setup.py b/setup.py index 0ecce82..46f7de5 100644 --- a/setup.py +++ b/setup.py @@ -36,7 +36,7 @@ setup( name="py-ard", - version="1.0.5", + version="1.0.6", description="ARD reduction for HLA with Python", long_description=readme, long_description_content_type="text/markdown", diff --git a/tests/features/mac.feature b/tests/features/mac.feature index 4cc3160..f517a73 100644 --- a/tests/features/mac.feature +++ b/tests/features/mac.feature @@ -48,3 +48,26 @@ Feature: MAC (Multiple Allele Code) | A*01:01/A*01:02 | A*01:AB | | HLA-A*25:01/HLA-A*26:01 | HLA-A*25:BYHR | | HLA-A*02:01/HLA-A*02:09/HLA-A*02:43N | HLA-A*02:GNF | + + + Scenario Outline: Validate allele specific antigen MACs + + MAC validation rules for allele specific antigen codes: + - The 1st field with the most allele designations in the request is + the 1st field of the allele code designation + - If there is a tie in the number of alleles designations sharing the 1st field, + the 1st field with the lowest numeric value is selected. + + + Given the MAC code is + When checking for validity of the MAC + Then the validness is + + Examples: + | MAC | Validity | + | DRB1*07:DFJR | Invalid | + | DRB1*15:DFJR | Valid | + | DPB1*08:BHHE | Invalid | + | DPB1*19:BHHE | Valid | + | A*31:CMZEY | Invalid | + | A*02:CMZEY | Valid | diff --git a/tests/steps/mac.py b/tests/steps/mac.py index 91f1919..64069d5 100644 --- a/tests/steps/mac.py +++ b/tests/steps/mac.py @@ -1,6 +1,8 @@ from behave import * from hamcrest import assert_that, is_ +from pyard.exceptions import InvalidAlleleError + @given("the MAC code is {mac_code}") def step_impl(context, mac_code): @@ -30,3 +32,17 @@ def step_impl(context): @then("the decoded MAC is {mac_code}") def step_impl(context, mac_code): assert_that(context.mac_code, is_(mac_code)) + + +@when("checking for validity of the MAC") +def step_impl(context): + try: + context.is_valid = context.ard.validate(context.mac_code) + except InvalidAlleleError: + context.is_valid = False + + +@then("the validness is {validity}") +def step_impl(context, validity): + valid = validity == "Valid" + assert_that(context.is_valid, is_(valid)) diff --git a/tests/test_pyard.py b/tests/test_pyard.py index 0dcf87a..39dbd60 100644 --- a/tests/test_pyard.py +++ b/tests/test_pyard.py @@ -34,7 +34,7 @@ import pyard from pyard.constants import DEFAULT_CACHE_SIZE -from pyard.exceptions import InvalidAlleleError, InvalidMACError, InvalidTypingError +from pyard.exceptions import InvalidAlleleError from pyard.misc import validate_reduction_type @@ -135,15 +135,15 @@ def test_redux_types(self): validate_reduction_type("XX") def test_empty_allele(self): - with self.assertRaises(InvalidTypingError): + with self.assertRaises(InvalidAlleleError): self.ard.redux("A*", "lgx") def test_fp_allele(self): - with self.assertRaises(InvalidTypingError): + with self.assertRaises(InvalidAlleleError): self.ard.redux("A*0.123", "lgx") def test_empty_fields(self): - with self.assertRaises(InvalidTypingError): + with self.assertRaises(InvalidAlleleError): # : without any data self.ard.redux("DQA1*01:01:01:G", "lgx") @@ -152,7 +152,7 @@ def test_invalid_serology(self): serology_a10 = self.ard.redux("A10", "lgx") self.assertEqual(serology_a10.split("/")[0], "A*25:01") # And A100 isn't a valid typing - with self.assertRaises(InvalidTypingError): + with self.assertRaises(InvalidAlleleError): self.ard.redux("A100", "lgx") def test_allele_duplicated(self):