nmdp-bioinformatics
diff --git a/‎README.rst‎
Lines changed: 2 additions & 3 deletions b/‎README.rst‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎extras/README.md‎
Lines changed: 100 additions & 0 deletions b/‎extras/README.md‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎extras/conf.py‎
Lines changed: 78 additions & 0 deletions b/‎extras/conf.py‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎extras/reduce_csv.py‎
Lines changed: 127 additions & 0 deletions b/‎extras/reduce_csv.py‎
Lines changed: 127 additions & 0 deletions
diff --git a/‎extras/sample.csv‎
Lines changed: 4 additions & 0 deletions b/‎extras/sample.csv‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎pyard/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎pyard/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyard/broad_splits.py‎
Lines changed: 21 additions & 1 deletion b/‎pyard/broad_splits.py‎
Lines changed: 21 additions & 1 deletion
@@ -89,9 +89,8 @@ Example
     # 'B*07:02:01G+B*07:02:01G^A*01:01:01G+A*02:01:01G/A*02:02'
 
     # py-ard can also reduce serology based typings
-    ard.redux_gl('HLA-A*10^HLA-A*9', 'lg')
-    # >>> ard_gl
-    # 'HLA-A*24:19g/HLA-A*24:22g^HLA-A*26:01g/HLA-A*26:10g/HLA-A*26:15g/HLA-A*26:92g/HLA-A*66:01g/HLA-A*66:03g'
+    ard.redux_gl('B14', 'lg')
+    # >>> 'B*14:01g/B*14:02g/B*14:03g/B*14:04g/B*14:05g/B*14:06g/B*14:08g/B*14:09g/B*14:10g/B*14:11g/B*14:12g/B*14:13g/B*14:14g/B*14:15g/B*14:16g/B*14:17g/B*14:18g/B*14:19g/B*14:20g/B*14:21g/B*14:22g/B*14:23g/B*14:24g/B*14:25g/B*14:26g/B*14:27g/B*14:28g/B*14:29g/B*14:30g/B*14:31g/B*14:32g/B*14:33g/B*14:34g/B*14:35g/B*14:36g/B*14:37g/B*14:38g/B*14:39g/B*14:40g/B*14:42g/B*14:43g/B*14:44g/B*14:45g/B*14:46g/B*14:47g/B*14:48g/B*14:49g/B*14:50g/B*14:51g/B*14:52g/B*14:53g/B*14:54g/B*14:55g/B*14:56g/B*14:57g/B*14:58g/B*14:59g/B*14:60g/B*14:62g/B*14:63g/B*14:65g/B*14:66g/B*14:68g/B*14:70Qg/B*14:71g/B*14:73g/B*14:74g/B*14:75g/B*14:77g/B*14:82g/B*14:83g/B*14:86g/B*14:87g/B*14:88g/B*14:90g/B*14:93g/B*14:94g/B*14:95g/B*14:96g/B*14:97g/B*14:99g/B*14:102g'
 
 
 
 
@@ -0,0 +1,100 @@
+# Extras
+
+# Batch Script for CSV File
+
+**Example Scripts to batch reduce HLA typings from a CSV File**
+
+`reduce_csv.py` and `conf.py` scripts can be used to take a CSV file with HLA 
+typing data and reduce certain columns and produce a new CSV and Excel file.
+
+For most use case, installing `py-ard`, specifying the changes in `conf.py` file
+and running `python reduce_csv.py` will produce result based on the configuration
+in the `conf.py`.
+
+
+```python
+#
+# configurations for processing CSV files
+#
+
+# The column names that are in CSV
+# The output file will have these columns
+all_columns_in_csv = [
+    "nmdp_id", "r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2",
+    "r_dpb1_typ1", "r_dpb1_typ2"
+]
+
+#
+# List of columns which have typing information and need to be reduced.
+# The locus is the 2nd term in the column name
+# Eg: For column R_DRB1_type1, DPB1 is the locus name
+#
+columns_to_reduce_in_csv = [
+    "r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2", "r_dpb1_typ1",
+    "r_dpb1_typ2"
+]
+
+#
+# Configuration options to ARD reduction of a CSV file
+#
+ard_config = {
+    # All Columns in the CSV file
+    "csv_in_column_names": all_columns_in_csv,
+
+    # Columns to check for typings
+    "columns_to_check": columns_to_reduce_in_csv,
+
+    # How should the typings be reduced
+    # Valid Options:
+    # - G
+    # - lg
+    # - lgx
+    "redux_type": "lgx",
+
+    # Input CSV filename
+    "in_csv_filename": "sample.csv",
+
+    # Output CSV filename
+    "out_csv_filename": 'clean_sample.csv',
+
+    # Use compression
+    # Valid options
+    # - 'gzip'
+    # - 'zip'
+    # - None
+    "apply_compression": 'gzip',
+
+    # Show verbose log
+    # Valid options:
+    # - True
+    # - False
+    "verbose_log": True,
+
+    # What to reduce ?
+    "reduce_serology": False,
+    "reduce_v2": True,
+    "reduce_3field": True,
+    "reduce_P": True,
+    "reduce_XX": False,
+    "reduce_MAC": True,
+
+    # Is locus name present in allele
+    # Eg. A*01:01 vs 01:01
+    "locus_in_allele_name": False,
+
+    # Format
+    # Valid options:
+    # - csv
+    # - xlsx
+    "output_file_format": 'csv',
+
+    # Add a separate column for processed column
+    "new_column_for_redux": False,
+}
+```
+
+The included sample CSV file `sample.csv` can be processed using the script.
+
+```shell
+
+```
@@ -0,0 +1,78 @@
+#
+# configurations for processing CSV files
+#
+
+# The column names that are in CSV
+# The output file will have these columns
+all_columns_in_csv = [
+    "nmdp_id", "r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2",
+    "r_dpb1_typ1", "r_dpb1_typ2"
+]
+
+#
+# List of columns which have typing information and need to be reduced.
+# The locus is the 2nd term in the column name
+# Eg: For column R_DRB1_type1, DPB1 is the locus name
+#
+columns_to_reduce_in_csv = [
+    "r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2", "r_dpb1_typ1",
+    "r_dpb1_typ2"
+]
+
+#
+# Configuration options to ARD reduction of a CSV file
+#
+ard_config = {
+    # All Columns in the CSV file
+    "csv_in_column_names": all_columns_in_csv,
+
+    # Columns to check for typings
+    "columns_to_check": columns_to_reduce_in_csv,
+
+    # How should the typings be reduced
+    # Valid Options:
+    # - G
+    # - lg
+    # - lgx
+    "redux_type": "lgx",
+
+    # Input CSV filename
+    "in_csv_filename": "sample.csv",
+
+    # Output CSV filename
+    "out_csv_filename": 'clean_sample.csv',
+
+    # Use compression
+    # Valid options
+    # - 'gzip'
+    # - 'zip'
+    # - None
+    "apply_compression": 'gzip',
+
+    # Show verbose log
+    # Valid options:
+    # - True
+    # - False
+    "verbose_log": True,
+
+    # What to reduce ?
+    "reduce_serology": False,
+    "reduce_v2": True,
+    "reduce_3field": True,
+    "reduce_P": True,
+    "reduce_XX": False,
+    "reduce_MAC": True,
+
+    # Is locus name present in allele
+    # Eg. A*01:01 vs 01:01
+    "locus_in_allele_name": False,
+
+    # Format
+    # Valid options:
+    # - csv
+    # - xlsx
+    "output_file_format": 'csv',
+
+    # Add a separate column for processed column
+    "new_column_for_redux": False,
+}
@@ -0,0 +1,127 @@
+#
+#
+#  Quick script to reduce alleles from a CSV file
+#
+#  Use `conf.py` to setup configurations that's used here
+#  For Excel output, openpyxl library needs to be installed.
+#       pip install openpyxl
+#
+
+import pandas as pd
+import pyard
+import re
+
+from conf import ard_config
+
+verbose = ard_config["verbose_log"]
+white_space_regex = re.compile(r"\s+")
+
+
+def is_serology(allele: str) -> bool:
+    if len(allele.split(':')) == 1:
+        return True
+
+
+def is_3field(allele: str) -> bool:
+    return len(allele.split(':')) > 2
+
+
+def is_P(allele: str) -> bool:
+    if allele.endswith('P'):
+        fields = allele.split(':')
+        if len(fields) == 2:  # Ps are 2 fields
+            return fields[0].isdigit() and fields[0].isdigit()
+    return False
+
+
+def clean_locus(allele: str, column_name: str = 'Unknown') -> str:
+    if allele:
+        # Remove all white spaces
+        allele = white_space_regex.sub('', allele)
+        locus = column_name.split('_')[1].upper()
+        # If the allele comes in as an allele list, apply reduce to all alleles
+        if '/' in allele:
+            return "/".join(map(reduce, allele.split('/'), locus))
+        else:
+            return reduce(allele, locus)
+    return allele
+
+
+def should_be_reduced(allele, locus_allele):
+    if is_serology(allele):
+        return ard_config["reduce_serology"]
+
+    if ard_config["reduce_v2"]:
+        if ard.is_v2(locus_allele):
+            return True
+
+    if ard_config["reduce_3field"]:
+        if is_3field(locus_allele):
+            return True
+
+    if ard_config["reduce_P"]:
+        if is_P(allele):
+            return True
+
+    if ard_config["reduce_XX"]:
+        if ard.is_XX(locus_allele):
+            return True
+
+    if ard_config["reduce_MAC"]:
+        if ard.is_mac(locus_allele) and not ard.is_XX(locus_allele):
+            return True
+
+    return False
+
+
+def reduce(allele, locus):
+    # Does the allele name have the locus in it ?
+    if '*' in allele:
+        locus_allele = allele
+    elif ard_config["locus_in_allele_name"]:
+        locus_allele = allele
+    else:
+        locus_allele = f"{locus}*{allele}"
+
+    # Check the config if this allele should be reduced
+    if should_be_reduced(allele, locus_allele):
+        # print(f"reducing '{locus_allele}'")
+        reduced_allele = ard.redux_gl(locus_allele, ard_config["redux_type"])
+        # print(f"reduced to '{reduced_allele}'")
+        if reduced_allele:
+            allele = "/".join(map(lambda a: a.split('*')[1],
+                                  reduced_allele.split('/')))
+        else:
+            if verbose:
+                print(f"Failed to reduce {locus_allele}")
+
+        if verbose:
+            print(f"\t{locus_allele} => {allele}")
+    return allele
+
+
+if __name__ == '__main__':
+    ard = pyard.ARD(remove_invalid=False)
+
+    df = pd.read_csv(ard_config["in_csv_filename"], names=ard_config["csv_in_column_names"], header=0, dtype=str)
+    df.fillna('', inplace=True)
+
+    for column in ard_config["columns_to_check"]:
+        if verbose:
+            print(f"Column:{column} =>")
+        if ard_config["new_column_for_redux"]:
+            # insert a new column
+            new_column_name = f"reduced_{column}"
+            new_column_index = df.columns.get_loc(column) + 1
+            df.insert(new_column_index, new_column_name, df[column].apply(clean_locus, column_name=column))
+        else:
+            df[column] = df[column].apply(clean_locus, column_name=column)
+
+    if ard_config["output_file_format"] == 'xlsx':
+        out_file_name = f"{ard_config['out_csv_filename']}.xlsx"
+        df.to_excel(out_file_name, index=False)
+    else:
+        out_file_name = f"{ard_config['out_csv_filename'] + '.gz' if ard_config['apply_compression'] else ''}"
+        df.to_csv(out_file_name, index=False, compression=ard_config["apply_compression"])
+    if verbose:
+        print(f"Saved result to file:{out_file_name}")
@@ -0,0 +1,4 @@
+nmdp_id,r_a_typ1,r_a_typ2,r_b_typ1,r_b_typ2,r_c_typ1,r_c_typ2,r_drb1_typ1,r_drb1_typ2,r_dpb1_typ1,r_dpb1_typ2
+123,A*23:71,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01
+456,A*01:01:42,A*30:12:02,B*44:02:32,B*35:42,C*03:148,C*04:322,DRB1*13:01:16,DRB1*15:80N,DPB1*914:01:01,DPB1*278:01:01
+789,A*02:247,A*03:227,B*15:570,B*07:02:01:17,C*16:01:10,C*06:102,DRB1*13:156,DRB1*14:167:01,DPB1*405:01:01,DPB1*479:01:01
@@ -24,4 +24,4 @@
 from .pyard import ARD
 
 __author__ = """NMDP Bioinformatics"""
-__version__ = '0.6.1'
+__version__ = '0.6.2'
@@ -28,7 +28,7 @@
 #
 # Mapping Generated from `dna_relshp.csv` file
 #
-broad_splits_mapping = {
+broad_splits_dna_mapping = {
     'A*09': ['A*23', 'A*24'],
     'A*10': ['A*25', 'A*26', 'A*34', 'A*66'],
     'A*19': ['A*29', 'A*30', 'A*31', 'A*32', 'A*33', 'A*74'],
@@ -45,3 +45,23 @@
     'DRB1*02': ['DRB1*15', 'DRB1*16'],
     'DRB1*06': ['DRB1*13', 'DRB1*14']
 }
+broad_splits_ser_mapping = {
+    'A9':  ['A23', 'A24'],
+    'A10': ['A25', 'A26', 'A34', 'A66'],
+    'A19': ['A29', 'A30', 'A31', 'A32', 'A33', 'A74'],
+    'A28': ['A68', 'A69'],
+    'B5':  ['B51', 'B52'],
+    'B12': ['B44', 'B45'],
+    'B15': ['B62', 'B63', 'B75', 'B76', 'B77'], 
+    'B16': ['B38', 'B39'],
+    'B17': ['B57', 'B58'],
+    'B21': ['B49', 'B50'],
+    'B22': ['B54', 'B55', 'B56'],
+    'B40': ['B60', 'B61'],
+    'B70': ['B71', 'B72'],
+    'DQ1': ['DQ5',  'DQ6'],
+    'DR2': ['DR15', 'DR16'],
+    'DR3': ['DR17', 'DR18'],
+    'DR5': ['DR11', 'DR12'],
+    'DR6': ['DR13', 'DR14']
+}