Support reducing glstring column in a csv file

pbashyal-nmdp · pbashyal-nmdp · commit 245b7948dacb · 2023-03-16T12:33:07.000-05:00
diff --git a/extras/README.md b/extras/README.md
@@ -150,6 +150,20 @@ The column names corresponding to the loci will be reduced and must appear in th
 }
 ```
 
+### GL String Columns
+
+Instead of providing single locus alleles per column with `locus_column_mapping`, a GL String describing the whole
+genotype can be provided per column. Use `glstring_columns` to provide a list of GL String columns to reduce.
+
+```json
+  "glstring_columns": [
+    "donor_gl",
+    "recip_gl"
+  ],
+```
+
+Depending upon the data, only one of `locus_column_mapping` or `glstring_columns` needs to be provided.
+
 ### Redux Options
 
 `redux_type` Reduction Type
diff --git a/extras/reduce_conf.json b/extras/reduce_conf.json
@@ -2,7 +2,8 @@
   "in_csv_filename": "sample.csv",
   "out_csv_filename": "clean_sample.csv",
   "columns_from_csv": [
-    "nmdp_id",
+    "rid",
+    "did",
     "r_a_typ1",
     "r_a_typ2",
     "r_b_typ1",
diff --git a/extras/reduce_conf_glstring.json b/extras/reduce_conf_glstring.json
@@ -0,0 +1,33 @@
+{
+  "in_csv_filename": "sample_glstring.csv",
+  "out_csv_filename": "clean_sample_glstring.csv",
+  "columns_from_csv": [
+    "did",
+    "rid",
+    "donor_gl",
+    "recip_gl"
+  ],
+  "glstring_columns": [
+    "donor_gl",
+    "recip_gl"
+  ],
+  "redux_type": "lgx",
+  "redux_cache_size": 1000,
+  "reduce_serology": false,
+  "reduce_v2": true,
+  "convert_v2_to_v3": false,
+  "reduce_2field": true,
+  "reduce_3field": true,
+  "reduce_P": true,
+  "reduce_XX": false,
+  "reduce_MAC": true,
+  "map_drb345_to_drbx": false,
+  "locus_in_allele_name": true,
+  "keep_locus_in_allele_name": true,
+  "new_column_for_redux": true,
+  "reduced_column_prefix": "reduced_",
+  "generate_glstring": true,
+  "output_file_format": "csv",
+  "apply_compression": "gzip",
+  "verbose_log": true
+}
diff --git a/extras/sample.csv b/extras/sample.csv
@@ -1,4 +1,4 @@
-nmdp_id,r_a_typ1,r_a_typ2,r_b_typ1,r_b_typ2,r_c_typ1,r_c_typ2,r_drb1_typ1,r_drb1_typ2,r_dpb1_typ1,r_dpb1_typ2,d_a_typ1,d_a_typ2,d_b_typ1,d_b_typ2,d_c_typ1,d_c_typ2,d_drb1_typ1,d_drb1_typ2,d_dpb1_typ1,d_dpb1_typ2
-123,A*01:AB,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01,A*01:AB,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01
-456,A*01:01:42,A*30:12:02,B*44:02:32,B*35:42,C*03:148,C*04:322,DRB1*13:01:16,DRB1*15:80N,DPB1*914:01:01,DPB1*278:01:01,A*01:01:42,A*30:12:02,B*44:02:32,B*35:42,C*03:148,C*04:322,DRB1*13:01:16,DRB1*15:80N,DPB1*914:01:01,DPB1*278:01:01
-789,A*02:247,A*03:227,B*15:570,B*07:02:01:17,C*16:01:10,C*06:102,DRB1*13:156,DRB1*14:167:01,DPB1*405:01:01,DPB1*479:01:01,A*02:247,A*03:227,B*15:570,B*07:02:01:17,C*16:01:10,C*06:102,DRB1*13:156,DRB1*14:167:01,DPB1*405:01:01,DPB1*479:01:01
+rid,did,r_a_typ1,r_a_typ2,r_b_typ1,r_b_typ2,r_c_typ1,r_c_typ2,r_drb1_typ1,r_drb1_typ2,r_dpb1_typ1,r_dpb1_typ2,d_a_typ1,d_a_typ2,d_b_typ1,d_b_typ2,d_c_typ1,d_c_typ2,d_drb1_typ1,d_drb1_typ2,d_dpb1_typ1,d_dpb1_typ2
+2110,123,A*01:AB,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01,A*01:AB,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01
+2111,456,A*01:01:42,A*30:12:02,B*44:02:32,B*35:42,C*03:148,C*04:322,DRB1*13:01:16,DRB1*15:80N,DPB1*914:01:01,DPB1*278:01:01,A*01:01:42,A*30:12:02,B*44:02:32,B*35:42,C*03:148,C*04:322,DRB1*13:01:16,DRB1*15:80N,DPB1*914:01:01,DPB1*278:01:01
+2113,789,A*02:247,A*03:227,B*15:570,B*07:02:01:17,C*16:01:10,C*06:102,DRB1*13:156,DRB1*14:167:01,DPB1*405:01:01,DPB1*479:01:01,A*02:247,A*03:227,B*15:570,B*07:02:01:17,C*16:01:10,C*06:102,DRB1*13:156,DRB1*14:167:01,DPB1*405:01:01,DPB1*479:01:01
diff --git a/extras/sample_glstring.csv b/extras/sample_glstring.csv
@@ -0,0 +1,3 @@
+rid,did,recip_gl,donor_gl
+123,456,A*02:GNF+A*03:XYZ^B*07:ABD+B*44:AWA,A*02:01:01+A*03:01:01^B*07:RVXR+B*44:XYAG
+789,345,A*01:TUS+A*24:02:01G^B*08:ARGR+B*08:ARGS,A*02:01:01+A*01:PXTD^B*51:01:01G+B*40:BWUP
diff --git a/scripts/pyard-reduce-csv b/scripts/pyard-reduce-csv
@@ -32,13 +32,14 @@ import argparse
 import json
 import re
 import sys
+from urllib.error import HTTPError
 
 import pandas as pd
 
 import pyard
 from pyard.db import similar_alleles
 import pyard.drbx as drbx
-from pyard.exceptions import PyArdError
+from pyard.exceptions import PyArdError, InvalidTypingError
 from pyard.misc import get_data_dir, get_imgt_version, download_to_file
 
 
@@ -171,6 +172,91 @@ def create_drbx(row, locus_in_allele_name):
     return drbx.map_drbx(row.values, locus_in_allele_name)
 
 
+def reduce_locus_columns(df, ard_config, locus_column_mapping, verbose):
+    reduce_prefix = ard_config.get("reduced_column_prefix", "reduced_")
+    for subject in locus_column_mapping:
+        for locus in locus_column_mapping[subject]:
+            # Reduce each of the specified columns
+            locus_columns = locus_column_mapping[subject][locus]
+            for column in locus_columns:
+                if verbose:
+                    print(f"Column:{column} =>")
+                if ard_config.get("new_column_for_redux"):
+                    # insert a new column
+                    new_column_name = f"{reduce_prefix}{column}"
+                    new_column_index = df.columns.get_loc(column) + 1
+                    # Apply clean_locus function to the column and insert as a new column
+                    df.insert(
+                        new_column_index,
+                        new_column_name,
+                        df[column].apply(clean_locus, locus=locus, column_name=column),
+                    )
+                    locus_columns[locus_columns.index(column)] = new_column_name
+                else:
+                    # Apply clean_locus function to the column and replace the column
+                    df[column] = df[column].apply(
+                        clean_locus, locus=locus, column_name=column
+                    )
+    # Map DRB3,DRB4,DRB5 to DRBX if specified
+    # New columns DRBX_1 and DRBX_2 are created
+    if ard_config.get("map_drb345_to_drbx"):
+        drbx_loci = ["DRB3", "DRB4", "DRB5"]
+        drbx_columns = [
+            col_name for col_name in df.columns if col_name.split("_")[1] in drbx_loci
+        ]
+        if len(drbx_columns) == len(drbx_loci) * 2:  # For Type1/Type2
+            locus_in_allele_name = ard_config["keep_locus_in_allele_name"]
+            df_drbx = df[drbx_columns].apply(
+                create_drbx, axis=1, args=(locus_in_allele_name,)
+            )
+            df["DRBX_1"], df["DRBX_2"] = zip(*df_drbx)
+
+    if ard_config.get("generate_glstring"):
+        for subject in locus_column_mapping:
+            slug_columns = []
+            for locus in locus_column_mapping[subject]:
+                slug_column = locus + "_slug"
+                slug_columns.append(slug_column)
+                if len(locus_column_mapping[subject][locus]) > 1:
+                    df[slug_column] = (
+                        df[locus_column_mapping[subject][locus][0]]
+                        + "+"
+                        + df[locus_column_mapping[subject][locus][1]]
+                    )
+                else:
+                    df[slug_column] = df[locus_column_mapping[subject][locus][0]]
+
+            df[subject + "_gl"] = df[slug_columns].agg("^".join, axis=1)
+            df[subject + "_gl"] = df[subject + "_gl"].apply(
+                lambda gl: gl.replace("^+", "")
+            )
+            df.drop(columns=slug_columns, inplace=True)
+
+
+def reduce_glstring(glstring: str) -> str:
+    try:
+        return ard.redux(glstring, ard_config["redux_type"])
+    except InvalidTypingError as e:
+        print(f"Error reducing {glstring} \n", e.message, file=sys.stderr)
+        return "Failed"
+
+
+def reduce_glstring_columns(df, ard_config, glstring_columns):
+    reduce_prefix = ard_config.get("reduced_column_prefix", "reduced_")
+    for column in glstring_columns:
+        if ard_config.get("new_column_for_redux"):
+            # insert a new column
+            new_column_name = f"{reduce_prefix}{column}"
+            new_column_index = df.columns.get_loc(column) + 1
+            # Apply clean_locus function to the column and insert as a new column
+            df.insert(
+                new_column_index, new_column_name, df[column].apply(reduce_glstring)
+            )
+        else:
+            # Apply clean_locus function to the column and replace the column
+            df[column] = df[column].apply(reduce_glstring)
+
+
 if __name__ == "__main__":
     # config is specified with a -c parameter
     parser = argparse.ArgumentParser()
@@ -207,15 +293,20 @@ if __name__ == "__main__":
     args = parser.parse_args()
 
     if args.generate:
-        config_url = "https://raw.githubusercontent.com/nmdp-bioinformatics/py-ard/master/extras/reduce_conf.json"
-        sample_config = "sample_reduce_conf.json"
-        download_to_file(config_url, sample_config)
-        print(f"Created {sample_config}")
-
-        sample_url = "https://raw.githubusercontent.com/nmdp-bioinformatics/py-ard/master/extras/sample.csv"
-        sample_csv = "sample.csv"
-        download_to_file(sample_url, sample_csv)
-        print(f"Created {sample_csv}")
+        sample_files = [
+            "reduce_conf.json",
+            "sample.csv",
+            "reduce_conf_glstring.json",
+            "sample_glstring.csv",
+        ]
+        for sample_file in sample_files:
+            try:
+                url = f"https://raw.githubusercontent.com/nmdp-bioinformatics/py-ard/master/extras/{sample_file}"
+                sample_local_file = f"sample_{sample_file}"
+                download_to_file(url, sample_local_file)
+                print(f"Created {sample_local_file}")
+            except HTTPError:
+                print(f"Download failed for {sample_file}")
         sys.exit(0)
 
     config_filename = args.config
@@ -248,8 +339,21 @@ if __name__ == "__main__":
     data_dir = get_data_dir(args.data_dir)
     imgt_version = get_imgt_version(args.imgt_version)
     max_cache_size = ard_config.get("redux_cache_size", pyard.DEFAULT_CACHE_SIZE)
+    csv_redux_config = {
+        "reduce_serology": ard_config.get("reduce_serology", True),
+        "reduce_v2": ard_config.get("reduce_v2", True),
+        "reduce_3field": ard_config.get("reduce_3field", True),
+        "reduce_P": ard_config.get("reduce_P", True),
+        "reduce_XX": ard_config.get("reduce_XX", True),
+        "reduce_MAC": ard_config.get("reduce_MAC", True),
+        "map_drb345_to_drbx": ard_config.get("map_drb345_to_drbx", True),
+        "verbose_log": ard_config.get("verbose_log", True),
+    }
     ard = pyard.init(
-        imgt_version=imgt_version, data_dir=data_dir, cache_size=max_cache_size
+        imgt_version=imgt_version,
+        data_dir=data_dir,
+        cache_size=max_cache_size,
+        config=csv_redux_config,
     )
 
     # Read the Input File
@@ -268,68 +372,14 @@ if __name__ == "__main__":
         print(f"File not found {ard_config.get('in_csv_filename')}", file=sys.stderr)
         sys.exit(1)
 
-    reduce_prefix = ard_config.get("reduced_column_prefix", "reduced_")
-
     failed_to_reduce_alleles = []
-    locus_column_mapping = ard_config["locus_column_mapping"]
-    for subject in locus_column_mapping:
-        for locus in locus_column_mapping[subject]:
-            # Reduce each of the specified columns
-            locus_columns = locus_column_mapping[subject][locus]
-            for column in locus_columns:
-                if verbose:
-                    print(f"Column:{column} =>")
-                if ard_config.get("new_column_for_redux"):
-                    # insert a new column
-                    new_column_name = f"{reduce_prefix}{column}"
-                    new_column_index = df.columns.get_loc(column) + 1
-                    # Apply clean_locus function to the column and insert as a new column
-                    df.insert(
-                        new_column_index,
-                        new_column_name,
-                        df[column].apply(clean_locus, locus=locus, column_name=column),
-                    )
-                    locus_columns[locus_columns.index(column)] = new_column_name
-                else:
-                    # Apply clean_locus function to the column and replace the column
-                    df[column] = df[column].apply(
-                        clean_locus, locus=locus, column_name=column
-                    )
-
-    # Map DRB3,DRB4,DRB5 to DRBX if specified
-    # New columns DRBX_1 and DRBX_2 are created
-    if ard_config.get("map_drb345_to_drbx"):
-        drbx_loci = ["DRB3", "DRB4", "DRB5"]
-        drbx_columns = [
-            col_name for col_name in df.columns if col_name.split("_")[1] in drbx_loci
-        ]
-        if len(drbx_columns) == len(drbx_loci) * 2:  # For Type1/Type2
-            locus_in_allele_name = ard_config["keep_locus_in_allele_name"]
-            df_drbx = df[drbx_columns].apply(
-                create_drbx, axis=1, args=(locus_in_allele_name,)
-            )
-            df["DRBX_1"], df["DRBX_2"] = zip(*df_drbx)
-
-    if ard_config.get("generate_glstring"):
-        for subject in locus_column_mapping:
-            slug_columns = []
-            for locus in locus_column_mapping[subject]:
-                slug_column = locus + "_slug"
-                slug_columns.append(slug_column)
-                if len(locus_column_mapping[subject][locus]) > 1:
-                    df[slug_column] = (
-                        df[locus_column_mapping[subject][locus][0]]
-                        + "+"
-                        + df[locus_column_mapping[subject][locus][1]]
-                    )
-                else:
-                    df[slug_column] = df[locus_column_mapping[subject][locus][0]]
+    locus_column_mapping = ard_config.get("locus_column_mapping", None)
+    if locus_column_mapping:
+        reduce_locus_columns(df, ard_config, locus_column_mapping, verbose)
 
-            df[subject + "_gl"] = df[slug_columns].agg("^".join, axis=1)
-            df[subject + "_gl"] = df[subject + "_gl"].apply(
-                lambda gl: gl.replace("^+", "")
-            )
-            df.drop(columns=slug_columns, inplace=True)
+    glstring_columns = ard_config.get("glstring_columns", None)
+    if glstring_columns:
+        reduce_glstring_columns(df, ard_config, glstring_columns)
 
     # Save as XLSX if specified
     if ard_config["output_file_format"] == "xlsx":

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+rid,did,recip_gl,donor_gl`
	`2`	`+123,456,A02:GNF+A03:XYZ^B07:ABD+B44:AWA,A02:01:01+A03:01:01^B07:RVXR+B44:XYAG`
	`3`	`+789,345,A01:TUS+A24:02:01G^B08:ARGR+B08:ARGS,A02:01:01+A01:PXTD^B51:01:01G+B40:BWUP`