Extra Helper Scripts for Batch processing CSV File (#69)

pbashyal-nmdp · web-flow · commit b9ca6347cf0a · 2021-03-26T11:55:49.000-05:00
`reduce_csv.py` : the driver file
`conf.py`: configuration
`sample.csv`: sample CSV file
diff --git a/extras/README.md b/extras/README.md
@@ -0,0 +1,100 @@
+# Extras
+
+# Batch Script for CSV File
+
+**Example Scripts to batch reduce HLA typings from a CSV File**
+
+`reduce_csv.py` and `conf.py` scripts can be used to take a CSV file with HLA 
+typing data and reduce certain columns and produce a new CSV and Excel file.
+
+For most use case, installing `py-ard`, specifying the changes in `conf.py` file
+and running `python reduce_csv.py` will produce result based on the configuration
+in the `conf.py`.
+
+
+```python
+#
+# configurations for processing CSV files
+#
+
+# The column names that are in CSV
+# The output file will have these columns
+all_columns_in_csv = [
+    "nmdp_id", "r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2",
+    "r_dpb1_typ1", "r_dpb1_typ2"
+]
+
+#
+# List of columns which have typing information and need to be reduced.
+# The locus is the 2nd term in the column name
+# Eg: For column R_DRB1_type1, DPB1 is the locus name
+#
+columns_to_reduce_in_csv = [
+    "r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2", "r_dpb1_typ1",
+    "r_dpb1_typ2"
+]
+
+#
+# Configuration options to ARD reduction of a CSV file
+#
+ard_config = {
+    # All Columns in the CSV file
+    "csv_in_column_names": all_columns_in_csv,
+
+    # Columns to check for typings
+    "columns_to_check": columns_to_reduce_in_csv,
+
+    # How should the typings be reduced
+    # Valid Options:
+    # - G
+    # - lg
+    # - lgx
+    "redux_type": "lgx",
+
+    # Input CSV filename
+    "in_csv_filename": "sample.csv",
+
+    # Output CSV filename
+    "out_csv_filename": 'clean_sample.csv',
+
+    # Use compression
+    # Valid options
+    # - 'gzip'
+    # - 'zip'
+    # - None
+    "apply_compression": 'gzip',
+
+    # Show verbose log
+    # Valid options:
+    # - True
+    # - False
+    "verbose_log": True,
+
+    # What to reduce ?
+    "reduce_serology": False,
+    "reduce_v2": True,
+    "reduce_3field": True,
+    "reduce_P": True,
+    "reduce_XX": False,
+    "reduce_MAC": True,
+
+    # Is locus name present in allele
+    # Eg. A*01:01 vs 01:01
+    "locus_in_allele_name": False,
+
+    # Format
+    # Valid options:
+    # - csv
+    # - xlsx
+    "output_file_format": 'csv',
+
+    # Add a separate column for processed column
+    "new_column_for_redux": False,
+}
+```
+
+The included sample CSV file `sample.csv` can be processed using the script.
+
+```shell
+
+```
diff --git a/extras/conf.py b/extras/conf.py
@@ -0,0 +1,78 @@
+#
+# configurations for processing CSV files
+#
+
+# The column names that are in CSV
+# The output file will have these columns
+all_columns_in_csv = [
+    "nmdp_id", "r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2",
+    "r_dpb1_typ1", "r_dpb1_typ2"
+]
+
+#
+# List of columns which have typing information and need to be reduced.
+# The locus is the 2nd term in the column name
+# Eg: For column R_DRB1_type1, DPB1 is the locus name
+#
+columns_to_reduce_in_csv = [
+    "r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2", "r_dpb1_typ1",
+    "r_dpb1_typ2"
+]
+
+#
+# Configuration options to ARD reduction of a CSV file
+#
+ard_config = {
+    # All Columns in the CSV file
+    "csv_in_column_names": all_columns_in_csv,
+
+    # Columns to check for typings
+    "columns_to_check": columns_to_reduce_in_csv,
+
+    # How should the typings be reduced
+    # Valid Options:
+    # - G
+    # - lg
+    # - lgx
+    "redux_type": "lgx",
+
+    # Input CSV filename
+    "in_csv_filename": "sample.csv",
+
+    # Output CSV filename
+    "out_csv_filename": 'clean_sample.csv',
+
+    # Use compression
+    # Valid options
+    # - 'gzip'
+    # - 'zip'
+    # - None
+    "apply_compression": 'gzip',
+
+    # Show verbose log
+    # Valid options:
+    # - True
+    # - False
+    "verbose_log": True,
+
+    # What to reduce ?
+    "reduce_serology": False,
+    "reduce_v2": True,
+    "reduce_3field": True,
+    "reduce_P": True,
+    "reduce_XX": False,
+    "reduce_MAC": True,
+
+    # Is locus name present in allele
+    # Eg. A*01:01 vs 01:01
+    "locus_in_allele_name": False,
+
+    # Format
+    # Valid options:
+    # - csv
+    # - xlsx
+    "output_file_format": 'csv',
+
+    # Add a separate column for processed column
+    "new_column_for_redux": False,
+}
diff --git a/extras/reduce_csv.py b/extras/reduce_csv.py
@@ -0,0 +1,127 @@
+#
+#
+#  Quick script to reduce alleles from a CSV file
+#
+#  Use `conf.py` to setup configurations that's used here
+#  For Excel output, openpyxl library needs to be installed.
+#       pip install openpyxl
+#
+
+import pandas as pd
+import pyard
+import re
+
+from conf import ard_config
+
+verbose = ard_config["verbose_log"]
+white_space_regex = re.compile(r"\s+")
+
+
+def is_serology(allele: str) -> bool:
+    if len(allele.split(':')) == 1:
+        return True
+
+
+def is_3field(allele: str) -> bool:
+    return len(allele.split(':')) > 2
+
+
+def is_P(allele: str) -> bool:
+    if allele.endswith('P'):
+        fields = allele.split(':')
+        if len(fields) == 2:  # Ps are 2 fields
+            return fields[0].isdigit() and fields[0].isdigit()
+    return False
+
+
+def clean_locus(allele: str, column_name: str = 'Unknown') -> str:
+    if allele:
+        # Remove all white spaces
+        allele = white_space_regex.sub('', allele)
+        locus = column_name.split('_')[1].upper()
+        # If the allele comes in as an allele list, apply reduce to all alleles
+        if '/' in allele:
+            return "/".join(map(reduce, allele.split('/'), locus))
+        else:
+            return reduce(allele, locus)
+    return allele
+
+
+def should_be_reduced(allele, locus_allele):
+    if is_serology(allele):
+        return ard_config["reduce_serology"]
+
+    if ard_config["reduce_v2"]:
+        if ard.is_v2(locus_allele):
+            return True
+
+    if ard_config["reduce_3field"]:
+        if is_3field(locus_allele):
+            return True
+
+    if ard_config["reduce_P"]:
+        if is_P(allele):
+            return True
+
+    if ard_config["reduce_XX"]:
+        if ard.is_XX(locus_allele):
+            return True
+
+    if ard_config["reduce_MAC"]:
+        if ard.is_mac(locus_allele) and not ard.is_XX(locus_allele):
+            return True
+
+    return False
+
+
+def reduce(allele, locus):
+    # Does the allele name have the locus in it ?
+    if '*' in allele:
+        locus_allele = allele
+    elif ard_config["locus_in_allele_name"]:
+        locus_allele = allele
+    else:
+        locus_allele = f"{locus}*{allele}"
+
+    # Check the config if this allele should be reduced
+    if should_be_reduced(allele, locus_allele):
+        # print(f"reducing '{locus_allele}'")
+        reduced_allele = ard.redux_gl(locus_allele, ard_config["redux_type"])
+        # print(f"reduced to '{reduced_allele}'")
+        if reduced_allele:
+            allele = "/".join(map(lambda a: a.split('*')[1],
+                                  reduced_allele.split('/')))
+        else:
+            if verbose:
+                print(f"Failed to reduce {locus_allele}")
+
+        if verbose:
+            print(f"\t{locus_allele} => {allele}")
+    return allele
+
+
+if __name__ == '__main__':
+    ard = pyard.ARD(remove_invalid=False)
+
+    df = pd.read_csv(ard_config["in_csv_filename"], names=ard_config["csv_in_column_names"], header=0, dtype=str)
+    df.fillna('', inplace=True)
+
+    for column in ard_config["columns_to_check"]:
+        if verbose:
+            print(f"Column:{column} =>")
+        if ard_config["new_column_for_redux"]:
+            # insert a new column
+            new_column_name = f"reduced_{column}"
+            new_column_index = df.columns.get_loc(column) + 1
+            df.insert(new_column_index, new_column_name, df[column].apply(clean_locus, column_name=column))
+        else:
+            df[column] = df[column].apply(clean_locus, column_name=column)
+
+    if ard_config["output_file_format"] == 'xlsx':
+        out_file_name = f"{ard_config['out_csv_filename']}.xlsx"
+        df.to_excel(out_file_name, index=False)
+    else:
+        out_file_name = f"{ard_config['out_csv_filename'] + '.gz' if ard_config['apply_compression'] else ''}"
+        df.to_csv(out_file_name, index=False, compression=ard_config["apply_compression"])
+    if verbose:
+        print(f"Saved result to file:{out_file_name}")
diff --git a/extras/sample.csv b/extras/sample.csv
@@ -0,0 +1,4 @@
+nmdp_id,r_a_typ1,r_a_typ2,r_b_typ1,r_b_typ2,r_c_typ1,r_c_typ2,r_drb1_typ1,r_drb1_typ2,r_dpb1_typ1,r_dpb1_typ2
+123,A*23:71,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01
+456,A*01:01:42,A*30:12:02,B*44:02:32,B*35:42,C*03:148,C*04:322,DRB1*13:01:16,DRB1*15:80N,DPB1*914:01:01,DPB1*278:01:01
+789,A*02:247,A*03:227,B*15:570,B*07:02:01:17,C*16:01:10,C*06:102,DRB1*13:156,DRB1*14:167:01,DPB1*405:01:01,DPB1*479:01:01