Merge pull request #30 from pbashyal-nmdp/make_graph_module

mmaiers-nmdp · web-flow · commit 290272f8e44c · 2024-01-04T11:14:44.000-06:00
Update README and graph generation
diff --git a/README.md b/README.md
@@ -11,6 +11,140 @@
 
 ![GRIM Dependencies](images/py-graph-imputation.png)
 
+### Use `py-graph-imputation`
+
+#### Install `py-graph-imputation` from PyPi
+```
+pip install py-graph-imputation
+```
+
+#### Get Frequency Data and Subject Data and Configuration File
+
+For an example, copy the folders to your working directory:
+ -  https://github.com/nmdp-bioinformatics/py-graph-imputation/tree/master/data
+ -  https://github.com/nmdp-bioinformatics/py-graph-imputation/tree/master/conf
+
+so it appears as:
+
+```
+.
+├── conf
+│   ├── README.md
+│   └── minimal-configuration.json
+├── data
+│   ├── freqs
+│   │   └── CAU.freqs.gz
+│   └── subjects
+│       └── donor.csv
+```
+
+#### Modify the configuration.json to suit your need
+
+
+#### Produce HPF csv file from Frequency Data
+
+```
+>>> from graph_generation.generate_hpf import produce_hpf
+>>> produce_hpf(conf_file='conf/minimal-configuration.json')
+****************************************************************************************************
+Conversion to HPF file based on following configuration:
+	Population: ['CAU']
+	Frequency File Directory: data/freqs
+	Output File: output/hpf.csv
+****************************************************************************************************
+Reading Frequency File:	 data/freqs/CAU.freqs.gz
+Writing hpf File:	 output/hpf.csv
+```
+
+This will produce the files which will be used for graph generation:
+
+```
+├── output
+│   ├── hpf.csv                 # CSV file of Haplotype, Populatio, Freq
+│   └── pop_counts_file.txt     # Size of each population
+```
+
+#### Generate the Graph (nodes and edges) files
+
+```
+>>> from grim.grim import graph_freqs
+
+>>> graph_freqs(conf_file='conf/minimal-configuration.json')
+****************************************************************************************************
+Performing graph generation based on following configuration:
+	Population: ['CAU']
+	Freq File: output/hpf.csv
+	Freq Trim Threshold: 1e-05
+****************************************************************************************************
+```
+
+This will produce the following files:
+
+```
+├── output
+│   ├── csv
+│   │   ├── edges.csv
+│   │   ├── info_node.csv
+│   │   ├── nodes.csv
+│   │   └── top_links.csv
+
+```
+
+#### Produce Imputation Results for Subjects
+
+```
+>>> from grim.grim import graph_freqs, impute
+>>> impute(conf_file='conf/minimal-configuration.json')
+****************************************************************************************************
+Performing imputation based on:
+	Population: ['CAU']
+	Priority: {'alpha': 0.4999999, 'eta': 0, 'beta': 1e-07, 'gamma': 1e-07, 'delta': 0.4999999}
+	UNK priority: SR
+	Epsilon: 0.001
+	Plan B: True
+	Number of Results: 10
+	Number of Population Results: 100
+	Nodes File: output/csv/nodes.csv
+	Top Links File: output/csv/edges.csv
+	Input File: data/subjects/donor.csv
+	Output UMUG Format: True
+	Output UMUG Freq Filename: output/don.umug
+	Output UMUG Pops Filename: output/don.umug.pops
+	Output Haplotype Format: True
+	Output HAP Freq Filename: output/don.pmug
+	Output HAP Pops Filename: output/don.pmug.pops
+	Output Miss Filename: output/don.miss
+	Output Problem Filename: output/don.problem
+	Factor Missing Data: 0.0001
+	Loci Map: {'A': 1, 'B': 2, 'C': 3, 'DQB1': 4, 'DRB1': 5}
+	Plan B Matrix: [[[1, 2, 3, 4, 5]], [[1, 2, 3], [4, 5]], [[1], [2, 3], [4, 5]], [[1, 2, 3], [4], [5]], [[1], [2, 3], [4], [5]], [[1], [2], [3], [4], [5]]]
+	Pops Count File: output/pop_counts_file.txt
+	Use Pops Count File: output/pop_counts_file.txt
+	Number of Options Threshold: 100000
+	Max Number of haplotypes in phase: 100
+	Save space mode: False
+****************************************************************************************************
+0 Subject: D1 8400 haplotypes
+0 Subject: D1 6028 haplotypes
+0.09946062499999186
+```
+
+This will produce files in `output` directory as:
+
+```
+├── output
+│   ├── don.miss                # Cases that failed imputation (e.g. incorrect typing etc.)
+│   ├── don.pmug                # Phased imputation as PMUG GL String
+│   ├── don.pmug.pops           # Population for Phased Imputation
+│   ├── don.problem             # List of errors
+│   ├── don.umug                # Unphased imputation as UMUG GL String
+│   ├── don.umug.pops           # Population for Phased Imputation
+```
+
+
+
+
+
 ### Development
 How to develop on the project locally.
 
diff --git a/graph_generation/generate_hpf.py b/graph_generation/generate_hpf.py
@@ -4,89 +4,94 @@
 import pathlib
 import argparse
 
+
+def produce_hpf(conf_file):
+    # Read configuration file and load properties
+    with open(conf_file) as f:
+        conf = json.load(f)
+
+    pops = conf.get("populations")
+    freq_data_dir = project_dir + conf.get("freq_data_dir")
+    output_dir = project_dir + conf.get("graph_files_path")
+    pop_ratio_dir = project_dir + conf.get("pops_count_file")
+    # Output in HaplotypePopulationFrequency (hpf) csv file
+    freq_file = project_dir + conf.get("freq_file")
+
+    # Create output directory if it doesn't exist
+    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
+
+    # Display the configurations we are using
+    print(
+        "****************************************************************************************************"
+    )
+    print("Conversion to HPF file based on following configuration:")
+    print("\tPopulation: {}".format(pops))
+    print("\tFrequency File Directory: {}".format(freq_data_dir))
+    print("\tOutput File: {}".format(freq_file))
+    print(
+        "****************************************************************************************************"
+    )
+
+    haplist_overall = {}  # list of haplotypes across all populations
+    pop_hap_combos = {}
+
+    list_pop_count = []
+    #### Load initial frequency files
+    for pop in pops:
+        in_freq_file = freq_data_dir + "/" + pop + ".freqs.gz"
+        print("Reading Frequency File:\t {}".format(in_freq_file))
+        with gzip.open(in_freq_file, "rb") as zf:
+            count_pop = 0
+            lines = [x.decode("utf8").strip() for x in zf.readlines()]
+            for hap_line in lines:
+                haplotype, count, freq = hap_line.split(",")
+                if haplotype == "Haplo":
+                    continue
+                freq = float(freq)
+                # Ignore lines with 0 freq
+                if freq == 0.0:
+                    continue
+
+                pop_haplotype = pop + "-" + haplotype
+                haplist_overall[haplotype] = 1
+                pop_hap_combos[pop_haplotype] = freq
+
+                count_pop += float(count)
+            list_pop_count.append(count_pop)
+
+    sum_pops = sum(list_pop_count)
+    pop_ratio_file = open(pop_ratio_dir, "w")
+    for pop, ratio in zip(pops, list_pop_count):
+        pop_ratio_file.write("{},{},{}\n".format(pop, ratio, (ratio / sum_pops)))
+
+    header = ["hap", "pop", "freq"]
+
+    print("Writing hpf File:\t {}".format(freq_file))
+    with open(freq_file, mode="w") as csv_file:
+        csv_writer = csv.writer(csv_file, delimiter=",", quoting=csv.QUOTE_NONE)
+        csv_writer.writerow(header)
+        for pop_haplotype in pop_hap_combos:
+            (pop, haplotype) = pop_haplotype.split("-")
+            freq = pop_hap_combos[pop_haplotype]
+            csv_writer.writerow([haplotype, pop, freq])
+
+
+# Global
 project_dir = ""
 
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "-c",
-    "--config",
-    required=False,
-    default="../../conf/minimal-configuration.json",
-    help="Configuration JSON file",
-    type=str,
-)
-
-args = parser.parse_args()
-configuration_file = args.config
-
-# Read configuration file and load properties
-with open(configuration_file) as f:
-    conf = json.load(f)
-
-pops = conf.get("populations")
-freq_data_dir = project_dir + conf.get("freq_data_dir")
-output_dir = project_dir + conf.get("graph_files_path")
-pop_ratio_dir = project_dir + conf.get("pops_count_file")
-# Output in HaplotypePopulationFrequency (hpf) csv file
-freq_file = project_dir + conf.get("freq_file")
-
-
-# Create output directory if it doesn't exist
-pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
-
-# Display the configurations we are using
-print(
-    "****************************************************************************************************"
-)
-print("Conversion to HPF file based on following configuration:")
-print("\tPopulation: {}".format(pops))
-print("\tFrequency File Directory: {}".format(freq_data_dir))
-print("\tOutput File: {}".format(freq_file))
-print(
-    "****************************************************************************************************"
-)
-
-haplist_overall = {}  # list of haplotypes across all populations
-pop_hap_combos = {}
-
-list_pop_count = []
-#### Load initial frequency files
-for pop in pops:
-    in_freq_file = freq_data_dir + "/" + pop + ".freqs.gz"
-    print("Reading Frequency File:\t {}".format(in_freq_file))
-    with gzip.open(in_freq_file, "rb") as zf:
-        count_pop = 0
-        lines = [x.decode("utf8").strip() for x in zf.readlines()]
-        for hap_line in lines:
-            haplotype, count, freq = hap_line.split(",")
-            if haplotype == "Haplo":
-                continue
-            freq = float(freq)
-            # Ignore lines with 0 freq
-            if freq == 0.0:
-                continue
-
-            pop_haplotype = pop + "-" + haplotype
-            haplist_overall[haplotype] = 1
-            pop_hap_combos[pop_haplotype] = freq
-
-            count_pop += float(count)
-        list_pop_count.append(count_pop)
-
-sum_pops = sum(list_pop_count)
-pop_ratio_file = open(pop_ratio_dir, "w")
-for pop, ratio in zip(pops, list_pop_count):
-    pop_ratio_file.write("{},{},{}\n".format(pop, ratio, (ratio / sum_pops)))
-
-
-header = ["hap", "pop", "freq"]
-
-
-print("Writing hpf File:\t {}".format(freq_file))
-with open(freq_file, mode="w") as csv_file:
-    csv_writer = csv.writer(csv_file, delimiter=",", quoting=csv.QUOTE_NONE)
-    csv_writer.writerow(header)
-    for pop_haplotype in pop_hap_combos:
-        (pop, haplotype) = pop_haplotype.split("-")
-        freq = pop_hap_combos[pop_haplotype]
-        csv_writer.writerow([haplotype, pop, freq])
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-c",
+        "--config",
+        required=False,
+        default="../../conf/minimal-configuration.json",
+        help="Configuration JSON file",
+        type=str,
+    )
+
+    args = parser.parse_args()
+    configuration_file = args.config
+
+    produce_hpf(configuration_file)
diff --git a/grim/__init__.py b/grim/__init__.py
@@ -26,4 +26,4 @@
 """Top-level package for py-grim."""
 
 __organization__ = "NMDP/CIBMTR Bioinformatics"
-__version__ = "0.0.12"
+__version__ = "0.0.13"
diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.0.12
+current_version = 0.0.13
 commit = True
 tag = True
 
diff --git a/setup.py b/setup.py
@@ -52,7 +52,7 @@
 
 setup(
     name="py-graph-imputation",
-    version="0.0.12",
+    version="0.0.13",
     author="Pradeep Bashyal",
     author_email="pbashyal@nmdp.org",
     python_requires=">=3.8",