nmdp-bioinformatics
diff --git a/‎.gitignore‎
Lines changed: 7 additions & 0 deletions b/‎.gitignore‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 3 additions & 1 deletion b/‎Makefile‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 178 additions & 10 deletions b/‎README.md‎
Lines changed: 178 additions & 10 deletions
diff --git a/‎conf/minimal-configuration.json‎
Lines changed: 55 additions & 0 deletions b/‎conf/minimal-configuration.json‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎data/donors_dir/donors.txt‎
Lines changed: 3 additions & 0 deletions b/‎data/donors_dir/donors.txt‎
Lines changed: 3 additions & 0 deletions
@@ -134,3 +134,10 @@ dmypy.json
 # behave
 pretty.output
 allure_report/
+
+# graph output dirs
+output/
+results/
+
+lol_graph.c
+cutils.c
@@ -89,12 +89,14 @@ docker: docker-build ## build a docker image and run the service
 
 install: clean ## install the package to the active Python's site-packages
 	pip install --upgrade pip
-	python setup.py install
+	pip install git+https://github.com/nmdp-bioinformatics/py-graph-imputation
 	pip install -r requirements.txt
 	pip install -r requirements-tests.txt
 	pip install -r requirements-dev.txt
 	pip install -r requirements-deploy.txt
 	pre-commit install
+	python setup.py build_ext --inplace
+	python setup.py install
 
 venv: ## creates a Python3 virtualenv environment in venv
 	python3 -m venv venv --prompt $(PROJECT_NAME)-venv
 
@@ -1,15 +1,183 @@
-My Project Template
+py-graph-match
 ===================
 
+Matching with Graph
 
-How to use the template:
+`grma`` is a package for finding HLA matches using graphs approach.
+The matching is based on [grim's](https://github.com/nmdp-bioinformatics/py-graph-imputation) imputation.
 
-1. Create a template by clicking on the "Use this template" button. Make sure to select all branches
-   This will create a new repository with the given name e.g. `urban-potato`
+
+## Pre-requisites
+
+### Data Directory Structure
+
+```
+data
+├── donors_dir
+│   └── donors.txt
+├── hpf.csv
+└── patients.txt
+```
+
+### conf Directory Structure
+
+```
+conf
+└── minimal-configuration.json
+```
+
+Follow these steps for finding matches:
+
+Setup a virtual environment (venv) and run:
+```
+make install
+```
+
+## Quick Getting Started
+
+Get Started with a built-in example.
+
+### Build 'Donors Graph'
+
+```
+python test_build_donors_graph.py
+```
+
+### Find Matches
+
+Use grma algorthm for finding matches efficiently. You can run the file `test_matching.py`
+```
+python test_matching.py
+```
+
+Find the match results in `results` directory.
+
+# Full Walk through
+### Building The Donors' Graph:
+
+The donors' graph is a graph which contains all the donors (the search space). It implemented using a LOL (List of Lists) representation written in cython for better time and memory efficiency.
+The building might take a lot of memory and time, so it's recommended to save the graph in a pickle file.
+
+Before building the donors' graph, all the donors' HLAs must be imputed using `grim`.
+Then all the imputation files must be saved under the same directory.
+
+```python
+import os
+from grma.donorsgraph.build_donors_graph import BuildMatchingGraph
+
+PATH_TO_DONORS_DIR = "data/donors_dir"
+PATH_TO_DONORS_GRAPH = "output/donors_graph.pkl"
+
+os.makedirs(f"output", exist_ok=True)
+
+build_matching = BuildMatchingGraph(PATH_TO_DONORS_DIR)
+graph = build_matching.graph  # access the donors' graph
+
+build_matching.to_pickle(PATH_TO_DONORS_GRAPH)  # save the donors' graph to pickle
+```
+
+### Search & Match before imputation to patients
+The function `matching` finds matches up to 3 mismatches and return a `pandas.DataFrame` object of the matches sorted by number of mismatches and their score.
+
+The function get these parameters:
+* match_graph: a grma donors' graph object - `grma.match.Graph`
+* grim_config_file: a path to `grim` configuration file
+
+
+```python
+from grma.match import Graph, matching
+
+PATH_TO_DONORS_GRAPH = "data/donors_graph.pkl"
+PATH_CONGIF_FILE = "conf/minimal-configuration.json"
+
+
+# The donors' graph we built earlier
+donors_graph = Graph.from_pickle(PATH_TO_DONORS_GRAPH)
+
+
+# matching_results is a dict - {patient_id: the patient's result dataframe}
+matching_results = matching(donors_graph,PATH_CONGIF_FILE, search_id=1, donors_info=[],
+                                    threshold=0.1, cutof=100, save_to_csv=True, output_dir="results")
+
+```
+
+`matching` takes some optional parameters, which you might want to change:
+
+* search_id: An integer identification of the search. default is 0.
+* donors_info: An iterable of fields from the database to include in the results. default is None.
+* threshold: Minimal score value for a valid match. default is 0.1.
+* cutof: Maximum number of matches to return. default is 50.
+* verbose: A boolean flag for whether to print the documentation. default is False
+* save_to_csv: A boolean flag for whether to save the matching results into a csv file. default is False.  If the field is set to True, upon completion of the function, it will generate a directory named `search_1`
+* `output_dir`: output directory to write match results file to
+
+### Search & Match after imputation to patients
+
+The function `find_mathces` find matches up to 3 mismatches and return a `pandas.DataFrame` object of the matches
+ sorted by number of mismatches and their score.
+
+They get these parameters:
+* imputation_filename: a path to the file of the patients' typing.
+* match_graph: a grma donors' graph object - `grma.match.Graph`
+
+```python
+from grma.match import Graph, find_matches
+
+PATH_TO_PATIENTS_FILE = "data/patients_file.txt"
+PATH_TO_DONORS_GRAPH = "output/donors_graph.pkl"
+
+# The donors' graph we built earlier
+donors_graph = Graph.from_pickle(PATH_TO_DONORS_GRAPH)
+matching_results = find_matches(PATH_TO_PATIENTS_FILE, donors_graph)
+
+# matching_results is a dict - {patient_id: the patient's result dataframe}
+
+for patient, df in matching_results.items():
+    # Use here the dataframe 'df' with the results for 'patient'
+    print(patient, df)
+```
+
+`find_matches` takes some optional parameters, which you might want to change:
+* search_id: An integer identification of the search. default is 0.
+* donors_info: An iterable of fields from the database to include in the results. default is None.
+* threshold: Minimal score value for a valid match. default is 0.1.
+* cutof: Maximum number of matches to return. default is 50.
+* verbose: A boolean flag for whether to print the documentation. default is False
+* save_to_csv: A boolean flag for whether to save the matching results into a csv file. default is False.
+If the field is set to True, upon completion of the function, it will generate a directory named `Matching_Result
+s_1`.
+* calculate_time: A boolean flag for whether to return the matching time for patient. default is False.
+  In case `calculate_time=True` the output will be dict like this: `{patient_id: (results_dataframe, time)}`
+* `output_dir`: output directory to write match results file to
+
+### Set Database
+In order to get in the matching results more information about the donors than the matching information,
+one can set a database that has all the donors' information in it.
+The database must be a `pandas.DataFrame` that its indexes are the donors' IDs.
+
+After setting the database, when calling one of the matching functions,
+you may set in the `donor_info` variable a `list` with the names of the columns you want to join to the result dataframe from the database.
+
+Example of setting the database:
+
+```python
+import pandas as pd
+from grma.match import set_database
+
+donors = [0, 1, 2]
+database = pd.DataFrame([[30], [32], [25]], columns=["Age"], index=donors)
+
+set_database(database)
+```
+
+
+# How to contribute:
+
+1. Fork the repository: https://github.com/nmdp-bioinformatics/py-graph-match.git
 2. Clone the repository locally
     ```shell
-    git clone git@github.com:pbashyal-nmdp/urban-potato.git
-    cd urban-potato
+    git clone  https://github.com/<Your-Github-ID>/py-graph-match.git
+    cd py-graph-match
     ```
 3. Make a virtual environment and activate it, run `make venv`
    ```shell
@@ -58,18 +226,18 @@ How to use the template:
     |   |-- HLA_alleles.py
     |   `-- SLUG_match.py
     `-- unit
-        `-- test_my_project_template.py
+        `-- test_py-graph-match.py
     ```
-8. Package Module files go in the `my_project_template` directory.
+8. Package Module files go in the `py-graph-match` directory.
     ```
-    my_project_template
+    py-graph-match
     |-- __init__.py
     |-- algorithm
     |   `-- match.py
     |-- model
     |   |-- allele.py
     |   `-- slug.py
-    `-- my_project_template.py
+    `-- py-graph-match.py
     ```
 9. Run all tests with `make test` or different tests with `make behave` or `make pytest`. `make behave` will generate report files and open the browser to the report.
 10. Use `python app.py` to run the Flask service app in debug mode. Service will be available at http://localhost:8080/
 
@@ -0,0 +1,55 @@
+{
+  "populations": [
+    "CAU"
+  ],
+  "freq_trim_threshold": 1e-5,
+ "priority": {
+   "alpha": 0.4999999,
+    "eta": 0,
+    "beta": 1e-7,
+    "gamma": 1e-7,
+    "delta": 0.4999999
+  },
+  "UNK_priors": "SR",
+  "FULL_LOCI": "ABCQR",
+   "loci_map": {
+    "A": 1,
+    "B": 2,
+    "C": 3,
+    "DQB1": 4,
+    "DRB1": 5
+  },
+
+  "factor_missing_data": 0.0001,
+  "Plan_B_Matrix": [
+                [[1, 2, 3, 4, 5]],
+                [[1, 2, 3], [4, 5]],
+                [[1], [2, 3], [4, 5]],
+                [[1, 2, 3], [4], [5]],
+                [[1], [2, 3], [4], [5]],
+                [[1], [2], [3], [4], [5]]
+            ],
+  "planb": true,
+  "number_of_options_threshold": 100000,
+  "epsilon": 1e-3,
+  "number_of_results": 10,
+  "number_of_pop_results": 100,
+  "output_MUUG": true,
+  "output_haplotypes": true,
+  "freq_data_dir": "data/freqs" ,
+  "freq_file": "data/hpf.csv" ,
+  "graph_files_path": "output/csv/" ,
+  "node_csv_file": "nodes.csv",
+  "edges_csv_file": "edges.csv",
+  "info_node_csv_file": "info_node.csv",
+  "top_links_csv_file": "top_links.csv",
+  "imputation_in_file": "data/patients.txt",
+  "imputation_out_umug_freq_filename": "don.umug",
+  "imputation_out_umug_pops_filename": "don.umug.pops",
+  "imputation_out_hap_freq_filename": "don.pmug",
+  "imputation_out_hap_pops_filename": "don.pmug.pops",
+  "imputation_out_miss_filename": "don.miss",
+  "imputation_out_problem_filename": "don.problem",
+  "max_haplotypes_number_in_phase": 100,
+  "imuptation_out_path": "output"
+}
@@ -0,0 +1,3 @@
+12,A*01:01+A*01:01^B*07:02+B*57:01^C*06:02+C*07:02^DQB1*03:03+DQB1*06:02^DRB1*07:01+DRB1*15:01,1,0
+13,A*01:02+A*01:01^B*07:02+B*57:01^C*06:02+C*07:02^DQB1*03:03+DQB1*06:02^DRB1*07:01+DRB1*15:01,1,0
+14,A*01:02+A*02:02^B*07:02+B*57:01^C*01:02+C*07:02^DQB1*03:03+DQB1*06:02^DRB1*07:01+DRB1*15:01,1,0
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+12,A01:01+A01:01^B07:02+B57:01^C06:02+C07:02^DQB103:03+DQB106:02^DRB107:01+DRB115:01,1,0`
	`2`	`+13,A01:02+A01:01^B07:02+B57:01^C06:02+C07:02^DQB103:03+DQB106:02^DRB107:01+DRB115:01,1,0`
	`3`	`+14,A01:02+A02:02^B07:02+B57:01^C01:02+C07:02^DQB103:03+DQB106:02^DRB107:01+DRB115:01,1,0`