Add script to parse disgenet data

JDRomano2 · JDRomano2 · commit ae7d6c8fe611 · 2022-11-07T09:30:31.000-05:00
diff --git a/BUILD.org b/BUILD.org
@@ -103,10 +103,10 @@ from within Windows, etc.).
 
 Now that you have the two data files, you should run the AlzKB script
 we wrote to filter for rows in those files corresponding to
-Alzheimer's Disease. This script is in the =scripts/= directory of the
-AlzKB repository, so either find it on your local filesystem if you
-already have a copy of the repository, or find it on the AlzKB page of
-GitHub.
+Alzheimer's Disease, named =alzkb_parse_disgenet.py=. This script is
+in the =scripts/= directory of the AlzKB repository, so either find it
+on your local filesystem if you already have a copy of the repository,
+or find it on the AlzKB GitHub repository in your web browser.
 
 You can then run the Python script from within the =disgenet/=
 directory, which should deposit two filtered data files in the
diff --git a/scripts/alzkb_parse_disgenet.py b/scripts/alzkb_parse_disgenet.py
@@ -0,0 +1,25 @@
+# !/usr/bin/env python
+## created by Yun Hao and Joe Romano @MooreLab 2022
+## This script parses DisGeNET gene-disease relationship data to extract relationships specific to Alzheimer's disease
+
+# NOTE: This file must be run from the `disgenet/` directory containing the original TSV files referenced below!
+# Both output files will be deposited into the `disgenet/CUSTOM/` directory.
+
+import pandas as pd
+
+from pathlib import Path
+
+disgenet_df = pd.read_csv("./disease_mappings_to_attributes.tsv", sep="\t", header=0)
+disgenet_do_df = pd.read_csv("./disease_mappings.tsv", sep="\t", header=0)
+
+disgenet_ad_df = disgenet_df.loc[disgenet_df["name"].str.contains("Alzheimer"),:]
+cuis = list(disgenet_ad_df.diseaseId.unique())
+
+# For adding disease ontology identifiers
+disgenet_ad_do_df = disgenet_do_df.loc[disgenet_do_df.diseaseId.isin(cuis),:]
+
+# if we don't have the CUSTOM subdirectory, create it
+Path("CUSTOM").mkdir(exist_ok=True)
+
+disgenet_ad_df.to_csv("./CUSTOM/disease_mappings_to_attributes_alzheimer.tsv", sep="\t", header=True, index=False)
+disgenet_ad_do_df.to_csv("./CUSTOM/disease_mappings_alzheimer.tsv", sep="\t", header=True, index=False)