Skip to content

Commit 2e9b894

Browse files
committed
added data processing scripts
1 parent d9f27e5 commit 2e9b894

File tree

9 files changed

+2753
-0
lines changed

9 files changed

+2753
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.DS_Store

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
# AddictionKB
22
A knowledge base for substance-use disorders
3+
4+
![image](/Users/ghosha/Documents/VSCode Projects/AddictionKB/img/AddictionKB_Schema.png)

data/addictionkb.rdf

Lines changed: 1462 additions & 0 deletions
Large diffs are not rendered by default.

img/AddictionKB_Schema.png

233 KB
Loading

scripts/All_Keywords.csv

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
Keywords
2+
HIV
3+
Human immunodeficiency virus
4+
AIDS
5+
Acquired immunodeficiency syndrome
6+
Pneumocystis pneumonia
7+
PCP
8+
Kaposi's sarcoma
9+
Cytomegalovirus infection
10+
Tuberculosis
11+
TB
12+
Candidiasis oral
13+
Candidiasis esophageal
14+
Cryptococcosis
15+
Toxoplasmosis
16+
HIV-associated neurocognitive disorder
17+
Non-Hodgkin lymphoma
18+
Wasting syndrome
19+
Herpes simplex virus infection
20+
Progressive multifocal leukoencephalopathy
21+
PML
22+
Hepatitis B virus infection
23+
Hepatitis C virus infection
24+
Human papillomavirus infection
25+
HPV
26+
Mycobacterium avium complex infection
27+
MAC
28+
Recurrent bacterial pneumonia
29+
Isosporiasis
30+
Histoplasmosis
31+
Coccidioidomycosis
32+
Aspergillosis
33+
Cytomegalovirus retinitis
34+
Cryptosporidiosis
35+
Salmonella septicemia
36+
Microsporidiosis
37+
HIV encephalopathy
38+
Peripheral neuropathy
39+
Seizure disorders
40+
Depression
41+
Bipolar disorder
42+
Cervical cancer
43+
Primary central nervous system lymphoma
44+
Anal squamous cell carcinoma
45+
Recurrent sinusitis
46+
Chronic obstructive pulmonary disease
47+
COPD
48+
Bronchiectasis
49+
Esophageal ulceration
50+
Non-alcoholic fatty liver disease
51+
NAFLD
52+
Pancreatitis
53+
Chronic diarrhea
54+
Pulmonary hypertension
55+
Atherosclerosis
56+
Diabetes mellitus
57+
Dyslipidemia
58+
HIV-associated nephropathy
59+
HIVAN
60+
Chronic kidney disease
61+
CKD
62+
Seborrheic dermatitis
63+
Psoriasis
64+
Eosinophilic folliculitis
65+
Osteoporosis
66+
Myositis
67+
Thrombocytopenia
68+
Anemia
69+
Rheumatoid arthritis
70+
Systemic lupus erythematosus
71+
Syphilis
72+
Gonorrhea
73+
Human T-cell leukemia virus type 1
74+
HTLV-1
75+
Leishmaniasis
76+
Strongyloidiasis
77+
Fat redistribution syndrome
78+
Lipodystrophy
79+
Metabolic syndrome
80+
Chronic fatigue syndrome
81+
HIV wasting syndrome
82+
Immune reconstitution inflammatory syndrome
83+
IRIS
84+
Post-traumatic stress disorder
85+
PTSD
86+
Substance use disorders
87+
Anxiety disorders
88+
Substance use
89+
Addiction
90+
Dependence
91+
Withdrawal
92+
Intoxication
93+
Craving
94+
Overdose
95+
Sedative
96+
Hypnotic
97+
Barbiturate
98+
Benzodiazepine
99+
Hallucination-inducing
100+
Psychedelic
101+
Ecstasy
102+
MDMA
103+
Synthetic drug
104+
Fentanyl
105+
Kratom
106+
Designer drug
107+
Mania
108+
Cyclothymic
109+
Dysthymic
110+
Suicide
111+
Self-harm
112+
Eating disorder
113+
Anorexia
114+
Bulimia
115+
Binge eating
116+
Insomnia
117+
Somatic symptom
118+
Conversion disorder
119+
Hypochondriasis
120+
Dissociative disorder
121+
Identity disorder
122+
Amnesia
123+
Fugue
124+
Paranoia
125+
Impulse-control disorder
126+
Conduct disorder
127+
Oppositional defiant disorder
128+
ODD
129+
Pathological gambling
130+
Hoarding
131+
Trichotillomania
132+
Skin-picking
133+
Body dysmorphic disorder
134+
Intermittent explosive disorder
135+
Somatoform
136+
Delirium
137+
Psychosis
138+
Developmental disorder
139+
Learning disability
140+
Cognitive impairment
141+
Intellectual disability
142+
Speech disorder
143+
Language disorder
144+
Social anxiety
145+
Executive dysfunction
146+
Acute stress disorder
147+
Trauma
148+
Resilience
149+
Emotional dysregulation
150+
Behavioral health
151+
Mental health
152+
Emotional disturbance
153+
Psychological distress
154+
Psychosomatic
155+
Drug
156+
Opioid
157+
Morphine
158+
Narcotic
159+
Opiate
160+
Heroin
161+
Cocaine
162+
Nicotine
163+
Tobacco
164+
Alcohol
165+
Cannabis
166+
Marijuana
167+
Amphetamine
168+
Methamphetamine
169+
Stimulant
170+
Hallucinogen
171+
Inhalant
172+
Schizophrenia
173+
Psychotic disorder
174+
Delusion
175+
Bipolar
176+
Depression
177+
Depressive
178+
Affective
179+
Mood
180+
Phobia
181+
Anxiety
182+
OCD
183+
Obsessive
184+
Stress
185+
Fear of
186+
Neurotic
187+
PTSD
188+
Post-traumatic
189+
Adjustment
190+
Mood disorder
191+
Personality disorder
192+
Mental disorder
193+
Autistic
194+
Rett
195+
Asperger
196+
ADHD
197+
Attention-deficit hyperactivity
198+
Tic
199+
Tourette
200+
Mental disorder
201+
Psychiatric
202+
Hallucination

scripts/addkb_parse_disgenet.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# This script parses DisGeNET gene-disease relationship data to extract relationships specific to HIV and substance use disorders.
2+
3+
import pandas as pd
4+
from pathlib import Path
5+
6+
# read the files downloaded from the disgenet website - all files are stored in the OneDrive folder
7+
disgenet_df = pd.read_csv("Path to file disease_mappings_to_attributes.tsv from disgenet", sep="\t", header=0)
8+
disgenet_do_df = pd.read_csv("Path to file disease_mappings.tsv from disgenet", sep="\t", header=0)
9+
10+
# keywords used to filter the data - suggested by literature review and domain experts
11+
keyword_list = pd.read_csv("/Users/ghosha/Documents/VSCode Projects/AddictionKB/scripts/All_Keywords.csv")
12+
13+
# Create an empty list to store DataFrames
14+
dfs_to_concat = []
15+
16+
for keyword in keyword_list['Keywords']:
17+
print(keyword)
18+
# convert the keyword to a string
19+
keyword = str(keyword)
20+
df_keyword = disgenet_df.loc[disgenet_df["name"].str.contains(keyword, case=False), :].copy() # added the : afterwards
21+
df_keyword['keyword'] = keyword
22+
dfs_to_concat.append(df_keyword)
23+
24+
# Concatenate the DataFrames
25+
disgenet_opioid_all_df = pd.concat(dfs_to_concat)
26+
print("Size of disgenet_opioid_all_df: ", disgenet_opioid_all_df.shape)
27+
28+
# Get unique disease IDs
29+
cuis = disgenet_opioid_all_df['diseaseId'].unique()
30+
31+
# Filter the disease ontology DataFrame
32+
disgenet_opioid_all_do_df = disgenet_do_df.loc[disgenet_do_df.diseaseId.isin(cuis), :].copy()
33+
34+
# Group keywords by disease ID and join them
35+
keyword_groups = disgenet_opioid_all_df.groupby('diseaseId')['keyword'].apply(lambda x: ', '.join(x)).reset_index()
36+
disgenet_opioid_all_do_df = disgenet_opioid_all_do_df.merge(keyword_groups, on='diseaseId', how='left')
37+
38+
# print the shape of the final DataFrame
39+
print("Size of disgenet_opioid_all_do_df: ", disgenet_opioid_all_do_df.shape)
40+
41+
42+
# Convert the DataFrames to CSV files
43+
disgenet_opioid_all_df.to_csv("/Path to store filtered data/disease_mappings_to_attributes_addkb_all.tsv", sep="\t", header=True, index=False)
44+
disgenet_opioid_all_do_df.to_csv("Path to store filtered data/disease_mappings_addkb_all.tsv", sep="\t", header=True, index=False)

scripts/addkb_parse_ncbigene.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# This script parses NCBI human gene data and Bgee epxression data
2+
3+
my_set = set()
4+
5+
def processLargeTextFile(source, compare_index, separator):
6+
# count to see how many lines are added to the set
7+
count = 0
8+
with open(source, "r") as r:
9+
for line in r:
10+
columns = line.split(separator)
11+
my_set.add(columns[compare_index].replace('Ensembl:', '') )
12+
count += 1
13+
print("Length from processLargeTextFile: ", count)
14+
r.close()
15+
16+
def keepDesiredColums(row, keep_index, separator):
17+
columns = row.split(separator)
18+
19+
output_str = []
20+
for index in keep_index:
21+
output_str.append(columns[index])
22+
23+
return separator.join(output_str)
24+
25+
def filterLargeTextFile(source, destination, delimiter, keep_index):
26+
with open(source, "r") as r, open(destination, "w") as w:
27+
#load header row
28+
w.write(keepDesiredColums(r.readline(), keep_index, delimiter) + '\n')
29+
30+
#load body
31+
count = 0
32+
for line in r:
33+
if line is not None:
34+
count += 1
35+
w.write(keepDesiredColums(line, keep_index, delimiter) + '\n')
36+
print("Length from filterLargeTextFile: ", count)
37+
r.close(), w.close()
38+
39+
def fileIndexFinder(source, destination, keep_set, compare_column_index, separator):
40+
count_rows =0
41+
with open(source, "r") as r, open(destination, "w") as w:
42+
w.write('Ensembl' + separator + r.readline())
43+
44+
for line in r:
45+
columns = line.split(separator)
46+
parsed_column = columns[compare_column_index]
47+
print("Parsed column before splitting or processing: ", parsed_column)
48+
49+
if '|' in parsed_column:
50+
parsed_column_split = parsed_column.split('|')
51+
if len(parsed_column_split) > 2:
52+
parsed_column = parsed_column_split[2].replace('Ensembl:', '')
53+
#print("Parsed column: ", parsed_column)
54+
55+
if parsed_column in keep_set: # remov ethe if condition if you want to follow AlzKB conventions
56+
count_rows +=1
57+
w.write(parsed_column + separator + line)
58+
print("Length from fileIndexFinder: ", count_rows)
59+
r.close()
60+
61+
62+
63+
brain_file='Path to file Homo_sapiens_expr_advanced.tsv' #https://bgee.org/?page=download&action=expr_calls#id1
64+
gene_file='Path to file Homo_sapiens.gene_info' #https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz
65+
66+
gene_dest_file='Path to save Homo_sapiens.gene_info_filtered'
67+
68+
final_out='Path to save output.tsv'
69+
70+
delimiter = '\t'
71+
keep_index = [1,2,4,5,6,8,9]
72+
compare_index = 0
73+
74+
processLargeTextFile(brain_file, compare_index, delimiter)
75+
76+
filterLargeTextFile(gene_file, gene_dest_file, delimiter, keep_index)
77+
78+
print("Length of my_set: ", len(my_set))
79+
80+
fileIndexFinder(gene_dest_file, final_out, my_set, 3, delimiter)

0 commit comments

Comments
 (0)