EpistasisLab
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎data/addictionkb.rdf‎
Lines changed: 1462 additions & 0 deletions b/‎data/addictionkb.rdf‎
Lines changed: 1462 additions & 0 deletions
diff --git a/‎img/AddictionKB_Schema.png‎
233 KB b/‎img/AddictionKB_Schema.png‎
233 KB
diff --git a/‎scripts/All_Keywords.csv‎
Lines changed: 202 additions & 0 deletions b/‎scripts/All_Keywords.csv‎
Lines changed: 202 additions & 0 deletions
diff --git a/‎scripts/addkb_parse_disgenet.py‎
Lines changed: 44 additions & 0 deletions b/‎scripts/addkb_parse_disgenet.py‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎scripts/addkb_parse_ncbigene.py‎
Lines changed: 80 additions & 0 deletions b/‎scripts/addkb_parse_ncbigene.py‎
Lines changed: 80 additions & 0 deletions
@@ -0,0 +1 @@
+.DS_Store
@@ -1,2 +1,4 @@
 # AddictionKB
 A knowledge base for substance-use disorders
+
+![image](/Users/ghosha/Documents/VSCode Projects/AddictionKB/img/AddictionKB_Schema.png)
@@ -0,0 +1,202 @@
+Keywords
+HIV
+Human immunodeficiency virus
+AIDS
+Acquired immunodeficiency syndrome
+Pneumocystis pneumonia 
+PCP
+Kaposi's sarcoma
+Cytomegalovirus infection
+Tuberculosis 
+TB
+Candidiasis oral 
+Candidiasis esophageal
+Cryptococcosis
+Toxoplasmosis
+HIV-associated neurocognitive disorder 
+Non-Hodgkin lymphoma
+Wasting syndrome
+Herpes simplex virus infection
+Progressive multifocal leukoencephalopathy 
+PML
+Hepatitis B virus infection
+Hepatitis C virus infection
+Human papillomavirus infection
+HPV
+Mycobacterium avium complex infection
+MAC
+Recurrent bacterial pneumonia
+Isosporiasis
+Histoplasmosis
+Coccidioidomycosis
+Aspergillosis
+Cytomegalovirus retinitis
+Cryptosporidiosis
+Salmonella septicemia
+Microsporidiosis
+HIV encephalopathy
+Peripheral neuropathy
+Seizure disorders
+Depression
+Bipolar disorder
+Cervical cancer
+Primary central nervous system lymphoma
+Anal squamous cell carcinoma
+Recurrent sinusitis
+Chronic obstructive pulmonary disease
+COPD
+Bronchiectasis
+Esophageal ulceration
+Non-alcoholic fatty liver disease 
+NAFLD
+Pancreatitis
+Chronic diarrhea
+Pulmonary hypertension
+Atherosclerosis
+Diabetes mellitus
+Dyslipidemia
+HIV-associated nephropathy 
+HIVAN
+Chronic kidney disease 
+CKD
+Seborrheic dermatitis
+Psoriasis
+Eosinophilic folliculitis
+Osteoporosis
+Myositis
+Thrombocytopenia
+Anemia
+Rheumatoid arthritis
+Systemic lupus erythematosus
+Syphilis
+Gonorrhea
+Human T-cell leukemia virus type 1 
+HTLV-1
+Leishmaniasis
+Strongyloidiasis
+Fat redistribution syndrome 
+Lipodystrophy
+Metabolic syndrome
+Chronic fatigue syndrome
+HIV wasting syndrome
+Immune reconstitution inflammatory syndrome 
+IRIS
+Post-traumatic stress disorder 
+PTSD
+Substance use disorders
+Anxiety disorders
+Substance use
+Addiction
+Dependence
+Withdrawal
+Intoxication
+Craving
+Overdose
+Sedative
+Hypnotic
+Barbiturate
+Benzodiazepine
+Hallucination-inducing
+Psychedelic
+Ecstasy 
+MDMA
+Synthetic drug
+Fentanyl
+Kratom
+Designer drug
+Mania
+Cyclothymic
+Dysthymic
+Suicide
+Self-harm
+Eating disorder
+Anorexia
+Bulimia
+Binge eating
+Insomnia
+Somatic symptom
+Conversion disorder
+Hypochondriasis
+Dissociative disorder
+Identity disorder
+Amnesia
+Fugue
+Paranoia
+Impulse-control disorder
+Conduct disorder
+Oppositional defiant disorder 
+ODD
+Pathological gambling
+Hoarding
+Trichotillomania
+Skin-picking
+Body dysmorphic disorder
+Intermittent explosive disorder
+Somatoform
+Delirium
+Psychosis
+Developmental disorder
+Learning disability
+Cognitive impairment
+Intellectual disability
+Speech disorder
+Language disorder
+Social anxiety
+Executive dysfunction
+Acute stress disorder
+Trauma
+Resilience
+Emotional dysregulation
+Behavioral health
+Mental health
+Emotional disturbance
+Psychological distress
+Psychosomatic
+Drug
+Opioid
+Morphine
+Narcotic
+Opiate
+Heroin
+Cocaine
+Nicotine
+Tobacco
+Alcohol
+Cannabis
+Marijuana
+Amphetamine
+Methamphetamine
+Stimulant
+Hallucinogen
+Inhalant
+Schizophrenia
+Psychotic disorder
+Delusion
+Bipolar
+Depression
+Depressive
+Affective
+Mood
+Phobia
+Anxiety
+OCD
+Obsessive
+Stress
+Fear of
+Neurotic
+PTSD
+Post-traumatic
+Adjustment
+Mood disorder
+Personality disorder
+Mental disorder
+Autistic
+Rett
+Asperger
+ADHD
+Attention-deficit hyperactivity
+Tic
+Tourette
+Mental disorder
+Psychiatric
+Hallucination
@@ -0,0 +1,44 @@
+# This script parses DisGeNET gene-disease relationship data to extract relationships specific to HIV and substance use disorders.
+
+import pandas as pd
+from pathlib import Path
+
+# read the files downloaded from the disgenet website - all files are stored in the OneDrive folder
+disgenet_df = pd.read_csv("Path to file disease_mappings_to_attributes.tsv from disgenet", sep="\t", header=0)
+disgenet_do_df = pd.read_csv("Path to file disease_mappings.tsv from disgenet", sep="\t", header=0)
+
+# keywords used to filter the data - suggested by literature review and domain experts
+keyword_list = pd.read_csv("/Users/ghosha/Documents/VSCode Projects/AddictionKB/scripts/All_Keywords.csv")
+
+# Create an empty list to store DataFrames
+dfs_to_concat = []
+
+for keyword in keyword_list['Keywords']:
+    print(keyword)
+    # convert the keyword to a string
+    keyword = str(keyword)
+    df_keyword = disgenet_df.loc[disgenet_df["name"].str.contains(keyword, case=False), :].copy() # added the : afterwards
+    df_keyword['keyword'] = keyword
+    dfs_to_concat.append(df_keyword)
+
+# Concatenate the DataFrames
+disgenet_opioid_all_df = pd.concat(dfs_to_concat)
+print("Size of disgenet_opioid_all_df: ", disgenet_opioid_all_df.shape)
+
+# Get unique disease IDs
+cuis = disgenet_opioid_all_df['diseaseId'].unique()
+
+# Filter the disease ontology DataFrame
+disgenet_opioid_all_do_df = disgenet_do_df.loc[disgenet_do_df.diseaseId.isin(cuis), :].copy()
+
+# Group keywords by disease ID and join them
+keyword_groups = disgenet_opioid_all_df.groupby('diseaseId')['keyword'].apply(lambda x: ', '.join(x)).reset_index()
+disgenet_opioid_all_do_df = disgenet_opioid_all_do_df.merge(keyword_groups, on='diseaseId', how='left')
+
+# print the shape of the final DataFrame
+print("Size of disgenet_opioid_all_do_df: ", disgenet_opioid_all_do_df.shape)
+
+
+# Convert the DataFrames to CSV files
+disgenet_opioid_all_df.to_csv("/Path to store filtered data/disease_mappings_to_attributes_addkb_all.tsv", sep="\t", header=True, index=False)
+disgenet_opioid_all_do_df.to_csv("Path to store filtered data/disease_mappings_addkb_all.tsv", sep="\t", header=True, index=False)
@@ -0,0 +1,80 @@
+# This script parses NCBI human gene data and Bgee epxression data 
+
+my_set = set()
+
+def processLargeTextFile(source, compare_index, separator):
+    # count to see how many lines are added to the set
+    count = 0
+    with open(source, "r") as r:
+        for line in r:
+            columns = line.split(separator)
+            my_set.add(columns[compare_index].replace('Ensembl:', '') )
+            count += 1
+    print("Length from processLargeTextFile: ", count)
+    r.close()
+
+def keepDesiredColums(row, keep_index, separator):
+    columns = row.split(separator)
+
+    output_str = []
+    for index in keep_index:
+        output_str.append(columns[index])
+
+    return separator.join(output_str)
+
+def filterLargeTextFile(source, destination, delimiter, keep_index):
+    with open(source, "r") as r, open(destination, "w") as w:
+        #load header row
+        w.write(keepDesiredColums(r.readline(), keep_index, delimiter) + '\n')
+
+        #load body
+        count = 0
+        for line in r:
+            if line is not None:
+                count += 1
+                w.write(keepDesiredColums(line, keep_index, delimiter) + '\n')
+        print("Length from filterLargeTextFile: ", count)
+    r.close(), w.close()
+
+def fileIndexFinder(source, destination, keep_set, compare_column_index, separator):
+    count_rows =0
+    with open(source, "r") as r, open(destination, "w") as w:
+        w.write('Ensembl' + separator +  r.readline())
+
+        for line in r:
+            columns = line.split(separator)
+            parsed_column = columns[compare_column_index]
+            print("Parsed column before splitting or processing: ", parsed_column)
+
+            if '|' in parsed_column:
+                parsed_column_split = parsed_column.split('|')
+                if len(parsed_column_split) > 2:
+                    parsed_column = parsed_column_split[2].replace('Ensembl:', '')
+                    #print("Parsed column: ", parsed_column)
+
+            if parsed_column in keep_set: # remov ethe if condition if you want to follow AlzKB conventions
+                count_rows +=1
+                w.write(parsed_column + separator + line)
+    print("Length from fileIndexFinder: ", count_rows)
+    r.close()
+
+
+
+brain_file='Path to file Homo_sapiens_expr_advanced.tsv' #https://bgee.org/?page=download&action=expr_calls#id1
+gene_file='Path to file Homo_sapiens.gene_info' #https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz
+
+gene_dest_file='Path to save Homo_sapiens.gene_info_filtered'
+
+final_out='Path to save output.tsv'
+
+delimiter = '\t'
+keep_index = [1,2,4,5,6,8,9]
+compare_index = 0
+
+processLargeTextFile(brain_file, compare_index, delimiter)
+
+filterLargeTextFile(gene_file, gene_dest_file, delimiter, keep_index)
+
+print("Length of my_set: ", len(my_set))
+
+fileIndexFinder(gene_dest_file, final_out, my_set, 3,  delimiter)