Skip to content

Commit c043f3d

Browse files
committed
Update ongoing projects
1 parent f6de8b2 commit c043f3d

File tree

49 files changed

+167224
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+167224
-0
lines changed

MLMarker_app/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from .database import Database
2+
from .atlas import Atlas
3+
from .mlmarker import MLMarker
4+
from .utils import sample_predict_class, sample_predict_class_adjusted
5+
from .predictor_atlas import PredictorAtlas

MLMarker_app/atlas.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
import pandas as pd
2+
import mysql.connector
3+
4+
class Atlas:
5+
def __init__(self, db, tissue, percentage, disease_status):
6+
self.db = db
7+
self.percentage = percentage
8+
self.conn = mysql.connector.connect(user='root', password='password', host='127.0.0.1', port='3306', database=self.db)
9+
if tissue in ('tissue_name', 'cell_type'):
10+
self.tissue = tissue
11+
else:
12+
print('Tissue type not defined')
13+
raise ValueError
14+
self.disease_status = disease_status
15+
16+
def check_connection(self):
17+
if self.conn.is_connected():
18+
print("Connection successful")
19+
else:
20+
print("No connection")
21+
22+
def get_pepseq(self):
23+
# Retrieve protein sequences from the database
24+
seqsql = "SELECT uniprot_id, length FROM protein WHERE length IS NOT NULL"
25+
seqData = pd.read_sql_query(seqsql, self.conn)
26+
seqData['length'] = pd.to_numeric(seqData['length'], errors='coerce')
27+
self.seqData = seqData
28+
return self.seqData
29+
30+
def get_tissue_data(self):
31+
# Retrieve tissue data from the database
32+
tissuesql = "SELECT tissue_id, tissue_name, cell_type, disease_status FROM tissue"
33+
tissueData = pd.read_sql_query(tissuesql, self.conn)
34+
self.tissueData = tissueData
35+
return self.tissueData
36+
37+
def get_assay_data(self):
38+
# Retrieve assay data from the database
39+
assaysql = "SELECT assay_id, peptide_id, quantification FROM peptide_to_assay"
40+
assayData = pd.read_sql_query(assaysql, self.conn)
41+
self.assayData = assayData
42+
return self.assayData
43+
44+
def get_assay_tissue_data(self):
45+
# Retrieve assay-tissue mapping data from the database
46+
assaytissuesql = "SELECT assay_id, tissue_id FROM tissue_to_assay"
47+
assaytissueData = pd.read_sql_query(assaytissuesql, self.conn)
48+
self.assaytissueData = assaytissueData
49+
return self.assaytissueData
50+
51+
def get_pep_data(self):
52+
# Retrieve peptide to protein mapping data from the database
53+
pepsql = "SELECT peptide_to_protein.peptide_id, peptide_to_protein.uniprot_id FROM peptide_to_protein"
54+
pepData = pd.read_sql_query(pepsql, self.conn)
55+
self.pepData = pepData
56+
return self.pepData
57+
58+
def get_filtered_proteins(self):
59+
# Filter proteins based on specific criteria
60+
pepData = self.get_pep_data()
61+
proteotypicData = pepData.groupby("peptide_id").filter(lambda x: len(x) == 1)
62+
proteins = proteotypicData.groupby("uniprot_id").filter(lambda x: len(x) > 2)
63+
non_human_proteins = ['TRYP_PIG', 'TRY2_BOVIN', 'TRY1_BOVIN', 'SSPA_STAAU', 'SRPP_HEVBR', 'REF_HEVBR', 'ADH1_YEAST', 'ALBU_BOVIN', 'CAS1_BOVIN', 'CAS2_BOVIN', 'CASK_BOVIN', 'CASB_BOVIN', 'OVAL_CHICK', 'ALDOA_RABIT', 'BGAL_ECOLI', 'CAH2_BOVIN', 'CTRA_BOVIN', 'CTRB_BOVIN', 'CYC_HORSE', 'DHE3_BOVIN', 'GAG_SCVLA', 'GFP_AEQVI', 'K1C15_SHEEP', 'K1M1_SHEEP', 'K2M2_SHEEP', 'K2M3_SHEEP', 'KRA3A_SHEEP', 'KRA3_SHEEP', 'KRA61_SHEEP', 'LALBA_BOVIN', 'LYSC_CHICK', 'LYSC_LYSEN', 'MYG_HORSE', 'K1M2_SHEEP', 'K2M1_SHEEP']
64+
proteins = proteins[~proteins['uniprot_id'].isin(non_human_proteins)]
65+
self.proteins = proteins
66+
return self.proteins
67+
68+
def get_protein_data(self):
69+
# Merge protein, tissue, and assay data
70+
seqData = self.get_pepseq()
71+
tissueData = self.get_tissue_data()
72+
assayData = self.get_assay_data()
73+
assaytissueData = self.get_assay_tissue_data()
74+
tissue_assay = pd.merge(assaytissueData, tissueData, on='tissue_id', how='left')
75+
tissue_assay = pd.merge(assayData, tissue_assay, on='assay_id', how='left')
76+
proteins = self.get_filtered_proteins()
77+
protData = pd.merge(tissue_assay, proteins, on='peptide_id').sort_values(['assay_id', 'uniprot_id'])
78+
if self.tissue == 'tissue_name':
79+
del protData['cell_type']
80+
elif self.tissue == 'cell_type':
81+
del protData['tissue_name']
82+
del protData['peptide_id']
83+
del protData['tissue_id']
84+
del protData['disease_status']
85+
self.protData = protData
86+
return self.protData
87+
88+
def filter_protein_data(self):
89+
# Filter protein data based on a percentage threshold
90+
protData = self.get_protein_data()
91+
assays = protData[self.tissue].unique()
92+
DataFrameDict = {elem: pd.DataFrame for elem in assays}
93+
reduction = []
94+
for key in DataFrameDict.keys():
95+
DataFrameDict[key] = protData[protData[self.tissue] == key]
96+
perc = self.percentage * len(pd.unique(DataFrameDict[key]['assay_id']))
97+
before = DataFrameDict[key]['uniprot_id'].nunique()
98+
DataFrameDict[key] = DataFrameDict[key].groupby('uniprot_id').filter(lambda x: len(x) > perc)
99+
after = DataFrameDict[key]['uniprot_id'].nunique()
100+
reduction.append(before - after)
101+
filteredData = pd.DataFrame()
102+
for key in DataFrameDict.keys():
103+
filteredData = filteredData.append(DataFrameDict[key])
104+
del filteredData[self.tissue]
105+
self.filteredData = filteredData
106+
self.reduction = sum(reduction)
107+
return self.filteredData
108+
109+
def calculate_NSAF(self):
110+
# Calculate NSAF scores for proteins
111+
filteredData = self.filter_protein_data()
112+
assays = filteredData['assay_id'].unique()
113+
DataFrameDict3 = {elem: pd.DataFrame for elem in assays}
114+
for key in DataFrameDict3.keys():
115+
DataFrameDict3[key] = filteredData[filteredData['assay_id'] == key]
116+
for key in DataFrameDict3.keys():
117+
sumSaf = 0
118+
assay = DataFrameDict3[key]
119+
assay.pop('assay_id')
120+
grouped = DataFrameDict3[key].groupby('uniprot_id').sum().reset_index()
121+
seqAddedDF = pd.merge(grouped, self.seqData, on='uniprot_id')
122+
seqAddedDF['SAF'] = seqAddedDF['quantification'] / seqAddedDF['length']
123+
sumSaf = seqAddedDF['SAF'].sum()
124+
seqAddedDF['NSAF'] = seqAddedDF['SAF'] / sumSaf
125+
del seqAddedDF['length']
126+
del seqAddedDF['quantification']
127+
del seqAddedDF['SAF']
128+
seqAddedDF.insert(loc=0, column='assay_id', value=key)
129+
DataFrameDict3[key] = seqAddedDF
130+
proteinData = pd.DataFrame()
131+
for key in DataFrameDict3.keys():
132+
proteinData = proteinData.append(DataFrameDict3[key])
133+
self.proteinData = proteinData
134+
return self.proteinData
135+
136+
def get_predictor_atlas(self):
137+
# Generate the predictor atlas
138+
proteinData = self.calculate_NSAF()
139+
tissueData = self.get_tissue_data()
140+
if self.disease_status == 'Healthy':
141+
tissueData = tissueData[tissueData['disease_status'] == "Healthy"]
142+
self.atlas = pd.merge(proteinData, tissueData, on='assay_id')
143+
self.atlas = pd.pivot_table(self.atlas, values='NSAF', index='uniprot_id', columns='tissue_name').fillna(0)
144+
return self.atlas

MLMarker_app/cell_predictors.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import pandas as pd
2+
import numpy as np
3+
from xgboost import XGBClassifier
4+
from sklearn.svm import SVC
5+
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
6+
import time
7+
8+
class CellPredictor:
9+
def __init__(self, X_train_path, X_test_path, y_train_path, y_test_path, dict_train_label_weight_path):
10+
# Load the data
11+
self.X_train = pd.read_csv(X_train_path)
12+
self.X_test = pd.read_csv(X_test_path)
13+
self.y_train = pd.read_csv(y_train_path)
14+
self.y_test = pd.read_csv(y_test_path)
15+
dict_train_label_weight = pd.read_csv(dict_train_label_weight_path)
16+
self.dict_train_label_weights = {item[0]: item[1] for item in dict_train_label_weight.values.tolist()}
17+
self.train_weights = list(self.dict_train_label_weights.values())
18+
self.num_classes = len(np.unique(self.y_train))
19+
print('Data loaded')
20+
21+
def cv_comparison(self, models, names, X, y, cv):
22+
cv_scores = pd.DataFrame()
23+
accs, f1s, precs, recs, f1s_w, precs_w, recs_w = [], [], [], [], [], [], []
24+
25+
for model, name in zip(models, names):
26+
print(name)
27+
start = time.time()
28+
acc = np.round(cross_val_score(model, X, y, scoring='accuracy', cv=cv), 4)
29+
accs.append(acc)
30+
acc_avg = round(np.mean(acc[~np.isnan(acc)]), 4)
31+
f1 = np.round(cross_val_score(model, X, y, scoring='f1_macro', cv=cv), 4)
32+
f1s.append(f1)
33+
f1_avg = round(np.mean(f1[~np.isnan(f1)]), 4)
34+
prec = np.round(cross_val_score(model, X, y, scoring='precision_macro', cv=cv), 4)
35+
precs.append(prec)
36+
prec_avg = round(np.mean(prec[~np.isnan(prec)]), 4)
37+
rec = np.round(cross_val_score(model, X, y, scoring='recall_macro', cv=cv), 4)
38+
recs.append(rec)
39+
rec_avg = round(np.mean(rec[~np.isnan(rec)]), 4)
40+
f1_w = np.round(cross_val_score(model, X, y, scoring='f1_weighted', cv=cv), 4)
41+
f1s_w.append(f1_w)
42+
f1_w_avg = round(np.mean(f1_w[~np.isnan(f1_w)]), 4)
43+
prec_w = np.round(cross_val_score(model, X, y, scoring='precision_weighted', cv=cv), 4)
44+
precs_w.append(prec_w)
45+
prec_w_avg = round(np.mean(prec_w[~np.isnan(prec_w)]), 4)
46+
rec_w = np.round(cross_val_score(model, X, y, scoring='recall_weighted', cv=cv), 4)
47+
recs_w.append(rec_w)
48+
rec_w_avg = round(np.mean(rec_w[~np.isnan(rec_w)]), 4)
49+
cv_scores[name] = [acc_avg, f1_avg, prec_avg, rec_avg, f1_w_avg, prec_w_avg, rec_w_avg]
50+
print(time.time() - start)
51+
cv_scores.index = ['Accuracy', 'f1_macro', 'precision_macro', 'recall_macro', 'f1_weighted', 'precision_weighted', 'recall_weighted']
52+
return cv_scores, accs, f1s, precs, recs, f1s_w, precs_w, recs_w
53+
54+
def train_models(self, output_prefix):
55+
xgb_unbal = XGBClassifier(random_state=42, objective='multi:softprob', eval_metric='mlogloss', num_class=self.num_classes, n_jobs=-1)
56+
xgb = XGBClassifier(random_state=42, objective='multi:softprob', eval_metric='mlogloss', num_class=self.num_classes, weight=self.train_weights, n_jobs=-1)
57+
svm_unbal = SVC(random_state=42)
58+
svm = SVC(random_state=42, class_weight=self.dict_train_label_weights)
59+
60+
models_xgb = [xgb_unbal, xgb]
61+
names_xgb = ['XGBClassifier unbalanced', 'XGBClassifier dict balanced']
62+
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)
63+
comp_xgb, _, _, _, _, _, _, _ = self.cv_comparison(models_xgb, names_xgb, self.X_train, self.y_train, cv=cv)
64+
comp_xgb.to_csv(f'{output_prefix}_XGB.csv', sep='/')
65+
66+
models_svm = [svm_unbal, svm]
67+
names_svm = ['SVM unbalanced', 'SVM']
68+
comp_svm, _, _, _, _, _, _, _ = self.cv_comparison(models_svm, names_svm, self.X_train, self.y_train, cv=cv)
69+
comp_svm.to_csv(f'{output_prefix}_SVM.csv', sep='/')
70+

MLMarker_app/database.py

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
import pandas as pd
2+
import mysql.connector
3+
from collections import defaultdict
4+
import glob
5+
import os
6+
import logging
7+
8+
class Database:
9+
def __init__(self, db_name='expression_atlas2', user='root', password='password', host='127.0.0.1', port='3306'):
10+
self.conn = mysql.connector.connect(user=user, password=password, host=host, port=port, database=db_name)
11+
self.mycursor = self.conn.cursor(buffered=True)
12+
# Check the connection
13+
if self.conn.is_connected():
14+
print("Connection successful")
15+
else:
16+
print("No connection")
17+
18+
def check_projects(self, new_projects):
19+
# Check if a project is already in the DB. Returns empty list if no duplicates have been found
20+
for x in new_projects:
21+
if x[0:3] != 'PXD':
22+
print('Project name must begin with "PXD"')
23+
return False
24+
25+
query = "SELECT PXD_accession FROM project"
26+
old_projects = pd.read_sql_query(query, self.conn)['PXD_accession'].values.tolist()
27+
duplicates = []
28+
for p in new_projects:
29+
if p in old_projects:
30+
print(f'{p} is already in the database')
31+
duplicates.append(p)
32+
print('Projects checked.')
33+
return duplicates
34+
35+
def build_project_table(self, meta_df, list_of_pxds):
36+
# Populate project table using a dataframe with the necessary metadata and a list of PXD accessions
37+
meta_df = meta_df[meta_df['accession'].isin(list_of_pxds)]
38+
check = self.check_projects(meta_df.accession.unique().tolist())
39+
if check:
40+
print(f"Duplicates detected: {check}. \nNo entries have been added")
41+
return
42+
43+
meta_df = meta_df[['accession', 'experimentTypes', 'instrumentNames', 'keywords', 'references']].astype(str)
44+
meta_tuples = list(meta_df.to_records(index=False))
45+
for i in meta_tuples:
46+
project = "INSERT INTO project(PXD_accession, experiment_type, instrument, keywords, ref) VALUES (%s, %s, %s, %s, %s)"
47+
self.mycursor.execute(project, list(i))
48+
self.conn.commit()
49+
print(f"{len(meta_tuples)} projects added to table 'project'.")
50+
51+
def build_mod_table(self, mod_df):
52+
# Insert modifications into the modifications table
53+
mod_tuples = list(mod_df.to_records(index=False))
54+
for i in mod_tuples:
55+
mod = "INSERT IGNORE INTO modifications(mod_id, modification_type, mass_difference) VALUES(%s, %s, %s)"
56+
self.mycursor.execute(mod, list(i))
57+
self.conn.commit()
58+
print(f"{len(mod_tuples)} modifications added.")
59+
60+
def build_tissue_table(self, tissue_df):
61+
# Insert tissues into the tissue table
62+
tissue_df = tissue_df[['tissue', 'cell_type', 'status']].drop_duplicates()
63+
tissue_tuples = list(tissue_df.to_records(index=False))
64+
for i in tissue_tuples:
65+
tissue = "INSERT INTO tissue(tissue_name, cell_type, disease_status) VALUES (%s, %s, %s)"
66+
self.mycursor.execute(tissue, list(i))
67+
self.conn.commit()
68+
print(f"{len(tissue_tuples)} tissues added.")
69+
70+
def build_assay_cell_table(self, assay_df):
71+
# Insert assays into the assay table and link them with tissues
72+
assay_tuples = list(assay_df.to_records(index=False))
73+
for i in assay_tuples:
74+
accession, filename, pride_tissue, cell_type, tissue, status = i
75+
self.mycursor.execute("SELECT project_id FROM project WHERE PXD_accession = %s", (accession,))
76+
projectID = self.mycursor.fetchone()[0]
77+
assay = "INSERT INTO assay(project_id, filename) VALUES(%s, %s)"
78+
self.mycursor.execute(assay, (projectID, filename))
79+
self.conn.commit()
80+
assayID = self.mycursor.lastrowid
81+
self.mycursor.execute("SELECT tissue_id FROM tissue WHERE tissue_name = %s AND cell_type = %s AND disease_status = %s", (tissue, cell_type, status))
82+
tissueID = self.mycursor.fetchone()[0]
83+
tissue_to_assay = "INSERT INTO tissue_to_assay(assay_id, tissue_id) VALUES(%s, %s)"
84+
self.mycursor.execute(tissue_to_assay, (assayID, tissueID))
85+
self.conn.commit()
86+
print(f"{len(assay_tuples)} assays added.")
87+
88+
def ionbot_parse(self, file):
89+
# Parse ionbot output files and filter based on specific criteria
90+
df = pd.read_csv(file, sep=',')
91+
if df.empty:
92+
logging.debug(f"File {file} is empty")
93+
return False
94+
df = df[(df['best_psm'] == 1) & (df['q_value'] <= 0.01) & (df['DB'] == 'T')]
95+
if df.empty:
96+
logging.debug(f"{file} did not pass filtering")
97+
return False
98+
df = df[~df['proteins'].str.contains('||', regex=False)]
99+
df['modifications'] = df['modifications'].fillna('x|[2030]unmodified')
100+
if df.empty:
101+
logging.debug(f"{file} all peptides are linked to multiple proteins or do not pass the filtering")
102+
return False
103+
spectral_counts = defaultdict(int)
104+
for pep in df['matched_peptide'].tolist():
105+
spectral_counts[pep] += 1
106+
spectral_counts = dict(sorted(spectral_counts.items(), key=lambda item: item[1], reverse=True))
107+
return df, spectral_counts
108+
109+
def ionbot_store(self, file, filename):
110+
# Store parsed ionbot data into the database
111+
filename = filename.split('/')[-1].split('.')[0]
112+
self.mycursor.execute("SELECT assay_id FROM assay WHERE filename = %s", (filename,))
113+
assayID = self.mycursor.fetchone()
114+
if not assayID:
115+
print(f'{filename} is not in assays')
116+
return
117+
assayID = assayID[0]
118+
parser = self.ionbot_parse(file)
119+
if not parser:
120+
logging.warning(f"parser failed for {filename}")
121+
return
122+
df, spectral_counts = parser
123+
for _, row in df.iterrows():
124+
protID, pepseq, mod = row['proteins'], row['matched_peptide'], row['modifications']
125+
self.mycursor.execute("INSERT INTO peptide(peptide_sequence) VALUES (%s) ON DUPLICATE KEY UPDATE peptide_sequence=peptide_sequence", (pepseq,))
126+
self.conn.commit()
127+
self.mycursor.execute("SELECT peptide_id FROM peptide WHERE peptide_sequence = %s", (pepseq,))
128+
pepID = self.mycursor.fetchone()[0]
129+
uniprotID = protID.split('|')[1]
130+
self.mycursor.execute("INSERT INTO protein(uniprot_id) VALUES (%s) ON DUPLICATE KEY UPDATE uniprot_id=uniprot_id", (uniprotID,))
131+
self.conn.commit()
132+
self.mycursor.execute("INSERT INTO peptide_to_protein(uniprot_id, peptide_id) VALUES (%s,%s) ON DUPLICATE KEY UPDATE peptide_id=peptide_id, uniprot_id=uniprot_id", (uniprotID, pepID))
133+
self.conn.commit()
134+
for m in mod.split(';'):
135+
location, modID = m.split('|')[0], m[m.find("[")+1:m.find("]")]
136+
self.mycursor.execute("SELECT mod_id FROM modifications WHERE mod_id = %s", (modID,))
137+
modID = self.mycursor.fetchone()
138+
if modID:
139+
modID = modID[0]
140+
self.mycursor.execute("INSERT INTO peptide_modifications(peptide_id, location, mod_id, assay_id) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE count = count + 1", (pepID, location, modID, assayID))
141+
self.conn.commit()
142+
count = spectral_counts.get(pepseq, float('inf'))
143+
self.mycursor.execute("INSERT INTO peptide_to_assay(peptide_id, assay_id, quantification) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE quantification=%s", (pepID, assayID, count, count))
144+
self.conn.commit()
145+
logging.info(f'{filename} was stored')
146+
147+
def find_ionbot_files(self, projects):
148+
# Find and process ionbot files for given projects
149+
logging.basicConfig(filename='ionbot_assays.log', level=logging.DEBUG)
150+
number_of_files = 0
151+
for pxd in projects:
152+
path = None
153+
for base in ['/home/compomics/conode53_pride/PRIDE_DATA/', '/home/compomics/conode54_pride/PRIDE_DATA/', '/home/compomics/conode55_pride/PRIDE_DATA/']:
154+
if os.path.exists(base + str(pxd)):
155+
path = base + str(pxd)
156+
break
157+
if not path:
158+
continue
159+
for file in glob.glob(path + "/*.mgf.gzip/*.mgf.gzip.ionbot.csv"):
160+
number_of_files += 1
161+
if file not in logging.root.manager.loggerDict:
162+
if os.path.getsize(file) != 0:
163+
self.ionbot_store(file, file)
164+
print(number_of_files)

0 commit comments

Comments
 (0)