TineClaeys
diff --git a/‎MLMarker_app/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎MLMarker_app/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎MLMarker_app/atlas.py‎
Lines changed: 144 additions & 0 deletions b/‎MLMarker_app/atlas.py‎
Lines changed: 144 additions & 0 deletions
diff --git a/‎MLMarker_app/cell_predictors.py‎
Lines changed: 70 additions & 0 deletions b/‎MLMarker_app/cell_predictors.py‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎MLMarker_app/database.py‎
Lines changed: 164 additions & 0 deletions b/‎MLMarker_app/database.py‎
Lines changed: 164 additions & 0 deletions
@@ -0,0 +1,5 @@
+from .database import Database
+from .atlas import Atlas
+from .mlmarker import MLMarker
+from .utils import sample_predict_class, sample_predict_class_adjusted
+from .predictor_atlas import PredictorAtlas
@@ -0,0 +1,144 @@
+import pandas as pd
+import mysql.connector
+
+class Atlas:
+    def __init__(self, db, tissue, percentage, disease_status):
+        self.db = db
+        self.percentage = percentage
+        self.conn = mysql.connector.connect(user='root', password='password', host='127.0.0.1', port='3306', database=self.db)
+        if tissue in ('tissue_name', 'cell_type'):
+            self.tissue = tissue
+        else:
+            print('Tissue type not defined')
+            raise ValueError
+        self.disease_status = disease_status
+
+    def check_connection(self):
+        if self.conn.is_connected():
+            print("Connection successful")
+        else:
+            print("No connection")
+
+    def get_pepseq(self):
+        # Retrieve protein sequences from the database
+        seqsql = "SELECT uniprot_id, length FROM protein WHERE length IS NOT NULL"
+        seqData = pd.read_sql_query(seqsql, self.conn)
+        seqData['length'] = pd.to_numeric(seqData['length'], errors='coerce')
+        self.seqData = seqData
+        return self.seqData
+
+    def get_tissue_data(self):
+        # Retrieve tissue data from the database
+        tissuesql = "SELECT tissue_id, tissue_name, cell_type, disease_status FROM tissue"
+        tissueData = pd.read_sql_query(tissuesql, self.conn)
+        self.tissueData = tissueData
+        return self.tissueData
+
+    def get_assay_data(self):
+        # Retrieve assay data from the database
+        assaysql = "SELECT assay_id, peptide_id, quantification FROM peptide_to_assay"
+        assayData = pd.read_sql_query(assaysql, self.conn)
+        self.assayData = assayData
+        return self.assayData
+
+    def get_assay_tissue_data(self):
+        # Retrieve assay-tissue mapping data from the database
+        assaytissuesql = "SELECT assay_id, tissue_id FROM tissue_to_assay"
+        assaytissueData = pd.read_sql_query(assaytissuesql, self.conn)
+        self.assaytissueData = assaytissueData
+        return self.assaytissueData
+
+    def get_pep_data(self):
+        # Retrieve peptide to protein mapping data from the database
+        pepsql = "SELECT peptide_to_protein.peptide_id, peptide_to_protein.uniprot_id FROM peptide_to_protein"
+        pepData = pd.read_sql_query(pepsql, self.conn)
+        self.pepData = pepData
+        return self.pepData
+
+    def get_filtered_proteins(self):
+        # Filter proteins based on specific criteria
+        pepData = self.get_pep_data()
+        proteotypicData = pepData.groupby("peptide_id").filter(lambda x: len(x) == 1)
+        proteins = proteotypicData.groupby("uniprot_id").filter(lambda x: len(x) > 2)
+        non_human_proteins = ['TRYP_PIG', 'TRY2_BOVIN', 'TRY1_BOVIN', 'SSPA_STAAU', 'SRPP_HEVBR', 'REF_HEVBR', 'ADH1_YEAST', 'ALBU_BOVIN', 'CAS1_BOVIN', 'CAS2_BOVIN', 'CASK_BOVIN', 'CASB_BOVIN', 'OVAL_CHICK', 'ALDOA_RABIT', 'BGAL_ECOLI', 'CAH2_BOVIN', 'CTRA_BOVIN', 'CTRB_BOVIN', 'CYC_HORSE', 'DHE3_BOVIN', 'GAG_SCVLA', 'GFP_AEQVI', 'K1C15_SHEEP', 'K1M1_SHEEP', 'K2M2_SHEEP', 'K2M3_SHEEP', 'KRA3A_SHEEP', 'KRA3_SHEEP', 'KRA61_SHEEP', 'LALBA_BOVIN', 'LYSC_CHICK', 'LYSC_LYSEN', 'MYG_HORSE', 'K1M2_SHEEP', 'K2M1_SHEEP']
+        proteins = proteins[~proteins['uniprot_id'].isin(non_human_proteins)]
+        self.proteins = proteins
+        return self.proteins
+
+    def get_protein_data(self):
+        # Merge protein, tissue, and assay data
+        seqData = self.get_pepseq()
+        tissueData = self.get_tissue_data()
+        assayData = self.get_assay_data()
+        assaytissueData = self.get_assay_tissue_data()
+        tissue_assay = pd.merge(assaytissueData, tissueData, on='tissue_id', how='left')
+        tissue_assay = pd.merge(assayData, tissue_assay, on='assay_id', how='left')
+        proteins = self.get_filtered_proteins()
+        protData = pd.merge(tissue_assay, proteins, on='peptide_id').sort_values(['assay_id', 'uniprot_id'])
+        if self.tissue == 'tissue_name':
+            del protData['cell_type']
+        elif self.tissue == 'cell_type':
+            del protData['tissue_name']
+        del protData['peptide_id']
+        del protData['tissue_id']
+        del protData['disease_status']
+        self.protData = protData
+        return self.protData
+
+    def filter_protein_data(self):
+        # Filter protein data based on a percentage threshold
+        protData = self.get_protein_data()
+        assays = protData[self.tissue].unique()
+        DataFrameDict = {elem: pd.DataFrame for elem in assays}
+        reduction = []
+        for key in DataFrameDict.keys():
+            DataFrameDict[key] = protData[protData[self.tissue] == key]
+            perc = self.percentage * len(pd.unique(DataFrameDict[key]['assay_id']))
+            before = DataFrameDict[key]['uniprot_id'].nunique()
+            DataFrameDict[key] = DataFrameDict[key].groupby('uniprot_id').filter(lambda x: len(x) > perc)
+            after = DataFrameDict[key]['uniprot_id'].nunique()
+            reduction.append(before - after)
+        filteredData = pd.DataFrame()
+        for key in DataFrameDict.keys():
+            filteredData = filteredData.append(DataFrameDict[key])
+        del filteredData[self.tissue]
+        self.filteredData = filteredData
+        self.reduction = sum(reduction)
+        return self.filteredData
+
+    def calculate_NSAF(self):
+        # Calculate NSAF scores for proteins
+        filteredData = self.filter_protein_data()
+        assays = filteredData['assay_id'].unique()
+        DataFrameDict3 = {elem: pd.DataFrame for elem in assays}
+        for key in DataFrameDict3.keys():
+            DataFrameDict3[key] = filteredData[filteredData['assay_id'] == key]
+        for key in DataFrameDict3.keys():
+            sumSaf = 0
+            assay = DataFrameDict3[key]
+            assay.pop('assay_id')
+            grouped = DataFrameDict3[key].groupby('uniprot_id').sum().reset_index()
+            seqAddedDF = pd.merge(grouped, self.seqData, on='uniprot_id')
+            seqAddedDF['SAF'] = seqAddedDF['quantification'] / seqAddedDF['length']
+            sumSaf = seqAddedDF['SAF'].sum()
+            seqAddedDF['NSAF'] = seqAddedDF['SAF'] / sumSaf
+            del seqAddedDF['length']
+            del seqAddedDF['quantification']
+            del seqAddedDF['SAF']
+            seqAddedDF.insert(loc=0, column='assay_id', value=key)
+            DataFrameDict3[key] = seqAddedDF
+        proteinData = pd.DataFrame()
+        for key in DataFrameDict3.keys():
+            proteinData = proteinData.append(DataFrameDict3[key])
+        self.proteinData = proteinData
+        return self.proteinData
+
+    def get_predictor_atlas(self):
+        # Generate the predictor atlas
+        proteinData = self.calculate_NSAF()
+        tissueData = self.get_tissue_data()
+        if self.disease_status == 'Healthy':
+            tissueData = tissueData[tissueData['disease_status'] == "Healthy"]
+        self.atlas = pd.merge(proteinData, tissueData, on='assay_id')
+        self.atlas = pd.pivot_table(self.atlas, values='NSAF', index='uniprot_id', columns='tissue_name').fillna(0)
+        return self.atlas
@@ -0,0 +1,70 @@
+import pandas as pd
+import numpy as np
+from xgboost import XGBClassifier
+from sklearn.svm import SVC
+from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
+import time
+
+class CellPredictor:
+    def __init__(self, X_train_path, X_test_path, y_train_path, y_test_path, dict_train_label_weight_path):
+        # Load the data
+        self.X_train = pd.read_csv(X_train_path)
+        self.X_test = pd.read_csv(X_test_path)
+        self.y_train = pd.read_csv(y_train_path)
+        self.y_test = pd.read_csv(y_test_path)
+        dict_train_label_weight = pd.read_csv(dict_train_label_weight_path)
+        self.dict_train_label_weights = {item[0]: item[1] for item in dict_train_label_weight.values.tolist()}
+        self.train_weights = list(self.dict_train_label_weights.values())
+        self.num_classes = len(np.unique(self.y_train))
+        print('Data loaded')
+
+    def cv_comparison(self, models, names, X, y, cv):
+        cv_scores = pd.DataFrame()
+        accs, f1s, precs, recs, f1s_w, precs_w, recs_w = [], [], [], [], [], [], []
+
+        for model, name in zip(models, names):
+            print(name)
+            start = time.time()
+            acc = np.round(cross_val_score(model, X, y, scoring='accuracy', cv=cv), 4)
+            accs.append(acc)
+            acc_avg = round(np.mean(acc[~np.isnan(acc)]), 4)
+            f1 = np.round(cross_val_score(model, X, y, scoring='f1_macro', cv=cv), 4)
+            f1s.append(f1)
+            f1_avg = round(np.mean(f1[~np.isnan(f1)]), 4)
+            prec = np.round(cross_val_score(model, X, y, scoring='precision_macro', cv=cv), 4)
+            precs.append(prec)
+            prec_avg = round(np.mean(prec[~np.isnan(prec)]), 4)
+            rec = np.round(cross_val_score(model, X, y, scoring='recall_macro', cv=cv), 4)
+            recs.append(rec)
+            rec_avg = round(np.mean(rec[~np.isnan(rec)]), 4)
+            f1_w = np.round(cross_val_score(model, X, y, scoring='f1_weighted', cv=cv), 4)
+            f1s_w.append(f1_w)
+            f1_w_avg = round(np.mean(f1_w[~np.isnan(f1_w)]), 4)
+            prec_w = np.round(cross_val_score(model, X, y, scoring='precision_weighted', cv=cv), 4)
+            precs_w.append(prec_w)
+            prec_w_avg = round(np.mean(prec_w[~np.isnan(prec_w)]), 4)
+            rec_w = np.round(cross_val_score(model, X, y, scoring='recall_weighted', cv=cv), 4)
+            recs_w.append(rec_w)
+            rec_w_avg = round(np.mean(rec_w[~np.isnan(rec_w)]), 4)
+            cv_scores[name] = [acc_avg, f1_avg, prec_avg, rec_avg, f1_w_avg, prec_w_avg, rec_w_avg]
+            print(time.time() - start)
+        cv_scores.index = ['Accuracy', 'f1_macro', 'precision_macro', 'recall_macro', 'f1_weighted', 'precision_weighted', 'recall_weighted']
+        return cv_scores, accs, f1s, precs, recs, f1s_w, precs_w, recs_w
+
+    def train_models(self, output_prefix):
+        xgb_unbal = XGBClassifier(random_state=42, objective='multi:softprob', eval_metric='mlogloss', num_class=self.num_classes, n_jobs=-1)
+        xgb = XGBClassifier(random_state=42, objective='multi:softprob', eval_metric='mlogloss', num_class=self.num_classes, weight=self.train_weights, n_jobs=-1)
+        svm_unbal = SVC(random_state=42)
+        svm = SVC(random_state=42, class_weight=self.dict_train_label_weights)
+
+        models_xgb = [xgb_unbal, xgb]
+        names_xgb = ['XGBClassifier unbalanced', 'XGBClassifier dict balanced']
+        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)
+        comp_xgb, _, _, _, _, _, _, _ = self.cv_comparison(models_xgb, names_xgb, self.X_train, self.y_train, cv=cv)
+        comp_xgb.to_csv(f'{output_prefix}_XGB.csv', sep='/')
+
+        models_svm = [svm_unbal, svm]
+        names_svm = ['SVM unbalanced', 'SVM']
+        comp_svm, _, _, _, _, _, _, _ = self.cv_comparison(models_svm, names_svm, self.X_train, self.y_train, cv=cv)
+        comp_svm.to_csv(f'{output_prefix}_SVM.csv', sep='/')
+
@@ -0,0 +1,164 @@
+import pandas as pd
+import mysql.connector
+from collections import defaultdict
+import glob
+import os
+import logging
+
+class Database:
+    def __init__(self, db_name='expression_atlas2', user='root', password='password', host='127.0.0.1', port='3306'):
+        self.conn = mysql.connector.connect(user=user, password=password, host=host, port=port, database=db_name)
+        self.mycursor = self.conn.cursor(buffered=True)
+        # Check the connection
+        if self.conn.is_connected():
+            print("Connection successful")
+        else:
+            print("No connection")
+
+    def check_projects(self, new_projects):
+        # Check if a project is already in the DB. Returns empty list if no duplicates have been found
+        for x in new_projects:
+            if x[0:3] != 'PXD':
+                print('Project name must begin with "PXD"')
+                return False
+        
+        query = "SELECT PXD_accession FROM project"
+        old_projects = pd.read_sql_query(query, self.conn)['PXD_accession'].values.tolist()
+        duplicates = []
+        for p in new_projects:
+            if p in old_projects:
+                print(f'{p} is already in the database')
+                duplicates.append(p)
+        print('Projects checked.')
+        return duplicates
+
+    def build_project_table(self, meta_df, list_of_pxds):
+        # Populate project table using a dataframe with the necessary metadata and a list of PXD accessions
+        meta_df = meta_df[meta_df['accession'].isin(list_of_pxds)]
+        check = self.check_projects(meta_df.accession.unique().tolist())
+        if check:
+            print(f"Duplicates detected: {check}. \nNo entries have been added")
+            return
+
+        meta_df = meta_df[['accession', 'experimentTypes', 'instrumentNames', 'keywords', 'references']].astype(str)
+        meta_tuples = list(meta_df.to_records(index=False))
+        for i in meta_tuples:
+            project = "INSERT INTO project(PXD_accession, experiment_type, instrument, keywords, ref) VALUES (%s, %s, %s, %s, %s)"
+            self.mycursor.execute(project, list(i))
+            self.conn.commit()
+        print(f"{len(meta_tuples)} projects added to table 'project'.")
+
+    def build_mod_table(self, mod_df):
+        # Insert modifications into the modifications table
+        mod_tuples = list(mod_df.to_records(index=False))
+        for i in mod_tuples:
+            mod = "INSERT IGNORE INTO modifications(mod_id, modification_type, mass_difference) VALUES(%s, %s, %s)"
+            self.mycursor.execute(mod, list(i))
+            self.conn.commit()
+        print(f"{len(mod_tuples)} modifications added.")
+
+    def build_tissue_table(self, tissue_df):
+        # Insert tissues into the tissue table
+        tissue_df = tissue_df[['tissue', 'cell_type', 'status']].drop_duplicates()
+        tissue_tuples = list(tissue_df.to_records(index=False))
+        for i in tissue_tuples:
+            tissue = "INSERT INTO tissue(tissue_name, cell_type, disease_status) VALUES (%s, %s, %s)"
+            self.mycursor.execute(tissue, list(i))
+            self.conn.commit()
+        print(f"{len(tissue_tuples)} tissues added.")
+
+    def build_assay_cell_table(self, assay_df):
+        # Insert assays into the assay table and link them with tissues
+        assay_tuples = list(assay_df.to_records(index=False))
+        for i in assay_tuples:
+            accession, filename, pride_tissue, cell_type, tissue, status = i
+            self.mycursor.execute("SELECT project_id FROM project WHERE PXD_accession = %s", (accession,))
+            projectID = self.mycursor.fetchone()[0]
+            assay = "INSERT INTO assay(project_id, filename) VALUES(%s, %s)"
+            self.mycursor.execute(assay, (projectID, filename))
+            self.conn.commit()
+            assayID = self.mycursor.lastrowid
+            self.mycursor.execute("SELECT tissue_id FROM tissue WHERE tissue_name = %s AND cell_type = %s AND disease_status = %s", (tissue, cell_type, status))
+            tissueID = self.mycursor.fetchone()[0]
+            tissue_to_assay = "INSERT INTO tissue_to_assay(assay_id, tissue_id) VALUES(%s, %s)"
+            self.mycursor.execute(tissue_to_assay, (assayID, tissueID))
+            self.conn.commit()
+        print(f"{len(assay_tuples)} assays added.")
+
+    def ionbot_parse(self, file):
+        # Parse ionbot output files and filter based on specific criteria
+        df = pd.read_csv(file, sep=',')
+        if df.empty:
+            logging.debug(f"File {file} is empty")
+            return False
+        df = df[(df['best_psm'] == 1) & (df['q_value'] <= 0.01) & (df['DB'] == 'T')]
+        if df.empty:
+            logging.debug(f"{file} did not pass filtering")
+            return False
+        df = df[~df['proteins'].str.contains('||', regex=False)]
+        df['modifications'] = df['modifications'].fillna('x|[2030]unmodified')
+        if df.empty:
+            logging.debug(f"{file} all peptides are linked to multiple proteins or do not pass the filtering")
+            return False
+        spectral_counts = defaultdict(int)
+        for pep in df['matched_peptide'].tolist():
+            spectral_counts[pep] += 1
+        spectral_counts = dict(sorted(spectral_counts.items(), key=lambda item: item[1], reverse=True))
+        return df, spectral_counts
+
+    def ionbot_store(self, file, filename):
+        # Store parsed ionbot data into the database
+        filename = filename.split('/')[-1].split('.')[0]
+        self.mycursor.execute("SELECT assay_id FROM assay WHERE filename = %s", (filename,))
+        assayID = self.mycursor.fetchone()
+        if not assayID:
+            print(f'{filename} is not in assays')
+            return
+        assayID = assayID[0]
+        parser = self.ionbot_parse(file)
+        if not parser:
+            logging.warning(f"parser failed for {filename}")
+            return
+        df, spectral_counts = parser
+        for _, row in df.iterrows():
+            protID, pepseq, mod = row['proteins'], row['matched_peptide'], row['modifications']
+            self.mycursor.execute("INSERT INTO peptide(peptide_sequence) VALUES (%s) ON DUPLICATE KEY UPDATE peptide_sequence=peptide_sequence", (pepseq,))
+            self.conn.commit()
+            self.mycursor.execute("SELECT peptide_id FROM peptide WHERE peptide_sequence = %s", (pepseq,))
+            pepID = self.mycursor.fetchone()[0]
+            uniprotID = protID.split('|')[1]
+            self.mycursor.execute("INSERT INTO protein(uniprot_id) VALUES (%s) ON DUPLICATE KEY UPDATE uniprot_id=uniprot_id", (uniprotID,))
+            self.conn.commit()
+            self.mycursor.execute("INSERT INTO peptide_to_protein(uniprot_id, peptide_id) VALUES (%s,%s) ON DUPLICATE KEY UPDATE peptide_id=peptide_id, uniprot_id=uniprot_id", (uniprotID, pepID))
+            self.conn.commit()
+            for m in mod.split(';'):
+                location, modID = m.split('|')[0], m[m.find("[")+1:m.find("]")]
+                self.mycursor.execute("SELECT mod_id FROM modifications WHERE mod_id = %s", (modID,))
+                modID = self.mycursor.fetchone()
+                if modID:
+                    modID = modID[0]
+                    self.mycursor.execute("INSERT INTO peptide_modifications(peptide_id, location, mod_id, assay_id) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE count = count + 1", (pepID, location, modID, assayID))
+                    self.conn.commit()
+            count = spectral_counts.get(pepseq, float('inf'))
+            self.mycursor.execute("INSERT INTO peptide_to_assay(peptide_id, assay_id, quantification) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE quantification=%s", (pepID, assayID, count, count))
+            self.conn.commit()
+        logging.info(f'{filename} was stored')
+
+    def find_ionbot_files(self, projects):
+        # Find and process ionbot files for given projects
+        logging.basicConfig(filename='ionbot_assays.log', level=logging.DEBUG)
+        number_of_files = 0
+        for pxd in projects:
+            path = None
+            for base in ['/home/compomics/conode53_pride/PRIDE_DATA/', '/home/compomics/conode54_pride/PRIDE_DATA/', '/home/compomics/conode55_pride/PRIDE_DATA/']:
+                if os.path.exists(base + str(pxd)):
+                    path = base + str(pxd)
+                    break
+            if not path:
+                continue
+            for file in glob.glob(path + "/*.mgf.gzip/*.mgf.gzip.ionbot.csv"):
+                number_of_files += 1
+                if file not in logging.root.manager.loggerDict:
+                    if os.path.getsize(file) != 0:
+                        self.ionbot_store(file, file)
+        print(number_of_files)