|
| 1 | +import pandas as pd |
| 2 | +import mysql.connector |
| 3 | +from collections import defaultdict |
| 4 | +import glob |
| 5 | +import os |
| 6 | +import logging |
| 7 | + |
| 8 | +class Database: |
| 9 | + def __init__(self, db_name='expression_atlas2', user='root', password='password', host='127.0.0.1', port='3306'): |
| 10 | + self.conn = mysql.connector.connect(user=user, password=password, host=host, port=port, database=db_name) |
| 11 | + self.mycursor = self.conn.cursor(buffered=True) |
| 12 | + # Check the connection |
| 13 | + if self.conn.is_connected(): |
| 14 | + print("Connection successful") |
| 15 | + else: |
| 16 | + print("No connection") |
| 17 | + |
| 18 | + def check_projects(self, new_projects): |
| 19 | + # Check if a project is already in the DB. Returns empty list if no duplicates have been found |
| 20 | + for x in new_projects: |
| 21 | + if x[0:3] != 'PXD': |
| 22 | + print('Project name must begin with "PXD"') |
| 23 | + return False |
| 24 | + |
| 25 | + query = "SELECT PXD_accession FROM project" |
| 26 | + old_projects = pd.read_sql_query(query, self.conn)['PXD_accession'].values.tolist() |
| 27 | + duplicates = [] |
| 28 | + for p in new_projects: |
| 29 | + if p in old_projects: |
| 30 | + print(f'{p} is already in the database') |
| 31 | + duplicates.append(p) |
| 32 | + print('Projects checked.') |
| 33 | + return duplicates |
| 34 | + |
| 35 | + def build_project_table(self, meta_df, list_of_pxds): |
| 36 | + # Populate project table using a dataframe with the necessary metadata and a list of PXD accessions |
| 37 | + meta_df = meta_df[meta_df['accession'].isin(list_of_pxds)] |
| 38 | + check = self.check_projects(meta_df.accession.unique().tolist()) |
| 39 | + if check: |
| 40 | + print(f"Duplicates detected: {check}. \nNo entries have been added") |
| 41 | + return |
| 42 | + |
| 43 | + meta_df = meta_df[['accession', 'experimentTypes', 'instrumentNames', 'keywords', 'references']].astype(str) |
| 44 | + meta_tuples = list(meta_df.to_records(index=False)) |
| 45 | + for i in meta_tuples: |
| 46 | + project = "INSERT INTO project(PXD_accession, experiment_type, instrument, keywords, ref) VALUES (%s, %s, %s, %s, %s)" |
| 47 | + self.mycursor.execute(project, list(i)) |
| 48 | + self.conn.commit() |
| 49 | + print(f"{len(meta_tuples)} projects added to table 'project'.") |
| 50 | + |
| 51 | + def build_mod_table(self, mod_df): |
| 52 | + # Insert modifications into the modifications table |
| 53 | + mod_tuples = list(mod_df.to_records(index=False)) |
| 54 | + for i in mod_tuples: |
| 55 | + mod = "INSERT IGNORE INTO modifications(mod_id, modification_type, mass_difference) VALUES(%s, %s, %s)" |
| 56 | + self.mycursor.execute(mod, list(i)) |
| 57 | + self.conn.commit() |
| 58 | + print(f"{len(mod_tuples)} modifications added.") |
| 59 | + |
| 60 | + def build_tissue_table(self, tissue_df): |
| 61 | + # Insert tissues into the tissue table |
| 62 | + tissue_df = tissue_df[['tissue', 'cell_type', 'status']].drop_duplicates() |
| 63 | + tissue_tuples = list(tissue_df.to_records(index=False)) |
| 64 | + for i in tissue_tuples: |
| 65 | + tissue = "INSERT INTO tissue(tissue_name, cell_type, disease_status) VALUES (%s, %s, %s)" |
| 66 | + self.mycursor.execute(tissue, list(i)) |
| 67 | + self.conn.commit() |
| 68 | + print(f"{len(tissue_tuples)} tissues added.") |
| 69 | + |
| 70 | + def build_assay_cell_table(self, assay_df): |
| 71 | + # Insert assays into the assay table and link them with tissues |
| 72 | + assay_tuples = list(assay_df.to_records(index=False)) |
| 73 | + for i in assay_tuples: |
| 74 | + accession, filename, pride_tissue, cell_type, tissue, status = i |
| 75 | + self.mycursor.execute("SELECT project_id FROM project WHERE PXD_accession = %s", (accession,)) |
| 76 | + projectID = self.mycursor.fetchone()[0] |
| 77 | + assay = "INSERT INTO assay(project_id, filename) VALUES(%s, %s)" |
| 78 | + self.mycursor.execute(assay, (projectID, filename)) |
| 79 | + self.conn.commit() |
| 80 | + assayID = self.mycursor.lastrowid |
| 81 | + self.mycursor.execute("SELECT tissue_id FROM tissue WHERE tissue_name = %s AND cell_type = %s AND disease_status = %s", (tissue, cell_type, status)) |
| 82 | + tissueID = self.mycursor.fetchone()[0] |
| 83 | + tissue_to_assay = "INSERT INTO tissue_to_assay(assay_id, tissue_id) VALUES(%s, %s)" |
| 84 | + self.mycursor.execute(tissue_to_assay, (assayID, tissueID)) |
| 85 | + self.conn.commit() |
| 86 | + print(f"{len(assay_tuples)} assays added.") |
| 87 | + |
| 88 | + def ionbot_parse(self, file): |
| 89 | + # Parse ionbot output files and filter based on specific criteria |
| 90 | + df = pd.read_csv(file, sep=',') |
| 91 | + if df.empty: |
| 92 | + logging.debug(f"File {file} is empty") |
| 93 | + return False |
| 94 | + df = df[(df['best_psm'] == 1) & (df['q_value'] <= 0.01) & (df['DB'] == 'T')] |
| 95 | + if df.empty: |
| 96 | + logging.debug(f"{file} did not pass filtering") |
| 97 | + return False |
| 98 | + df = df[~df['proteins'].str.contains('||', regex=False)] |
| 99 | + df['modifications'] = df['modifications'].fillna('x|[2030]unmodified') |
| 100 | + if df.empty: |
| 101 | + logging.debug(f"{file} all peptides are linked to multiple proteins or do not pass the filtering") |
| 102 | + return False |
| 103 | + spectral_counts = defaultdict(int) |
| 104 | + for pep in df['matched_peptide'].tolist(): |
| 105 | + spectral_counts[pep] += 1 |
| 106 | + spectral_counts = dict(sorted(spectral_counts.items(), key=lambda item: item[1], reverse=True)) |
| 107 | + return df, spectral_counts |
| 108 | + |
| 109 | + def ionbot_store(self, file, filename): |
| 110 | + # Store parsed ionbot data into the database |
| 111 | + filename = filename.split('/')[-1].split('.')[0] |
| 112 | + self.mycursor.execute("SELECT assay_id FROM assay WHERE filename = %s", (filename,)) |
| 113 | + assayID = self.mycursor.fetchone() |
| 114 | + if not assayID: |
| 115 | + print(f'{filename} is not in assays') |
| 116 | + return |
| 117 | + assayID = assayID[0] |
| 118 | + parser = self.ionbot_parse(file) |
| 119 | + if not parser: |
| 120 | + logging.warning(f"parser failed for {filename}") |
| 121 | + return |
| 122 | + df, spectral_counts = parser |
| 123 | + for _, row in df.iterrows(): |
| 124 | + protID, pepseq, mod = row['proteins'], row['matched_peptide'], row['modifications'] |
| 125 | + self.mycursor.execute("INSERT INTO peptide(peptide_sequence) VALUES (%s) ON DUPLICATE KEY UPDATE peptide_sequence=peptide_sequence", (pepseq,)) |
| 126 | + self.conn.commit() |
| 127 | + self.mycursor.execute("SELECT peptide_id FROM peptide WHERE peptide_sequence = %s", (pepseq,)) |
| 128 | + pepID = self.mycursor.fetchone()[0] |
| 129 | + uniprotID = protID.split('|')[1] |
| 130 | + self.mycursor.execute("INSERT INTO protein(uniprot_id) VALUES (%s) ON DUPLICATE KEY UPDATE uniprot_id=uniprot_id", (uniprotID,)) |
| 131 | + self.conn.commit() |
| 132 | + self.mycursor.execute("INSERT INTO peptide_to_protein(uniprot_id, peptide_id) VALUES (%s,%s) ON DUPLICATE KEY UPDATE peptide_id=peptide_id, uniprot_id=uniprot_id", (uniprotID, pepID)) |
| 133 | + self.conn.commit() |
| 134 | + for m in mod.split(';'): |
| 135 | + location, modID = m.split('|')[0], m[m.find("[")+1:m.find("]")] |
| 136 | + self.mycursor.execute("SELECT mod_id FROM modifications WHERE mod_id = %s", (modID,)) |
| 137 | + modID = self.mycursor.fetchone() |
| 138 | + if modID: |
| 139 | + modID = modID[0] |
| 140 | + self.mycursor.execute("INSERT INTO peptide_modifications(peptide_id, location, mod_id, assay_id) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE count = count + 1", (pepID, location, modID, assayID)) |
| 141 | + self.conn.commit() |
| 142 | + count = spectral_counts.get(pepseq, float('inf')) |
| 143 | + self.mycursor.execute("INSERT INTO peptide_to_assay(peptide_id, assay_id, quantification) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE quantification=%s", (pepID, assayID, count, count)) |
| 144 | + self.conn.commit() |
| 145 | + logging.info(f'{filename} was stored') |
| 146 | + |
| 147 | + def find_ionbot_files(self, projects): |
| 148 | + # Find and process ionbot files for given projects |
| 149 | + logging.basicConfig(filename='ionbot_assays.log', level=logging.DEBUG) |
| 150 | + number_of_files = 0 |
| 151 | + for pxd in projects: |
| 152 | + path = None |
| 153 | + for base in ['/home/compomics/conode53_pride/PRIDE_DATA/', '/home/compomics/conode54_pride/PRIDE_DATA/', '/home/compomics/conode55_pride/PRIDE_DATA/']: |
| 154 | + if os.path.exists(base + str(pxd)): |
| 155 | + path = base + str(pxd) |
| 156 | + break |
| 157 | + if not path: |
| 158 | + continue |
| 159 | + for file in glob.glob(path + "/*.mgf.gzip/*.mgf.gzip.ionbot.csv"): |
| 160 | + number_of_files += 1 |
| 161 | + if file not in logging.root.manager.loggerDict: |
| 162 | + if os.path.getsize(file) != 0: |
| 163 | + self.ionbot_store(file, file) |
| 164 | + print(number_of_files) |
0 commit comments