Skip to content

Commit 124dc85

Browse files
Added scripts for plots and protein pair analysis
1 parent 1b4581c commit 124dc85

File tree

4 files changed

+220
-2
lines changed

4 files changed

+220
-2
lines changed

code/analysis.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import pandas as pd
2+
3+
4+
def analyse_non_clusters(uniref_id_mapping_file: str, non_clustered_file: str, taxon_file: str) -> None:
5+
"""
6+
Analyses and compares the non clustered pairs with the latest release 2023_01 of UniProt to check if any of the
7+
pairs have already been clustered. Adds this information as a new column "clustered", sets it to True is clustered.
8+
Additionally, adds Taxon name of both the accessions of each pair as two additonal columns and creates a file
9+
containing of all pairs that are to date still not clustered (true non clusters).
10+
Parameters
11+
----------
12+
uniref_id_mapping_file : str
13+
TSV file consisting of information about each Eukaryota entry and if they belong to any cluster
14+
as per release 2023_01.
15+
non_clustered_file : str
16+
TSV filepath to score matrix of Non clustered file.
17+
taxon_file : str
18+
TSV Filepath for Eukaryota fucntions and taxonomy data.
19+
"""
20+
df_id_mapping = pd.read_csv(uniref_id_mapping_file, sep='\t', usecols=['From', 'Cluster ID'], index_col=0, squeeze=True)
21+
dictionary = {index: value for index, value in df_id_mapping.items()}
22+
23+
df1 = pd.read_csv(non_clustered_file, sep='\t')
24+
# adds True if the pair has been clustered as per the latest release, else False
25+
for idx, row in df1.iterrows():
26+
if dictionary[row['accession1']] == dictionary[row['accession2']]:
27+
df1.at[idx, 'clustered'] = True
28+
else:
29+
df1.at[idx, 'clustered'] = False
30+
31+
# Saves along with an additional 'clustered' column
32+
df1.to_csv("./data/output/scores/not_clustered_score_matrix_word2doc2vec_with_new_clusters.tsv", sep='\t')
33+
34+
# adds taxon for both accessions
35+
df_euk = pd.read_csv(taxon_file, sep='\t')
36+
accession_to_taxon = dict(zip(df_euk['accession'], df_euk['taxon']))
37+
df1['taxon_acc_1'] = df1['accession1'].map(accession_to_taxon)
38+
df1['taxon_acc_2'] = df1['accession2'].map(accession_to_taxon)
39+
df1.to_csv("./data/output/scores/not_clustered_score_matrix_word2doc2vec_with_taxon.tsv", sep='\t')
40+
41+
# Saves only those that have not been clustered
42+
df1 = df1.loc[df1['clustered'] == True]
43+
df1.to_csv("./data/output/scores/true_non_clusters.tsv", sep='\t')
44+
45+
46+
def analyse_cases(true_non_cluster_file: str):
47+
"""
48+
Analysis the true non clusters and groups them into two cases and creates two separate TSV files.
49+
Case 1: high sequence identity and high cosine similarity
50+
Case 2: high sequence identity and low cosine similarity.
51+
Parameters
52+
----------
53+
true_non_cluster_file : str
54+
TSV Filepath containing of all true non clusters.
55+
"""
56+
57+
df = pd.read_csv(true_non_cluster_file, sep='\t')
58+
df1 = df.loc[(df['sequence_identity_score'] >= 0.90) & (df['cosine_score'] >= 0.90)]
59+
df1.to_csv("./data/output/score/case1.tsv", sep='\t')
60+
df2 = df.loc[(df['sequence_identity_score'] >= 0.90) & (df['cosine_score'] < 0.90)]
61+
df2.to_csv("./data/output/score/case2.tsv", sep='\t')
62+
63+
64+
if __name__ == "__main__":
65+
analyse_non_clusters("./data/output/scores/uniprot-compressed_true_download_true_fields_id_2Cname_2Ctypes_2Ccou-2023.03.09-14.53.27.42.tsv",
66+
"./data/output/scores/not_clustered_score_matrix_word2doc2vec.tsv", "./data/output/functions/rev-20220525-UniProtKB-eukaryota.tsv")
67+
analyse_cases("./data/output/scores/true_non_clusters.tsv")

code/parse_fasta.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def write_fasta(fast_file: str) -> None:
7474
Filepath to the fasta file containing all Eukaryota sequences.
7575
"""
7676
record_iter = [i for i in SeqIO.parse(open(fast_file), "fasta")]
77-
for i, batch in enumerate(batch_iterator(record_iter, 10000)):
77+
for i, batch in enumerate(batch_iterator(record_iter, 100)):
7878
filename = "eukaryota_group_%i.fasta" % (i + 1)
7979
with open(filename, "w") as handle:
8080
count = SeqIO.write(batch, handle, "fasta")
@@ -84,6 +84,6 @@ def write_fasta(fast_file: str) -> None:
8484
if __name__ == "__main__":
8585
eukaryota_accessions = extract_eukaryota_accessions("./data/output/functions/rev-20220525-UniProtKB-eukaryota.tsv")
8686
eukaryota_fasta = parse_fasta("./data/uniprot/swissprot/uniprot_sprot-only2022_02/uniprot_sprot.fasta", eukaryota_accessions)
87-
write_fasta("data/output/functions/swissprot-eukaryota.fasta")
87+
write_fasta("./data/output/functions/swissprot-eukaryota.fasta")
8888

8989

code/plot.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import pandas as pd
2+
import matplotlib.pyplot as plt
3+
4+
5+
def plot_sequence_embedding(input_filepath: str, output_filepath: str):
6+
"""
7+
Plots a scatter plot between sequence (blast percentage identity score) vs embedding similarity (cosine similarity)
8+
for clustered pairs.
9+
Parameters
10+
----------
11+
input_filepath : str
12+
TSV filepath for the clustered pairs of proteins.
13+
output_filepath : str
14+
Filepath to save the plot.
15+
"""
16+
df = pd.read_csv(input_filepath, sep='\t')
17+
x = df['cosine_score']
18+
y = df['sequence_identity_score']
19+
fig, ax = plt.subplots(figsize=(8, 6))
20+
ax.scatter(x, y, s=5)
21+
ax.plot([min(x), max(x)], [min(y), max(y)], color='red')
22+
ax.set_xlabel('Embedding similarity (cosine similarity)')
23+
ax.set_ylabel('Sequence similarity (percentage identity)')
24+
ax.set_title('Sequence vs. Embedding similarity')
25+
plt.savefig(output_filepath)
26+
27+
28+
if __name__ == "__main__":
29+
plot_sequence_embedding("./data/output/scores/clustered_score_matrix_word2doc2vec.tsv",
30+
'./data/output/plots/scatter_plot_not_clustered_word2odc2vec.png')
31+
32+
plot_sequence_embedding("./data/output/scores/clustered_score_matrix_hybrid.tsv",
33+
'./data/output/plots/scatter_plot_not_clustered_hybrid.png')

code/score_matrix.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
5+
def read_cosine_matrix(matrix_file):
6+
matrix = np.load(matrix_file)
7+
cosine_matrix = matrix['arr_0']
8+
return cosine_matrix
9+
10+
11+
def create_cluster_score_matrix(cluster_file: str, cosine_matrix: np.array, blast_file: str, taxon_file: str, output_file: str) -> None:
12+
"""
13+
Adds the corresponding cosine similarity score, blast percentage identity score and taxons for each pair of the
14+
clustered pairs.
15+
Parameters
16+
----------
17+
cluster_file : str
18+
TSV Filepath to clustered pairs of proteins.
19+
cosine_matrix : np.array
20+
Cosine matrix of dimensions 161354 x 161354 of the best model.
21+
blast_file : str
22+
TSV Filepath for blast percentage identity score for all Eukaryota proteins.
23+
taxon_file : str
24+
TSV Filepath for Eukaryota fucntions and taxonomy data.
25+
output_file : str
26+
Output filepath to save the resulting score matrix matrix.
27+
"""
28+
df = pd.read_csv(cluster_file, sep='\t')
29+
ref_indices = df['accession1_index'].to_numpy()
30+
asd_indices = df['accession2_index'].to_numpy()
31+
array = np.array(list(zip(ref_indices, asd_indices)), dtype=object)
32+
# adds cosine score
33+
cosine_array = np.zeros((len(ref_indices)), dtype=object)
34+
for idx, i in enumerate(array):
35+
if i[0] > i[1]:
36+
cosine_score = cosine_matrix[i[1]][i[0]]
37+
elif i[0] < i[1]:
38+
cosine_score = cosine_matrix[i[0]][i[1]]
39+
cosine_array[idx] = cosine_score
40+
df['cosine_score'] = cosine_array
41+
# adds blast percentage identity score
42+
blast = pd.read_csv(blast_file, sep='\t')
43+
blast_grouped = blast.groupby(['accession1', 'accession2'])['sequence_identity_score'].max()
44+
result = pd.merge(df, blast_grouped, on=['accession1', 'accession2'], how='left')
45+
result['sequence_identity_score'].fillna(
46+
result.groupby(['accession2', 'accession1'])['sequence_identity_score'].transform('max'), inplace=True)
47+
# adds taxon for both accessions
48+
df_euk = pd.read_csv(taxon_file, sep='\t')
49+
accession_to_taxon = dict(zip(df_euk['accession'], df_euk['taxon']))
50+
result['taxon_acc_1'] = df['accession1'].map(accession_to_taxon)
51+
result['taxon_acc_2'] = df['accession2'].map(accession_to_taxon)
52+
result.to_csv(output_file, sep='\t')
53+
# # df = pd.read_csv("./data/output/scores/score_hybrid.tsv", sep='\t')
54+
# # df = df[df['sequence_identity_score'].isnull()]
55+
# df = pd.read_csv("./add.tsv", sep='\t')
56+
# print(len(df))
57+
# df.to_csv("./add2.tsv", sep='\t')
58+
# result = pd.read_csv("./data/output/scores/clustered_score_matrix_hybrid.tsv", sep='\t')
59+
# new = pd.read_csv("./values2.tsv", sep='\t')
60+
# merged = pd.concat([result, new])
61+
# merged.dropna(subset=['sequence_identity_score'], inplace=True)
62+
# merged.to_csv("./data/output/scores/clustered_score_matrix_hybrid.tsv", sep='\t')
63+
# print(len(result.loc[result['cosine_score'] < 0.9]))
64+
# merged.to_csv("./data/clustered_score_matrix_word2doc2vec.tsv", sep='\t', index=False)
65+
66+
67+
def create_non_clustered_score_matrix(cluster_file: str, cosine_matrix: np.array, output_file: str) -> None:
68+
"""
69+
Adds the corresponding cosine similarity score for each pair of the
70+
non-clustered pairs.
71+
Parameters
72+
----------
73+
cluster_file : str
74+
TSV Filepath to clustered pairs of proteins.
75+
cosine_matrix : np.array
76+
Cosine matrix of dimensions 161354 x 161354 of the best model.
77+
output_file : str
78+
Output filepath to save the resulting score matrix matrix.
79+
"""
80+
df = pd.read_csv(cluster_file, sep='\t')
81+
ref_indices = df['accession1_index'].to_numpy()
82+
asd_indices = df['accession2_index'].to_numpy()
83+
array = np.array(list(zip(ref_indices, asd_indices)), dtype=object)
84+
cosine_array = np.zeros((len(ref_indices)), dtype=object)
85+
for idx, i in enumerate(array):
86+
if i[0] > i[1]:
87+
cosine_score = cosine_matrix[i[1]][i[0]]
88+
elif i[0] < i[1]:
89+
cosine_score = cosine_matrix[i[0]][i[1]]
90+
cosine_array[idx] = cosine_score
91+
df['cosine_score'] = cosine_array
92+
df.to_csv(output_file, sep='\t')
93+
94+
95+
if __name__ == "__main__":
96+
# Word2doc2Vec
97+
cosine_matrix = read_cosine_matrix("./data/output/cosine/cosine_word2doc2vev_bestmodel.npz")
98+
create_cluster_score_matrix("./data/output/uniref/clustered_pairs_index.tsv",
99+
cosine_matrix,
100+
"./data/output/blast.tsv",
101+
"./data/output/functions/rev-20220525-UniProtKB-eukaryota.tsv",
102+
"./data/output/scores/clustered_score_matrix_word2doc2vec.tsv")
103+
104+
create_non_clustered_score_matrix("./data/output/uniref/not_clustered_pairs_index.tsv",
105+
cosine_matrix,
106+
"./data/output/scores/not_clustered_score_matrix_word2doc2vec.tsv")
107+
108+
# Hybrid-Word2doc2Vec
109+
cosine_matrix = "./data/output/cosine/cosine_hybrid_bestmodel.npz"
110+
create_cluster_score_matrix("./data/output/uniref/clustered_pairs_index.tsv",
111+
cosine_matrix,
112+
"./data/output/blast.tsv",
113+
"./data/output/functions/rev-20220525-UniProtKB-eukaryota.tsv",
114+
"./data/output/scores/clustered_score_matrix_hybrid.tsv")
115+
create_non_clustered_score_matrix("./data/output/uniref/not_clustered_pairs_index.tsv",
116+
cosine_matrix,
117+
"./data/output/scores/not_clustered_score_matrix_word2doc2vec.tsv")
118+

0 commit comments

Comments
 (0)