Added scripts for plots and protein pair analysis

rohitharavinder · rohitharavinder · commit 124dc8540b75 · 2023-03-29T13:20:02.000+02:00
diff --git a/code/analysis.py b/code/analysis.py
@@ -0,0 +1,67 @@
+import pandas as pd
+
+
+def analyse_non_clusters(uniref_id_mapping_file: str, non_clustered_file: str, taxon_file: str) -> None:
+    """
+    Analyses and compares the non clustered pairs with the latest release 2023_01 of UniProt to check if any of the
+    pairs have already been clustered. Adds this information as a new column "clustered", sets it to True is clustered.
+    Additionally, adds Taxon name of both the accessions of each pair as two additonal columns and creates a file
+    containing of all pairs that are to date still not clustered (true non clusters).
+    Parameters
+    ----------
+    uniref_id_mapping_file : str
+        TSV file consisting of information about each Eukaryota entry and if they belong to any cluster
+        as per release 2023_01.
+    non_clustered_file : str
+        TSV filepath to score matrix of Non clustered file.
+    taxon_file : str
+        TSV Filepath for Eukaryota fucntions and taxonomy data.
+    """
+    df_id_mapping = pd.read_csv(uniref_id_mapping_file, sep='\t', usecols=['From', 'Cluster ID'], index_col=0, squeeze=True)
+    dictionary = {index: value for index, value in df_id_mapping.items()}
+
+    df1 = pd.read_csv(non_clustered_file, sep='\t')
+    # adds True if the pair has been clustered as per the latest release, else False
+    for idx, row in df1.iterrows():
+        if dictionary[row['accession1']] == dictionary[row['accession2']]:
+            df1.at[idx, 'clustered'] = True
+        else:
+            df1.at[idx, 'clustered'] = False
+
+    # Saves along with an additional 'clustered' column
+    df1.to_csv("./data/output/scores/not_clustered_score_matrix_word2doc2vec_with_new_clusters.tsv", sep='\t')
+
+    # adds taxon for both accessions
+    df_euk = pd.read_csv(taxon_file, sep='\t')
+    accession_to_taxon = dict(zip(df_euk['accession'], df_euk['taxon']))
+    df1['taxon_acc_1'] = df1['accession1'].map(accession_to_taxon)
+    df1['taxon_acc_2'] = df1['accession2'].map(accession_to_taxon)
+    df1.to_csv("./data/output/scores/not_clustered_score_matrix_word2doc2vec_with_taxon.tsv", sep='\t')
+
+    # Saves only those that have not been clustered
+    df1 = df1.loc[df1['clustered'] == True]
+    df1.to_csv("./data/output/scores/true_non_clusters.tsv", sep='\t')
+
+
+def analyse_cases(true_non_cluster_file: str):
+    """
+    Analysis the true non clusters and groups them into two cases and creates two separate TSV files.
+    Case 1: high sequence identity and high cosine similarity
+    Case 2:  high sequence identity and low cosine similarity.
+    Parameters
+    ----------
+    true_non_cluster_file : str
+        TSV Filepath containing of all true non clusters.
+    """
+
+    df = pd.read_csv(true_non_cluster_file, sep='\t')
+    df1 = df.loc[(df['sequence_identity_score'] >= 0.90) & (df['cosine_score'] >= 0.90)]
+    df1.to_csv("./data/output/score/case1.tsv", sep='\t')
+    df2 = df.loc[(df['sequence_identity_score'] >= 0.90) & (df['cosine_score'] < 0.90)]
+    df2.to_csv("./data/output/score/case2.tsv", sep='\t')
+
+
+if __name__ == "__main__":
+    analyse_non_clusters("./data/output/scores/uniprot-compressed_true_download_true_fields_id_2Cname_2Ctypes_2Ccou-2023.03.09-14.53.27.42.tsv",
+                         "./data/output/scores/not_clustered_score_matrix_word2doc2vec.tsv", "./data/output/functions/rev-20220525-UniProtKB-eukaryota.tsv")
+    analyse_cases("./data/output/scores/true_non_clusters.tsv")
diff --git a/code/parse_fasta.py b/code/parse_fasta.py
@@ -74,7 +74,7 @@ def write_fasta(fast_file: str) -> None:
         Filepath to the fasta file containing all Eukaryota sequences.
     """
     record_iter = [i for i in SeqIO.parse(open(fast_file), "fasta")]
-    for i, batch in enumerate(batch_iterator(record_iter, 10000)):
+    for i, batch in enumerate(batch_iterator(record_iter, 100)):
         filename = "eukaryota_group_%i.fasta" % (i + 1)
         with open(filename, "w") as handle:
             count = SeqIO.write(batch, handle, "fasta")
@@ -84,6 +84,6 @@ def write_fasta(fast_file: str) -> None:
 if __name__ == "__main__":
     eukaryota_accessions = extract_eukaryota_accessions("./data/output/functions/rev-20220525-UniProtKB-eukaryota.tsv")
     eukaryota_fasta = parse_fasta("./data/uniprot/swissprot/uniprot_sprot-only2022_02/uniprot_sprot.fasta", eukaryota_accessions)
-    write_fasta("data/output/functions/swissprot-eukaryota.fasta")
+    write_fasta("./data/output/functions/swissprot-eukaryota.fasta")
 
 
diff --git a/code/plot.py b/code/plot.py
@@ -0,0 +1,33 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+
+
+def plot_sequence_embedding(input_filepath: str, output_filepath: str):
+    """
+    Plots a scatter plot between sequence (blast percentage identity score) vs embedding similarity (cosine similarity)
+    for clustered pairs.
+    Parameters
+    ----------
+    input_filepath : str
+        TSV filepath for the clustered pairs of proteins.
+    output_filepath : str
+        Filepath to save the plot.
+    """
+    df = pd.read_csv(input_filepath, sep='\t')
+    x = df['cosine_score']
+    y = df['sequence_identity_score']
+    fig, ax = plt.subplots(figsize=(8, 6))
+    ax.scatter(x, y, s=5)
+    ax.plot([min(x), max(x)], [min(y), max(y)], color='red')
+    ax.set_xlabel('Embedding similarity (cosine similarity)')
+    ax.set_ylabel('Sequence similarity (percentage identity)')
+    ax.set_title('Sequence vs. Embedding similarity')
+    plt.savefig(output_filepath)
+
+
+if __name__ == "__main__":
+    plot_sequence_embedding("./data/output/scores/clustered_score_matrix_word2doc2vec.tsv",
+                            './data/output/plots/scatter_plot_not_clustered_word2odc2vec.png')
+
+    plot_sequence_embedding("./data/output/scores/clustered_score_matrix_hybrid.tsv",
+                            './data/output/plots/scatter_plot_not_clustered_hybrid.png')
diff --git a/code/score_matrix.py b/code/score_matrix.py
@@ -0,0 +1,118 @@
+import numpy as np
+import pandas as pd
+
+
+def read_cosine_matrix(matrix_file):
+    matrix = np.load(matrix_file)
+    cosine_matrix = matrix['arr_0']
+    return cosine_matrix
+
+
+def create_cluster_score_matrix(cluster_file: str, cosine_matrix: np.array, blast_file: str, taxon_file: str, output_file: str) -> None:
+    """
+    Adds the corresponding cosine similarity score, blast percentage identity score and taxons for each pair of the
+    clustered pairs.
+    Parameters
+    ----------
+    cluster_file : str
+        TSV Filepath to clustered pairs of proteins.
+    cosine_matrix : np.array
+        Cosine matrix of dimensions 161354 x 161354 of the best model.
+    blast_file : str
+        TSV Filepath for blast percentage identity score for all Eukaryota proteins.
+    taxon_file : str
+        TSV Filepath for Eukaryota fucntions and taxonomy data.
+    output_file : str
+        Output filepath to save the resulting score matrix matrix.
+    """
+    df = pd.read_csv(cluster_file, sep='\t')
+    ref_indices = df['accession1_index'].to_numpy()
+    asd_indices = df['accession2_index'].to_numpy()
+    array = np.array(list(zip(ref_indices, asd_indices)), dtype=object)
+    # adds cosine score
+    cosine_array = np.zeros((len(ref_indices)), dtype=object)
+    for idx, i in enumerate(array):
+        if i[0] > i[1]:
+            cosine_score = cosine_matrix[i[1]][i[0]]
+        elif i[0] < i[1]:
+            cosine_score = cosine_matrix[i[0]][i[1]]
+        cosine_array[idx] = cosine_score
+    df['cosine_score'] = cosine_array
+    # adds blast percentage identity score
+    blast = pd.read_csv(blast_file, sep='\t')
+    blast_grouped = blast.groupby(['accession1', 'accession2'])['sequence_identity_score'].max()
+    result = pd.merge(df, blast_grouped, on=['accession1', 'accession2'], how='left')
+    result['sequence_identity_score'].fillna(
+        result.groupby(['accession2', 'accession1'])['sequence_identity_score'].transform('max'), inplace=True)
+    # adds taxon for both accessions
+    df_euk = pd.read_csv(taxon_file, sep='\t')
+    accession_to_taxon = dict(zip(df_euk['accession'], df_euk['taxon']))
+    result['taxon_acc_1'] = df['accession1'].map(accession_to_taxon)
+    result['taxon_acc_2'] = df['accession2'].map(accession_to_taxon)
+    result.to_csv(output_file, sep='\t')
+    # # df = pd.read_csv("./data/output/scores/score_hybrid.tsv", sep='\t')
+    # # df = df[df['sequence_identity_score'].isnull()]
+    # df = pd.read_csv("./add.tsv", sep='\t')
+    # print(len(df))
+    # df.to_csv("./add2.tsv", sep='\t')
+    # result = pd.read_csv("./data/output/scores/clustered_score_matrix_hybrid.tsv", sep='\t')
+    # new = pd.read_csv("./values2.tsv", sep='\t')
+    # merged = pd.concat([result, new])
+    # merged.dropna(subset=['sequence_identity_score'], inplace=True)
+    # merged.to_csv("./data/output/scores/clustered_score_matrix_hybrid.tsv", sep='\t')
+    # print(len(result.loc[result['cosine_score'] < 0.9]))
+    # merged.to_csv("./data/clustered_score_matrix_word2doc2vec.tsv", sep='\t', index=False)
+
+
+def create_non_clustered_score_matrix(cluster_file: str, cosine_matrix: np.array, output_file: str) -> None:
+    """
+    Adds the corresponding cosine similarity score for each pair of the
+    non-clustered pairs.
+    Parameters
+    ----------
+    cluster_file : str
+        TSV Filepath to clustered pairs of proteins.
+    cosine_matrix : np.array
+        Cosine matrix of dimensions 161354 x 161354 of the best model.
+    output_file : str
+        Output filepath to save the resulting score matrix matrix.
+    """
+    df = pd.read_csv(cluster_file, sep='\t')
+    ref_indices = df['accession1_index'].to_numpy()
+    asd_indices = df['accession2_index'].to_numpy()
+    array = np.array(list(zip(ref_indices, asd_indices)), dtype=object)
+    cosine_array = np.zeros((len(ref_indices)), dtype=object)
+    for idx, i in enumerate(array):
+        if i[0] > i[1]:
+            cosine_score = cosine_matrix[i[1]][i[0]]
+        elif i[0] < i[1]:
+            cosine_score = cosine_matrix[i[0]][i[1]]
+        cosine_array[idx] = cosine_score
+    df['cosine_score'] = cosine_array
+    df.to_csv(output_file, sep='\t')
+
+
+if __name__ == "__main__":
+    # Word2doc2Vec
+    cosine_matrix = read_cosine_matrix("./data/output/cosine/cosine_word2doc2vev_bestmodel.npz")
+    create_cluster_score_matrix("./data/output/uniref/clustered_pairs_index.tsv",
+                                cosine_matrix,
+                                "./data/output/blast.tsv",
+                                "./data/output/functions/rev-20220525-UniProtKB-eukaryota.tsv",
+                                "./data/output/scores/clustered_score_matrix_word2doc2vec.tsv")
+
+    create_non_clustered_score_matrix("./data/output/uniref/not_clustered_pairs_index.tsv",
+                                      cosine_matrix,
+                                      "./data/output/scores/not_clustered_score_matrix_word2doc2vec.tsv")
+
+    # Hybrid-Word2doc2Vec
+    cosine_matrix = "./data/output/cosine/cosine_hybrid_bestmodel.npz"
+    create_cluster_score_matrix("./data/output/uniref/clustered_pairs_index.tsv",
+                                cosine_matrix,
+                                "./data/output/blast.tsv",
+                                "./data/output/functions/rev-20220525-UniProtKB-eukaryota.tsv",
+                                "./data/output/scores/clustered_score_matrix_hybrid.tsv")
+    create_non_clustered_score_matrix("./data/output/uniref/not_clustered_pairs_index.tsv",
+                                      cosine_matrix,
+                                      "./data/output/scores/not_clustered_score_matrix_word2doc2vec.tsv")
+