HistoricalEmotionAnalysis/RandomWalk.py at main · NotJona/HistoricalEmotionAnalysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""
This file implements Random Walk

Written with the help of GenAI
"""
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize as sk_normalize
from scipy.sparse import diags, csr_matrix
from functions import load_VAD_seed_lexion, save_as_VAD, load_seed_data, load_GOLD_data, get_gold_words


def retrieve_embeddings(embedding_name, lexicon_path, gold_standard_path):
    """
    returns dict of form word: word_vector for all words in the COHA
    :param embedding_name: str, name of embedding
    :param lexicon_path: str, path to seed word lexicon
    :param gold_standard_path: str, path to gold standard lexicon
    :return: dict of form word: word_vector
    """
    seed_data = load_seed_data(embedding_name, lexicon_path)
    GOLD_data = load_GOLD_data(embedding_name, gold_standard_path)
    word_vector_dict = dict()
    for element in seed_data:
        word_vector_dict[element[0]] = element[2]
    for row in GOLD_data:
        word_vector_dict[row[0]] = row[1]
    return word_vector_dict

def random_walk(embedding_name, lexicon_path, gold_standard_path, vad_lexicon_name, save_folder_name,
                beta=0.9, max_iter=100, n_neighbors=25):
    """
    performs a random walk to induce historical VAD scores. Saves result under:
    'HistoricalVAD/RandomWalk/vad_lexicon_name _ model_name _ historicalVAD.tsv'
    :param embedding_name: str, name of the embedding
    :param lexicon_path: str, path to seed word lexicon
    :param gold_standard_path: str, path to gold standard lexicon
    :param vad_lexicon_name: str, name of the seed word lexicon (important for saving)
    :param save_folder_name: str, name of the saved folder
    :param beta: float, weight term
    :param max_iter: int, maximum number of iterations
    :param n_neighbors: int, number of neighbors
    """
    word_embeddings = retrieve_embeddings(embedding_name, lexicon_path, gold_standard_path)
    seed_vad_scores = load_VAD_seed_lexion(lexicon_path)

    # Step 0: Prepare data and graph
    words = list(word_embeddings.keys())
    embeddings = np.array([word_embeddings[w] for w in words])
    n_words = len(words)
    word_to_idx = {word: i for i, word in enumerate(words)}

    # Step 1: Build graph with angular distance weights
    cosine_sim = cosine_similarity(embeddings)

    # Create edge weight matrix E using angular distance for top neighbors
    E = np.zeros_like(cosine_sim)
    for i in range(n_words):
        # Get top n_neighbors by cosine similarity (excluding self)
        top_indices = np.argpartition(-cosine_sim[i], n_neighbors + 1)[1:n_neighbors + 1]

        # Compute angular distance weights
        for j in top_indices:
            cos_ij = max(min(cosine_sim[i, j], 1.0), -1.0)  # Numerical safety
            E[i, j] = np.arccos(-cos_ij)  # Your specified E_ij formula

    # Symmetrize the graph (undirected)
    E = np.maximum(E, E.T)

    # Step 2: Construct normalized transition matrix
    # 1. Compute row sums of E (W in the paper) -> D_ii = sum_j W_ij
    row_sums_E = E.sum(axis=1, keepdims=True)
    E = E / (row_sums_E + 1e-10)
    E = csr_matrix(E)
    D_rowsums = E.sum(axis=1).A1  # .A1 converts to 1D numpy array

    # 2. Compute D^{-1/2} (inverse square root of row sums)
    D_inv_sqrt = diags(1 / np.sqrt(D_rowsums + 1e-10))  # Add epsilon to avoid division by zero

    # 3. Construct T = D^{-1/2} E D^{-1/2} (E is W in the paper) # Ensure E is sparse
    T = D_inv_sqrt @ (E @ D_inv_sqrt)  # Parentheses optimize order

    # Initialize matrices for positive (S+) and negative (S-) seeds
    # Step 3: Prepare positive and negative seeds
    s_pos = np.zeros((n_words, 3))
    s_neg = np.zeros((n_words, 3))

    for word, (v, a, d) in seed_vad_scores.items():
        idx = word_to_idx[word]
        s_pos[idx] = [v, a, d]
        s_neg[idx] = [10 - v, 10 - a, 10 - d]  # Inverted around center (5)

    # Normalize seed vectors
    s_pos = sk_normalize(s_pos, axis=0, norm='l1')
    s_neg = sk_normalize(s_neg, axis=0, norm='l1')

    # Step 4: Run random walks with restart
    p_pos = np.ones((n_words, 3)) / n_words  # Uniform initialization
    p_neg = np.ones((n_words, 3)) / n_words

    # Run random walks for P+ and P-
    for i in range(max_iter):
        p_pos = beta * (T @ p_pos) + (1 - beta) * s_pos
        p_neg = beta * (T @ p_neg) + (1 - beta) * s_neg

    # Step 5: Compute final VAD scores (scaled to 1-9)
    epsilon = 1e-10  # Avoid division by zero
    P_final = p_pos / (p_pos + p_neg + epsilon)

    #saving and upscaling
    GOLD_words = get_gold_words(gold_standard_path)
    result_VAD_lexicon = [[word, P_final[i][0] * 8 + 1, P_final[i][1] * 8 + 1, P_final[i][2] * 8 + 1]
                          for i, word in enumerate(words) if word in GOLD_words]
    save_as_VAD(save_folder_name, vad_lexicon_name, embedding_name, result_VAD_lexicon)