|
| 1 | +#------------------------------------------------------------- |
| 2 | +# |
| 3 | +# Licensed to the Apache Software Foundation (ASF) under one |
| 4 | +# or more contributor license agreements. See the NOTICE file |
| 5 | +# distributed with this work for additional information |
| 6 | +# regarding copyright ownership. The ASF licenses this file |
| 7 | +# to you under the Apache License, Version 2.0 (the |
| 8 | +# "License"); you may not use this file except in compliance |
| 9 | +# with the License. You may obtain a copy of the License at |
| 10 | +# |
| 11 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 12 | +# |
| 13 | +# Unless required by applicable law or agreed to in writing, |
| 14 | +# software distributed under the License is distributed on an |
| 15 | +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 16 | +# KIND, either express or implied. See the License for the |
| 17 | +# specific language governing permissions and limitations |
| 18 | +# under the License. |
| 19 | +#------------------------------------------------------------- |
| 20 | + |
| 21 | +init = function(matrix[double] cooc_matrix, double x_max, double alpha) |
| 22 | + return(matrix[double] weights, matrix[double] log_cooc_matrix){ |
| 23 | + E = 2.718281828; |
| 24 | + bounded = pmin(cooc_matrix, x_max); |
| 25 | + weights = pmin(1, (bounded / x_max) ^ alpha); |
| 26 | + log_cooc_matrix = ifelse(cooc_matrix > 0, log(cooc_matrix, E), 0); |
| 27 | +} |
| 28 | + |
| 29 | +gloveWithCoocMatrix = function(matrix[double] cooc_matrix, frame[Unknown] cooc_index, int seed, int vector_size, double alpha, double eta, double x_max, double tol, int iterations,int print_loss_it) |
| 30 | + return (frame[Unknown] G){ |
| 31 | + /* |
| 32 | + * Computes the vector embeddings for words by analyzing their co-occurrence statistics in a large text corpus. |
| 33 | + * |
| 34 | + * Inputs: |
| 35 | + * - cooc_matrix: Precomputed co-occurrence matrix of shape (N, N). |
| 36 | + * - cooc_index: Index file mapping words to their positions in the co-occurrence matrix. |
| 37 | + * The second column should contain the word list in the same order as the matrix. |
| 38 | + * - seed: Random seed for reproducibility. |
| 39 | + * - vector_size: Dimensionality of word vectors, V. |
| 40 | + * - eta: Learning rate for optimization, recommended value: 0.05. |
| 41 | + * - alpha: Weighting function parameter, recommended value: 0.75. |
| 42 | + * - x_max: Maximum co-occurrence value as per the GloVe paper: 100. |
| 43 | + * - tol: Tolerance value to avoid overfitting, recommended value: 1e-4. |
| 44 | + * - iterations: Total number of training iterations. |
| 45 | + * - print_loss_it: Interval (in iterations) for printing the loss. |
| 46 | + * |
| 47 | + * Outputs: |
| 48 | + * - G: frame of the word indices and their word vectors, of shape (N, V). Each represented as a vector, of shape (1,V) |
| 49 | + */ |
| 50 | + |
| 51 | + vocab_size = nrow(cooc_matrix); |
| 52 | + W = (rand(rows=vocab_size, cols=vector_size, min=0, max=1, seed=seed)-0.5)/vector_size; |
| 53 | + C = (rand(rows=vocab_size, cols=vector_size, min=0, max=1, seed=seed+1)-0.5)/vector_size; |
| 54 | + bw = (rand(rows=vocab_size, cols=1, min=0, max=1, seed=seed+2)-0.5)/vector_size; |
| 55 | + bc = (rand(rows=vocab_size, cols=1, min=0, max=1, seed=seed+3)-0.5)/vector_size; |
| 56 | + [weights, log_cooc_matrix] = init(cooc_matrix, x_max, alpha); |
| 57 | + |
| 58 | + momentum_W = 1e-8 + 0.1 * matrix(1, nrow(W), ncol(W)); |
| 59 | + momentum_C = 1e-8 + 0.1 * matrix(1, nrow(C), ncol(C)); |
| 60 | + momentum_bw = 1e-8 + 0.1 * matrix(1, nrow(bw), ncol(bw)); |
| 61 | + momentum_bc = 1e-8 + 0.1 * matrix(1, nrow(bc), ncol(bc)); |
| 62 | + |
| 63 | + error = 0; |
| 64 | + iter = 0; |
| 65 | + tolerance = tol; |
| 66 | + previous_error = 1e10; |
| 67 | + conti = TRUE; |
| 68 | + |
| 69 | + while (conti) { |
| 70 | + |
| 71 | + # compute predictions for all co-occurring word pairs at once |
| 72 | + predictions = W %*% t(C) + bw + t(bc); |
| 73 | + diffs = predictions - log_cooc_matrix; |
| 74 | + weighted_diffs = weights * diffs; |
| 75 | + |
| 76 | + # compute gradients |
| 77 | + wgrad = weighted_diffs %*% C; |
| 78 | + cgrad = t(weighted_diffs) %*% W; |
| 79 | + bwgrad = rowSums(weighted_diffs); |
| 80 | + bcgrad = matrix(colSums(weighted_diffs), nrow(bc), ncol(bc)); |
| 81 | + |
| 82 | + error = sum(0.5 * (weights * (diffs ^ 2))); |
| 83 | + iter = iter + 1; |
| 84 | + |
| 85 | + |
| 86 | + if (abs(previous_error - error) >= tolerance) { |
| 87 | + if(iter <= iterations){ |
| 88 | + |
| 89 | + # get steps and update |
| 90 | + momentum_W = momentum_W + (wgrad ^ 2); |
| 91 | + momentum_C = momentum_C + (cgrad ^ 2); |
| 92 | + momentum_bw = momentum_bw + (bwgrad ^ 2); |
| 93 | + momentum_bc = momentum_bc + (bcgrad ^ 2); |
| 94 | + |
| 95 | + W = W - (eta * wgrad / (sqrt(momentum_W) + 1e-8)); |
| 96 | + C = C - (eta * cgrad / (sqrt(momentum_C) + 1e-8)); |
| 97 | + bw = bw - (eta * bwgrad / (sqrt(momentum_bw) + 1e-8)); |
| 98 | + bc = bc - (eta * bcgrad / (sqrt(momentum_bc) + 1e-8)); |
| 99 | + |
| 100 | + G = W + C; |
| 101 | + |
| 102 | + previous_error = error; |
| 103 | + |
| 104 | + final_iter = iter; |
| 105 | + } else { |
| 106 | + conti = FALSE; |
| 107 | + } |
| 108 | + } else { |
| 109 | + conti = FALSE; |
| 110 | + } |
| 111 | + |
| 112 | + if (iter - floor(iter / print_loss_it) * print_loss_it == 0) { |
| 113 | + print("iteration: " + iter + " error: " + error); |
| 114 | + } |
| 115 | + } |
| 116 | + |
| 117 | + # add the word index to the word vectors |
| 118 | + print("Given " + iterations + " iterations, " + "stopped (or converged) at the " + final_iter + " iteration / error: " + error); |
| 119 | + G = cbind(cooc_index[,2], as.frame(G)); |
| 120 | +} |
| 121 | + |
| 122 | +glove = function( |
| 123 | + Frame[Unknown] input, |
| 124 | + int seed, int vector_size, |
| 125 | + double alpha, double eta, |
| 126 | + double x_max, |
| 127 | + double tol, |
| 128 | + int iterations, |
| 129 | + int print_loss_it, |
| 130 | + Int maxTokens, |
| 131 | + Int windowSize, |
| 132 | + Boolean distanceWeighting, |
| 133 | + Boolean symmetric) |
| 134 | + return (frame[Unknown] G){ |
| 135 | + |
| 136 | + /* |
| 137 | + * Main function to Computes the vector embeddings for words in a large text corpus. |
| 138 | + * INPUT: |
| 139 | + * ------------------------------------------------------------------------------ |
| 140 | + * - input (Frame[Unknown]): 1DInput corpus in CSV format. |
| 141 | + * - seed: Random seed for reproducibility. |
| 142 | + * - vector_size: Dimensionality of word vectors, V. |
| 143 | + * - eta: Learning rate for optimization, recommended value: 0.05. |
| 144 | + * - alpha: Weighting function parameter, recommended value: 0.75. |
| 145 | + * - x_max: Maximum co-occurrence value as per the GloVe paper: 100. |
| 146 | + * - tol: Tolerance value to avoid overfitting, recommended value: 1e-4. |
| 147 | + * - iterations: Total number of training iterations. |
| 148 | + * - print_loss_it: Interval (in iterations) for printing the loss. |
| 149 | + * - maxTokens (Int): Maximum number of tokens per text entry. |
| 150 | + * - windowSize (Int): Context window size. |
| 151 | + * - distanceWeighting (Boolean): Whether to apply distance-based weighting. |
| 152 | + * - symmetric (Boolean): Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE). |
| 153 | + * ------------------------------------------------------------------------------ |
| 154 | + * OUTPUT: |
| 155 | + * ------------------------------------------------------------------------------ |
| 156 | + * G (Frame[Unknown]): The word indices and their word vectors, of shape (N, V). Each represented as a vector, of shape (1,V) |
| 157 | + * ------------------------------------------------------------------------------ |
| 158 | + */ |
| 159 | + |
| 160 | + [cooc_matrix, cooc_index] = cooccurrenceMatrix(input, maxTokens, windowSize, distanceWeighting, symmetric); |
| 161 | + G = gloveWithCoocMatrix(cooc_matrix, cooc_index, seed, vector_size, alpha, eta, x_max, tol, iterations, print_loss_it); |
| 162 | +} |
0 commit comments