|
18 | 18 | # under the License. |
19 | 19 | #------------------------------------------------------------- |
20 | 20 |
|
| 21 | + |
| 22 | +# Computes the vector embeddings for words in a large text corpus. |
| 23 | +# |
| 24 | +# INPUT: |
| 25 | +# -------------------------------------------------------------------------------- |
| 26 | +# input 1DInput corpus in CSV format. |
| 27 | +# seed Random seed for reproducibility. |
| 28 | +# vector_size Dimensionality of word vectors, V. |
| 29 | +# eta Learning rate for optimization, recommended value: 0.05. |
| 30 | +# alpha Weighting function parameter, recommended value: 0.75. |
| 31 | +# x_max Maximum co-occurrence value as per the GloVe paper: 100. |
| 32 | +# tol Tolerance value to avoid overfitting, recommended value: 1e-4. |
| 33 | +# iterations Total number of training iterations. |
| 34 | +# print_loss_it Interval (in iterations) for printing the loss. |
| 35 | +# maxTokens Maximum number of tokens per text entry. |
| 36 | +# windowSize Context window size. |
| 37 | +# distanceWeighting Whether to apply distance-based weighting. |
| 38 | +# symmetric Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE). |
| 39 | +# ------------------------------------------------------------------------------ |
| 40 | +# |
| 41 | +# OUTPUT: |
| 42 | +# ------------------------------------------------------------------------------ |
| 43 | +# G The word indices and their word vectors, of shape (N, V). Each represented as a vector, of shape (1,V) |
| 44 | +# ------------------------------------------------------------------------------ |
| 45 | + |
| 46 | + |
| 47 | +f_glove = function( |
| 48 | + Frame[Unknown] input, |
| 49 | + int seed, int vector_size, |
| 50 | + double alpha, double eta, |
| 51 | + double x_max, |
| 52 | + double tol, |
| 53 | + int iterations, |
| 54 | + int print_loss_it, |
| 55 | + Int maxTokens, |
| 56 | + Int windowSize, |
| 57 | + Boolean distanceWeighting, |
| 58 | + Boolean symmetric) |
| 59 | + return (frame[Unknown] G){ |
| 60 | + |
| 61 | + [cooc_matrix, cooc_index] = cooccurrenceMatrix(input, maxTokens, windowSize, distanceWeighting, symmetric); |
| 62 | + G = gloveWithCoocMatrix(cooc_matrix, cooc_index, seed, vector_size, alpha, eta, x_max, tol, iterations, print_loss_it); |
| 63 | +} |
| 64 | + |
| 65 | + |
21 | 66 | init = function(matrix[double] cooc_matrix, double x_max, double alpha) |
22 | 67 | return(matrix[double] weights, matrix[double] log_cooc_matrix){ |
23 | 68 | E = 2.718281828; |
@@ -118,45 +163,3 @@ gloveWithCoocMatrix = function(matrix[double] cooc_matrix, frame[Unknown] cooc_i |
118 | 163 | print("Given " + iterations + " iterations, " + "stopped (or converged) at the " + final_iter + " iteration / error: " + error); |
119 | 164 | G = cbind(cooc_index[,2], as.frame(G)); |
120 | 165 | } |
121 | | - |
122 | | -glove = function( |
123 | | - Frame[Unknown] input, |
124 | | - int seed, int vector_size, |
125 | | - double alpha, double eta, |
126 | | - double x_max, |
127 | | - double tol, |
128 | | - int iterations, |
129 | | - int print_loss_it, |
130 | | - Int maxTokens, |
131 | | - Int windowSize, |
132 | | - Boolean distanceWeighting, |
133 | | - Boolean symmetric) |
134 | | - return (frame[Unknown] G){ |
135 | | - |
136 | | - /* |
137 | | - * Main function to Computes the vector embeddings for words in a large text corpus. |
138 | | - * INPUT: |
139 | | - * ------------------------------------------------------------------------------ |
140 | | - * - input (Frame[Unknown]): 1DInput corpus in CSV format. |
141 | | - * - seed: Random seed for reproducibility. |
142 | | - * - vector_size: Dimensionality of word vectors, V. |
143 | | - * - eta: Learning rate for optimization, recommended value: 0.05. |
144 | | - * - alpha: Weighting function parameter, recommended value: 0.75. |
145 | | - * - x_max: Maximum co-occurrence value as per the GloVe paper: 100. |
146 | | - * - tol: Tolerance value to avoid overfitting, recommended value: 1e-4. |
147 | | - * - iterations: Total number of training iterations. |
148 | | - * - print_loss_it: Interval (in iterations) for printing the loss. |
149 | | - * - maxTokens (Int): Maximum number of tokens per text entry. |
150 | | - * - windowSize (Int): Context window size. |
151 | | - * - distanceWeighting (Boolean): Whether to apply distance-based weighting. |
152 | | - * - symmetric (Boolean): Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE). |
153 | | - * ------------------------------------------------------------------------------ |
154 | | - * OUTPUT: |
155 | | - * ------------------------------------------------------------------------------ |
156 | | - * G (Frame[Unknown]): The word indices and their word vectors, of shape (N, V). Each represented as a vector, of shape (1,V) |
157 | | - * ------------------------------------------------------------------------------ |
158 | | - */ |
159 | | - |
160 | | - [cooc_matrix, cooc_index] = cooccurrenceMatrix(input, maxTokens, windowSize, distanceWeighting, symmetric); |
161 | | - G = gloveWithCoocMatrix(cooc_matrix, cooc_index, seed, vector_size, alpha, eta, x_max, tol, iterations, print_loss_it); |
162 | | -} |
|
0 commit comments