|
| 1 | +from scipy.special.cython_special cimport gammaln |
| 2 | +from scipy.special.cython_special cimport psi |
| 3 | + |
| 4 | +from libc.math cimport exp, log, abs |
| 5 | +from libc.string cimport memcpy |
| 6 | +from libc.stdlib cimport malloc, free |
| 7 | + |
| 8 | +cimport numpy as np |
| 9 | + |
| 10 | +import numpy as np |
| 11 | + |
| 12 | +ctypedef np.float64_t REAL_t |
| 13 | + |
| 14 | + |
| 15 | +cdef update_phi(REAL_t * gamma, REAL_t *phi, REAL_t * log_phi, |
| 16 | + int * word_ids, REAL_t * lda_topics, const int num_topics, |
| 17 | + const int doc_length): |
| 18 | + |
| 19 | + """Update variational multinomial parameters, based on a document and a time-slice. |
| 20 | +
|
| 21 | + This is done based on the original Blei-LDA paper, where: |
| 22 | + log_phi := beta * exp(Ψ(gamma)), over every topic for every word. |
| 23 | +
|
| 24 | + TODO: incorporate lee-sueng trick used in |
| 25 | + **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**. |
| 26 | +
|
| 27 | + Parameters |
| 28 | + ---------- |
| 29 | + gamma : size of num_topics. in-parameter |
| 30 | +
|
| 31 | + phi : size of (max_doc_len, num_topics). int/out-parameter |
| 32 | +
|
| 33 | + log_phi: size of (max_doc_len, num_topics). in/out-parameter |
| 34 | +
|
| 35 | + word_ids: size of doc_length. in-parameter |
| 36 | +
|
| 37 | + lda_topics: size of (vocab_len, num_topics). in-parameter |
| 38 | +
|
| 39 | + num_topics: number of topics in the model. in-parameter |
| 40 | +
|
| 41 | + doc_length: length of the document (number of words in the document). in-parameter |
| 42 | +
|
| 43 | + """ |
| 44 | + |
| 45 | + cdef int k, i |
| 46 | + |
| 47 | + # digamma values |
| 48 | + cdef REAL_t * dig = <REAL_t *> malloc(num_topics * sizeof(REAL_t)) |
| 49 | + if dig == NULL: |
| 50 | + raise |
| 51 | + if dig == NULL: |
| 52 | + print("NULL pointer") |
| 53 | + |
| 54 | + for k in range(num_topics): |
| 55 | + dig[k] = psi(gamma[k]) |
| 56 | + |
| 57 | + cdef REAL_t *log_phi_row = NULL |
| 58 | + cdef REAL_t *phi_row = NULL |
| 59 | + |
| 60 | + for i in range(doc_length): |
| 61 | + for k in range(num_topics): |
| 62 | + # TODO check if proper indexing |
| 63 | + log_phi[i * num_topics + k] = \ |
| 64 | + dig[k] + lda_topics[word_ids[i] * num_topics + k] |
| 65 | + |
| 66 | + log_phi_row = log_phi + i * num_topics |
| 67 | + |
| 68 | + phi_row = phi + i * num_topics |
| 69 | + |
| 70 | + # log normalize |
| 71 | + v = log_phi_row[0] |
| 72 | + for i in range(1, num_topics): |
| 73 | + v = log(exp(v) + exp(log_phi_row[i])) |
| 74 | + |
| 75 | + # subtract every element by v |
| 76 | + # TODO blas |
| 77 | + for i in range(num_topics): |
| 78 | + log_phi_row[i] = log_phi_row[i] - v |
| 79 | + |
| 80 | + for i in range(num_topics): |
| 81 | + phi_row[i] = exp(log_phi_row[i]) |
| 82 | + |
| 83 | + free(dig) |
| 84 | + |
| 85 | + |
| 86 | +cdef update_phi_fixed(): |
| 87 | + return |
| 88 | + |
| 89 | + |
| 90 | +cdef update_gamma(REAL_t * gamma, const REAL_t * phi, const REAL_t *lda_alpha, |
| 91 | + const int *word_counts, const int num_topics, const int doc_length): |
| 92 | + """Update variational dirichlet parameters. |
| 93 | +
|
| 94 | + This operations is described in the original Blei LDA paper: |
| 95 | + gamma = alpha + sum(phi), over every topic for every word. |
| 96 | +
|
| 97 | + Parameters |
| 98 | + ---------- |
| 99 | + gamma: size of num_topics. int/out-parameter |
| 100 | + |
| 101 | + phi: size of (max_doc_len, num_topics). in-parameter |
| 102 | + |
| 103 | + lda_alpha: size of num_topics. in-parameter |
| 104 | + |
| 105 | + word_counts: size of doc_length. in-parameter |
| 106 | + |
| 107 | + num_topics: number of topics in the model |
| 108 | + |
| 109 | + doc_length: length of the document |
| 110 | +
|
| 111 | + """ |
| 112 | + memcpy(gamma, lda_alpha, num_topics * sizeof(REAL_t)) |
| 113 | + |
| 114 | + cdef int i, k |
| 115 | + # TODO BLAS matrix*vector |
| 116 | + for i in range(doc_length): |
| 117 | + for k in range(num_topics): |
| 118 | + gamma[k] += phi[i * num_topics + k] * word_counts[i] |
| 119 | + |
| 120 | + |
| 121 | +cdef REAL_t compute_lda_lhood(REAL_t * lhood, const REAL_t *gamma, const REAL_t * phi, const REAL_t *log_phi, |
| 122 | + const REAL_t * lda_alpha, const REAL_t *lda_topics, |
| 123 | + int *word_counts, int *word_ids, |
| 124 | + const int num_topics, const int doc_length): |
| 125 | + """Compute the log likelihood bound. |
| 126 | + Parameters |
| 127 | + ---------- |
| 128 | + gamma: size of num_topics |
| 129 | + |
| 130 | + lhood: size of num_topics + 1 |
| 131 | + |
| 132 | + phi: size of (max_doc_len, num_topics). in-parameter |
| 133 | + |
| 134 | + log_phi: size of (max_doc_len, num_topics). in-parameter |
| 135 | + |
| 136 | + lda_alpha: size of num_topics. in-parameter |
| 137 | +
|
| 138 | + lda_topics: size of (vocab_len, num_topics). in-parameter |
| 139 | + |
| 140 | + word_counts: size of doc_len |
| 141 | + |
| 142 | + word_ids: size of doc_len |
| 143 | + |
| 144 | + num_topics: number of topics in the model |
| 145 | + |
| 146 | + doc_length: length of the document |
| 147 | +
|
| 148 | + Returns |
| 149 | + ------- |
| 150 | + float |
| 151 | + The optimal lower bound for the true posterior using the approximate distribution. |
| 152 | +
|
| 153 | + """ |
| 154 | + cdef int i |
| 155 | + |
| 156 | + cdef REAL_t gamma_sum = 0.0 |
| 157 | + |
| 158 | + for i in range(num_topics): |
| 159 | + gamma_sum += gamma[i] |
| 160 | + |
| 161 | + cdef REAL_t alpha_sum = 0.0 |
| 162 | + for i in range(num_topics): |
| 163 | + alpha_sum += lda_alpha[i] |
| 164 | + |
| 165 | + cdef REAL_t lhood_v = gammaln(alpha_sum) - gammaln(gamma_sum) |
| 166 | + lhood[num_topics] = lhood_v |
| 167 | + |
| 168 | + cdef REAL_t digsum = psi(gamma_sum) |
| 169 | + cdef REAL_t lhood_term, e_log_theta_k |
| 170 | + |
| 171 | + for k in range(num_topics): |
| 172 | + |
| 173 | + e_log_theta_k = psi(gamma[k]) - digsum |
| 174 | + |
| 175 | + lhood_term = \ |
| 176 | + (lda_alpha[k] - gamma[k]) * e_log_theta_k + \ |
| 177 | + gammaln(gamma[k]) - gammaln(lda_alpha[k]) |
| 178 | + |
| 179 | + # TODO: check why there's an IF |
| 180 | + for i in range(doc_length): |
| 181 | + if phi[i * num_topics + k] > 0: |
| 182 | + lhood_term += \ |
| 183 | + word_counts[i] * phi[i * num_topics + k] \ |
| 184 | + * (e_log_theta_k + lda_topics[word_ids[i] * num_topics + k] |
| 185 | + - log_phi[i * num_topics + k]) |
| 186 | + |
| 187 | + lhood[k] = lhood_term |
| 188 | + lhood_v += lhood_term |
| 189 | + |
| 190 | + return lhood_v |
| 191 | + |
| 192 | +cdef init_lda_post(REAL_t *gamma, REAL_t * phi, const int *word_counts, |
| 193 | + const REAL_t *lda_alpha, |
| 194 | + const int doc_length, const int num_topics): |
| 195 | + |
| 196 | + """Initialize variational posterior. """ |
| 197 | + |
| 198 | + cdef int i, j |
| 199 | + |
| 200 | + cdef int total = 0 |
| 201 | + |
| 202 | + # BLAS sum of absolute numbers |
| 203 | + for i in range(doc_length): |
| 204 | + total += word_counts[i] |
| 205 | + |
| 206 | + cdef REAL_t init_value = lda_alpha[0] + float(total) / num_topics |
| 207 | + |
| 208 | + for i in range(num_topics): |
| 209 | + gamma[i] = init_value |
| 210 | + |
| 211 | + init_value = 1.0 / num_topics |
| 212 | + |
| 213 | + for i in range(doc_length): |
| 214 | + phi_doc = phi + i * num_topics |
| 215 | + for j in range(num_topics): |
| 216 | + phi_doc[j] = init_value |
| 217 | + |
| 218 | + |
| 219 | +def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED=1e-8, |
| 220 | + lda_inference_max_iter=25, g=None, g3_matrix=None, g4_matrix=None, g5_matrix=None): |
| 221 | + """Posterior inference for lda. |
| 222 | +
|
| 223 | + Parameters |
| 224 | + ---------- |
| 225 | +
|
| 226 | + Returns |
| 227 | + ------- |
| 228 | + float |
| 229 | + The optimal lower bound for the true posterior using the approximate distribution. |
| 230 | + """ |
| 231 | + |
| 232 | + cdef int i |
| 233 | + |
| 234 | + ############### |
| 235 | + # Setup C structures |
| 236 | + ############### |
| 237 | + |
| 238 | + cdef int num_topics = self.lda.num_topics |
| 239 | + cdef int vocab_len = len(self.lda.id2word) |
| 240 | + |
| 241 | + cdef int doc_length = len(self.doc) |
| 242 | + cdef int max_doc_len = doc_length |
| 243 | + |
| 244 | + # TODO adopt implementation to avoid memory allocation for every document. |
| 245 | + # E.g. create numpy array field of Python class. Be careful with the array size, it should be at least |
| 246 | + # the length of the longest document |
| 247 | + cdef int * word_ids = <int *> malloc(doc_length * sizeof(int)) |
| 248 | + if word_ids == NULL: |
| 249 | + raise |
| 250 | + cdef int * word_counts = <int *> malloc(doc_length * sizeof(int)) |
| 251 | + if word_counts == NULL: |
| 252 | + raise |
| 253 | + # TODO Would it be better to create numpy array first? |
| 254 | + for i in range(doc_length): |
| 255 | + word_ids[i] = self.doc[i][0] |
| 256 | + word_counts[i] = self.doc[i][0] |
| 257 | + |
| 258 | + cdef REAL_t * gamma = <REAL_t *> (np.PyArray_DATA(self.gamma)) |
| 259 | + cdef REAL_t * phi = <REAL_t *> (np.PyArray_DATA(self.phi)) |
| 260 | + cdef REAL_t * log_phi = <REAL_t *> (np.PyArray_DATA(self.log_phi)) |
| 261 | + cdef REAL_t * lhood = <REAL_t *> (np.PyArray_DATA(self.lhood)) |
| 262 | + |
| 263 | + cdef REAL_t * lda_topics = <REAL_t *> (np.PyArray_DATA(self.lda.topics)) |
| 264 | + cdef REAL_t * lda_alpha = <REAL_t *> (np.PyArray_DATA(self.lda.alpha)) |
| 265 | + |
| 266 | + ############### |
| 267 | + # Finished setup of c structures here |
| 268 | + ############### |
| 269 | + |
| 270 | + init_lda_post(gamma, phi, word_counts, lda_alpha, doc_length, num_topics) |
| 271 | + |
| 272 | + # sum of counts in a doc |
| 273 | + cdef REAL_t total = sum(count for word_id, count in self.doc) |
| 274 | + |
| 275 | + cdef REAL_t lhood_v = compute_lda_lhood(lhood, gamma, phi, log_phi, |
| 276 | + lda_alpha, lda_topics, word_counts, word_ids, |
| 277 | + num_topics, doc_length) |
| 278 | + |
| 279 | + cdef REAL_t lhood_old = 0.0 |
| 280 | + cdef REAL_t converged = 0.0 |
| 281 | + cdef int iter_ = 0 |
| 282 | + |
| 283 | + # TODO Why first iteration starts here is done outside of the loop? |
| 284 | + iter_ += 1 |
| 285 | + lhood_old = lhood_v |
| 286 | + update_gamma(gamma, phi, lda_alpha, word_counts, num_topics, doc_length) |
| 287 | + |
| 288 | + model = "DTM" |
| 289 | + |
| 290 | + # if model == "DTM" or sslm is None: |
| 291 | + update_phi(gamma, phi, log_phi, word_ids, lda_topics, num_topics, doc_length) |
| 292 | + |
| 293 | + lhood_v = compute_lda_lhood(lhood, gamma, phi, log_phi, |
| 294 | + lda_alpha, lda_topics, word_counts, word_ids, |
| 295 | + num_topics, doc_length) |
| 296 | + |
| 297 | + converged = abs((lhood_old - lhood_v) / (lhood_old * total)) |
| 298 | + |
| 299 | + while converged > LDA_INFERENCE_CONVERGED and iter_ <= lda_inference_max_iter: |
| 300 | + |
| 301 | + iter_ += 1 |
| 302 | + lhood_old = lhood_v |
| 303 | + update_gamma(gamma, phi, lda_alpha, word_counts, num_topics, doc_length) |
| 304 | + model = "DTM" |
| 305 | + |
| 306 | + update_phi(gamma, phi, log_phi, word_ids, lda_topics, num_topics, doc_length) |
| 307 | + |
| 308 | + lhood_v = compute_lda_lhood(lhood, gamma, phi, log_phi, |
| 309 | + lda_alpha, lda_topics, word_counts, word_ids, |
| 310 | + num_topics, doc_length) |
| 311 | + |
| 312 | + converged = np.fabs((lhood_old - lhood_v) / (lhood_old * total)) |
| 313 | + |
| 314 | + free(word_ids) |
| 315 | + free(word_counts) |
| 316 | + |
| 317 | + return lhood_v |
0 commit comments