Skip to content

Commit 1de110b

Browse files
committed
Cythonized Python code for sslm and LdaPost classes
1 parent 57b3af3 commit 1de110b

File tree

7 files changed

+2096
-0
lines changed

7 files changed

+2096
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,5 +89,7 @@ gensim/models/fasttext_inner.c
8989
gensim/models/nmf_pgd.c
9090
gensim/models/word2vec_corpusfile.cpp
9191
gensim/models/word2vec_inner.c
92+
gensim/models/ldaseq_sslm_inner.c
93+
gensim/models/ldaseq_posterior_inner.c
9294

9395
.ipynb_checkpoints

gensim/models/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from .fasttext import FastText # noqa:F401
2323
from .translation_matrix import TranslationMatrix, BackMappingTranslationMatrix # noqa:F401
2424
from .nmf import Nmf # noqa:F401
25+
from .ldaseqmodel_optimized import LdaSeqModel_V4
2526

2627
from gensim import interfaces, utils
2728

Lines changed: 317 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,317 @@
1+
from scipy.special.cython_special cimport gammaln
2+
from scipy.special.cython_special cimport psi
3+
4+
from libc.math cimport exp, log, abs
5+
from libc.string cimport memcpy
6+
from libc.stdlib cimport malloc, free
7+
8+
cimport numpy as np
9+
10+
import numpy as np
11+
12+
ctypedef np.float64_t REAL_t
13+
14+
15+
cdef update_phi(REAL_t * gamma, REAL_t *phi, REAL_t * log_phi,
16+
int * word_ids, REAL_t * lda_topics, const int num_topics,
17+
const int doc_length):
18+
19+
"""Update variational multinomial parameters, based on a document and a time-slice.
20+
21+
This is done based on the original Blei-LDA paper, where:
22+
log_phi := beta * exp(Ψ(gamma)), over every topic for every word.
23+
24+
TODO: incorporate lee-sueng trick used in
25+
**Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**.
26+
27+
Parameters
28+
----------
29+
gamma : size of num_topics. in-parameter
30+
31+
phi : size of (max_doc_len, num_topics). int/out-parameter
32+
33+
log_phi: size of (max_doc_len, num_topics). in/out-parameter
34+
35+
word_ids: size of doc_length. in-parameter
36+
37+
lda_topics: size of (vocab_len, num_topics). in-parameter
38+
39+
num_topics: number of topics in the model. in-parameter
40+
41+
doc_length: length of the document (number of words in the document). in-parameter
42+
43+
"""
44+
45+
cdef int k, i
46+
47+
# digamma values
48+
cdef REAL_t * dig = <REAL_t *> malloc(num_topics * sizeof(REAL_t))
49+
if dig == NULL:
50+
raise
51+
if dig == NULL:
52+
print("NULL pointer")
53+
54+
for k in range(num_topics):
55+
dig[k] = psi(gamma[k])
56+
57+
cdef REAL_t *log_phi_row = NULL
58+
cdef REAL_t *phi_row = NULL
59+
60+
for i in range(doc_length):
61+
for k in range(num_topics):
62+
# TODO check if proper indexing
63+
log_phi[i * num_topics + k] = \
64+
dig[k] + lda_topics[word_ids[i] * num_topics + k]
65+
66+
log_phi_row = log_phi + i * num_topics
67+
68+
phi_row = phi + i * num_topics
69+
70+
# log normalize
71+
v = log_phi_row[0]
72+
for i in range(1, num_topics):
73+
v = log(exp(v) + exp(log_phi_row[i]))
74+
75+
# subtract every element by v
76+
# TODO blas
77+
for i in range(num_topics):
78+
log_phi_row[i] = log_phi_row[i] - v
79+
80+
for i in range(num_topics):
81+
phi_row[i] = exp(log_phi_row[i])
82+
83+
free(dig)
84+
85+
86+
cdef update_phi_fixed():
87+
return
88+
89+
90+
cdef update_gamma(REAL_t * gamma, const REAL_t * phi, const REAL_t *lda_alpha,
91+
const int *word_counts, const int num_topics, const int doc_length):
92+
"""Update variational dirichlet parameters.
93+
94+
This operations is described in the original Blei LDA paper:
95+
gamma = alpha + sum(phi), over every topic for every word.
96+
97+
Parameters
98+
----------
99+
gamma: size of num_topics. int/out-parameter
100+
101+
phi: size of (max_doc_len, num_topics). in-parameter
102+
103+
lda_alpha: size of num_topics. in-parameter
104+
105+
word_counts: size of doc_length. in-parameter
106+
107+
num_topics: number of topics in the model
108+
109+
doc_length: length of the document
110+
111+
"""
112+
memcpy(gamma, lda_alpha, num_topics * sizeof(REAL_t))
113+
114+
cdef int i, k
115+
# TODO BLAS matrix*vector
116+
for i in range(doc_length):
117+
for k in range(num_topics):
118+
gamma[k] += phi[i * num_topics + k] * word_counts[i]
119+
120+
121+
cdef REAL_t compute_lda_lhood(REAL_t * lhood, const REAL_t *gamma, const REAL_t * phi, const REAL_t *log_phi,
122+
const REAL_t * lda_alpha, const REAL_t *lda_topics,
123+
int *word_counts, int *word_ids,
124+
const int num_topics, const int doc_length):
125+
"""Compute the log likelihood bound.
126+
Parameters
127+
----------
128+
gamma: size of num_topics
129+
130+
lhood: size of num_topics + 1
131+
132+
phi: size of (max_doc_len, num_topics). in-parameter
133+
134+
log_phi: size of (max_doc_len, num_topics). in-parameter
135+
136+
lda_alpha: size of num_topics. in-parameter
137+
138+
lda_topics: size of (vocab_len, num_topics). in-parameter
139+
140+
word_counts: size of doc_len
141+
142+
word_ids: size of doc_len
143+
144+
num_topics: number of topics in the model
145+
146+
doc_length: length of the document
147+
148+
Returns
149+
-------
150+
float
151+
The optimal lower bound for the true posterior using the approximate distribution.
152+
153+
"""
154+
cdef int i
155+
156+
cdef REAL_t gamma_sum = 0.0
157+
158+
for i in range(num_topics):
159+
gamma_sum += gamma[i]
160+
161+
cdef REAL_t alpha_sum = 0.0
162+
for i in range(num_topics):
163+
alpha_sum += lda_alpha[i]
164+
165+
cdef REAL_t lhood_v = gammaln(alpha_sum) - gammaln(gamma_sum)
166+
lhood[num_topics] = lhood_v
167+
168+
cdef REAL_t digsum = psi(gamma_sum)
169+
cdef REAL_t lhood_term, e_log_theta_k
170+
171+
for k in range(num_topics):
172+
173+
e_log_theta_k = psi(gamma[k]) - digsum
174+
175+
lhood_term = \
176+
(lda_alpha[k] - gamma[k]) * e_log_theta_k + \
177+
gammaln(gamma[k]) - gammaln(lda_alpha[k])
178+
179+
# TODO: check why there's an IF
180+
for i in range(doc_length):
181+
if phi[i * num_topics + k] > 0:
182+
lhood_term += \
183+
word_counts[i] * phi[i * num_topics + k] \
184+
* (e_log_theta_k + lda_topics[word_ids[i] * num_topics + k]
185+
- log_phi[i * num_topics + k])
186+
187+
lhood[k] = lhood_term
188+
lhood_v += lhood_term
189+
190+
return lhood_v
191+
192+
cdef init_lda_post(REAL_t *gamma, REAL_t * phi, const int *word_counts,
193+
const REAL_t *lda_alpha,
194+
const int doc_length, const int num_topics):
195+
196+
"""Initialize variational posterior. """
197+
198+
cdef int i, j
199+
200+
cdef int total = 0
201+
202+
# BLAS sum of absolute numbers
203+
for i in range(doc_length):
204+
total += word_counts[i]
205+
206+
cdef REAL_t init_value = lda_alpha[0] + float(total) / num_topics
207+
208+
for i in range(num_topics):
209+
gamma[i] = init_value
210+
211+
init_value = 1.0 / num_topics
212+
213+
for i in range(doc_length):
214+
phi_doc = phi + i * num_topics
215+
for j in range(num_topics):
216+
phi_doc[j] = init_value
217+
218+
219+
def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED=1e-8,
220+
lda_inference_max_iter=25, g=None, g3_matrix=None, g4_matrix=None, g5_matrix=None):
221+
"""Posterior inference for lda.
222+
223+
Parameters
224+
----------
225+
226+
Returns
227+
-------
228+
float
229+
The optimal lower bound for the true posterior using the approximate distribution.
230+
"""
231+
232+
cdef int i
233+
234+
###############
235+
# Setup C structures
236+
###############
237+
238+
cdef int num_topics = self.lda.num_topics
239+
cdef int vocab_len = len(self.lda.id2word)
240+
241+
cdef int doc_length = len(self.doc)
242+
cdef int max_doc_len = doc_length
243+
244+
# TODO adopt implementation to avoid memory allocation for every document.
245+
# E.g. create numpy array field of Python class. Be careful with the array size, it should be at least
246+
# the length of the longest document
247+
cdef int * word_ids = <int *> malloc(doc_length * sizeof(int))
248+
if word_ids == NULL:
249+
raise
250+
cdef int * word_counts = <int *> malloc(doc_length * sizeof(int))
251+
if word_counts == NULL:
252+
raise
253+
# TODO Would it be better to create numpy array first?
254+
for i in range(doc_length):
255+
word_ids[i] = self.doc[i][0]
256+
word_counts[i] = self.doc[i][0]
257+
258+
cdef REAL_t * gamma = <REAL_t *> (np.PyArray_DATA(self.gamma))
259+
cdef REAL_t * phi = <REAL_t *> (np.PyArray_DATA(self.phi))
260+
cdef REAL_t * log_phi = <REAL_t *> (np.PyArray_DATA(self.log_phi))
261+
cdef REAL_t * lhood = <REAL_t *> (np.PyArray_DATA(self.lhood))
262+
263+
cdef REAL_t * lda_topics = <REAL_t *> (np.PyArray_DATA(self.lda.topics))
264+
cdef REAL_t * lda_alpha = <REAL_t *> (np.PyArray_DATA(self.lda.alpha))
265+
266+
###############
267+
# Finished setup of c structures here
268+
###############
269+
270+
init_lda_post(gamma, phi, word_counts, lda_alpha, doc_length, num_topics)
271+
272+
# sum of counts in a doc
273+
cdef REAL_t total = sum(count for word_id, count in self.doc)
274+
275+
cdef REAL_t lhood_v = compute_lda_lhood(lhood, gamma, phi, log_phi,
276+
lda_alpha, lda_topics, word_counts, word_ids,
277+
num_topics, doc_length)
278+
279+
cdef REAL_t lhood_old = 0.0
280+
cdef REAL_t converged = 0.0
281+
cdef int iter_ = 0
282+
283+
# TODO Why first iteration starts here is done outside of the loop?
284+
iter_ += 1
285+
lhood_old = lhood_v
286+
update_gamma(gamma, phi, lda_alpha, word_counts, num_topics, doc_length)
287+
288+
model = "DTM"
289+
290+
# if model == "DTM" or sslm is None:
291+
update_phi(gamma, phi, log_phi, word_ids, lda_topics, num_topics, doc_length)
292+
293+
lhood_v = compute_lda_lhood(lhood, gamma, phi, log_phi,
294+
lda_alpha, lda_topics, word_counts, word_ids,
295+
num_topics, doc_length)
296+
297+
converged = abs((lhood_old - lhood_v) / (lhood_old * total))
298+
299+
while converged > LDA_INFERENCE_CONVERGED and iter_ <= lda_inference_max_iter:
300+
301+
iter_ += 1
302+
lhood_old = lhood_v
303+
update_gamma(gamma, phi, lda_alpha, word_counts, num_topics, doc_length)
304+
model = "DTM"
305+
306+
update_phi(gamma, phi, log_phi, word_ids, lda_topics, num_topics, doc_length)
307+
308+
lhood_v = compute_lda_lhood(lhood, gamma, phi, log_phi,
309+
lda_alpha, lda_topics, word_counts, word_ids,
310+
num_topics, doc_length)
311+
312+
converged = np.fabs((lhood_old - lhood_v) / (lhood_old * total))
313+
314+
free(word_ids)
315+
free(word_counts)
316+
317+
return lhood_v
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
cimport numpy as np
2+
ctypedef np.float64_t REAL_t
3+
4+
5+
6+
cdef struct StateSpaceLanguageModelConfig:
7+
int num_time_slices, vocab_len
8+
REAL_t chain_variance, obs_variance
9+
10+
REAL_t * obs
11+
REAL_t * mean
12+
REAL_t * variance
13+
14+
REAL_t * fwd_mean
15+
REAL_t * fwd_variance
16+
REAL_t * zeta
17+
REAL_t * e_log_prob
18+
# T
19+
REAL_t * word_counts
20+
# T
21+
REAL_t * totals
22+
23+
REAL_t * deriv
24+
# T * (T+1)
25+
REAL_t * mean_deriv_mtx
26+
int word
27+

0 commit comments

Comments
 (0)