@@ -107,7 +107,7 @@ class LdaMulticore(LdaModel):
107107
108108 """
109109 def __init__ (self , corpus = None , num_topics = 100 , id2word = None , workers = None ,
110- chunksize = 2000 , passes = 1 , batch = False , alpha = 'symmetric' ,
110+ chunksize = 2000 , passes = 1 , update_every = 1 , alpha = 'symmetric' ,
111111 eta = None , decay = 0.5 , offset = 1.0 , eval_every = 10 , iterations = 50 ,
112112 gamma_threshold = 0.001 , random_state = None , minimum_probability = 0.01 ,
113113 minimum_phi_value = 0.01 , per_word_topics = False , dtype = np .float32 ):
@@ -133,6 +133,9 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
133133 Number of documents to be used in each training chunk.
134134 passes : int, optional
135135 Number of passes through the corpus during training.
136+ update_every : int, optional
137+ Number of chunks to be iterated through before each M step of EM.
138+ Set to 0 for batch learning, > 1 for online iterative learning.
136139 alpha : {float, numpy.ndarray of float, list of float, str}, optional
137140 A-priori belief on document-topic distribution, this can be:
138141 * scalar for a symmetric prior over document-topic distribution,
@@ -184,18 +187,15 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
184187 raise NotImplementedError ("auto-tuning alpha not implemented in LdaMulticore; use plain LdaModel." )
185188
186189 super (LdaMulticore , self ).__init__ (
187- corpus = corpus , num_topics = num_topics ,
188- id2word = id2word , chunksize = chunksize , passes = passes , alpha = alpha , eta = eta ,
190+ corpus = corpus , num_topics = num_topics , id2word = id2word , distributed = False , # not distributed across machines
191+ chunksize = chunksize , passes = passes , update_every = update_every , alpha = alpha , eta = eta ,
189192 decay = decay , offset = offset , eval_every = eval_every , iterations = iterations ,
190- gamma_threshold = gamma_threshold , random_state = random_state , minimum_probability = minimum_probability ,
193+ gamma_threshold = gamma_threshold , minimum_probability = minimum_probability , random_state = random_state ,
191194 minimum_phi_value = minimum_phi_value , per_word_topics = per_word_topics , dtype = dtype ,
192195 )
193196
194197 def update (self , corpus , chunks_as_numpy = False ):
195- """Train the model with new documents, by EM-iterating over `corpus` until the topics converge
196- (or until the maximum number of allowed iterations is reached).
197-
198- Train the model with new documents, by EM-iterating over the corpus until the topics converge, or until
198+ """Train the model with new documents, by EM-iterating over the corpus until the topics converge, or until
199199 the maximum number of allowed iterations is reached. `corpus` must be an iterable. The E step is distributed
200200 into the several processes.
201201
@@ -231,14 +231,20 @@ def update(self, corpus, chunks_as_numpy=False):
231231
232232 self .state .numdocs += lencorpus
233233
234- if self .batch :
234+ # Same as in LdaModel but self.workers (processes) is used instead of self.numworkers (machines)
235+ if self .update_every :
236+ updatetype = "online"
237+ if self .passes == 1 :
238+ updatetype += " (single-pass)"
239+ else :
240+ updatetype += " (multi-pass)"
241+ updateafter = min (lencorpus , self .update_every * self .workers * self .chunksize )
242+ else :
235243 updatetype = "batch"
236244 updateafter = lencorpus
237- else :
238- updatetype = "online"
239- updateafter = self .chunksize * self .workers
245+
240246 eval_every = self .eval_every or 0
241- evalafter = min (lencorpus , eval_every * updateafter )
247+ evalafter = min (lencorpus , eval_every * self . workers * self . chunksize )
242248
243249 updates_per_pass = max (1 , lencorpus / updateafter )
244250 logger .info (
0 commit comments