@@ -104,7 +104,39 @@ class GraphBuilder(object):
104104 """Computes the similarity graph from a set of (dense) embeddings."""
105105
106106 def __init__ (self , graph_builder_config ):
107+ """Initializes this GraphBuilder from the given configuration instance.
108+
109+ Args:
110+ graph_builder_config: A `nsl.configs.GraphBuilderConfig` instance.
111+
112+ Raises:
113+ ValueError: If `lsh_bits < 0`, or if `lsh_bits > 0 and lsh_rounds < 1`.
114+ """
107115 self .config = graph_builder_config
116+ if self .config .lsh_bits < 0 :
117+ raise ValueError ('lsh_bits < 0' )
118+ if self .config .lsh_bits > 0 and self .config .lsh_rounds < 1 :
119+ raise ValueError ('lsh_bits > 0 but lsh_rounds < 1' )
120+
121+ # Keep a set of previously written edges if it's possible we might
122+ # generate the same edge multiple times. This can happen only if both
123+ # 'lsh_bits > 0' and 'lsh_rounds > 1'. To save space, we pick a canonical
124+ # ordering (source < target) for each bi-directional edge. Note that we
125+ # do not need to store the edge weight as well because for any
126+ # (source, target) pair, the cosine similarity between them will never
127+ # change.
128+ self .edge_set = None
129+ if self .config .lsh_bits > 0 and self .config .lsh_rounds > 1 :
130+ self .edge_set = set ()
131+
132+ def _is_new_edge (self , src , tgt ):
133+ """Returns `True` iff the edge `src` to `tgt` has not been generated yet."""
134+ canonical_edge = (src , tgt ) if src < tgt else (tgt , src )
135+ # Remember set size before calling add() because add() returns None.
136+ # This way we don't have to hash 'canonical_edge' twice.
137+ set_size_before_add = len (self .edge_set )
138+ self .edge_set .add (canonical_edge )
139+ return len (self .edge_set ) > set_size_before_add
108140
109141 def _bucket (self , lsh_matrix , embedding ):
110142 """Returns the bucket ID of the given `embedding` relative to `lsh_matrix`.
@@ -147,17 +179,13 @@ def _generate_lsh_buckets(self, embeddings):
147179 # Generate a random matrix of values in the range [-1.0, 1.0] to use
148180 # to create the LSH buckets.
149181 num_dims = next (iter (embeddings .values ())).size
150- np .random .seed (self .config .random_seed )
151182 lsh_matrix = np .random .rand (self .config .lsh_bits , num_dims ) * 2 - 1
152183
153184 # Add each embedding to its appropriate bucket
154- start_time = time .time ()
155185 bucket_map = {}
156186 for key , embedding in six .iteritems (embeddings ):
157187 s = bucket_map .setdefault (self ._bucket (lsh_matrix , embedding ), set ())
158188 s .add (key )
159- logging .info ('Bucketed embeddings into %d bucket(s) in %.2f seconds.' ,
160- len (bucket_map ), time .time () - start_time )
161189 return bucket_map
162190
163191 def _generate_edges_for_bucket (self , bucket , embeddings ):
@@ -174,15 +202,15 @@ def _generate_edges_for_bucket(self, bucket, embeddings):
174202 for src , tgt in itertools .combinations (bucket , 2 ):
175203 weight = np .dot (embeddings [src ], embeddings [tgt ])
176204 if weight >= self .config .similarity_threshold :
177- yield (src , tgt , weight )
205+ if self .edge_set is None or self ._is_new_edge (src , tgt ):
206+ yield (src , tgt , weight )
178207
179208 def _generate_edges (self , embeddings ):
180209 """Generates edges among pairs of the given `embeddings`.
181210
182- This function considers all distinct pairs of nodes in `embeddings`,
183- computes the similarity between all such pairs (by calling the `_similarity`
184- method), and yields any edge for which the similarity is at least
185- `self.similarity_threshold`.
211+ This function considers related pairs of nodes in `embeddings`,
212+ computes the cosine similarity between all such pairs, and yields any edge
213+ for which the cosine similarity is at least `self.similarity_threshold`.
186214
187215 Args:
188216 embeddings: A `dict`: node_id -> embedding.
@@ -191,16 +219,27 @@ def _generate_edges(self, embeddings):
191219 A tuple (source, target, weight) denoting a (directed) edge from 'source'
192220 to 'target' with the given 'weight'.
193221 """
194- start_time = time .time ()
195- edge_cnt = 0
196- bucket_map = self ._generate_lsh_buckets (embeddings )
197- for bucket in bucket_map .values ():
198- for edge in self ._generate_edges_for_bucket (bucket , embeddings ):
199- yield edge
200- edge_cnt += 1
201- if (edge_cnt % 1000000 ) == 0 :
202- logging .info ('Created %d bi-directional edges in %.2f seconds....' ,
203- edge_cnt , time .time () - start_time )
222+ for lsh_round in range (max (1 , self .config .lsh_rounds )):
223+ start_time = time .time ()
224+ edge_cnt = 0
225+ bucket_map = self ._generate_lsh_buckets (embeddings )
226+ logging_prefix = 'LSH bucketing round {}' .format (lsh_round )
227+ logging .info ('%s: created %d bucket(s) in %.2f seconds.' , logging_prefix ,
228+ len (bucket_map ),
229+ time .time () - start_time )
230+ for bucket in bucket_map .values ():
231+ for edge in self ._generate_edges_for_bucket (bucket , embeddings ):
232+ edge_cnt += 1
233+ if (edge_cnt % 1000000 ) == 0 :
234+ logging .info (
235+ '%s: generated %d new bi-directional edges in %.2f seconds....' ,
236+ logging_prefix , edge_cnt ,
237+ time .time () - start_time )
238+ yield edge
239+ logging .info (
240+ '%s completed: generated %d new bi-directional edges in %.2f seconds.' ,
241+ logging_prefix , edge_cnt ,
242+ time .time () - start_time )
204243
205244 def build (self , embedding_files , output_graph_path ):
206245 """Reads embeddings and writes the similarity graph to `output_graph_path`.
@@ -220,6 +259,8 @@ def build(self, embedding_files, output_graph_path):
220259 start_time = time .time ()
221260 logging .info ('Building graph and writing edges to TSV file: %s' ,
222261 output_graph_path )
262+ np .random .seed (self .config .random_seed )
263+ logging .info ('Using random seed value: %s' , self .config .random_seed )
223264 edge_cnt = 0
224265 with open (output_graph_path , 'w' ) as f :
225266 for (src , tgt , wt ) in self ._generate_edges (embeddings ):
@@ -267,14 +308,60 @@ def build_graph_from_config(embedding_files, output_graph_path,
267308 The `lsh_bits` configuration attribute is used to control the maximum number
268309 of LSH buckets. In particular, if `lsh_bits` has the value `n`, then there
269310 can be at most `2^n` LSH buckets. Using a larger value for `lsh_bits` will
270- (generally) result in a larger number of buckets, and therefore, faster
271- running times. The disadvantage to using too many LSH buckets, however, is
272- that we may not create a graph edge between two instances that are otherwise
273- highly similar because they happened to be randomly hashed into two different
274- LSH buckets. A good rule of thumb is to set
275- `lsh_bits = ceiling(log_2(num_instances / 1000))`.
276-
277- The resulting graph edges are written to the TSV file named by
311+ (generally) result in a larger number of buckets, and therefore, smaller
312+ number of instances in each bucket that need to be compared to each other.
313+ As a result, increasing `lsh_bits` can lead to dramatically faster running
314+ times.
315+
316+ The disadvantage to using too many LSH buckets, however, is that we won't
317+ create a graph edge between two instances that are highly similar if they
318+ happen to be randomly hashed into two different LSH buckets. To address
319+ that problem, the `lsh_rounds` parameter can be used to perform multiple
320+ rounds of the LSH bucketing process. Even if two similar instances may get
321+ hashed to different LSH buckets during the first round, they may get hashed
322+ into the same LSH bucket on a subsequent round. An edge is created in the
323+ output graph if two intances are hashed into the same bucket and deemed to
324+ be similar enough on *any* of the LSH rounds (i.e., the resulting graph is the
325+ *union* of the graph edges generated on each LSH round).
326+
327+ To illustrate these concepts and how various `lsh_bits` and `lsh_rounds`
328+ values correlate with graph building running times, we performed multiple runs
329+ of the graph builder on a dataset containing 50,000 instances, each with a
330+ 100-dimensional embedding. When `lsh_bits = 0`, the program has to compare
331+ each instance against every other instance, for a total of roughly 2.5B
332+ comparisons, which takes nearly half an hour to complete and generates a total
333+ of 35,313 graph edges (when `similarity_threshold = 0.9`). As `lsh_bits` is
334+ increased, we lose recall (i.e., fewer than 35,313 edges are generated), but
335+ the recall can then be improved by increasing `lsh_rounds`. This table shows
336+ the minimum `lsh_rounds` value required to achieve a recall of >= 99.7%
337+ (except for the `lsh_bits = 1` case), as well as the elapsed running time:
338+
339+ ```none
340+ lsh_bits lsh_rounds Recall Running time
341+ 0 N/A 100.0% 27m 46s
342+ 1 2 99.4% 24m 33s
343+ 2 3 99.8% 15m 35s
344+ 3 4 99.7% 9m 37.9s
345+ 4 6 99.9% 7m 07.5s
346+ 5 8 99.9% 4m 59.2s
347+ 6 9 99.7% 3m 01.2s
348+ 7 11 99.8% 2m 02.3s
349+ 8 13 99.8% 1m 20.8s
350+ 9 16 99.7% 58.5s
351+ 10 18 99.7% 43.6s
352+ ```
353+
354+ As the table illustrates, by increasing both `lsh_bits` and `lsh_rounds`, we
355+ can dramatically decrease the running time of the graph builder without
356+ sacrificing edge recall. We have found that a good rule of thumb is to set
357+ `lsh_bits >= ceiling(log_2(num_instances / 1000))`, so the expected LSH bucket
358+ size will be at most 1000. However, if your instances are clustered or you
359+ want an even faster run, you may want to use a larger `lsh_bits` value. Note,
360+ however, that when the similarity threshold is lower, recall rates are reduced
361+ more quickly the larger the value of `lsh_bits` is, so be careful not to set
362+ that parameter too high for smaller `similarity_threshold` values.
363+
364+ The generated graph edges are written to the TSV file named by
278365 `output_graph_path`. Each output edge is represented by a TSV line with the
279366 following form:
280367
@@ -292,6 +379,9 @@ def build_graph_from_config(embedding_files, output_graph_path,
292379 should be written.
293380 graph_builder_config: A `nsl.configs.GraphBuilderConfig` specifying the
294381 graph building parameters.
382+
383+ Raises:
384+ ValueError: If `lsh_bits < 0`, or if `lsh_bits > 0 and lsh_rounds < 1`.
295385 """
296386 graph_builder = GraphBuilder (graph_builder_config )
297387 graph_builder .build (embedding_files , output_graph_path )
@@ -302,7 +392,9 @@ def build_graph(embedding_files,
302392 similarity_threshold = 0.8 ,
303393 id_feature_name = 'id' ,
304394 embedding_feature_name = 'embedding' ,
305- lsh_bits = 0 ):
395+ lsh_bits = 0 ,
396+ lsh_rounds = 2 ,
397+ random_seed = None ):
306398 """Like `nsl.tools.build_graph_from_config`, but with individual parameters.
307399
308400 This API exists to maintain backward compatibility, but is deprecated in favor
@@ -322,15 +414,26 @@ def build_graph(embedding_files,
322414 lsh_bits: Determines the maximum number of LSH buckets into which input data
323415 points will be bucketed by the graph builder. See the
324416 `nsl.tools.build_graph_from_config` documentation for details.
417+ lsh_rounds: The number of rounds of LSH bucketing to perform when
418+ `lsh_bits > 0`. This is also the number of LSH buckets each point will be
419+ hashed into.
420+ random_seed: Value used to seed the random number generator used to perform
421+ randomized LSH bucketing of the inputs when `lsh_bits > 0`. By default,
422+ the generator will be initialized randomly, but setting this to any
423+ integer will initialize it deterministically.
424+
425+ Raises:
426+ ValueError: If `lsh_bits < 0`, or if `lsh_bits > 0 and lsh_rounds < 1`.
325427 """
326428 build_graph_from_config (
327- embedding_files ,
328- output_graph_path ,
429+ embedding_files , output_graph_path ,
329430 nsl_configs .GraphBuilderConfig (
330431 id_feature_name = id_feature_name ,
331432 embedding_feature_name = embedding_feature_name ,
332433 similarity_threshold = similarity_threshold ,
333- lsh_bits = lsh_bits ))
434+ lsh_bits = lsh_bits ,
435+ lsh_rounds = lsh_rounds ,
436+ random_seed = random_seed ))
334437
335438
336439def _main (argv ):
@@ -349,6 +452,7 @@ def _main(argv):
349452 embedding_feature_name = flag .embedding_feature_name ,
350453 similarity_threshold = flag .similarity_threshold ,
351454 lsh_bits = flag .lsh_bits ,
455+ lsh_rounds = flag .lsh_rounds ,
352456 random_seed = flag .random_seed ))
353457
354458
@@ -371,6 +475,10 @@ def _main(argv):
371475 potential buckets for better performance. The larger your number of
372476 input instances, the larger this value should be. A good rule of thumb is
373477 to set `lsh_bits = ceiling(log_2(num_instances / 1000))`.""" )
478+ flags .DEFINE_integer (
479+ 'lsh_rounds' , 2 ,
480+ """The number of rounds of LSH bucketing to perform when `lsh_bits > 0`.
481+ This is also the number of LSH buckets each point will be hashed into.""" )
374482 flags .DEFINE_integer (
375483 'random_seed' , None ,
376484 """Value used to seed the random number generator used to perform
0 commit comments