@@ -110,23 +110,23 @@ def __init__(self, graph_builder_config):
110110 graph_builder_config: A `nsl.configs.GraphBuilderConfig` instance.
111111
112112 Raises:
113- ValueError: If `lsh_bits < 0`, or if `lsh_bits > 0 and lsh_rounds < 1`.
113+ ValueError: If `lsh_splits < 0` or if `lsh_splits > 0 and lsh_rounds < 1`.
114114 """
115115 self .config = graph_builder_config
116- if self .config .lsh_bits < 0 :
117- raise ValueError ('lsh_bits < 0' )
118- if self .config .lsh_bits > 0 and self .config .lsh_rounds < 1 :
119- raise ValueError ('lsh_bits > 0 but lsh_rounds < 1' )
116+ if self .config .lsh_splits < 0 :
117+ raise ValueError ('lsh_splits < 0' )
118+ if self .config .lsh_splits > 0 and self .config .lsh_rounds < 1 :
119+ raise ValueError ('lsh_splits > 0 but lsh_rounds < 1' )
120120
121121 # Keep a set of previously written edges if it's possible we might
122122 # generate the same edge multiple times. This can happen only if both
123- # 'lsh_bits > 0' and 'lsh_rounds > 1'. To save space, we pick a canonical
123+ # 'lsh_splits > 0' and 'lsh_rounds > 1'. To save space, we pick a canonical
124124 # ordering (source < target) for each bi-directional edge. Note that we
125125 # do not need to store the edge weight as well because for any
126126 # (source, target) pair, the cosine similarity between them will never
127127 # change.
128128 self .edge_set = None
129- if self .config .lsh_bits > 0 and self .config .lsh_rounds > 1 :
129+ if self .config .lsh_splits > 0 and self .config .lsh_rounds > 1 :
130130 self .edge_set = set ()
131131
132132 def _is_new_edge (self , src , tgt ):
@@ -146,7 +146,7 @@ def _bucket(self, lsh_matrix, embedding):
146146 embedding: A 1-D vector representing the dense embedding for a point.
147147
148148 Returns:
149- The bucket ID, a value in `[0, 2^n)`, where `n = self.config.lsh_bits `.
149+ The bucket ID, a value in `[0, 2^n)`, where `n = self.config.lsh_splits `.
150150 Bit `i` of the result (where bit 0 corresponds to the least significant
151151 bit) is 1 if and only if the dot product of row `i` of `lsh_matrix' and
152152 `embedding` is positive.
@@ -159,10 +159,10 @@ def _bucket(self, lsh_matrix, embedding):
159159 return bucket
160160
161161 def _generate_lsh_buckets (self , embeddings ):
162- """Buckets the given `embeddings` according to `config.lsh_bits `.
162+ """Buckets the given `embeddings` according to `config.lsh_splits `.
163163
164164 The embeddings can be bucketed into a total of at most `2^n` different
165- buckets, where `n` is given by the value of `config.lsh_bits `. If `n` is
165+ buckets, where `n` is given by the value of `config.lsh_splits `. If `n` is
166166 not positive, then all of the given `embeddings` keys will be bucketed into
167167 bucket 0.
168168
@@ -172,14 +172,14 @@ def _generate_lsh_buckets(self, embeddings):
172172 Returns:
173173 A dictionary mapping bucket IDs to sets of embedding IDs in each bucket.
174174 The bucket IDs are integers in the half-open interval `[0, 2^n)`, where
175- `n = config.lsh_bits `.
175+ `n = config.lsh_splits `.
176176 """
177- if self .config .lsh_bits <= 0 : return {0 : set (embeddings .keys ())}
177+ if self .config .lsh_splits <= 0 : return {0 : set (embeddings .keys ())}
178178
179179 # Generate a random matrix of values in the range [-1.0, 1.0] to use
180180 # to create the LSH buckets.
181181 num_dims = next (iter (embeddings .values ())).size
182- lsh_matrix = np .random .rand (self .config .lsh_bits , num_dims ) * 2 - 1
182+ lsh_matrix = np .random .rand (self .config .lsh_splits , num_dims ) * 2 - 1
183183
184184 # Add each embedding to its appropriate bucket
185185 bucket_map = {}
@@ -305,12 +305,12 @@ def build_graph_from_config(embedding_files, output_graph_path,
305305 compare just the pairs of points within each bucket for similarity, which can
306306 lead to dramatically faster running times.
307307
308- The `lsh_bits ` configuration attribute is used to control the maximum number
309- of LSH buckets. In particular, if `lsh_bits ` has the value `n`, then there
310- can be at most `2^n` LSH buckets. Using a larger value for `lsh_bits ` will
308+ The `lsh_splits ` configuration attribute is used to control the maximum number
309+ of LSH buckets. In particular, if `lsh_splits ` has the value `n`, then there
310+ can be at most `2^n` LSH buckets. Using a larger value for `lsh_splits ` will
311311 (generally) result in a larger number of buckets, and therefore, smaller
312312 number of instances in each bucket that need to be compared to each other.
313- As a result, increasing `lsh_bits ` can lead to dramatically faster running
313+ As a result, increasing `lsh_splits ` can lead to dramatically faster running
314314 times.
315315
316316 The disadvantage to using too many LSH buckets, however, is that we won't
@@ -324,42 +324,42 @@ def build_graph_from_config(embedding_files, output_graph_path,
324324 be similar enough on *any* of the LSH rounds (i.e., the resulting graph is the
325325 *union* of the graph edges generated on each LSH round).
326326
327- To illustrate these concepts and how various `lsh_bits ` and `lsh_rounds`
327+ To illustrate these concepts and how various `lsh_splits ` and `lsh_rounds`
328328 values correlate with graph building running times, we performed multiple runs
329329 of the graph builder on a dataset containing 50,000 instances, each with a
330- 100-dimensional embedding. When `lsh_bits = 0`, the program has to compare
330+ 100-dimensional embedding. When `lsh_splits = 0`, the program has to compare
331331 each instance against every other instance, for a total of roughly 2.5B
332332 comparisons, which takes nearly half an hour to complete and generates a total
333- of 35,313 graph edges (when `similarity_threshold = 0.9`). As `lsh_bits ` is
333+ of 35,313 graph edges (when `similarity_threshold = 0.9`). As `lsh_splits ` is
334334 increased, we lose recall (i.e., fewer than 35,313 edges are generated), but
335335 the recall can then be improved by increasing `lsh_rounds`. This table shows
336336 the minimum `lsh_rounds` value required to achieve a recall of >= 99.7%
337- (except for the `lsh_bits = 1` case), as well as the elapsed running time:
337+ (except for the `lsh_splits = 1` case), as well as the elapsed running time:
338338
339339 ```none
340- lsh_bits lsh_rounds Recall Running time
341- 0 N/A 100.0% 27m 46s
342- 1 2 99.4% 24m 33s
343- 2 3 99.8% 15m 35s
344- 3 4 99.7% 9m 37.9s
345- 4 6 99.9% 7m 07.5s
346- 5 8 99.9% 4m 59.2s
347- 6 9 99.7% 3m 01.2s
348- 7 11 99.8% 2m 02.3s
349- 8 13 99.8% 1m 20.8s
350- 9 16 99.7% 58.5s
351- 10 18 99.7% 43.6s
340+ lsh_splits lsh_rounds Recall Running time
341+ 0 N/A 100.0% 27m 46s
342+ 1 2 99.4% 24m 33s
343+ 2 3 99.8% 15m 35s
344+ 3 4 99.7% 9m 37.9s
345+ 4 6 99.9% 7m 07.5s
346+ 5 8 99.9% 4m 59.2s
347+ 6 9 99.7% 3m 01.2s
348+ 7 11 99.8% 2m 02.3s
349+ 8 13 99.8% 1m 20.8s
350+ 9 16 99.7% 58.5s
351+ 10 18 99.7% 43.6s
352352 ```
353353
354- As the table illustrates, by increasing both `lsh_bits ` and `lsh_rounds`, we
355- can dramatically decrease the running time of the graph builder without
354+ As the table illustrates, by increasing both `lsh_splits ` and `lsh_rounds`,
355+ we can dramatically decrease the running time of the graph builder without
356356 sacrificing edge recall. We have found that a good rule of thumb is to set
357- `lsh_bits >= ceiling(log_2(num_instances / 1000))`, so the expected LSH bucket
358- size will be at most 1000. However, if your instances are clustered or you
359- want an even faster run, you may want to use a larger `lsh_bits ` value. Note,
360- however, that when the similarity threshold is lower, recall rates are reduced
361- more quickly the larger the value of `lsh_bits ` is, so be careful not to set
362- that parameter too high for smaller `similarity_threshold` values.
357+ `lsh_splits >= ceiling(log_2(num_instances / 1000))`, so the expected LSH
358+ bucket size will be at most 1000. However, if your instances are clustered or
359+ you want an even faster run, you may want to use a larger `lsh_splits ` value.
360+ Note, however, that when the similarity threshold is lower, recall rates are
361+ reduced more quickly the larger the value of `lsh_splits ` is, so be careful
362+ not to set that parameter too high for smaller `similarity_threshold` values.
363363
364364 The generated graph edges are written to the TSV file named by
365365 `output_graph_path`. Each output edge is represented by a TSV line with the
@@ -381,7 +381,7 @@ def build_graph_from_config(embedding_files, output_graph_path,
381381 graph building parameters.
382382
383383 Raises:
384- ValueError: If `lsh_bits < 0`, or if `lsh_bits > 0 and lsh_rounds < 1`.
384+ ValueError: If `lsh_splits < 0` or if `lsh_splits > 0 and lsh_rounds < 1`.
385385 """
386386 graph_builder = GraphBuilder (graph_builder_config )
387387 graph_builder .build (embedding_files , output_graph_path )
@@ -392,7 +392,7 @@ def build_graph(embedding_files,
392392 similarity_threshold = 0.8 ,
393393 id_feature_name = 'id' ,
394394 embedding_feature_name = 'embedding' ,
395- lsh_bits = 0 ,
395+ lsh_splits = 0 ,
396396 lsh_rounds = 2 ,
397397 random_seed = None ):
398398 """Like `nsl.tools.build_graph_from_config`, but with individual parameters.
@@ -411,27 +411,27 @@ def build_graph(embedding_files,
411411 objects representing the ID of examples.
412412 embedding_feature_name: The name of the feature in the input
413413 `tf.train.Example` objects representing the embedding of examples.
414- lsh_bits : Determines the maximum number of LSH buckets into which input data
415- points will be bucketed by the graph builder. See the
414+ lsh_splits : Determines the maximum number of LSH buckets into which input
415+ data points will be bucketed by the graph builder. See the
416416 `nsl.tools.build_graph_from_config` documentation for details.
417417 lsh_rounds: The number of rounds of LSH bucketing to perform when
418- `lsh_bits > 0`. This is also the number of LSH buckets each point will be
419- hashed into.
418+ `lsh_splits > 0`. This is also the number of LSH buckets each point will
419+ be hashed into.
420420 random_seed: Value used to seed the random number generator used to perform
421- randomized LSH bucketing of the inputs when `lsh_bits > 0`. By default,
421+ randomized LSH bucketing of the inputs when `lsh_splits > 0`. By default,
422422 the generator will be initialized randomly, but setting this to any
423423 integer will initialize it deterministically.
424424
425425 Raises:
426- ValueError: If `lsh_bits < 0`, or if `lsh_bits > 0 and lsh_rounds < 1`.
426+ ValueError: If `lsh_splits < 0` or if `lsh_splits > 0 and lsh_rounds < 1`.
427427 """
428428 build_graph_from_config (
429429 embedding_files , output_graph_path ,
430430 nsl_configs .GraphBuilderConfig (
431431 id_feature_name = id_feature_name ,
432432 embedding_feature_name = embedding_feature_name ,
433433 similarity_threshold = similarity_threshold ,
434- lsh_bits = lsh_bits ,
434+ lsh_splits = lsh_splits ,
435435 lsh_rounds = lsh_rounds ,
436436 random_seed = random_seed ))
437437
@@ -451,7 +451,7 @@ def _main(argv):
451451 id_feature_name = flag .id_feature_name ,
452452 embedding_feature_name = flag .embedding_feature_name ,
453453 similarity_threshold = flag .similarity_threshold ,
454- lsh_bits = flag .lsh_bits ,
454+ lsh_splits = flag .lsh_splits ,
455455 lsh_rounds = flag .lsh_rounds ,
456456 random_seed = flag .random_seed ))
457457
@@ -470,19 +470,20 @@ def _main(argv):
470470 """Lower bound on the cosine similarity required for an edge
471471 to be created between two nodes.""" )
472472 flags .DEFINE_integer (
473- 'lsh_bits' , 0 ,
474- """The input instances will be randomly bucketed into 2^(lsh_bits)
475- potential buckets for better performance. The larger your number of
473+ 'lsh_splits' , 0 ,
474+ """On each LSH bucketing round, the space containing the input instances
475+ will be randomly split/partitioned this many times for better performance,
476+ resulting in up to 2^(lsh_splits) LSH buckets. The larger your number of
476477 input instances, the larger this value should be. A good rule of thumb is
477- to set `lsh_bits = ceiling(log_2(num_instances / 1000))`.""" )
478+ to set `lsh_splits = ceiling(log_2(num_instances / 1000))`.""" )
478479 flags .DEFINE_integer (
479480 'lsh_rounds' , 2 ,
480- """The number of rounds of LSH bucketing to perform when `lsh_bits > 0`.
481+ """The number of rounds of LSH bucketing to perform when `lsh_splits > 0`.
481482 This is also the number of LSH buckets each point will be hashed into.""" )
482483 flags .DEFINE_integer (
483484 'random_seed' , None ,
484485 """Value used to seed the random number generator used to perform
485- randomized LSH bucketing of the inputs when `lsh_bits > 0`. By default,
486+ randomized LSH bucketing of the inputs when `lsh_splits > 0`. By default,
486487 the generator will be initialized randomly, but setting this to any
487488 integer will initialize it deterministically.""" )
488489
0 commit comments