Skip to content

Commit 0d50227

Browse files
Neural-Link Teamtensorflow-copybara
authored andcommitted
Switch the graph builder flag name from --lsh_bits to --lsh_splits.
Also updated API documentation to better describe the name of the flag. PiperOrigin-RevId: 320296734
1 parent ad4c2c7 commit 0d50227

File tree

3 files changed

+72
-71
lines changed

3 files changed

+72
-71
lines changed

neural_structured_learning/configs/configs.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -306,24 +306,24 @@ class GraphBuilderConfig(object):
306306
Defaults to `'embedding'`.
307307
similarity_threshold: Threshold used to determine which edges to retain in
308308
the resulting graph. Defaults to 0.8.
309-
lsh_bits: Determines the maximum number of LSH buckets into which input data
310-
points will be bucketed by the graph builder. See the
311-
`nsl.tools.build_graph_from_config` documentation for details. This
312-
defaults to 0, in which case all pairs of inputs will be compared,
313-
probably resulting in slow running times on larger input sets. Defaults
314-
to 0.
309+
lsh_splits: On each LSH bucketing round, the space containing the input
310+
instances will be randomly split/partitioned this many times for better
311+
graph builder performance. See the `nsl.tools.build_graph_from_config`
312+
documentation for details. Defaults to 0, in which case all pairs of
313+
inputs will be compared, probably resulting in slow running times on
314+
larger input sets.
315315
lsh_rounds: The number of rounds of LSH bucketing to perform when
316-
`lsh_bits > 0`. This is also the number of LSH buckets each point will be
317-
hashed into. Defaults to 2.
316+
`lsh_splits > 0`. This is also the number of LSH buckets each point will
317+
be hashed into. Defaults to 2.
318318
random_seed: Value used to seed the random number generator used to perform
319-
randomized LSH bucketing of the inputs when `lsh_bits > 0`. By default,
319+
randomized LSH bucketing of the inputs when `lsh_splits > 0`. By default,
320320
the generator will be initialized randomly, but setting this to any
321321
integer will initialize it deterministically. Defaults to `None`.
322322
"""
323323
id_feature_name = attr.ib(default='id')
324324
embedding_feature_name = attr.ib(default='embedding')
325325
similarity_threshold = attr.ib(default=0.8)
326-
lsh_bits = attr.ib(default=0)
326+
lsh_splits = attr.ib(default=0)
327327
lsh_rounds = attr.ib(default=2)
328328
random_seed = attr.ib(default=None)
329329

neural_structured_learning/tools/build_graph.py

Lines changed: 58 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -110,23 +110,23 @@ def __init__(self, graph_builder_config):
110110
graph_builder_config: A `nsl.configs.GraphBuilderConfig` instance.
111111
112112
Raises:
113-
ValueError: If `lsh_bits < 0`, or if `lsh_bits > 0 and lsh_rounds < 1`.
113+
ValueError: If `lsh_splits < 0` or if `lsh_splits > 0 and lsh_rounds < 1`.
114114
"""
115115
self.config = graph_builder_config
116-
if self.config.lsh_bits < 0:
117-
raise ValueError('lsh_bits < 0')
118-
if self.config.lsh_bits > 0 and self.config.lsh_rounds < 1:
119-
raise ValueError('lsh_bits > 0 but lsh_rounds < 1')
116+
if self.config.lsh_splits < 0:
117+
raise ValueError('lsh_splits < 0')
118+
if self.config.lsh_splits > 0 and self.config.lsh_rounds < 1:
119+
raise ValueError('lsh_splits > 0 but lsh_rounds < 1')
120120

121121
# Keep a set of previously written edges if it's possible we might
122122
# generate the same edge multiple times. This can happen only if both
123-
# 'lsh_bits > 0' and 'lsh_rounds > 1'. To save space, we pick a canonical
123+
# 'lsh_splits > 0' and 'lsh_rounds > 1'. To save space, we pick a canonical
124124
# ordering (source < target) for each bi-directional edge. Note that we
125125
# do not need to store the edge weight as well because for any
126126
# (source, target) pair, the cosine similarity between them will never
127127
# change.
128128
self.edge_set = None
129-
if self.config.lsh_bits > 0 and self.config.lsh_rounds > 1:
129+
if self.config.lsh_splits > 0 and self.config.lsh_rounds > 1:
130130
self.edge_set = set()
131131

132132
def _is_new_edge(self, src, tgt):
@@ -146,7 +146,7 @@ def _bucket(self, lsh_matrix, embedding):
146146
embedding: A 1-D vector representing the dense embedding for a point.
147147
148148
Returns:
149-
The bucket ID, a value in `[0, 2^n)`, where `n = self.config.lsh_bits`.
149+
The bucket ID, a value in `[0, 2^n)`, where `n = self.config.lsh_splits`.
150150
Bit `i` of the result (where bit 0 corresponds to the least significant
151151
bit) is 1 if and only if the dot product of row `i` of `lsh_matrix' and
152152
`embedding` is positive.
@@ -159,10 +159,10 @@ def _bucket(self, lsh_matrix, embedding):
159159
return bucket
160160

161161
def _generate_lsh_buckets(self, embeddings):
162-
"""Buckets the given `embeddings` according to `config.lsh_bits`.
162+
"""Buckets the given `embeddings` according to `config.lsh_splits`.
163163
164164
The embeddings can be bucketed into a total of at most `2^n` different
165-
buckets, where `n` is given by the value of `config.lsh_bits`. If `n` is
165+
buckets, where `n` is given by the value of `config.lsh_splits`. If `n` is
166166
not positive, then all of the given `embeddings` keys will be bucketed into
167167
bucket 0.
168168
@@ -172,14 +172,14 @@ def _generate_lsh_buckets(self, embeddings):
172172
Returns:
173173
A dictionary mapping bucket IDs to sets of embedding IDs in each bucket.
174174
The bucket IDs are integers in the half-open interval `[0, 2^n)`, where
175-
`n = config.lsh_bits`.
175+
`n = config.lsh_splits`.
176176
"""
177-
if self.config.lsh_bits <= 0: return {0: set(embeddings.keys())}
177+
if self.config.lsh_splits <= 0: return {0: set(embeddings.keys())}
178178

179179
# Generate a random matrix of values in the range [-1.0, 1.0] to use
180180
# to create the LSH buckets.
181181
num_dims = next(iter(embeddings.values())).size
182-
lsh_matrix = np.random.rand(self.config.lsh_bits, num_dims) * 2 - 1
182+
lsh_matrix = np.random.rand(self.config.lsh_splits, num_dims) * 2 - 1
183183

184184
# Add each embedding to its appropriate bucket
185185
bucket_map = {}
@@ -305,12 +305,12 @@ def build_graph_from_config(embedding_files, output_graph_path,
305305
compare just the pairs of points within each bucket for similarity, which can
306306
lead to dramatically faster running times.
307307
308-
The `lsh_bits` configuration attribute is used to control the maximum number
309-
of LSH buckets. In particular, if `lsh_bits` has the value `n`, then there
310-
can be at most `2^n` LSH buckets. Using a larger value for `lsh_bits` will
308+
The `lsh_splits` configuration attribute is used to control the maximum number
309+
of LSH buckets. In particular, if `lsh_splits` has the value `n`, then there
310+
can be at most `2^n` LSH buckets. Using a larger value for `lsh_splits` will
311311
(generally) result in a larger number of buckets, and therefore, smaller
312312
number of instances in each bucket that need to be compared to each other.
313-
As a result, increasing `lsh_bits` can lead to dramatically faster running
313+
As a result, increasing `lsh_splits` can lead to dramatically faster running
314314
times.
315315
316316
The disadvantage to using too many LSH buckets, however, is that we won't
@@ -324,42 +324,42 @@ def build_graph_from_config(embedding_files, output_graph_path,
324324
be similar enough on *any* of the LSH rounds (i.e., the resulting graph is the
325325
*union* of the graph edges generated on each LSH round).
326326
327-
To illustrate these concepts and how various `lsh_bits` and `lsh_rounds`
327+
To illustrate these concepts and how various `lsh_splits` and `lsh_rounds`
328328
values correlate with graph building running times, we performed multiple runs
329329
of the graph builder on a dataset containing 50,000 instances, each with a
330-
100-dimensional embedding. When `lsh_bits = 0`, the program has to compare
330+
100-dimensional embedding. When `lsh_splits = 0`, the program has to compare
331331
each instance against every other instance, for a total of roughly 2.5B
332332
comparisons, which takes nearly half an hour to complete and generates a total
333-
of 35,313 graph edges (when `similarity_threshold = 0.9`). As `lsh_bits` is
333+
of 35,313 graph edges (when `similarity_threshold = 0.9`). As `lsh_splits` is
334334
increased, we lose recall (i.e., fewer than 35,313 edges are generated), but
335335
the recall can then be improved by increasing `lsh_rounds`. This table shows
336336
the minimum `lsh_rounds` value required to achieve a recall of >= 99.7%
337-
(except for the `lsh_bits = 1` case), as well as the elapsed running time:
337+
(except for the `lsh_splits = 1` case), as well as the elapsed running time:
338338
339339
```none
340-
lsh_bits lsh_rounds Recall Running time
341-
0 N/A 100.0% 27m 46s
342-
1 2 99.4% 24m 33s
343-
2 3 99.8% 15m 35s
344-
3 4 99.7% 9m 37.9s
345-
4 6 99.9% 7m 07.5s
346-
5 8 99.9% 4m 59.2s
347-
6 9 99.7% 3m 01.2s
348-
7 11 99.8% 2m 02.3s
349-
8 13 99.8% 1m 20.8s
350-
9 16 99.7% 58.5s
351-
10 18 99.7% 43.6s
340+
lsh_splits lsh_rounds Recall Running time
341+
0 N/A 100.0% 27m 46s
342+
1 2 99.4% 24m 33s
343+
2 3 99.8% 15m 35s
344+
3 4 99.7% 9m 37.9s
345+
4 6 99.9% 7m 07.5s
346+
5 8 99.9% 4m 59.2s
347+
6 9 99.7% 3m 01.2s
348+
7 11 99.8% 2m 02.3s
349+
8 13 99.8% 1m 20.8s
350+
9 16 99.7% 58.5s
351+
10 18 99.7% 43.6s
352352
```
353353
354-
As the table illustrates, by increasing both `lsh_bits` and `lsh_rounds`, we
355-
can dramatically decrease the running time of the graph builder without
354+
As the table illustrates, by increasing both `lsh_splits` and `lsh_rounds`,
355+
we can dramatically decrease the running time of the graph builder without
356356
sacrificing edge recall. We have found that a good rule of thumb is to set
357-
`lsh_bits >= ceiling(log_2(num_instances / 1000))`, so the expected LSH bucket
358-
size will be at most 1000. However, if your instances are clustered or you
359-
want an even faster run, you may want to use a larger `lsh_bits` value. Note,
360-
however, that when the similarity threshold is lower, recall rates are reduced
361-
more quickly the larger the value of `lsh_bits` is, so be careful not to set
362-
that parameter too high for smaller `similarity_threshold` values.
357+
`lsh_splits >= ceiling(log_2(num_instances / 1000))`, so the expected LSH
358+
bucket size will be at most 1000. However, if your instances are clustered or
359+
you want an even faster run, you may want to use a larger `lsh_splits` value.
360+
Note, however, that when the similarity threshold is lower, recall rates are
361+
reduced more quickly the larger the value of `lsh_splits` is, so be careful
362+
not to set that parameter too high for smaller `similarity_threshold` values.
363363
364364
The generated graph edges are written to the TSV file named by
365365
`output_graph_path`. Each output edge is represented by a TSV line with the
@@ -381,7 +381,7 @@ def build_graph_from_config(embedding_files, output_graph_path,
381381
graph building parameters.
382382
383383
Raises:
384-
ValueError: If `lsh_bits < 0`, or if `lsh_bits > 0 and lsh_rounds < 1`.
384+
ValueError: If `lsh_splits < 0` or if `lsh_splits > 0 and lsh_rounds < 1`.
385385
"""
386386
graph_builder = GraphBuilder(graph_builder_config)
387387
graph_builder.build(embedding_files, output_graph_path)
@@ -392,7 +392,7 @@ def build_graph(embedding_files,
392392
similarity_threshold=0.8,
393393
id_feature_name='id',
394394
embedding_feature_name='embedding',
395-
lsh_bits=0,
395+
lsh_splits=0,
396396
lsh_rounds=2,
397397
random_seed=None):
398398
"""Like `nsl.tools.build_graph_from_config`, but with individual parameters.
@@ -411,27 +411,27 @@ def build_graph(embedding_files,
411411
objects representing the ID of examples.
412412
embedding_feature_name: The name of the feature in the input
413413
`tf.train.Example` objects representing the embedding of examples.
414-
lsh_bits: Determines the maximum number of LSH buckets into which input data
415-
points will be bucketed by the graph builder. See the
414+
lsh_splits: Determines the maximum number of LSH buckets into which input
415+
data points will be bucketed by the graph builder. See the
416416
`nsl.tools.build_graph_from_config` documentation for details.
417417
lsh_rounds: The number of rounds of LSH bucketing to perform when
418-
`lsh_bits > 0`. This is also the number of LSH buckets each point will be
419-
hashed into.
418+
`lsh_splits > 0`. This is also the number of LSH buckets each point will
419+
be hashed into.
420420
random_seed: Value used to seed the random number generator used to perform
421-
randomized LSH bucketing of the inputs when `lsh_bits > 0`. By default,
421+
randomized LSH bucketing of the inputs when `lsh_splits > 0`. By default,
422422
the generator will be initialized randomly, but setting this to any
423423
integer will initialize it deterministically.
424424
425425
Raises:
426-
ValueError: If `lsh_bits < 0`, or if `lsh_bits > 0 and lsh_rounds < 1`.
426+
ValueError: If `lsh_splits < 0` or if `lsh_splits > 0 and lsh_rounds < 1`.
427427
"""
428428
build_graph_from_config(
429429
embedding_files, output_graph_path,
430430
nsl_configs.GraphBuilderConfig(
431431
id_feature_name=id_feature_name,
432432
embedding_feature_name=embedding_feature_name,
433433
similarity_threshold=similarity_threshold,
434-
lsh_bits=lsh_bits,
434+
lsh_splits=lsh_splits,
435435
lsh_rounds=lsh_rounds,
436436
random_seed=random_seed))
437437

@@ -451,7 +451,7 @@ def _main(argv):
451451
id_feature_name=flag.id_feature_name,
452452
embedding_feature_name=flag.embedding_feature_name,
453453
similarity_threshold=flag.similarity_threshold,
454-
lsh_bits=flag.lsh_bits,
454+
lsh_splits=flag.lsh_splits,
455455
lsh_rounds=flag.lsh_rounds,
456456
random_seed=flag.random_seed))
457457

@@ -470,19 +470,20 @@ def _main(argv):
470470
"""Lower bound on the cosine similarity required for an edge
471471
to be created between two nodes.""")
472472
flags.DEFINE_integer(
473-
'lsh_bits', 0,
474-
"""The input instances will be randomly bucketed into 2^(lsh_bits)
475-
potential buckets for better performance. The larger your number of
473+
'lsh_splits', 0,
474+
"""On each LSH bucketing round, the space containing the input instances
475+
will be randomly split/partitioned this many times for better performance,
476+
resulting in up to 2^(lsh_splits) LSH buckets. The larger your number of
476477
input instances, the larger this value should be. A good rule of thumb is
477-
to set `lsh_bits = ceiling(log_2(num_instances / 1000))`.""")
478+
to set `lsh_splits = ceiling(log_2(num_instances / 1000))`.""")
478479
flags.DEFINE_integer(
479480
'lsh_rounds', 2,
480-
"""The number of rounds of LSH bucketing to perform when `lsh_bits > 0`.
481+
"""The number of rounds of LSH bucketing to perform when `lsh_splits > 0`.
481482
This is also the number of LSH buckets each point will be hashed into.""")
482483
flags.DEFINE_integer(
483484
'random_seed', None,
484485
"""Value used to seed the random number generator used to perform
485-
randomized LSH bucketing of the inputs when `lsh_bits > 0`. By default,
486+
randomized LSH bucketing of the inputs when `lsh_splits > 0`. By default,
486487
the generator will be initialized randomly, but setting this to any
487488
integer will initialize it deterministically.""")
488489

neural_structured_learning/tools/build_graph_test.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -88,11 +88,11 @@ def _create_graph_file(self):
8888

8989
def testBuildGraphInvalidLshBitsValue(self):
9090
with self.assertRaises(ValueError):
91-
build_graph_lib.build_graph([], None, lsh_bits=-1)
91+
build_graph_lib.build_graph([], None, lsh_splits=-1)
9292

9393
def testBuildGraphInvalidLshRoundsValue(self):
9494
with self.assertRaises(ValueError):
95-
build_graph_lib.build_graph([], None, lsh_bits=1, lsh_rounds=0)
95+
build_graph_lib.build_graph([], None, lsh_splits=1, lsh_rounds=0)
9696

9797
def testBuildGraphNoThresholdingNoLSH(self):
9898
"""All edges whose weight is greater than 0 are retained."""
@@ -175,7 +175,7 @@ def testBuildGraphWithThresholdWithLSHInsufficientLSHRounds(self):
175175
build_graph_lib.build_graph([embedding_path],
176176
graph_path,
177177
similarity_threshold=0.9,
178-
lsh_bits=2,
178+
lsh_splits=2,
179179
lsh_rounds=1,
180180
random_seed=12345)
181181
g_actual = graph_utils.read_tsv_graph(graph_path)
@@ -200,7 +200,7 @@ def testBuildGraphWithThresholdWithLSHSufficientLSHRounds(self):
200200
build_graph_lib.build_graph([embedding_path],
201201
graph_path,
202202
similarity_threshold=0.9,
203-
lsh_bits=2,
203+
lsh_splits=2,
204204
lsh_rounds=4,
205205
random_seed=12345)
206206
g_actual = graph_utils.read_tsv_graph(graph_path)

0 commit comments

Comments
 (0)