Skip to content

Commit a31a5ac

Browse files
Neural-Link Teamtensorflow-copybara
authored andcommitted
Adds a new --lsh_rounds flag to the NSL graph builder.
Why? As --lsh_bits is increased, the graph builder loses edge recall. Recall can be restored by performing multiple rounds of the LSH bucketing process. See the comment of the `nsl.tools.build_graph_from_config` function for details. PiperOrigin-RevId: 319900351
1 parent 7058e04 commit a31a5ac

File tree

4 files changed

+302
-83
lines changed

4 files changed

+302
-83
lines changed

neural_structured_learning/configs/configs.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -300,25 +300,31 @@ class GraphBuilderConfig(object):
300300
301301
Attributes:
302302
id_feature_name: The name of the feature in the input `tf.train.Example`
303-
objects representing the ID of examples.
303+
objects representing the ID of examples. Defaults to `'id'`.
304304
embedding_feature_name: The name of the feature in the input
305305
`tf.train.Example` objects representing the embedding of examples.
306+
Defaults to `'embedding'`.
306307
similarity_threshold: Threshold used to determine which edges to retain in
307-
the resulting graph.
308+
the resulting graph. Defaults to 0.8.
308309
lsh_bits: Determines the maximum number of LSH buckets into which input data
309310
points will be bucketed by the graph builder. See the
310311
`nsl.tools.build_graph_from_config` documentation for details. This
311312
defaults to 0, in which case all pairs of inputs will be compared,
312-
probably resulting in slow running times on larger input sets.
313+
probably resulting in slow running times on larger input sets. Defaults
314+
to 0.
315+
lsh_rounds: The number of rounds of LSH bucketing to perform when
316+
`lsh_bits > 0`. This is also the number of LSH buckets each point will be
317+
hashed into. Defaults to 2.
313318
random_seed: Value used to seed the random number generator used to perform
314319
randomized LSH bucketing of the inputs when `lsh_bits > 0`. By default,
315320
the generator will be initialized randomly, but setting this to any
316-
integer will initialize it deterministically.
321+
integer will initialize it deterministically. Defaults to `None`.
317322
"""
318323
id_feature_name = attr.ib(default='id')
319324
embedding_feature_name = attr.ib(default='embedding')
320325
similarity_threshold = attr.ib(default=0.8)
321326
lsh_bits = attr.ib(default=0)
327+
lsh_rounds = attr.ib(default=2)
322328
random_seed = attr.ib(default=None)
323329

324330

neural_structured_learning/tools/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ py_test(
7979
":graph_utils",
8080
# package protobuf,
8181
# package absl/testing:absltest
82+
# package six
8283
# package tensorflow
8384
],
8485
)

neural_structured_learning/tools/build_graph.py

Lines changed: 139 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,39 @@ class GraphBuilder(object):
104104
"""Computes the similarity graph from a set of (dense) embeddings."""
105105

106106
def __init__(self, graph_builder_config):
107+
"""Initializes this GraphBuilder from the given configuration instance.
108+
109+
Args:
110+
graph_builder_config: A `nsl.configs.GraphBuilderConfig` instance.
111+
112+
Raises:
113+
ValueError: If `lsh_bits < 0`, or if `lsh_bits > 0 and lsh_rounds < 1`.
114+
"""
107115
self.config = graph_builder_config
116+
if self.config.lsh_bits < 0:
117+
raise ValueError('lsh_bits < 0')
118+
if self.config.lsh_bits > 0 and self.config.lsh_rounds < 1:
119+
raise ValueError('lsh_bits > 0 but lsh_rounds < 1')
120+
121+
# Keep a set of previously written edges if it's possible we might
122+
# generate the same edge multiple times. This can happen only if both
123+
# 'lsh_bits > 0' and 'lsh_rounds > 1'. To save space, we pick a canonical
124+
# ordering (source < target) for each bi-directional edge. Note that we
125+
# do not need to store the edge weight as well because for any
126+
# (source, target) pair, the cosine similarity between them will never
127+
# change.
128+
self.edge_set = None
129+
if self.config.lsh_bits > 0 and self.config.lsh_rounds > 1:
130+
self.edge_set = set()
131+
132+
def _is_new_edge(self, src, tgt):
133+
"""Returns `True` iff the edge `src` to `tgt` has not been generated yet."""
134+
canonical_edge = (src, tgt) if src < tgt else (tgt, src)
135+
# Remember set size before calling add() because add() returns None.
136+
# This way we don't have to hash 'canonical_edge' twice.
137+
set_size_before_add = len(self.edge_set)
138+
self.edge_set.add(canonical_edge)
139+
return len(self.edge_set) > set_size_before_add
108140

109141
def _bucket(self, lsh_matrix, embedding):
110142
"""Returns the bucket ID of the given `embedding` relative to `lsh_matrix`.
@@ -147,17 +179,13 @@ def _generate_lsh_buckets(self, embeddings):
147179
# Generate a random matrix of values in the range [-1.0, 1.0] to use
148180
# to create the LSH buckets.
149181
num_dims = next(iter(embeddings.values())).size
150-
np.random.seed(self.config.random_seed)
151182
lsh_matrix = np.random.rand(self.config.lsh_bits, num_dims) * 2 - 1
152183

153184
# Add each embedding to its appropriate bucket
154-
start_time = time.time()
155185
bucket_map = {}
156186
for key, embedding in six.iteritems(embeddings):
157187
s = bucket_map.setdefault(self._bucket(lsh_matrix, embedding), set())
158188
s.add(key)
159-
logging.info('Bucketed embeddings into %d bucket(s) in %.2f seconds.',
160-
len(bucket_map), time.time() - start_time)
161189
return bucket_map
162190

163191
def _generate_edges_for_bucket(self, bucket, embeddings):
@@ -174,15 +202,15 @@ def _generate_edges_for_bucket(self, bucket, embeddings):
174202
for src, tgt in itertools.combinations(bucket, 2):
175203
weight = np.dot(embeddings[src], embeddings[tgt])
176204
if weight >= self.config.similarity_threshold:
177-
yield (src, tgt, weight)
205+
if self.edge_set is None or self._is_new_edge(src, tgt):
206+
yield (src, tgt, weight)
178207

179208
def _generate_edges(self, embeddings):
180209
"""Generates edges among pairs of the given `embeddings`.
181210
182-
This function considers all distinct pairs of nodes in `embeddings`,
183-
computes the similarity between all such pairs (by calling the `_similarity`
184-
method), and yields any edge for which the similarity is at least
185-
`self.similarity_threshold`.
211+
This function considers related pairs of nodes in `embeddings`,
212+
computes the cosine similarity between all such pairs, and yields any edge
213+
for which the cosine similarity is at least `self.similarity_threshold`.
186214
187215
Args:
188216
embeddings: A `dict`: node_id -> embedding.
@@ -191,16 +219,27 @@ def _generate_edges(self, embeddings):
191219
A tuple (source, target, weight) denoting a (directed) edge from 'source'
192220
to 'target' with the given 'weight'.
193221
"""
194-
start_time = time.time()
195-
edge_cnt = 0
196-
bucket_map = self._generate_lsh_buckets(embeddings)
197-
for bucket in bucket_map.values():
198-
for edge in self._generate_edges_for_bucket(bucket, embeddings):
199-
yield edge
200-
edge_cnt += 1
201-
if (edge_cnt % 1000000) == 0:
202-
logging.info('Created %d bi-directional edges in %.2f seconds....',
203-
edge_cnt, time.time() - start_time)
222+
for lsh_round in range(max(1, self.config.lsh_rounds)):
223+
start_time = time.time()
224+
edge_cnt = 0
225+
bucket_map = self._generate_lsh_buckets(embeddings)
226+
logging_prefix = 'LSH bucketing round {}'.format(lsh_round)
227+
logging.info('%s: created %d bucket(s) in %.2f seconds.', logging_prefix,
228+
len(bucket_map),
229+
time.time() - start_time)
230+
for bucket in bucket_map.values():
231+
for edge in self._generate_edges_for_bucket(bucket, embeddings):
232+
edge_cnt += 1
233+
if (edge_cnt % 1000000) == 0:
234+
logging.info(
235+
'%s: generated %d new bi-directional edges in %.2f seconds....',
236+
logging_prefix, edge_cnt,
237+
time.time() - start_time)
238+
yield edge
239+
logging.info(
240+
'%s completed: generated %d new bi-directional edges in %.2f seconds.',
241+
logging_prefix, edge_cnt,
242+
time.time() - start_time)
204243

205244
def build(self, embedding_files, output_graph_path):
206245
"""Reads embeddings and writes the similarity graph to `output_graph_path`.
@@ -220,6 +259,8 @@ def build(self, embedding_files, output_graph_path):
220259
start_time = time.time()
221260
logging.info('Building graph and writing edges to TSV file: %s',
222261
output_graph_path)
262+
np.random.seed(self.config.random_seed)
263+
logging.info('Using random seed value: %s', self.config.random_seed)
223264
edge_cnt = 0
224265
with open(output_graph_path, 'w') as f:
225266
for (src, tgt, wt) in self._generate_edges(embeddings):
@@ -267,14 +308,60 @@ def build_graph_from_config(embedding_files, output_graph_path,
267308
The `lsh_bits` configuration attribute is used to control the maximum number
268309
of LSH buckets. In particular, if `lsh_bits` has the value `n`, then there
269310
can be at most `2^n` LSH buckets. Using a larger value for `lsh_bits` will
270-
(generally) result in a larger number of buckets, and therefore, faster
271-
running times. The disadvantage to using too many LSH buckets, however, is
272-
that we may not create a graph edge between two instances that are otherwise
273-
highly similar because they happened to be randomly hashed into two different
274-
LSH buckets. A good rule of thumb is to set
275-
`lsh_bits = ceiling(log_2(num_instances / 1000))`.
276-
277-
The resulting graph edges are written to the TSV file named by
311+
(generally) result in a larger number of buckets, and therefore, smaller
312+
number of instances in each bucket that need to be compared to each other.
313+
As a result, increasing `lsh_bits` can lead to dramatically faster running
314+
times.
315+
316+
The disadvantage to using too many LSH buckets, however, is that we won't
317+
create a graph edge between two instances that are highly similar if they
318+
happen to be randomly hashed into two different LSH buckets. To address
319+
that problem, the `lsh_rounds` parameter can be used to perform multiple
320+
rounds of the LSH bucketing process. Even if two similar instances may get
321+
hashed to different LSH buckets during the first round, they may get hashed
322+
into the same LSH bucket on a subsequent round. An edge is created in the
323+
output graph if two intances are hashed into the same bucket and deemed to
324+
be similar enough on *any* of the LSH rounds (i.e., the resulting graph is the
325+
*union* of the graph edges generated on each LSH round).
326+
327+
To illustrate these concepts and how various `lsh_bits` and `lsh_rounds`
328+
values correlate with graph building running times, we performed multiple runs
329+
of the graph builder on a dataset containing 50,000 instances, each with a
330+
100-dimensional embedding. When `lsh_bits = 0`, the program has to compare
331+
each instance against every other instance, for a total of roughly 2.5B
332+
comparisons, which takes nearly half an hour to complete and generates a total
333+
of 35,313 graph edges (when `similarity_threshold = 0.9`). As `lsh_bits` is
334+
increased, we lose recall (i.e., fewer than 35,313 edges are generated), but
335+
the recall can then be improved by increasing `lsh_rounds`. This table shows
336+
the minimum `lsh_rounds` value required to achieve a recall of >= 99.7%
337+
(except for the `lsh_bits = 1` case), as well as the elapsed running time:
338+
339+
```none
340+
lsh_bits lsh_rounds Recall Running time
341+
0 N/A 100.0% 27m 46s
342+
1 2 99.4% 24m 33s
343+
2 3 99.8% 15m 35s
344+
3 4 99.7% 9m 37.9s
345+
4 6 99.9% 7m 07.5s
346+
5 8 99.9% 4m 59.2s
347+
6 9 99.7% 3m 01.2s
348+
7 11 99.8% 2m 02.3s
349+
8 13 99.8% 1m 20.8s
350+
9 16 99.7% 58.5s
351+
10 18 99.7% 43.6s
352+
```
353+
354+
As the table illustrates, by increasing both `lsh_bits` and `lsh_rounds`, we
355+
can dramatically decrease the running time of the graph builder without
356+
sacrificing edge recall. We have found that a good rule of thumb is to set
357+
`lsh_bits >= ceiling(log_2(num_instances / 1000))`, so the expected LSH bucket
358+
size will be at most 1000. However, if your instances are clustered or you
359+
want an even faster run, you may want to use a larger `lsh_bits` value. Note,
360+
however, that when the similarity threshold is lower, recall rates are reduced
361+
more quickly the larger the value of `lsh_bits` is, so be careful not to set
362+
that parameter too high for smaller `similarity_threshold` values.
363+
364+
The generated graph edges are written to the TSV file named by
278365
`output_graph_path`. Each output edge is represented by a TSV line with the
279366
following form:
280367
@@ -292,6 +379,9 @@ def build_graph_from_config(embedding_files, output_graph_path,
292379
should be written.
293380
graph_builder_config: A `nsl.configs.GraphBuilderConfig` specifying the
294381
graph building parameters.
382+
383+
Raises:
384+
ValueError: If `lsh_bits < 0`, or if `lsh_bits > 0 and lsh_rounds < 1`.
295385
"""
296386
graph_builder = GraphBuilder(graph_builder_config)
297387
graph_builder.build(embedding_files, output_graph_path)
@@ -302,7 +392,9 @@ def build_graph(embedding_files,
302392
similarity_threshold=0.8,
303393
id_feature_name='id',
304394
embedding_feature_name='embedding',
305-
lsh_bits=0):
395+
lsh_bits=0,
396+
lsh_rounds=2,
397+
random_seed=None):
306398
"""Like `nsl.tools.build_graph_from_config`, but with individual parameters.
307399
308400
This API exists to maintain backward compatibility, but is deprecated in favor
@@ -322,15 +414,26 @@ def build_graph(embedding_files,
322414
lsh_bits: Determines the maximum number of LSH buckets into which input data
323415
points will be bucketed by the graph builder. See the
324416
`nsl.tools.build_graph_from_config` documentation for details.
417+
lsh_rounds: The number of rounds of LSH bucketing to perform when
418+
`lsh_bits > 0`. This is also the number of LSH buckets each point will be
419+
hashed into.
420+
random_seed: Value used to seed the random number generator used to perform
421+
randomized LSH bucketing of the inputs when `lsh_bits > 0`. By default,
422+
the generator will be initialized randomly, but setting this to any
423+
integer will initialize it deterministically.
424+
425+
Raises:
426+
ValueError: If `lsh_bits < 0`, or if `lsh_bits > 0 and lsh_rounds < 1`.
325427
"""
326428
build_graph_from_config(
327-
embedding_files,
328-
output_graph_path,
429+
embedding_files, output_graph_path,
329430
nsl_configs.GraphBuilderConfig(
330431
id_feature_name=id_feature_name,
331432
embedding_feature_name=embedding_feature_name,
332433
similarity_threshold=similarity_threshold,
333-
lsh_bits=lsh_bits))
434+
lsh_bits=lsh_bits,
435+
lsh_rounds=lsh_rounds,
436+
random_seed=random_seed))
334437

335438

336439
def _main(argv):
@@ -349,6 +452,7 @@ def _main(argv):
349452
embedding_feature_name=flag.embedding_feature_name,
350453
similarity_threshold=flag.similarity_threshold,
351454
lsh_bits=flag.lsh_bits,
455+
lsh_rounds=flag.lsh_rounds,
352456
random_seed=flag.random_seed))
353457

354458

@@ -371,6 +475,10 @@ def _main(argv):
371475
potential buckets for better performance. The larger your number of
372476
input instances, the larger this value should be. A good rule of thumb is
373477
to set `lsh_bits = ceiling(log_2(num_instances / 1000))`.""")
478+
flags.DEFINE_integer(
479+
'lsh_rounds', 2,
480+
"""The number of rounds of LSH bucketing to perform when `lsh_bits > 0`.
481+
This is also the number of LSH buckets each point will be hashed into.""")
374482
flags.DEFINE_integer(
375483
'random_seed', None,
376484
"""Value used to seed the random number generator used to perform

0 commit comments

Comments
 (0)