Switch the graph builder flag name from --lsh_bits to --lsh_splits.

Neural-Link Team · tensorflow-copybara · commit 0d50227f0155 · 2020-07-08T17:13:37.000-07:00
Also updated API documentation to better describe the name of the flag.

PiperOrigin-RevId: 320296734
diff --git a/neural_structured_learning/configs/configs.py b/neural_structured_learning/configs/configs.py
@@ -306,24 +306,24 @@ class GraphBuilderConfig(object):
       Defaults to `'embedding'`.
     similarity_threshold: Threshold used to determine which edges to retain in
       the resulting graph. Defaults to 0.8.
-    lsh_bits: Determines the maximum number of LSH buckets into which input data
-      points will be bucketed by the graph builder. See the
-      `nsl.tools.build_graph_from_config` documentation for details. This
-      defaults to 0, in which case all pairs of inputs will be compared,
-      probably resulting in slow running times on larger input sets. Defaults
-      to 0.
+    lsh_splits: On each LSH bucketing round, the space containing the input
+      instances will be randomly split/partitioned this many times for better
+      graph builder performance. See the `nsl.tools.build_graph_from_config`
+      documentation for details. Defaults to 0, in which case all pairs of
+      inputs will be compared, probably resulting in slow running times on
+      larger input sets.
     lsh_rounds: The number of rounds of LSH bucketing to perform when
-      `lsh_bits > 0`. This is also the number of LSH buckets each point will be
-      hashed into. Defaults to 2.
+      `lsh_splits > 0`. This is also the number of LSH buckets each point will
+      be hashed into. Defaults to 2.
     random_seed: Value used to seed the random number generator used to perform
-      randomized LSH bucketing of the inputs when `lsh_bits > 0`. By default,
+      randomized LSH bucketing of the inputs when `lsh_splits > 0`. By default,
       the generator will be initialized randomly, but setting this to any
       integer will initialize it deterministically. Defaults to `None`.
   """
   id_feature_name = attr.ib(default='id')
   embedding_feature_name = attr.ib(default='embedding')
   similarity_threshold = attr.ib(default=0.8)
-  lsh_bits = attr.ib(default=0)
+  lsh_splits = attr.ib(default=0)
   lsh_rounds = attr.ib(default=2)
   random_seed = attr.ib(default=None)
 
diff --git a/neural_structured_learning/tools/build_graph.py b/neural_structured_learning/tools/build_graph.py
@@ -110,23 +110,23 @@ def __init__(self, graph_builder_config):
       graph_builder_config: A `nsl.configs.GraphBuilderConfig` instance.
 
     Raises:
-      ValueError: If `lsh_bits < 0`, or if `lsh_bits > 0 and lsh_rounds < 1`.
+      ValueError: If `lsh_splits < 0` or if `lsh_splits > 0 and lsh_rounds < 1`.
     """
     self.config = graph_builder_config
-    if self.config.lsh_bits < 0:
-      raise ValueError('lsh_bits < 0')
-    if self.config.lsh_bits > 0 and self.config.lsh_rounds < 1:
-      raise ValueError('lsh_bits > 0 but lsh_rounds < 1')
+    if self.config.lsh_splits < 0:
+      raise ValueError('lsh_splits < 0')
+    if self.config.lsh_splits > 0 and self.config.lsh_rounds < 1:
+      raise ValueError('lsh_splits > 0 but lsh_rounds < 1')
 
     # Keep a set of previously written edges if it's possible we might
     # generate the same edge multiple times. This can happen only if both
-    # 'lsh_bits > 0' and 'lsh_rounds > 1'. To save space, we pick a canonical
+    # 'lsh_splits > 0' and 'lsh_rounds > 1'. To save space, we pick a canonical
     # ordering (source < target) for each bi-directional edge. Note that we
     # do not need to store the edge weight as well because for any
     # (source, target) pair, the cosine similarity between them will never
     # change.
     self.edge_set = None
-    if self.config.lsh_bits > 0 and self.config.lsh_rounds > 1:
+    if self.config.lsh_splits > 0 and self.config.lsh_rounds > 1:
       self.edge_set = set()
 
   def _is_new_edge(self, src, tgt):
@@ -146,7 +146,7 @@ def _bucket(self, lsh_matrix, embedding):
       embedding: A 1-D vector representing the dense embedding for a point.
 
     Returns:
-      The bucket ID, a value in `[0, 2^n)`, where `n = self.config.lsh_bits`.
+      The bucket ID, a value in `[0, 2^n)`, where `n = self.config.lsh_splits`.
       Bit `i` of the result (where bit 0 corresponds to the least significant
       bit) is 1 if and only if the dot product of row `i` of `lsh_matrix' and
       `embedding` is positive.
@@ -159,10 +159,10 @@ def _bucket(self, lsh_matrix, embedding):
     return bucket
 
   def _generate_lsh_buckets(self, embeddings):
-    """Buckets the given `embeddings` according to `config.lsh_bits`.
+    """Buckets the given `embeddings` according to `config.lsh_splits`.
 
     The embeddings can be bucketed into a total of at most `2^n` different
-    buckets, where `n` is given by the value of `config.lsh_bits`. If `n` is
+    buckets, where `n` is given by the value of `config.lsh_splits`. If `n` is
     not positive, then all of the given `embeddings` keys will be bucketed into
     bucket 0.
 
@@ -172,14 +172,14 @@ def _generate_lsh_buckets(self, embeddings):
     Returns:
       A dictionary mapping bucket IDs to sets of embedding IDs in each bucket.
       The bucket IDs are integers in the half-open interval `[0, 2^n)`, where
-      `n = config.lsh_bits`.
+      `n = config.lsh_splits`.
     """
-    if self.config.lsh_bits <= 0: return {0: set(embeddings.keys())}
+    if self.config.lsh_splits <= 0: return {0: set(embeddings.keys())}
 
     # Generate a random matrix of values in the range [-1.0, 1.0] to use
     # to create the LSH buckets.
     num_dims = next(iter(embeddings.values())).size
-    lsh_matrix = np.random.rand(self.config.lsh_bits, num_dims) * 2 - 1
+    lsh_matrix = np.random.rand(self.config.lsh_splits, num_dims) * 2 - 1
 
     # Add each embedding to its appropriate bucket
     bucket_map = {}
@@ -305,12 +305,12 @@ def build_graph_from_config(embedding_files, output_graph_path,
   compare just the pairs of points within each bucket for similarity, which can
   lead to dramatically faster running times.
 
-  The `lsh_bits` configuration attribute is used to control the maximum number
-  of LSH buckets. In particular, if `lsh_bits` has the value `n`, then there
-  can be at most `2^n` LSH buckets. Using a larger value for `lsh_bits` will
+  The `lsh_splits` configuration attribute is used to control the maximum number
+  of LSH buckets. In particular, if `lsh_splits` has the value `n`, then there
+  can be at most `2^n` LSH buckets. Using a larger value for `lsh_splits` will
   (generally) result in a larger number of buckets, and therefore, smaller
   number of instances in each bucket that need to be compared to each other.
-  As a result, increasing `lsh_bits` can lead to dramatically faster running
+  As a result, increasing `lsh_splits` can lead to dramatically faster running
   times.
 
   The disadvantage to using too many LSH buckets, however, is that we won't
@@ -324,42 +324,42 @@ def build_graph_from_config(embedding_files, output_graph_path,
   be similar enough on *any* of the LSH rounds (i.e., the resulting graph is the
   *union* of the graph edges generated on each LSH round).
 
-  To illustrate these concepts and how various `lsh_bits` and `lsh_rounds`
+  To illustrate these concepts and how various `lsh_splits` and `lsh_rounds`
   values correlate with graph building running times, we performed multiple runs
   of the graph builder on a dataset containing 50,000 instances, each with a
-  100-dimensional embedding. When `lsh_bits = 0`, the program has to compare
+  100-dimensional embedding. When `lsh_splits = 0`, the program has to compare
   each instance against every other instance, for a total of roughly 2.5B
   comparisons, which takes nearly half an hour to complete and generates a total
-  of 35,313 graph edges (when `similarity_threshold = 0.9`). As `lsh_bits` is
+  of 35,313 graph edges (when `similarity_threshold = 0.9`). As `lsh_splits` is
   increased, we lose recall (i.e., fewer than 35,313 edges are generated), but
   the recall can then be improved by increasing `lsh_rounds`. This table shows
   the minimum `lsh_rounds` value required to achieve a recall of >= 99.7%
-  (except for the `lsh_bits = 1` case), as well as the elapsed running time:
+  (except for the `lsh_splits = 1` case), as well as the elapsed running time:
 
   ```none
-  lsh_bits   lsh_rounds    Recall    Running time
-     0          N/A        100.0%      27m 46s
-     1           2          99.4%      24m 33s
-     2           3          99.8%      15m 35s
-     3           4          99.7%       9m 37.9s
-     4           6          99.9%       7m 07.5s
-     5           8          99.9%       4m 59.2s
-     6           9          99.7%       3m 01.2s
-     7          11          99.8%       2m 02.3s
-     8          13          99.8%       1m 20.8s
-     9          16          99.7%          58.5s
-    10          18          99.7%          43.6s
+  lsh_splits  lsh_rounds    Recall    Running time
+      0          N/A        100.0%      27m 46s
+      1           2          99.4%      24m 33s
+      2           3          99.8%      15m 35s
+      3           4          99.7%       9m 37.9s
+      4           6          99.9%       7m 07.5s
+      5           8          99.9%       4m 59.2s
+      6           9          99.7%       3m 01.2s
+      7          11          99.8%       2m 02.3s
+      8          13          99.8%       1m 20.8s
+      9          16          99.7%          58.5s
+     10          18          99.7%          43.6s
   ```
 
-  As the table illustrates, by increasing both `lsh_bits` and `lsh_rounds`, we
-  can dramatically decrease the running time of the graph builder without
+  As the table illustrates, by increasing both `lsh_splits` and `lsh_rounds`,
+  we can dramatically decrease the running time of the graph builder without
   sacrificing edge recall. We have found that a good rule of thumb is to set
-  `lsh_bits >= ceiling(log_2(num_instances / 1000))`, so the expected LSH bucket
-  size will be at most 1000. However, if your instances are clustered or you
-  want an even faster run, you may want to use a larger `lsh_bits` value. Note,
-  however, that when the similarity threshold is lower, recall rates are reduced
-  more quickly the larger the value of `lsh_bits` is, so be careful not to set
-  that parameter too high for smaller `similarity_threshold` values.
+  `lsh_splits >= ceiling(log_2(num_instances / 1000))`, so the expected LSH
+  bucket size will be at most 1000. However, if your instances are clustered or
+  you want an even faster run, you may want to use a larger `lsh_splits` value.
+  Note, however, that when the similarity threshold is lower, recall rates are
+  reduced more quickly the larger the value of `lsh_splits` is, so be careful
+  not to set that parameter too high for smaller `similarity_threshold` values.
 
   The generated graph edges are written to the TSV file named by
   `output_graph_path`. Each output edge is represented by a TSV line with the
@@ -381,7 +381,7 @@ def build_graph_from_config(embedding_files, output_graph_path,
       graph building parameters.
 
   Raises:
-    ValueError: If `lsh_bits < 0`, or if `lsh_bits > 0 and lsh_rounds < 1`.
+    ValueError: If `lsh_splits < 0` or if `lsh_splits > 0 and lsh_rounds < 1`.
   """
   graph_builder = GraphBuilder(graph_builder_config)
   graph_builder.build(embedding_files, output_graph_path)
@@ -392,7 +392,7 @@ def build_graph(embedding_files,
                 similarity_threshold=0.8,
                 id_feature_name='id',
                 embedding_feature_name='embedding',
-                lsh_bits=0,
+                lsh_splits=0,
                 lsh_rounds=2,
                 random_seed=None):
   """Like `nsl.tools.build_graph_from_config`, but with individual parameters.
@@ -411,27 +411,27 @@ def build_graph(embedding_files,
       objects representing the ID of examples.
     embedding_feature_name: The name of the feature in the input
       `tf.train.Example` objects representing the embedding of examples.
-    lsh_bits: Determines the maximum number of LSH buckets into which input data
-      points will be bucketed by the graph builder. See the
+    lsh_splits: Determines the maximum number of LSH buckets into which input
+      data points will be bucketed by the graph builder. See the
       `nsl.tools.build_graph_from_config` documentation for details.
     lsh_rounds: The number of rounds of LSH bucketing to perform when
-      `lsh_bits > 0`. This is also the number of LSH buckets each point will be
-      hashed into.
+      `lsh_splits > 0`. This is also the number of LSH buckets each point will
+      be hashed into.
     random_seed: Value used to seed the random number generator used to perform
-      randomized LSH bucketing of the inputs when `lsh_bits > 0`. By default,
+      randomized LSH bucketing of the inputs when `lsh_splits > 0`. By default,
       the generator will be initialized randomly, but setting this to any
       integer will initialize it deterministically.
 
   Raises:
-      ValueError: If `lsh_bits < 0`, or if `lsh_bits > 0 and lsh_rounds < 1`.
+      ValueError: If `lsh_splits < 0` or if `lsh_splits > 0 and lsh_rounds < 1`.
   """
   build_graph_from_config(
       embedding_files, output_graph_path,
       nsl_configs.GraphBuilderConfig(
           id_feature_name=id_feature_name,
           embedding_feature_name=embedding_feature_name,
           similarity_threshold=similarity_threshold,
-          lsh_bits=lsh_bits,
+          lsh_splits=lsh_splits,
           lsh_rounds=lsh_rounds,
           random_seed=random_seed))
 
@@ -451,7 +451,7 @@ def _main(argv):
           id_feature_name=flag.id_feature_name,
           embedding_feature_name=flag.embedding_feature_name,
           similarity_threshold=flag.similarity_threshold,
-          lsh_bits=flag.lsh_bits,
+          lsh_splits=flag.lsh_splits,
           lsh_rounds=flag.lsh_rounds,
           random_seed=flag.random_seed))
 
@@ -470,19 +470,20 @@ def _main(argv):
       """Lower bound on the cosine similarity required for an edge
       to be created between two nodes.""")
   flags.DEFINE_integer(
-      'lsh_bits', 0,
-      """The input instances will be randomly bucketed into 2^(lsh_bits)
-      potential buckets for better performance. The larger your number of
+      'lsh_splits', 0,
+      """On each LSH bucketing round, the space containing the input instances
+      will be randomly split/partitioned this many times for better performance,
+      resulting in up to 2^(lsh_splits) LSH buckets. The larger your number of
       input instances, the larger this value should be. A good rule of thumb is
-      to set `lsh_bits = ceiling(log_2(num_instances / 1000))`.""")
+      to set `lsh_splits = ceiling(log_2(num_instances / 1000))`.""")
   flags.DEFINE_integer(
       'lsh_rounds', 2,
-      """The number of rounds of LSH bucketing to perform when `lsh_bits > 0`.
+      """The number of rounds of LSH bucketing to perform when `lsh_splits > 0`.
       This is also the number of LSH buckets each point will be hashed into.""")
   flags.DEFINE_integer(
       'random_seed', None,
       """Value used to seed the random number generator used to perform
-      randomized LSH bucketing of the inputs when `lsh_bits > 0`. By default,
+      randomized LSH bucketing of the inputs when `lsh_splits > 0`. By default,
       the generator will be initialized randomly, but setting this to any
       integer will initialize it deterministically.""")
 
diff --git a/neural_structured_learning/tools/build_graph_test.py b/neural_structured_learning/tools/build_graph_test.py
@@ -88,11 +88,11 @@ def _create_graph_file(self):
 
   def testBuildGraphInvalidLshBitsValue(self):
     with self.assertRaises(ValueError):
-      build_graph_lib.build_graph([], None, lsh_bits=-1)
+      build_graph_lib.build_graph([], None, lsh_splits=-1)
 
   def testBuildGraphInvalidLshRoundsValue(self):
     with self.assertRaises(ValueError):
-      build_graph_lib.build_graph([], None, lsh_bits=1, lsh_rounds=0)
+      build_graph_lib.build_graph([], None, lsh_splits=1, lsh_rounds=0)
 
   def testBuildGraphNoThresholdingNoLSH(self):
     """All edges whose weight is greater than 0 are retained."""
@@ -175,7 +175,7 @@ def testBuildGraphWithThresholdWithLSHInsufficientLSHRounds(self):
     build_graph_lib.build_graph([embedding_path],
                                 graph_path,
                                 similarity_threshold=0.9,
-                                lsh_bits=2,
+                                lsh_splits=2,
                                 lsh_rounds=1,
                                 random_seed=12345)
     g_actual = graph_utils.read_tsv_graph(graph_path)
@@ -200,7 +200,7 @@ def testBuildGraphWithThresholdWithLSHSufficientLSHRounds(self):
     build_graph_lib.build_graph([embedding_path],
                                 graph_path,
                                 similarity_threshold=0.9,
-                                lsh_bits=2,
+                                lsh_splits=2,
                                 lsh_rounds=4,
                                 random_seed=12345)
     g_actual = graph_utils.read_tsv_graph(graph_path)