Fix Hager-Zhang linesearch to accept intervals with zero derivative for the right endpoint.

srvasude · tensorflower-gardener · commit 6fc9a9eafa91 · 2022-12-01T15:00:48.000-08:00
- This improves performance of L-BFGS / BFGS substantially on test problems.

PiperOrigin-RevId: 492308325
diff --git a/tensorflow_probability/python/optimizer/linesearch/hager_zhang_test.py b/tensorflow_probability/python/optimizer/linesearch/hager_zhang_test.py
@@ -122,27 +122,30 @@ def fdf(x):
                     sum(r.func_evals for r in results_mapped))
 
   def test_batch_bracket_failures(self):
-    # To bracket successfully, we must find the narrow window with positive
-    # derivative -- roughly [1.39, 2.72].
+    # To bracket successfully, we must find the narrow window with non-negative
+    # derivative -- good values are roughly [1.39, 2.72].
     def _fdf(x):
       z = x - 1
       return ValueAndGradient(
           x=x,
           f=tf.math.exp(-z) - tf.math.exp(-z*z),
           df=2*z*tf.math.exp(-z*z) - tf.math.exp(-z))
 
-    start = tf.convert_to_tensor([0.01, 0.1, 1.0, 1.5, 2.0, 3.0])
+    start = tf.convert_to_tensor([0.01, 0.1, 1.0, 1.5, 2.0, -5.0])
     results = self.evaluate(hager_zhang(
         _fdf, initial_step_size=start))
 
     # Bracketing will do something like: check `5^0 * start`, `5^1 * start`,
-    # `5^2 * start`, ..., looking for a point where the derivative is positive.
-    # This search will find a point with positive derivative when `start` is
-    # `0.1`, `1.5`, or `2.0`, but will fail for the other values.
-    self.assertAllEqual([False, True, False, True, True, False],
-                        results.converged)
-    self.assertAllEqual([True, False, True, False, False, True],
-                        results.failed)
+    # `5^2 * start`, ..., looking for a point where the derivative is
+    # non-negative. The search will start to fail for negative values where the
+    # function is highly positive and goes steeply downward (but since it's far
+    # enough out fails to bracket).
+    self.assertAllEqual(
+        [True, True, True, True, True, False],
+        results.converged)
+    self.assertAllEqual(
+        [False, False, False, False, False, True],
+        results.failed)
 
     val_0 = self.evaluate(_fdf(tf.convert_to_tensor(0.0)))
     self.assertAllEqual(
diff --git a/tensorflow_probability/python/optimizer/linesearch/internal/hager_zhang_lib.py b/tensorflow_probability/python/optimizer/linesearch/internal/hager_zhang_lib.py
@@ -723,7 +723,7 @@ def _is_rising(val):
     rising: A Boolean Tensor giving whether this point is a suitable right
       end-point for an interval subject to secant subdivision.
   """
-  return tf.math.is_finite(val.f) & (val.df > 0)
+  return tf.math.is_finite(val.f) & (val.df >= 0.)
 
 
 def is_finite(val_1, val_2=None):
diff --git a/tensorflow_probability/python/optimizer/linesearch/internal/hager_zhang_lib_test.py b/tensorflow_probability/python/optimizer/linesearch/internal/hager_zhang_lib_test.py
@@ -252,6 +252,51 @@ def test_bracket_simple(self):
     self.assertLess(result.left.df, 0)  # Opposite slopes.
     self.assertGreaterEqual(result.right.df, 0)
 
+  def test_bracket_accepts_interval_zero_derivative(self):
+    """Tests that bracketing accepts f' = 0 for the right endpoint."""
+    wolfe_threshold = 1e-6
+
+    # This example is taken from the unconstrained beale function.
+    def beale(z):
+      # Constrain to [-4.5, 4.5]
+      z = 4.5 * tf.math.sigmoid(z) - 4.5 * tf.math.sigmoid(-z)
+      x = z[..., 0]
+      y = z[..., 1]
+      return ((1.5 - x + x * y)**2 +
+              (2.25 - x + x * y**2)**2 +
+              (2.625 - x + x * y**3)**2)
+
+    def beale_ls(t):
+      t = tf.convert_to_tensor(t, dtype=tf.float32)
+      def _internal_ls(t):
+        # Choose an initial point and step such that the step goes out towards
+        # infinity. In that way, we guarantee the gradients are zero at the
+        # step but aren't a suitable minima as the function increases away
+        # from the point (3., 0.5).
+        x = np.array([0.6, 1.35]).astype(np.float32)
+        # Large step that pushes the function to a flat region of space.
+        p = np.array([-100., -100.]).astype(np.float32)
+        return beale(x + t * p)
+      f, df = value_and_gradient(_internal_ls, t)
+      return ValueAndGradient(x=t, f=tf.squeeze(f), df=tf.squeeze(df))
+
+    val_a = beale_ls(0.0)  # Value at zero.
+    val_b = beale_ls(1.0)  # Value at initial step.
+    f_lim = val_a.f + (wolfe_threshold * tf.abs(val_a.f))
+
+    result = self.evaluate(
+        hzl.bracket(beale_ls, _interval(val_a, val_b), f_lim, max_iterations=5))
+
+    # The left endpoint has negative derivative, the right has zero derivative.
+    # This should be a valid interval a priori.
+    self.assertFalse(result.failed)
+    self.assertEqual(result.iteration, 0)  # Zero expansion.
+    self.assertEqual(result.num_evals, 0)  # Zero evaluations.
+    self.assertEqual(result.left.x, 0.)
+    self.assertEqual(result.right.x, 1.)
+    self.assertLess(result.left.df, 0)  # Opposite slopes.
+    self.assertGreaterEqual(result.right.df, 0)
+
   def test_bracket_batching(self):
     """Tests that bracketing works in batching mode."""
     wolfe_threshold = 1e-6