Skip to content

Commit 0d690e8

Browse files
Add Orbit beacon metric and retention policy.
PiperOrigin-RevId: 569314229
1 parent 72e7e91 commit 0d690e8

File tree

2 files changed

+14
-0
lines changed

2 files changed

+14
-0
lines changed

orbit/controller.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,14 @@
2626

2727
import tensorflow as tf
2828

29+
# pylint: disable=g-direct-tensorflow-import
30+
from tensorflow.python.eager import monitoring
31+
# pylint: enable=g-direct-tensorflow-import
32+
33+
_orbit_api_gauge = monitoring.BoolGauge(
34+
"/tensorflow/api/orbit", "orbit api usage"
35+
)
36+
2937

3038
def _log(message: str):
3139
"""Logs `message` to the `info` log, and also prints to stdout."""
@@ -243,6 +251,9 @@ def __init__(
243251
if restored_path:
244252
_log(f"restored from checkpoint: {restored_path}")
245253

254+
# Set Orbit framework gauge to True value
255+
_orbit_api_gauge.get_cell().set(True)
256+
246257
def train(self, steps: int, checkpoint_at_completion: bool = True):
247258
"""Runs training until the specified global step count has been reached.
248259

orbit/controller_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,7 @@ def test_no_checkpoint(self):
281281
test_controller.train_and_evaluate(
282282
train_steps=10, eval_steps=2, eval_interval=6)
283283
self.assertEqual(test_runner.global_step, 10)
284+
self.assertTrue(controller._orbit_api_gauge.get_cell().value())
284285

285286
def test_no_checkpoint_and_summaries(self):
286287
test_runner = TestRunner()
@@ -293,6 +294,7 @@ def test_no_checkpoint_and_summaries(self):
293294
test_controller.train_and_evaluate(
294295
train_steps=10, eval_steps=2, eval_interval=6)
295296
self.assertEqual(test_runner.global_step, 10)
297+
self.assertTrue(controller._orbit_api_gauge.get_cell().value())
296298

297299
@parameterized.named_parameters(
298300
("_sync_checkpoint_saving", False),
@@ -317,6 +319,7 @@ def test_has_checkpoint_no_summaries(self, enable_async_checkpoint_saving):
317319
test_controller.train_and_evaluate(
318320
train_steps=10, eval_steps=2, eval_interval=6)
319321
self.assertEqual(test_runner.global_step, 10)
322+
self.assertTrue(controller._orbit_api_gauge.get_cell().value())
320323

321324
# No summaries are saved.
322325
self.assertEmpty(tf.io.gfile.glob(

0 commit comments

Comments
 (0)