apple
diff --git a/‎.gitignore
Lines changed: 3 additions & 0 deletions b/‎.gitignore
Lines changed: 3 additions & 0 deletions
diff --git a/‎axlearn/audio/frontend.py
Lines changed: 2 additions & 9 deletions b/‎axlearn/audio/frontend.py
Lines changed: 2 additions & 9 deletions
diff --git a/‎axlearn/audio/frontend_utils.py
Lines changed: 18 additions & 2 deletions b/‎axlearn/audio/frontend_utils.py
Lines changed: 18 additions & 2 deletions
diff --git a/‎axlearn/audio/frontend_utils_test.py
Lines changed: 5 additions & 6 deletions b/‎axlearn/audio/frontend_utils_test.py
Lines changed: 5 additions & 6 deletions
diff --git a/‎axlearn/cloud/common/bundler.py
Lines changed: 10 additions & 1 deletion b/‎axlearn/cloud/common/bundler.py
Lines changed: 10 additions & 1 deletion
diff --git a/‎axlearn/cloud/common/event_queue.py
Lines changed: 17 additions & 13 deletions b/‎axlearn/cloud/common/event_queue.py
Lines changed: 17 additions & 13 deletions
diff --git a/‎axlearn/cloud/common/utils.py
Lines changed: 13 additions & 0 deletions b/‎axlearn/cloud/common/utils.py
Lines changed: 13 additions & 0 deletions
diff --git a/‎axlearn/cloud/common/utils_test.py
Lines changed: 18 additions & 0 deletions b/‎axlearn/cloud/common/utils_test.py
Lines changed: 18 additions & 0 deletions
diff --git a/‎axlearn/cloud/gcp/bundler.py
Lines changed: 15 additions & 6 deletions b/‎axlearn/cloud/gcp/bundler.py
Lines changed: 15 additions & 6 deletions
diff --git a/‎axlearn/cloud/gcp/bundler_test.py
Lines changed: 18 additions & 0 deletions b/‎axlearn/cloud/gcp/bundler_test.py
Lines changed: 18 additions & 0 deletions
@@ -159,6 +159,9 @@ cython_debug/
 # Vscode
 .vscode/
 
+# Zed
+.zed/
+
 # Weights & Biases
 wandb/
 
 
@@ -14,6 +14,7 @@
 
 from axlearn.audio.frontend_utils import (
     WindowType,
+    cast_for_rfft,
     frame,
     frame_paddings,
     linear_to_log_mel_spectrogram,
@@ -143,14 +144,6 @@ def _fft_dtype(input_dtype: jnp.dtype) -> jnp.dtype:
         raise ValueError(f"{input_dtype=} is not supported.")
 
 
-def _cast_for_rfft(x: Tensor) -> Tensor:
-    # jnp.fft.rfft input must be float32 or float64.
-    if x.dtype in (jnp.float32, jnp.float64):
-        return x
-    else:
-        return x.astype(jnp.float32)
-
-
 class LogMelFrontend(BaseFrontend):
     """Computes Log Mel spectrogram features.
 
@@ -200,7 +193,7 @@ def __init__(self, cfg: Config, *, parent: Optional[Module]):
         if cfg.fft is not None:
             self._fft = cfg.fft.set(n=fft_size).instantiate()
         else:
-            self._fft = lambda x: jnp.fft.rfft(_cast_for_rfft(x), n=fft_size)
+            self._fft = lambda x: jnp.fft.rfft(cast_for_rfft(x), n=fft_size)
 
         spectrogram = maybe_set_config(
             cfg.spectrogram,
 
@@ -10,7 +10,6 @@
 
 import enum
 import math
-from functools import partial
 from typing import Callable, Union
 
 import jax
@@ -404,8 +403,25 @@ def sharded_fft(n: int, partition_spec: PartitionSpec) -> Callable[[Tensor], Ten
         A callable that computes FFT.
     """
     return shard_map(
-        partial(jnp.fft.rfft, n=n),
+        lambda x: jnp.fft.rfft(cast_for_rfft(x), n=n),
         mesh=thread_resources.env.physical_mesh,
         in_specs=partition_spec,
         out_specs=partition_spec,
     )
+
+
+def cast_for_rfft(x: Tensor) -> Tensor:
+    """Casts the input tensor to a valid dtype for jnp.fft.rfft if necessary.
+
+    jnp.fft.rfft requires the input to be of dtype float32 or float64.
+
+    Args:
+        x: Input tensor of arbitrary dtype.
+
+    Returns:
+        A tensor of dtype float32 or float64, suitable for jnp.fft.rfft.
+    """
+    if x.dtype in (jnp.float32, jnp.float64):
+        return x
+    else:
+        return x.astype(jnp.float32)
@@ -23,6 +23,7 @@
 from axlearn.audio import frontend_utils
 from axlearn.audio.frontend_utils import (
     WindowType,
+    cast_for_rfft,
     frame,
     frame_paddings,
     linear_to_log_mel_spectrogram,
@@ -393,15 +394,13 @@ def _ref_log_mel_spectrogram(
 
 
 class ShardedFftTest(TestCase):
+    @parameterized.parameters(jnp.float32, jnp.bfloat16)
     @set_threefry_partitionable(False)  # TODO(Luzy): update for threefry_partitionable True
-    def test_fft(self):
+    def test_fft(self, dtype):
         input_shape = (8, 800, 400)
         fft_size = 512
         inputs = jax.random.uniform(
-            jax.random.PRNGKey(123),
-            shape=input_shape,
-            minval=-32768.0,
-            maxval=32768.0,
+            jax.random.PRNGKey(123), shape=input_shape, minval=-32768.0, maxval=32768.0, dtype=dtype
         )
         with Mesh(
             mesh_utils.create_device_mesh((len(jax.devices()), 1)), ("data", "model")
@@ -414,7 +413,7 @@ def test_fft(self):
             fft_fn = jax.jit(
                 sharded_fft(n=fft_size, partition_spec=PartitionSpec("data", None, None))
             )
-            ref_ffts = jax.jit(jnp.fft.rfft, static_argnames="n")(inputs, n=fft_size)
+            ref_ffts = jax.jit(jnp.fft.rfft, static_argnames="n")(cast_for_rfft(inputs), n=fft_size)
             test_ffts = fft_fn(inputs)
 
         assert_allclose(ref_ffts, test_ffts, rtol=1e-3)
 
@@ -62,6 +62,7 @@
     copy_blobs,
     get_pyproject_version,
     parse_kv_flags,
+    to_bool,
 )
 from axlearn.common.config import REQUIRED, Configurable, Required, config_class
 from axlearn.common.file_system import copy, exists, makedirs
@@ -341,9 +342,17 @@ def from_spec(cls, spec: list[str], *, fv: Optional[flags.FlagValues]) -> Config
         cfg: BaseDockerBundler.Config = super().from_spec(spec, fv=fv)
         kwargs = parse_kv_flags(spec, delimiter="=")
         cache_from = canonicalize_to_list(kwargs.pop("cache_from", None))
+        skip_bundle = to_bool(kwargs.pop("skip_bundle", False))
+        allow_dirty = to_bool(kwargs.pop("allow_dirty", False))
         # Non-config specs are treated as build args.
         build_args = {k: kwargs.pop(k) for k in list(kwargs.keys()) if k not in cfg}
-        return cfg.set(build_args=build_args, cache_from=cache_from, **kwargs)
+        return cfg.set(
+            build_args=build_args,
+            cache_from=cache_from,
+            skip_bundle=skip_bundle,
+            allow_dirty=allow_dirty,
+            **kwargs,
+        )
 
     # pylint: disable-next=arguments-renamed
     def id(self, tag: str) -> str:
 
@@ -203,7 +203,6 @@ def publish(self, event: Event):
             try:
                 # Ensure connection is established before publishing.
                 if not self._channel or not self._connection:
-                    logging.error("RabbitMQ publisher channel is closed, reconnecting...")
                     self.connect()
 
                 # Setting durable=True ensures that the queue will survive.
@@ -230,25 +229,30 @@ def publish(self, event: Event):
                 # Only retry on recoverable exceptions.
                 # AMQPConnectionError is assumed to be related to network issues,
                 # or temporary unavailable host.
-                logging.error(
-                    "Failed to publish event: %s. Error: %s. Attempt: %d",
-                    message,
-                    str(e),
-                    attempt,
-                )
                 self._handle_publish_error()
                 attempt += 1
-                if attempt <= self._num_tries:
+                if attempt < self._num_tries:
                     time.sleep(2**attempt)
+                else:
+                    logging.error(
+                        "Failed to publish event: %s after %d attempts. Error: %s.",
+                        message,
+                        attempt,
+                        str(e),
+                    )
             except Exception as e:  # pylint: disable=broad-except
-                # Unknown errors. Don't retry. Log to avoid crashing clients.
-                logging.error(
-                    "Unknown error. Failed to publish event: %s. Error: %s.", message, str(e)
-                )
                 self._handle_publish_error()
                 attempt += 1
-                if attempt <= self._num_tries:
+                if attempt < self._num_tries:
                     time.sleep(2**attempt)
+                else:
+                    # Unknown errors. Don't retry. Log to avoid crashing clients.
+                    logging.error(
+                        "Unknown error. Failed to publish event: %s after %d attempts. Error: %s.",
+                        message,
+                        attempt,
+                        str(e),
+                    )
 
     def _handle_publish_error(self):
         """Handle publish errors with retrying on connection issue."""
 
@@ -292,6 +292,19 @@ def merge(base: dict, overrides: dict):
     return base
 
 
+def to_bool(value: Any) -> bool:
+    """Converts a string representation of truth to a bool."""
+    if isinstance(value, bool):
+        return value
+    elif isinstance(value, str):
+        val_lower = value.lower()
+        if val_lower == "true":
+            return True
+        elif val_lower == "false":
+            return False
+    raise ValueError(f"Invalid truth value: '{value}'")
+
+
 _Row = list[Any]
 
 
 
@@ -222,6 +222,24 @@ def test_canonicalize(self, v_seq: Sequence[str], v_str: str, v_list: str, delim
     def test_merge(self, base, overrides, expected):
         self.assertEqual(expected, utils.merge(base, overrides))
 
+    @parameterized.parameters(
+        ("true", True),
+        ("True", True),
+        ("false", False),
+        ("False", False),
+        (True, True),
+        (False, False),
+        ("yes", ValueError),
+        (1, ValueError),
+    )
+    def test_to_bool(self, value, expected):
+        if isinstance(expected, type) and issubclass(expected, Exception):
+            with self.assertRaises(expected):
+                utils.to_bool(value)
+        else:
+            result = utils.to_bool(value)
+            self.assertEqual(result, expected)
+
     def test_infer_resources(self):
         @config_class
         class DummyConfig(ConfigBase):
 
@@ -58,7 +58,7 @@
 from axlearn.cloud.common.bundler import main_flags as bundler_main_flags
 from axlearn.cloud.common.bundler import register_bundler
 from axlearn.cloud.common.docker import registry_from_repo
-from axlearn.cloud.common.utils import canonicalize_to_list
+from axlearn.cloud.common.utils import canonicalize_to_list, to_bool
 from axlearn.cloud.gcp.cloud_build import get_cloud_build_status
 from axlearn.cloud.gcp.config import gcp_settings
 from axlearn.cloud.gcp.utils import common_flags
@@ -148,9 +148,7 @@ def from_spec(
         cfg.project = cfg.project or gcp_settings("project", required=False, fv=fv)
         cfg.repo = cfg.repo or gcp_settings("docker_repo", required=False, fv=fv)
         cfg.dockerfile = cfg.dockerfile or gcp_settings("default_dockerfile", required=False, fv=fv)
-        # The value from from_spec is a str and will result in wrong condition.
-        if isinstance(cfg.is_async, str):
-            cfg.is_async = cfg.is_async.lower() != "false"
+        cfg.is_async = to_bool(cfg.is_async)
         return cfg
 
     # pylint: disable-next=no-self-use,unused-argument
@@ -227,19 +225,30 @@ def _build_and_push(
         print(subprocess.run(cmd, check=True))
         return image
 
-    def wait_until_finished(self, name: str):
+    def wait_until_finished(self, name: str, wait_timeout=3600):
         """Waits for async CloudBuild to finish by polling for status.
 
         Is a no-op if `cfg.is_async` is False.
 
         Args:
             name: Bundle name.
+            wait_timeout: Overall timeout in seconds. Defaults to 1 hour.
 
         Raises:
-            ValueError: If async build failed.
+            TimeoutError: If the build does not complete within the overall timeout.
+            ValueError: If the async build fails.
         """
+        start_time = time.perf_counter()
         cfg: CloudBuildBundler.Config = self.config
         while cfg.is_async:
+            elapsed_time = time.perf_counter() - start_time
+            if elapsed_time > wait_timeout:
+                timeout_msg = (
+                    f"Timed out waiting for CloudBuild to finish for more than "
+                    f"{wait_timeout} seconds."
+                )
+                logging.error(timeout_msg)
+                raise TimeoutError(timeout_msg)
             try:
                 build_status = get_cloud_build_status(
                     project_id=cfg.project, image_name=self.id(name), tags=[name]
 
@@ -180,3 +180,21 @@ def test_wait_until_finished_retries_with_runtime_error(self):
             b = cfg.set(is_async=True).instantiate()
             b.wait_until_finished("test-name")
             self.assertEqual(2, mock_status.call_count)
+
+    def test_wait_until_finished_triggers_timeout(self):
+        # Tests that we raise a timeout error if wait_until_finished takes more than 1 hr.
+        cfg = self._get_test_cloud_build_bundler()
+
+        with mock.patch("time.perf_counter") as mock_perf_counter:
+            mock_perf_counter.side_effect = [0, 10, 500, 3601]
+
+            with self._mock_status(
+                None, CloudBuildStatus.PENDING, CloudBuildStatus.PENDING
+            ) as mock_status:
+                b = cfg.set(is_async=True).instantiate()
+                with self.assertRaisesRegex(
+                    TimeoutError,
+                    "Timed out waiting for CloudBuild to finish for more than 3600 seconds.",
+                ):
+                    b.wait_until_finished("test-name")
+                self.assertEqual(2, mock_status.call_count)