[AutoPGLE] Cleanup compiler code.

Google-ML-Automation · Google-ML-Automation · commit 8813973d9686 · 2024-12-10T09:37:35.000-08:00
PiperOrigin-RevId: 704741308
diff --git a/jax/_src/compiler.py b/jax/_src/compiler.py
@@ -374,60 +374,24 @@ def compile_or_get_cached(
                            host_callbacks)
 
   is_multi_process = (
-      len({device.process_index for device in devices.flatten()}) > 1)
-  min_device_process_id = (
-      min(devices.flatten(), key=lambda device: device.id).process_index)
-
-  # When PGLE is enabled there might be 3 types of situations:
-  # 1. PGLE profiled module (the one which was recompiled with FDO profile) is
-  # in the persistent cache. In this case the module should be returned from
-  # cache and PGLE should be disabled for this module. Is module is stored in
-  # the persistent cache under the "pgle_profiled_module_key" which calculated
-  # with replacing FDO profile with flag which identify that module were PGLE
-  # profiled.
-  # 2. PGLE profiled module is not in the persistent cache and the module is
-  # getting built with an FDO profile. In this case we need to share FDO profile
-  # with other processes and store the result under the
-  # "pgle_profiled_module_key" so later in case 1 we will be able to find the
-  # module.
-  # 3. PGLE profiled module is not in the persistent cache and the module is
-  # getting compiled to be PGLEd (FDO profile is empty). In this case we need to
-  # simply return the non-PGLE profiled module from the persistent cache.
-  if (config.enable_pgle.value
-      and config.pgle_profiling_runs.value > 0):
-    fdo_profile = compile_options.executable_build_options.fdo_profile
-    compile_options.executable_build_options.fdo_profile = b"pgle profiled"
-
-    pgle_profiled_module_key = compilation_cache.get_cache_key(
+      len({device.process_index for device in devices.flatten()}) > 1
+  )
+  min_device_process_id = min(
+      devices.flatten(), key=lambda device: device.id
+  ).process_index
+
+  if config.enable_pgle.value and config.pgle_profiling_runs.value > 0:
+    cache_key = _resolve_pgle_module_cache_key(
         computation,
         devices,
         compile_options,
         backend,
-        cache_key_type.IgnoreCallbacks.ALL,
+        pgle_profiler,
+        is_multi_process,
+        cache_key,
+        module_name,
+        min_device_process_id,
     )
-    compile_options.executable_build_options.fdo_profile = fdo_profile
-
-    if _is_executable_in_cache(backend, pgle_profiled_module_key):
-      # Load PGLE profiled module from the persistent cache.
-      cache_key = pgle_profiled_module_key
-      if pgle_profiler is not None:
-        pgle_profiler.disable()
-    elif fdo_profile is not None and len(fdo_profile) > 0:
-      # Store module under PGLE profiled module cache key.
-      cache_key = pgle_profiled_module_key
-      if is_multi_process and distributed.global_state.client is not None:
-        compile_options.executable_build_options.fdo_profile = _share_fdo_profiles(
-          computation, devices, compile_options, backend,
-          distributed.global_state.client,
-          min_device_process_id
-        )
-      else:
-        compile_options.executable_build_options.fdo_profile = fdo_profile
-        logger.debug(
-            "Compiling module %s with FDO profile: %s",
-            module_name,
-            compile_options.executable_build_options.fdo_profile,
-        )
 
   cache_retrieval_start = time.monotonic()
   retrieved_executable, retrieved_compile_time = _cache_read(
@@ -493,6 +457,75 @@ def compile_or_get_cached(
         cache_key,
     )
 
+
+# When PGLE is enabled there might be 3 types of situations:
+# 1. PGLE profiled module (the one which was recompiled with FDO profile) is
+# in the persistent cache. In this case the module should be returned from
+# cache and PGLE should be disabled for this module. Is module is stored in
+# the persistent cache under the "pgle_profiled_module_key" which calculated
+# with replacing FDO profile with flag which identify that module were PGLE
+# profiled.
+# 2. PGLE profiled module is not in the persistent cache and the module is
+# getting built with an FDO profile. In this case we need to share FDO profile
+# with other processes and store the result under the
+# "pgle_profiled_module_key" so later in case 1 we will be able to find the
+# module.
+# 3. PGLE profiled module is not in the persistent cache and the module is
+# getting compiled to be PGLEd (FDO profile is empty). In this case we need to
+# simply return the non-PGLE profiled module from the persistent cache.
+def _resolve_pgle_module_cache_key(
+    computation: ir.Module,
+    devices: np.ndarray,
+    compile_options: xc.CompileOptions,
+    backend: xc.Client,
+    pgle_profiler: profiler.PGLEProfiler | None,
+    is_multi_process: bool,
+    cache_key: str,
+    module_name: str,
+    min_device_process_id: int,
+) -> str:
+  fdo_profile = compile_options.executable_build_options.fdo_profile
+  compile_options.executable_build_options.fdo_profile = b"pgle profiled"
+
+  pgle_profiled_module_key = compilation_cache.get_cache_key(
+      computation,
+      devices,
+      compile_options,
+      backend,
+      cache_key_type.IgnoreCallbacks.ALL,
+  )
+  compile_options.executable_build_options.fdo_profile = fdo_profile
+
+  result_key = cache_key
+  if _is_executable_in_cache(backend, pgle_profiled_module_key):
+    # Load PGLE profiled module from the persistent cache.
+    result_key = pgle_profiled_module_key
+    if pgle_profiler is not None:
+      pgle_profiler.disable()
+  elif fdo_profile is not None and len(fdo_profile) > 0:
+    # Store module under PGLE profiled module cache key.
+    result_key = pgle_profiled_module_key
+    if is_multi_process and distributed.global_state.client is not None:
+      compile_options.executable_build_options.fdo_profile = (
+          _share_fdo_profiles(
+              computation,
+              devices,
+              compile_options,
+              backend,
+              distributed.global_state.client,
+              min_device_process_id,
+          )
+      )
+    else:
+      compile_options.executable_build_options.fdo_profile = fdo_profile
+      logger.debug(
+          "Compiling module %s with FDO profile of length %d",
+          module_name,
+          len(compile_options.executable_build_options.fdo_profile),
+      )
+  return result_key
+
+
 # The process that has the lowest device ID should share FDO profile before
 # compilation with other processes.
 def _share_fdo_profiles(
@@ -510,32 +543,39 @@ def _share_fdo_profiles(
     return fdo_profile
 
   compile_options.executable_build_options.fdo_profile = b""
-  profile_key = (
-      compilation_cache.get_cache_key(
-          computation,
-          devices,
-          compile_options,
-          backend,
-          cache_key_type.IgnoreCallbacks.ALL,
-      )
-      + "_fdo_sync"
-  )
+  try:
+    profile_key = (
+        compilation_cache.get_cache_key(
+            computation,
+            devices,
+            compile_options,
+            backend,
+            cache_key_type.IgnoreCallbacks.ALL,
+        )
+        + "_fdo_sync"
+    )
+  except xc._xla.XlaRuntimeError as ex:
+    logger.error(
+        "compile_or_get_cached: unable to generate cache key, "
+        "skipping the fdo profile sharing: %s",
+        ex,
+    )
+    return fdo_profile
+
   if profile_key in _share_fdo_profiles.modules_profiles:
     return _share_fdo_profiles.modules_profiles[profile_key]
 
   share_timeout = config.share_binary_between_hosts_timeout_ms.value
   if distributed.global_state.process_id == min_process_id:
     logger.debug(
-        "Sharing FDO profile: %s. For module %s. Process %d.",
-        fdo_profile,
+        "Module %s. Sharing FDO profile. Process %d.",
         module_name,
         min_process_id,
     )
     global_client.key_value_set_bytes(profile_key, fdo_profile)
   else:
     logger.debug(
-        "Waiting for FDO profile: %s. For module %s. Should be set by process %d.",
-        fdo_profile,
+        "Module %s. Waiting for FDO profile which should be set by process %d.",
         module_name,
         min_process_id,
     )