bazel-contrib · rickeylev · Jun 5, 2025 · May 23, 2025 · May 31, 2025 · May 31, 2025
@@ -11,6 +11,29 @@ documentation. It comes with:
 While it is primarily oriented towards docgen for Starlark code, the core of it
 is agnostic as to what is being documented.
 
+### Optimization
+
+Normally, Sphinx keeps various cache files to improve incremental building.
+Unfortunately, programs performing their own caching don't interact well
+with Bazel's model of precisely declaring and strictly enforcing what are
+inputs, what are outputs, and what files are available when running a program.
+The net effect is programs don't have a prior invocation's cache files
+available.
+
+There are two mechanisms available to make some cache available to Sphinx under
+Bazel:
+
+* Disable sandboxing, which allows some files from prior invocations to be
+  visible to subsequent invocations. This can be done multiple ways:
+  * Set `tags = ["no-sandbox"]` on the `sphinx_docs` target
+  * `--modify_execution_info=SphinxBuildDocs=+no-sandbox` (Bazel flag)
+  * `--strategy=SphinxBuildDocs=local` (Bazel flag)
+* Use persistent workers (enabled by default) by setting
+  `allow_persistent_workers=True` on the `sphinx_docs` target. Note that other
+  Bazel flags can disable using workers even if an action supports it. Setting
+  `--strategy=SphinxBuildDocs=dynamic,worker,local,sandbox` should tell Bazel
+  to use workers if possible, otherwise fallback to non-worker invocations.
+
 
 ```{toctree}
 :hidden:

@@ -103,6 +103,7 @@ def sphinx_docs(
         strip_prefix = "",
         extra_opts = [],
         tools = [],
+        allow_persistent_workers = True,
         **kwargs):
     """Generate docs using Sphinx.
 
@@ -142,6 +143,9 @@ def sphinx_docs(
         tools: {type}`list[label]` Additional tools that are used by Sphinx and its plugins.
             This just makes the tools available during Sphinx execution. To locate
             them, use {obj}`extra_opts` and `$(location)`.
+        allow_persistent_workers: {type}`bool` (experimental) If true, allow
+            using persistent workers for running Sphinx, if Bazel decides to do so.
+            This can improve incremental building of docs.
         **kwargs: {type}`dict` Common attributes to pass onto rules.
     """
     add_tag(kwargs, "@rules_python//sphinxdocs:sphinx_docs")
@@ -165,6 +169,7 @@ def sphinx_docs(
         source_tree = internal_name + "/_sources",
         extra_opts = extra_opts,
         tools = tools,
+        allow_persistent_workers = allow_persistent_workers,
         **kwargs
     )
 
@@ -209,6 +214,7 @@ def _sphinx_docs_impl(ctx):
             source_path = source_dir_path,
             output_prefix = paths.join(ctx.label.name, "_build"),
             inputs = inputs,
+            allow_persistent_workers = ctx.attr.allow_persistent_workers,
         )
         outputs[format] = output_dir
         per_format_args[format] = args_env
@@ -229,6 +235,10 @@ def _sphinx_docs_impl(ctx):
 _sphinx_docs = rule(
     implementation = _sphinx_docs_impl,
     attrs = {
+        "allow_persistent_workers": attr.bool(
+            doc = "(experimental) Whether to invoke Sphinx as a persistent worker.",
+            default = False,
+        ),
         "extra_opts": attr.string_list(
             doc = "Additional options to pass onto Sphinx. These are added after " +
                   "other options, but before the source/output args.",
@@ -254,28 +264,45 @@ _sphinx_docs = rule(
     },
 )
 
-def _run_sphinx(ctx, format, source_path, inputs, output_prefix):
+def _run_sphinx(ctx, format, source_path, inputs, output_prefix, allow_persistent_workers):
     output_dir = ctx.actions.declare_directory(paths.join(output_prefix, format))
 
     run_args = []  # Copy of the args to forward along to debug runner
     args = ctx.actions.args()  # Args passed to the action
 
+    # An args file is required for persistent workers, but we don't know if
+    # the action will use worker mode or not (settings we can't see may
+    # force non-worker mode). For consistency, always use a params file.
+    args.use_param_file("@%s", use_always = True)
+    args.set_param_file_format("multiline")
+
+    # NOTE: sphinx_build.py relies on the first two args being the srcdir and
+    # outputdir, in that order.
+    args.add(source_path)
+    args.add(output_dir.path)
+
     args.add("--show-traceback")  # Full tracebacks on error
     run_args.append("--show-traceback")
-    args.add("--builder", format)
-    run_args.extend(("--builder", format))
+    args.add(format, format = "--builder=%s")
+    run_args.append("--builder={}".format(format))
 
     if ctx.attr._quiet_flag[BuildSettingInfo].value:
         # Not added to run_args because run_args is for debugging
         args.add("--quiet")  # Suppress stdout informational text
 
     # Build in parallel, if possible
     # Don't add to run_args: parallel building breaks interactive debugging
-    args.add("--jobs", "auto")
-    args.add("--fresh-env")  # Don't try to use cache files. Bazel can't make use of them.
-    run_args.append("--fresh-env")
-    args.add("--write-all")  # Write all files; don't try to detect "changed" files
-    run_args.append("--write-all")
+    args.add("--jobs=auto")
+
+    # Put the doctree dir outside of the output directory.
+    # This allows it to be reused between invocations when possible; Bazel
+    # clears the output directory every action invocation.
+    # * For workers, they can fully re-use it.
+    # * For non-workers, it can be reused when sandboxing is disabled via
+    #   the `no-sandbox` tag or execution requirement.
+    #
+    # We also use a non-dot prefixed name so it shows up more visibly.
+    args.add(paths.join(output_dir.path + "_doctrees"), format = "--doctree-dir=%s")
 
     for opt in ctx.attr.extra_opts:
         expanded = ctx.expand_location(opt)
@@ -287,9 +314,6 @@ def _run_sphinx(ctx, format, source_path, inputs, output_prefix):
     for define in extra_defines:
         run_args.extend(("--define", define))
 
-    args.add(source_path)
-    args.add(output_dir.path)
-
     env = dict([
         v.split("=", 1)
         for v in ctx.attr._extra_env_flag[_FlagInfo].value
@@ -299,6 +323,14 @@ def _run_sphinx(ctx, format, source_path, inputs, output_prefix):
     for tool in ctx.attr.tools:
         tools.append(tool[DefaultInfo].files_to_run)
 
+    # NOTE: Command line flags or RBE capabilities may override the execution
+    # requirements and disable workers. Thus, we can't assume that these
+    # exec requirements will actually be respected.
+    execution_requirements = {}
+    if allow_persistent_workers:
+        execution_requirements["supports-workers"] = "1"
+        execution_requirements["requires-worker-protocol"] = "json"
+
     ctx.actions.run(
         executable = ctx.executable.sphinx,
         arguments = [args],
@@ -308,6 +340,7 @@ def _run_sphinx(ctx, format, source_path, inputs, output_prefix):
         mnemonic = "SphinxBuildDocs",
         progress_message = "Sphinx building {} for %{{label}}".format(format),
         env = env,
+        execution_requirements = execution_requirements,
     )
     return output_dir, struct(args = run_args, env = env)
 

@@ -1,8 +1,235 @@
+import contextlib
+import io
+import json
+import logging
 import os
-import pathlib
+import shutil
 import sys
+import traceback
+import typing
 
+import sphinx.application
 from sphinx.cmd.build import main
 
+WorkRequest = object
+WorkResponse = object
+
+logger = logging.getLogger("sphinxdocs_build")
+
+_WORKER_SPHINX_EXT_MODULE_NAME = "bazel_worker_sphinx_ext"
+
+# Config value name for getting the path to the request info file
+_REQUEST_INFO_CONFIG_NAME = "bazel_worker_request_info_path"
+
+
+class Worker:
+
+    def __init__(
+        self, instream: "typing.TextIO", outstream: "typing.TextIO", exec_root: str
+    ):
+        # NOTE: Sphinx performs its own logging re-configuration, so any
+        # logging config we do isn't respected by Sphinx. Controlling where
+        # stdout and stderr goes are the main mechanisms. Recall that
+        # Bazel send worker stderr to the worker log file.
+        # outputBase=$(bazel info output_base)
+        # find $outputBase/bazel-workers/ -type f -printf '%T@ %p\n' | sort -n | tail -1 | awk '{print $2}'
+        logging.basicConfig(level=logging.WARN)
+        logger.info("Initializing worker")
+
+        # The directory that paths are relative to.
+        self._exec_root = exec_root
+        # Where requests are read from.
+        self._instream = instream
+        # Where responses are written to.
+        self._outstream = outstream
+
+        # dict[str srcdir, dict[str path, str digest]]
+        self._digests = {}
+
+        # Internal output directories the worker gives to Sphinx that need
+        # to be cleaned up upon exit.
+        # set[str path]
+        self._worker_outdirs = set()
+        self._extension = BazelWorkerExtension()
+
+        sys.modules[_WORKER_SPHINX_EXT_MODULE_NAME] = self._extension
+        sphinx.application.builtin_extensions += (_WORKER_SPHINX_EXT_MODULE_NAME,)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self):
+        for worker_outdir in self._worker_outdirs:
+            shutil.rmtree(worker_outdir, ignore_errors=True)
+
+    def run(self) -> None:
+        logger.info("Worker started")
+        try:
+            while True:
+                request = None
+                try:
+                    request = self._get_next_request()
+                    if request is None:
+                        logger.info("Empty request: exiting")
+                        break
+                    response = self._process_request(request)
+                    if response:
+                        self._send_response(response)
+                except Exception:
+                    logger.exception("Unhandled error: request=%s", request)
+                    output = (
+                        f"Unhandled error:\nRequest id: {request.get('id')}\n"
+                        + traceback.format_exc()
+                    )
+                    request_id = 0 if not request else request.get("requestId", 0)
+                    self._send_response(
+                        {
+                            "exitCode": 3,
+                            "output": output,
+                            "requestId": request_id,
+                        }
+                    )
+        finally:
+            logger.info("Worker shutting down")
+
+    def _get_next_request(self) -> "object | None":
+        line = self._instream.readline()
+        if not line:
+            return None
+        return json.loads(line)
+
+    def _send_response(self, response: "WorkResponse") -> None:
+        self._outstream.write(json.dumps(response) + "\n")
+        self._outstream.flush()
+
+    def _prepare_sphinx(self, request):
+        sphinx_args = request["arguments"]
+        srcdir = sphinx_args[0]
+
+        incoming_digests = {}
+        current_digests = self._digests.setdefault(srcdir, {})
+        changed_paths = []
+        request_info = {"exec_root": self._exec_root, "inputs": request["inputs"]}
+        for entry in request["inputs"]:
+            path = entry["path"]
+            digest = entry["digest"]
+            # Make the path srcdir-relative so Sphinx understands it.
+            path = path.removeprefix(srcdir + "/")
+            incoming_digests[path] = digest
+
+            if path not in current_digests:
+                logger.info("path %s new", path)
+                changed_paths.append(path)
+            elif current_digests[path] != digest:
+                logger.info("path %s changed", path)
+                changed_paths.append(path)
+
+        self._digests[srcdir] = incoming_digests
+        self._extension.changed_paths = changed_paths
+        request_info["changed_sources"] = changed_paths
+
+        bazel_outdir = sphinx_args[1]
+        worker_outdir = bazel_outdir + ".worker-out.d"
+        self._worker_outdirs.add(worker_outdir)
+        sphinx_args[1] = worker_outdir
+
+        request_info_path = os.path.join(srcdir, "_bazel_worker_request_info.json")
+        with open(request_info_path, "w") as fp:
+            json.dump(request_info, fp)
+        sphinx_args.append(f"--define={_REQUEST_INFO_CONFIG_NAME}={request_info_path}")
+
+        return worker_outdir, bazel_outdir, sphinx_args
+
+    @contextlib.contextmanager
+    def _redirect_streams(self):
+        out = io.StringIO()
+        orig_stdout = sys.stdout
+        try:
+            sys.stdout = out
+            yield out
+        finally:
+            sys.stdout = orig_stdout
+
+    def _process_request(self, request: "WorkRequest") -> "WorkResponse | None":
+        logger.info("Request: %s", json.dumps(request, sort_keys=True, indent=2))
+        if request.get("cancel"):
+            return None
+
+        worker_outdir, bazel_outdir, sphinx_args = self._prepare_sphinx(request)
+
+        # Prevent anything from going to stdout because it breaks the worker
+        # protocol. We have limited control over where Sphinx sends output.
+        with self._redirect_streams() as stdout:
+            logger.info("main args: %s", sphinx_args)
+            exit_code = main(sphinx_args)
+
+        if exit_code:
+            raise Exception(
+                "Sphinx main() returned failure: "
+                + f"  exit code: {exit_code}\n"
+                + "========== STDOUT START ==========\n"
+                + stdout.getvalue().rstrip("\n")
+                + "\n"
+                + "========== STDOUT END ==========\n"
+            )
+
+        # Copying is unfortunately necessary because Bazel doesn't know to
+        # implicily bring along what the symlinks point to.
+        shutil.copytree(worker_outdir, bazel_outdir, dirs_exist_ok=True)
+
+        response = {
+            "requestId": request.get("requestId", 0),
+            "output": stdout.getvalue(),
+            "exitCode": 0,
+        }
+        return response
+
+
+class BazelWorkerExtension:
+    """A Sphinx extension implemented as a class acting like a module."""
+
+    def __init__(self):
+        # Make it look like a Module object
+        self.__name__ = _WORKER_SPHINX_EXT_MODULE_NAME
+        # set[str] of src-dir relative path names
+        self.changed_paths = set()
+
+    def setup(self, app):
+        app.add_config_value(_REQUEST_INFO_CONFIG_NAME, "", "")
+        app.connect("env-get-outdated", self._handle_env_get_outdated)
+        return {"parallel_read_safe": True, "parallel_write_safe": True}
+
+    def _handle_env_get_outdated(self, app, env, added, changed, removed):
+        changed = {
+            # NOTE: path2doc returns None if it's not a doc path
+            env.path2doc(p)
+            for p in self.changed_paths
+        }
+
+        logger.info("changed docs: %s", changed)
+        return changed
+
+
+def _worker_main(stdin, stdout, exec_root):
+    with Worker(stdin, stdout, exec_root) as worker:
+        return worker.run()
+
+
+def _non_worker_main():
+    args = []
+    for arg in sys.argv:
+        if arg.startswith("@"):
+            with open(arg.removeprefix("@")) as fp:
+                lines = [line.strip() for line in fp if line.strip()]
+            args.extend(lines)
+        else:
+            args.append(arg)
+    sys.argv[:] = args
+    return main()
+
+
 if __name__ == "__main__":
-    sys.exit(main())
+    if "--persistent_worker" in sys.argv:
+        sys.exit(_worker_main(sys.stdin, sys.stdout, os.getcwd()))
+    else:
+        sys.exit(_non_worker_main())