Add script to extract notebook image outputs (#3095)

frankharkins · beckykd · Eric-Arellano · web-flow · commit 81fff3ea5f28 · 2025-05-09T14:42:41.000Z
This PR adds a script to extract image outputs from notebooks and
convert them to AVIF if necessary. We're already doing this in
inner-source as part of the sync but it was taking too long with the new
tutorial notebooks. This PR just introduces the script, a follow-up PR
will extract the images and add this to `./fix`.

---------

Co-authored-by: Rebecca Dimock &lt;beckyd@us.ibm.com&gt;
Co-authored-by: Eric Arellano &lt;14852634+Eric-Arellano@users.noreply.github.com&gt;
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -25,6 +25,11 @@ jobs:
           node-version: 18
       - name: Install Node.js dependencies
         run: npm ci
+      - name: Install ImageMagick
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y imagemagick
+          sudo ln -s /usr/bin/convert /usr/bin/magick
 
       - name: File metadata
         run: npm run check:metadata
diff --git a/scripts/notebook-normalizer/pyproject.toml b/scripts/notebook-normalizer/pyproject.toml
@@ -0,0 +1,24 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "qiskit-docs-notebook-normalizer"
+version = "0.0.1"
+authors = [
+  { name="Qiskit docs team" },
+]
+description = "A tool to extract image outputs from notebooks and convert to AVIF if needed (requires ImageMagick)."
+requires-python = ">=3.8"
+license = "Apache-2.0"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent",
+]
+dependencies = [
+  "nbformat~=5.10.4",
+]
+
+[project.scripts]
+qiskit-docs-notebook-normalizer = "qiskit_docs_notebook_normalizer:main"
diff --git a/scripts/notebook-normalizer/qiskit_docs_notebook_normalizer/__init__.py b/scripts/notebook-normalizer/qiskit_docs_notebook_normalizer/__init__.py
@@ -0,0 +1,154 @@
+# This code is a Qiskit project.
+#
+# (C) Copyright IBM 2024.
+#
+# This code is licensed under the Apache License, Version 2.0. You may
+# obtain a copy of this license in the LICENSE file in the root directory
+# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# Any modifications or derivative works of this code must retain this
+# copyright notice, and modified files need to carry a notice indicating
+# that they have been altered from the originals.
+
+import argparse
+import shutil
+from dataclasses import dataclass
+from itertools import chain
+from pathlib import Path
+from typing import TypeGuard, ClassVar
+
+import nbformat
+
+from .cell_output_data import remove_circuit_drawing_html, extract_image_output, Image
+
+NOTEBOOK_PATHS = chain(
+    Path("docs").rglob("*.ipynb"),
+    Path("learning").rglob("*.ipynb"),
+)
+
+
+# Result types for normalization process
+@dataclass
+class NormalizationNeeded:
+    changes: ClassVar[bool] = True
+    nb: nbformat.NotebookNode
+    images: list[Image]
+
+
+@dataclass
+class AlreadyNormalized:
+    changes: ClassVar[bool] = False
+
+
+NormalizationResult = NormalizationNeeded | AlreadyNormalized
+
+
+def changes_made(result: NormalizationResult) -> TypeGuard[NormalizationNeeded]:
+    return result.changes
+
+
+def main():
+    """
+    Search for notebooks and extract image outputs if necessary.
+    """
+    parser = argparse.ArgumentParser(prog="Qiskit/documentation notebook normalization")
+    parser.add_argument("--check", action="store_true")
+    args = parser.parse_args()
+
+    problem_notebooks = []
+    for nb_path in NOTEBOOK_PATHS:
+        if is_hidden(nb_path):
+            continue
+
+        nb = nbformat.read(nb_path, 4)
+        images_folder = determine_image_folder(nb_path)
+
+        result = normalize_notebook(nb, images_folder, args.check)
+        if not changes_made(result):
+            continue
+
+        problem_notebooks.append(nb_path)
+        if args.check:
+            continue
+
+        ensure_exists_and_empty(images_folder)
+        for image in result.images:
+            image.write()
+        nbformat.write(result.nb, nb_path)
+        print(f"✍️ Written '{nb_path}' and {len(result.images)} image(s)")
+
+    if args.check and problem_notebooks:
+        print(
+            "\nThe following notebooks need normalizing:\n ",
+            "\n  ".join(map(str, problem_notebooks)),
+            "\nRun ./fix to fix them automatically.",
+        )
+        raise SystemExit(1)
+
+
+def normalize_notebook(
+    nb: nbformat.NotebookNode, image_folder: Path, check_only: bool = False
+) -> NormalizationResult:
+    """
+    Extracts images (converting if necessary) and returns an updated notebook.
+    """
+    images = []
+    change_made = False
+    for cell_index, cell in enumerate(nb.cells):
+        if cell.cell_type != "code":
+            continue
+        if "outputs" not in cell:
+            continue
+        for index, output in enumerate(cell["outputs"]):
+            if "data" not in output:
+                continue
+            data = output["data"]
+
+            html_removed = remove_circuit_drawing_html(data)
+            if html_removed:
+                change_made = True
+
+            # 2. Extract image outputs
+            filestem = Path(image_folder, f"{cell.id}-{index}")
+            if image := extract_image_output(
+                data, filestem, skip_conversion=check_only
+            ):
+                change_made = True
+                images.append(image)
+
+            if change_made and check_only:
+                # We now know the notebook needs linting so we don't need to
+                # keep looking at other cells
+                return NormalizationNeeded(nb=nb, images=[])
+
+    if change_made:
+        return NormalizationNeeded(nb=nb, images=images)
+    return AlreadyNormalized()
+
+
+def determine_image_folder(nb_path: Path) -> Path:
+    """
+    Determine the appropriate output folder for the extracted images, and ensure it exists and is empty.
+
+    For example, the following notebook path:
+        docs/guides/my-notebook.ipynb
+    Should have its images extracted to:
+        public/docs/images/guides/my-notebook/extracted-outputs/
+    """
+    return Path(
+        "public",
+        nb_path.parts[0],  # i.e. "docs" or "learning"
+        "images",
+        *nb_path.with_suffix("").parts[1:],  # e.g. "guides/visualize-results"
+        "extracted-outputs",
+    )
+
+
+def ensure_exists_and_empty(folder: Path) -> None:
+    if folder.exists():
+        shutil.rmtree(folder)
+    folder.mkdir(parents=True)
+
+
+def is_hidden(path: Path) -> bool:
+    return any(part.startswith(".") for part in path.parts)
diff --git a/scripts/notebook-normalizer/qiskit_docs_notebook_normalizer/cell_output_data.py b/scripts/notebook-normalizer/qiskit_docs_notebook_normalizer/cell_output_data.py
@@ -0,0 +1,92 @@
+import base64
+from dataclasses import dataclass
+from pathlib import Path
+from subprocess import Popen, PIPE
+
+# Qiskit's QuantumCircuit.draw() results in Jupyter outputting both a `text/html` and
+# `text/plain` entry. The HTML entry has pre-applied formatting that makes sense in
+# a Jupyter notebook, but renders horribly in our app:
+# https://github.com/Qiskit/qiskit/blob/df379876ba10d6f490a96723b6dbbf723ec45d7a/qiskit/visualization/circuit/text.py#L761-L769
+#
+# So, we instead should render the `text/plain` entry rather than `text/html`.
+CIRCUIT_DRAW_HTML_PREFIX = '<pre style="word-wrap: normal;white-space: pre;background: #fff0;line-height: 1.1;font-family: &quot;Courier New&quot;,Courier,monospace">'
+
+
+@dataclass
+class SvgImage:
+    data: str
+    filepath: Path
+
+    def write(self):
+        self.filepath.write_text(self.data)
+
+
+@dataclass
+class RasterImage:
+    data: bytes
+    filepath: Path
+
+    def write(self):
+        self.filepath.write_bytes(self.data)
+
+
+Image = SvgImage | RasterImage
+
+
+def remove_circuit_drawing_html(output_data: dict) -> bool:
+    """Mutates a notebook cell if needed and returns True if any changes were made."""
+    if html := output_data.get("text/html"):
+        if html.startswith(CIRCUIT_DRAW_HTML_PREFIX):
+            del output_data["text/html"]
+            return True
+    return False
+
+
+def extract_image_output(
+    output_data: dict, filestem: Path, skip_conversion: bool
+) -> Image | None:
+    """Extract image output if one exists and mutate the cell to point to the destination image."""
+
+    image = _get_image(output_data, filestem, skip_conversion)
+    if image is None:
+        return None
+
+    output_data["text/plain"] = _image_mdx_component(image)
+    # Delete all image outputs now we've converted one.
+    # An output can have many different representations (e.g. text,
+    # html, image), including many image representations in different
+    # formats. We only want to keep one image representation, so we ignore
+    # the rest.
+    for datatype in ["png", "jpeg", "svg+xml"]:
+        output_data.pop(f"image/{datatype}", None)
+    return image
+
+
+def _get_image(
+    output_data: dict, filestem: Path, skip_conversion: bool
+) -> Image | None:
+    """Just get the image data if it exists, nothing else"""
+    if svg_data := output_data.get("image/svg+xml", None):
+        return SvgImage(filepath=filestem.with_suffix(".svg"), data=svg_data)
+    if png_data := output_data.get("image/png", None):
+        png_image = RasterImage(
+            filepath=filestem.with_suffix(".png"),
+            data=base64.b64decode(png_data),
+        )
+        if skip_conversion:
+            return png_image
+        return _convert_to_avif(png_image)
+
+
+def _convert_to_avif(image: RasterImage) -> RasterImage:
+    """
+    Pipe image through ImageMagick subprocess to convert to AVIF.
+    """
+    new_path = image.filepath.with_suffix(".avif")
+    imagemagick = Popen(["magick", "-", "avif:-"], stdout=PIPE, stderr=PIPE, stdin=PIPE)
+    (new_data, _stderr) = imagemagick.communicate(input=image.data)
+    return RasterImage(filepath=new_path, data=new_data)
+
+
+def _image_mdx_component(image: Image) -> str:
+    return f'<Image src="/{image.filepath.relative_to("public")}" alt="Output of the previous code cell" />'
diff --git a/scripts/notebook-normalizer/test/test_normalization.py b/scripts/notebook-normalizer/test/test_normalization.py
diff --git a/tox.ini b/tox.ini