Skip to content

Commit 81fff3e

Browse files
frankharkinsbeckykdEric-Arellano
authored
Add script to extract notebook image outputs (#3095)
This PR adds a script to extract image outputs from notebooks and convert them to AVIF if necessary. We're already doing this in inner-source as part of the sync but it was taking too long with the new tutorial notebooks. This PR just introduces the script, a follow-up PR will extract the images and add this to `./fix`. --------- Co-authored-by: Rebecca Dimock <[email protected]> Co-authored-by: Eric Arellano <[email protected]>
1 parent 39201f7 commit 81fff3e

File tree

6 files changed

+527
-1
lines changed

6 files changed

+527
-1
lines changed

.github/workflows/main.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@ jobs:
2525
node-version: 18
2626
- name: Install Node.js dependencies
2727
run: npm ci
28+
- name: Install ImageMagick
29+
run: |
30+
sudo apt-get update
31+
sudo apt-get install -y imagemagick
32+
sudo ln -s /usr/bin/convert /usr/bin/magick
2833
2934
- name: File metadata
3035
run: npm run check:metadata
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
[build-system]
2+
requires = ["hatchling"]
3+
build-backend = "hatchling.build"
4+
5+
[project]
6+
name = "qiskit-docs-notebook-normalizer"
7+
version = "0.0.1"
8+
authors = [
9+
{ name="Qiskit docs team" },
10+
]
11+
description = "A tool to extract image outputs from notebooks and convert to AVIF if needed (requires ImageMagick)."
12+
requires-python = ">=3.8"
13+
license = "Apache-2.0"
14+
classifiers = [
15+
"Programming Language :: Python :: 3",
16+
"License :: OSI Approved :: Apache Software License",
17+
"Operating System :: OS Independent",
18+
]
19+
dependencies = [
20+
"nbformat~=5.10.4",
21+
]
22+
23+
[project.scripts]
24+
qiskit-docs-notebook-normalizer = "qiskit_docs_notebook_normalizer:main"
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
# This code is a Qiskit project.
2+
#
3+
# (C) Copyright IBM 2024.
4+
#
5+
# This code is licensed under the Apache License, Version 2.0. You may
6+
# obtain a copy of this license in the LICENSE file in the root directory
7+
# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
8+
#
9+
# Any modifications or derivative works of this code must retain this
10+
# copyright notice, and modified files need to carry a notice indicating
11+
# that they have been altered from the originals.
12+
13+
import argparse
14+
import shutil
15+
from dataclasses import dataclass
16+
from itertools import chain
17+
from pathlib import Path
18+
from typing import TypeGuard, ClassVar
19+
20+
import nbformat
21+
22+
from .cell_output_data import remove_circuit_drawing_html, extract_image_output, Image
23+
24+
NOTEBOOK_PATHS = chain(
25+
Path("docs").rglob("*.ipynb"),
26+
Path("learning").rglob("*.ipynb"),
27+
)
28+
29+
30+
# Result types for normalization process
31+
@dataclass
32+
class NormalizationNeeded:
33+
changes: ClassVar[bool] = True
34+
nb: nbformat.NotebookNode
35+
images: list[Image]
36+
37+
38+
@dataclass
39+
class AlreadyNormalized:
40+
changes: ClassVar[bool] = False
41+
42+
43+
NormalizationResult = NormalizationNeeded | AlreadyNormalized
44+
45+
46+
def changes_made(result: NormalizationResult) -> TypeGuard[NormalizationNeeded]:
47+
return result.changes
48+
49+
50+
def main():
51+
"""
52+
Search for notebooks and extract image outputs if necessary.
53+
"""
54+
parser = argparse.ArgumentParser(prog="Qiskit/documentation notebook normalization")
55+
parser.add_argument("--check", action="store_true")
56+
args = parser.parse_args()
57+
58+
problem_notebooks = []
59+
for nb_path in NOTEBOOK_PATHS:
60+
if is_hidden(nb_path):
61+
continue
62+
63+
nb = nbformat.read(nb_path, 4)
64+
images_folder = determine_image_folder(nb_path)
65+
66+
result = normalize_notebook(nb, images_folder, args.check)
67+
if not changes_made(result):
68+
continue
69+
70+
problem_notebooks.append(nb_path)
71+
if args.check:
72+
continue
73+
74+
ensure_exists_and_empty(images_folder)
75+
for image in result.images:
76+
image.write()
77+
nbformat.write(result.nb, nb_path)
78+
print(f"✍️ Written '{nb_path}' and {len(result.images)} image(s)")
79+
80+
if args.check and problem_notebooks:
81+
print(
82+
"\nThe following notebooks need normalizing:\n ",
83+
"\n ".join(map(str, problem_notebooks)),
84+
"\nRun ./fix to fix them automatically.",
85+
)
86+
raise SystemExit(1)
87+
88+
89+
def normalize_notebook(
90+
nb: nbformat.NotebookNode, image_folder: Path, check_only: bool = False
91+
) -> NormalizationResult:
92+
"""
93+
Extracts images (converting if necessary) and returns an updated notebook.
94+
"""
95+
images = []
96+
change_made = False
97+
for cell_index, cell in enumerate(nb.cells):
98+
if cell.cell_type != "code":
99+
continue
100+
if "outputs" not in cell:
101+
continue
102+
for index, output in enumerate(cell["outputs"]):
103+
if "data" not in output:
104+
continue
105+
data = output["data"]
106+
107+
html_removed = remove_circuit_drawing_html(data)
108+
if html_removed:
109+
change_made = True
110+
111+
# 2. Extract image outputs
112+
filestem = Path(image_folder, f"{cell.id}-{index}")
113+
if image := extract_image_output(
114+
data, filestem, skip_conversion=check_only
115+
):
116+
change_made = True
117+
images.append(image)
118+
119+
if change_made and check_only:
120+
# We now know the notebook needs linting so we don't need to
121+
# keep looking at other cells
122+
return NormalizationNeeded(nb=nb, images=[])
123+
124+
if change_made:
125+
return NormalizationNeeded(nb=nb, images=images)
126+
return AlreadyNormalized()
127+
128+
129+
def determine_image_folder(nb_path: Path) -> Path:
130+
"""
131+
Determine the appropriate output folder for the extracted images, and ensure it exists and is empty.
132+
133+
For example, the following notebook path:
134+
docs/guides/my-notebook.ipynb
135+
Should have its images extracted to:
136+
public/docs/images/guides/my-notebook/extracted-outputs/
137+
"""
138+
return Path(
139+
"public",
140+
nb_path.parts[0], # i.e. "docs" or "learning"
141+
"images",
142+
*nb_path.with_suffix("").parts[1:], # e.g. "guides/visualize-results"
143+
"extracted-outputs",
144+
)
145+
146+
147+
def ensure_exists_and_empty(folder: Path) -> None:
148+
if folder.exists():
149+
shutil.rmtree(folder)
150+
folder.mkdir(parents=True)
151+
152+
153+
def is_hidden(path: Path) -> bool:
154+
return any(part.startswith(".") for part in path.parts)
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import base64
2+
from dataclasses import dataclass
3+
from pathlib import Path
4+
from subprocess import Popen, PIPE
5+
6+
# Qiskit's QuantumCircuit.draw() results in Jupyter outputting both a `text/html` and
7+
# `text/plain` entry. The HTML entry has pre-applied formatting that makes sense in
8+
# a Jupyter notebook, but renders horribly in our app:
9+
# https://github.com/Qiskit/qiskit/blob/df379876ba10d6f490a96723b6dbbf723ec45d7a/qiskit/visualization/circuit/text.py#L761-L769
10+
#
11+
# So, we instead should render the `text/plain` entry rather than `text/html`.
12+
CIRCUIT_DRAW_HTML_PREFIX = '<pre style="word-wrap: normal;white-space: pre;background: #fff0;line-height: 1.1;font-family: &quot;Courier New&quot;,Courier,monospace">'
13+
14+
15+
@dataclass
16+
class SvgImage:
17+
data: str
18+
filepath: Path
19+
20+
def write(self):
21+
self.filepath.write_text(self.data)
22+
23+
24+
@dataclass
25+
class RasterImage:
26+
data: bytes
27+
filepath: Path
28+
29+
def write(self):
30+
self.filepath.write_bytes(self.data)
31+
32+
33+
Image = SvgImage | RasterImage
34+
35+
36+
def remove_circuit_drawing_html(output_data: dict) -> bool:
37+
"""Mutates a notebook cell if needed and returns True if any changes were made."""
38+
if html := output_data.get("text/html"):
39+
if html.startswith(CIRCUIT_DRAW_HTML_PREFIX):
40+
del output_data["text/html"]
41+
return True
42+
return False
43+
44+
45+
def extract_image_output(
46+
output_data: dict, filestem: Path, skip_conversion: bool
47+
) -> Image | None:
48+
"""Extract image output if one exists and mutate the cell to point to the destination image."""
49+
50+
image = _get_image(output_data, filestem, skip_conversion)
51+
if image is None:
52+
return None
53+
54+
output_data["text/plain"] = _image_mdx_component(image)
55+
# Delete all image outputs now we've converted one.
56+
# An output can have many different representations (e.g. text,
57+
# html, image), including many image representations in different
58+
# formats. We only want to keep one image representation, so we ignore
59+
# the rest.
60+
for datatype in ["png", "jpeg", "svg+xml"]:
61+
output_data.pop(f"image/{datatype}", None)
62+
return image
63+
64+
65+
def _get_image(
66+
output_data: dict, filestem: Path, skip_conversion: bool
67+
) -> Image | None:
68+
"""Just get the image data if it exists, nothing else"""
69+
if svg_data := output_data.get("image/svg+xml", None):
70+
return SvgImage(filepath=filestem.with_suffix(".svg"), data=svg_data)
71+
if png_data := output_data.get("image/png", None):
72+
png_image = RasterImage(
73+
filepath=filestem.with_suffix(".png"),
74+
data=base64.b64decode(png_data),
75+
)
76+
if skip_conversion:
77+
return png_image
78+
return _convert_to_avif(png_image)
79+
80+
81+
def _convert_to_avif(image: RasterImage) -> RasterImage:
82+
"""
83+
Pipe image through ImageMagick subprocess to convert to AVIF.
84+
"""
85+
new_path = image.filepath.with_suffix(".avif")
86+
imagemagick = Popen(["magick", "-", "avif:-"], stdout=PIPE, stderr=PIPE, stdin=PIPE)
87+
(new_data, _stderr) = imagemagick.communicate(input=image.data)
88+
return RasterImage(filepath=new_path, data=new_data)
89+
90+
91+
def _image_mdx_component(image: Image) -> str:
92+
return f'<Image src="/{image.filepath.relative_to("public")}" alt="Output of the previous code cell" />'

0 commit comments

Comments
 (0)