Merge pull request #1067 from juhasch/feature/pre_embedimages

juhasch · web-flow · commit 86050e8a84ba · 2018-02-17T13:13:19.000+01:00
Preprocessor to embed markdown images
diff --git a/docs/source/exporting.rst b/docs/source/exporting.rst
@@ -51,6 +51,11 @@ Converting linked SVG to PDF
 .. autoclass:: SVG2PDFPreprocessor
 
 
+Embedding images in notebooks
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: EmbedImagesPreprocessor
+
 
 Postprocessors
 --------------
diff --git a/src/jupyter_contrib_nbextensions/nbconvert_support/__init__.py b/src/jupyter_contrib_nbextensions/nbconvert_support/__init__.py
@@ -10,6 +10,7 @@
 from .pp_highlighter import HighlighterPostProcessor, HighlighterPreprocessor
 from .pre_codefolding import CodeFoldingPreprocessor
 from .pre_pymarkdown import PyMarkdownPreprocessor
+from .pre_embedimages import EmbedImagesPreprocessor
 from .pre_svg2pdf import SVG2PDFPreprocessor
 from .toc2 import TocExporter
 
@@ -22,6 +23,7 @@
     'ExporterInliner',
     'HighlighterPostProcessor',
     'HighlighterPreprocessor',
+    'EmbedImagesPreprocessor',
     'NotebookLangExporter',
     'PyMarkdownPreprocessor',
     'SVG2PDFPreprocessor',
diff --git a/src/jupyter_contrib_nbextensions/nbconvert_support/pre_embedimages.py b/src/jupyter_contrib_nbextensions/nbconvert_support/pre_embedimages.py
@@ -0,0 +1,157 @@
+"""Nbconvert preprocessor for the python-markdown nbextension."""
+
+import base64
+import os
+import re
+
+from ipython_genutils.ipstruct import Struct
+from nbconvert.preprocessors import Preprocessor
+from traitlets import Bool, Unicode
+
+try:
+    from urllib.request import urlopen  # py3
+except ImportError:
+    from urllib2 import urlopen
+
+
+class EmbedImagesPreprocessor(Preprocessor):
+    """
+    :mod:`nbconvert` Preprocessor to embed images in a markdown cell as
+        attachment inside the notebook itself.
+
+    This :class:`~nbconvert.preprocessors.Preprocessor` replaces kernel code in
+    markdown cells with the results stored in the cell metadata.
+
+    The preprocessor is installed by default. To enable embedding images with
+    NbConvert, you need to set the configuration parameter
+    `EmbedImagesPreprocessor.embed_images=True`.
+    This can be done either in the `jupyter_nbconvert_config.py` file::
+
+        c.EmbedImagesPreprocessor.embed_images=True
+
+    or using a command line parameter when calling NbConvert::
+
+        $ jupyter nbconvert --to html --EmbedImagesPreprocessor.embed_images=True mynotebook.ipynb
+
+    Further options are::
+
+        EmbedImagesPreprocessor.embed_remote_images=True
+
+    to additionally embeds all images referenced by an url
+    (e.g. http://jupyter.org/assets/nav_logo.svg) instead of a local file name.
+
+    Another configuration option is::
+
+        EmbedImagesPreprocessor.resize=small
+
+    Let's you scale-down the size of an image. This is useful if you want to
+    save space by not embedding large images and instead use a smaller (scaled)
+    version. Works only for raster images (i.e. png, jpg).
+    Valid resize settings are: small = 500px, mid = 1000px, large = 2000px
+    for maximum size in length or width.  No upscaling of small images will
+    be performed. The Python package `PIL` needs to be installed for this
+    option to work.
+
+    Example::
+
+            $ jupyter nbconvert --to html --EmbedImagesPreprocessor.embed_images=True
+                --EmbedImagesPreprocessor.resize=large mynotebook.ipynb
+
+    *Note:* To embed images after conversion to HTML you can also use the
+           `html_embed` exporter
+    """
+
+    embed_images = Bool(False, help="Embed images as attachment").tag(config=True)
+    embed_remote_images = Bool(False, help="Embed images referenced by an url as attachment").tag(config=True)
+    resize = Unicode('', help="Resize images to save space (reduce size)").tag(config=True)
+    imgsizes = {'small': 500, 'mid': 1000, 'large': 2000}
+
+    def preprocess(self, nb, resources):
+        """Skip preprocessor if not enabled"""
+        if self.embed_images:
+            nb, resources = super(EmbedImagesPreprocessor, self).preprocess(nb, resources)
+        return nb, resources
+
+    def resize_image(self, imgname, imgformat, imgdata):
+        """Resize images if desired and PIL is installed
+
+        Parameters
+        ----------
+            imgname: str
+                Name of image
+            imgformat: str
+                Format of image (JPG or PNG)
+            imgdata:
+                Binary image data
+
+        """
+        if imgformat in ['png', 'jpg']:
+            from io import BytesIO
+            try:
+                from PIL import Image
+            except ImportError:
+                self.log.info("Pillow library not available to resize images")
+                return imgdata
+            # Only make images smaller when rescaling
+            im = Image.open(BytesIO(imgdata))
+            factor = self.imgsizes[self.resize] / max(im.size)
+            if factor < 1.0:
+                newsize = (int(im.size[0] * factor), int(im.size[1] * factor))
+                newim = im.resize(newsize)
+                fp = BytesIO()
+                # PIL requires JPEG instead of JPG
+                newim.save(fp, format=imgformat.replace('jpg', 'jpeg'))
+                imgdata = fp.getvalue()
+                fp.close()
+                self.log.debug("Resized %d x %d image %s to size %d x %d pixels" %
+                               (im.size[0], im.size[1], imgname, newsize[0], newsize[1]))
+        return imgdata
+
+    def replfunc_md(self, match):
+        """Read image and store as base64 encoded attachment"""
+        url = match.group(2)
+        imgformat = url.split('.')[-1].lower()
+        if url.startswith('http'):
+            if self.embed_remote_images:
+                data = urlopen(url).read()
+            else:
+                return match.group(0)
+        elif url.startswith('attachment'):
+            return match.group(0)
+        else:
+            filename = os.path.join(self.path, url)
+            with open(filename, 'rb') as f:
+                data = f.read()
+
+        if self.resize in self.imgsizes.keys():
+            data = self.resize_image(url, imgformat, data)
+
+        self.log.debug("Embedding url: %s, format: %s" % (url, imgformat))
+        b64_data = base64.b64encode(data).decode("utf-8")
+        self.attachments[url] = {'image/' + imgformat: b64_data}
+
+        newimg = '![' + match.group(1) + '](attachment:' + match.group(2) + ')'
+        return newimg
+
+    def preprocess_cell(self, cell, resources, index):
+        """
+        Preprocess cell
+
+        Parameters
+        ----------
+        cell : NotebookNode cell
+            Notebook cell being processed
+        resources : dictionary
+            Additional resources used in the conversion process.  Allows
+            preprocessors to pass variables into the Jinja engine.
+        index : int
+            Index of the cell being processed (see base.py)
+        """
+        self.path = resources['metadata']['path']
+        self.attachments = getattr(cell, 'attachments', Struct())
+
+        if cell.cell_type == "markdown":
+            regex = re.compile('!\[([^"]*)\]\(([^"]+)\)')
+            cell.source = regex.sub(self.replfunc_md, cell.source)
+            cell.attachments = self.attachments
+        return cell, resources
diff --git a/tests/data/large_image.png b/tests/data/large_image.png
diff --git a/tests/test_preprocessors.py b/tests/test_preprocessors.py
@@ -113,6 +113,78 @@ def test_preprocessor_svg2pdf():
               'exported pdf should be referenced in exported notebook')
 
 
+def test_preprocessor_embedimages():
+    """Test python embedimages preprocessor."""
+    # check import shortcut
+    from jupyter_contrib_nbextensions.nbconvert_support import EmbedImagesPreprocessor  # noqa E501
+    notebook_node = nbf.new_notebook(cells=[
+        nbf.new_code_cell(source="a = 'world'"),
+        nbf.new_markdown_cell(
+            source="![testimage]({})".format(path_in_data('icon.png'))
+        ),
+    ])
+    customconfig = Config(EmbedImagesPreprocessor={'embed_images': True})
+    body, resources = export_through_preprocessor(
+        notebook_node, EmbedImagesPreprocessor, NotebookExporter, 'ipynb',
+        customconfig)
+
+    expected = 'image/png'
+    assert_in(expected, body, 'Attachment {} is missing'.format(expected))
+
+
+def test_preprocessor_embedimages_resize():
+    """Test python embedimages preprocessor."""
+    # check import shortcut
+    from jupyter_contrib_nbextensions.nbconvert_support import EmbedImagesPreprocessor  # noqa E501
+
+    try:
+        from PIL import Image  # noqa F401
+    except ImportError:
+        raise SkipTest('PIL not found')
+
+    notebook_node = nbf.new_notebook(cells=[
+        nbf.new_code_cell(source="a = 'world'"),
+        nbf.new_markdown_cell(
+            source="![testimage]({})".format(path_in_data('large_image.png'))
+        ),
+    ])
+    body, resources = export_through_preprocessor(
+        notebook_node, EmbedImagesPreprocessor, NotebookExporter, 'ipynb')
+    len_noembed = len(body)
+
+    customconfig = Config(EmbedImagesPreprocessor={'embed_images': True,
+                                                   'resize': 'small'})
+    body, resources = export_through_preprocessor(
+        notebook_node, EmbedImagesPreprocessor, NotebookExporter, 'ipynb',
+        customconfig)
+    len_small = len(body)
+
+    customconfig = Config(EmbedImagesPreprocessor={'embed_images': True,
+                                                   'resize': 'mid'})
+    body, resources = export_through_preprocessor(
+        notebook_node, EmbedImagesPreprocessor, NotebookExporter, 'ipynb',
+        customconfig)
+    len_mid = len(body)
+
+    customconfig = Config(EmbedImagesPreprocessor={'embed_images': True,
+                                                   'resize': 'large'})
+    body, resources = export_through_preprocessor(
+        notebook_node, EmbedImagesPreprocessor, NotebookExporter, 'ipynb',
+        customconfig)
+    len_large = len(body)
+
+    customconfig = Config(EmbedImagesPreprocessor={'embed_images': True})
+    body, resources = export_through_preprocessor(
+        notebook_node, EmbedImagesPreprocessor, NotebookExporter, 'ipynb',
+        customconfig)
+    len_noresize = len(body)
+
+    assert(len_noembed < len_small)
+    assert(len_small < len_mid)
+    assert(len_mid < len_large)
+    assert(len_large < len_noresize)
+
+
 def _normalize_iso8601_timezone(timestamp_str):
     # Zulu -> +00:00 offset
     timestamp_str = re.sub(r'Z$', r'+00:00', timestamp_str)
@@ -146,3 +218,4 @@ def test_preprocessor_execute_time():
                 _normalize_iso8601_timezone(etmd['end_time']),
                 _normalize_iso8601_timezone(etmd['start_time']),
                 'end_time should not be before start_time')
+