Added s3 provider

dolfandringa · dolfandringa · commit dbfab61b8fc1 · 2022-05-19T14:39:45.000+08:00
diff --git a/nbviewer/app.py b/nbviewer/app.py
@@ -201,6 +201,10 @@ class NBViewer(Application):
         default_value="nbviewer.providers.local.handlers.LocalFileHandler",
         help="The Tornado handler to use for viewing notebooks found on a local filesystem",
     ).tag(config=True)
+    s3_handler = Unicode(
+        default_value="nbviewer.providers.s3.handlers.S3Handler",
+        help="The Tornado handler to use for viewing notebooks from amazon S3",
+    ).tag(config=True)
     url_handler = Unicode(
         default_value="nbviewer.providers.url.handlers.URLHandler",
         help="The Tornado handler to use for viewing notebooks accessed via URL",
@@ -625,6 +629,7 @@ def init_tornado_application(self):
             github_user_handler=self.github_user_handler,
             index_handler=self.index_handler,
             local_handler=self.local_handler,
+            s3_handler=self.s3_handler,
             url_handler=self.url_handler,
             user_gists_handler=self.user_gists_handler,
         )
diff --git a/nbviewer/providers/__init__.py b/nbviewer/providers/__init__.py
@@ -6,12 +6,12 @@
 # -----------------------------------------------------------------------------
 
 default_providers = [
-    "nbviewer.providers.{}".format(prov) for prov in ["url", "github", "gist"]
+    "nbviewer.providers.{}".format(prov) for prov in ["url", "github", "gist", "s3"]
 ]
 
 default_rewrites = [
     "nbviewer.providers.{}".format(prov)
-    for prov in ["gist", "github", "dropbox", "url"]
+    for prov in ["gist", "github", "dropbox", "url", "s3"]
 ]
 
 
diff --git a/nbviewer/providers/s3/__init__.py b/nbviewer/providers/s3/__init__.py
@@ -0,0 +1,3 @@
+from .handlers import default_handlers
+from .handlers import S3Handler
+from .handlers import uri_rewrites
diff --git a/nbviewer/providers/s3/handlers.py b/nbviewer/providers/s3/handlers.py
@@ -0,0 +1,152 @@
+# -----------------------------------------------------------------------------
+#  Copyright (C) Jupyter Development Team
+#
+#  Distributed under the terms of the BSD License.  The full license is in
+#  the file COPYING, distributed as part of this software.
+# -----------------------------------------------------------------------------
+import errno
+import io
+import os
+from datetime import datetime
+from urllib.parse import urlparse
+import boto3
+import botocore
+from tornado import web
+
+from .. import _load_handler_from_location
+from ...utils import url_path_join
+from ..base import cached
+from ..base import RenderingHandler
+
+
+class S3Handler(RenderingHandler):
+    """Renderer for s3://
+
+    Serving notebooks from S3 buckets
+    """
+    def initialize(self, **kwargs):
+        self.s3_client = boto3.client("s3")
+        self._downloadable_data = None
+        self._downloaded_path = None
+        super().initialize(**kwargs)
+
+
+    async def download(self, path):
+        """Download the notebook"""
+        headers = await self.get_notebook_headers(path)
+        filename=os.path.basename(path)
+        self.set_header("Content-Length", headers["ContentLength"])
+        # Escape commas to workaround Chrome issue with commas in download filenames
+        self.set_header(
+            "Content-Disposition",
+            "attachment; filename={};".format(filename.replace(",", "_")),
+        )
+        if self._downloaded_path == path and self._downloadable_data is not None:
+            content = self._downloadable_data
+        else:
+            content = await self.read_s3_file(path)
+
+        if isinstance(content, bytes):
+            content = [content]
+        for chunk in content:
+            try:
+                self.write(chunk)
+                await self.flush()
+            except iostream.StreamClosedError:
+                return
+
+
+    async def get_notebook_data(self, path):
+        """Get additional notebook data"""
+        is_download = self.get_query_arguments("download")
+        if is_download:
+            await self.download(path)
+            return
+
+        return path
+
+
+    async def get_notebook_headers(self, path):
+        """Get the size of a notebook file."""
+        o = urlparse(path)
+        bucket = o.netloc
+        key = o.path[1:]
+        self.log.debug("Getting headers for %s from %s", key, bucket)
+        try:
+            head = self.s3_client.head_object(Bucket=bucket, Key=key)
+        except botocore.exceptions.ClientError as ex:
+            if ex.response["Error"]["Code"] == "404":
+                self.log.info("The notebook %s does not exist.", path)
+                raise web.HTTPError(404)
+            raise ex
+        return head
+
+
+    async def read_s3_file(self, path):
+        """Download the notebook file from s3."""
+        o = urlparse(path)
+        bucket = o.netloc
+        key = o.path[1:]
+        s3_file = io.BytesIO()
+        self.log.debug("Reading %s from %s", key, bucket)
+        try:
+            self.s3_client.download_fileobj(bucket, key, s3_file)
+        except botocore.exceptions.ClientError as ex:
+            if ex.response["Error"]["Code"] == "404":
+                self.log.info("The notebook %s does not exist.", path)
+                raise web.HTTPError(404)
+            raise ex
+        s3_file.seek(0)
+        self.log.debug("Done downloading.")
+        self._downloadable_data = s3_file.read().decode('utf-8')
+        self._downloaded_path = path
+        return self._downloadable_data
+
+
+    async def deliver_notebook(self, path):
+        nbdata = await self.read_s3_file(path)
+
+        # Explanation of some kwargs passed into `finish_notebook`:
+        # breadcrumbs: list of dict
+        #     Breadcrumb 'name' and 'url' to render as links at the top of the notebook page
+        # title: str
+        #     Title to use as the HTML page title (i.e., text on the browser tab)
+        await self.finish_notebook(
+            nbdata,
+            download_url="?download",
+            msg="file from s3: %s" % path,
+            public=False,
+            breadcrumbs=[],
+            title=os.path.basename(path),
+        )
+
+    @cached
+    async def get(self, path):
+        """Get an s3 notebook
+
+        Parameters
+        ==========
+        path: str
+            s3 uri
+        """
+        fullpath = await self.get_notebook_data(path)
+
+        # get_notebook_data returns None if a directory is to be shown or a notebook is to be downloaded,
+        # i.e. if no notebook is supposed to be rendered, making deliver_notebook inappropriate
+        if fullpath is not None:
+            await self.deliver_notebook(fullpath)
+
+
+
+def default_handlers(handlers=[], **handler_names):
+    """Tornado handlers"""
+
+    s3_handler = _load_handler_from_location(handler_names["s3_handler"])
+
+    return handlers + [(r"/(s3%3A//.*)", s3_handler, {})]
+
+def uri_rewrites(rewrites=[]):
+    return [
+        (r"^(s3://.*)$", "{0}"),
+    ]
+
diff --git a/requirements.in b/requirements.in
@@ -1,5 +1,6 @@
 elasticsearch
 ipython
+boto3
 jupyter_client
 jupyter_server>=0.2.0
 markdown>=3.0,==3.1.1 # pin until we workaround #909, which is a regression in 3.2

Original file line number	Diff line number	Diff line change
`@@ -6,12 +6,12 @@`
`6`	`6`	`# -----------------------------------------------------------------------------`
`7`	`7`
`8`	`8`	`default_providers = [`
`9`		`- "nbviewer.providers.{}".format(prov) for prov in ["url", "github", "gist"]`
	`9`	`+ "nbviewer.providers.{}".format(prov) for prov in ["url", "github", "gist", "s3"]`
`10`	`10`	`]`
`11`	`11`
`12`	`12`	`default_rewrites = [`
`13`	`13`	`"nbviewer.providers.{}".format(prov)`
`14`		`- for prov in ["gist", "github", "dropbox", "url"]`
	`14`	`+ for prov in ["gist", "github", "dropbox", "url", "s3"]`
`15`	`15`	`]`
`16`	`16`
`17`	`17`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .handlers import default_handlers`
	`2`	`+from .handlers import S3Handler`
	`3`	`+from .handlers import uri_rewrites`