alnoda-workspaces/workspaces/notebook-old-workspace/nbviewer/nbviewer/providers/base.py

# -----------------------------------------------------------------------------
#  Copyright (C) Jupyter Development Team
#
#  Distributed under the terms of the BSD License.  The full license is in
#  the file COPYING, distributed as part of this software.
# -----------------------------------------------------------------------------
import asyncio
import hashlib
import pickle
import socket
import time
from contextlib import contextmanager
from datetime import datetime
from html import escape
from http.client import responses
from urllib.parse import quote
from urllib.parse import urlencode
from urllib.parse import urlparse
from urllib.parse import urlunparse

import statsd
from nbformat import current_nbformat
from nbformat import reads
from tornado import httpclient
from tornado import web
from tornado.concurrent import Future
from tornado.escape import url_escape
from tornado.escape import url_unescape
from tornado.escape import utf8
from tornado.ioloop import IOLoop

from ..render import NbFormatError
from ..render import render_notebook
from ..utils import EmptyClass
from ..utils import parse_header_links
from ..utils import time_block
from ..utils import url_path_join

try:
    import pycurl
    from tornado.curl_httpclient import CurlError
except ImportError:
    pycurl = None

    class CurlError(Exception):
        pass


format_prefix = "/format/"


class BaseHandler(web.RequestHandler):
    """Base Handler class with common utilities"""

    def initialize(self, format=None, format_prefix="", **handler_settings):
        # format: str, optional
        #     Rendering format (e.g. script, slides, html)
        self.format = format or self.default_format
        self.format_prefix = format_prefix
        self.http_client = httpclient.AsyncHTTPClient()
        self.date_fmt = "%a, %d %b %Y %H:%M:%S UTC"

        for handler_setting in handler_settings:
            setattr(self, handler_setting, handler_settings[handler_setting])

    # Overloaded methods
    def redirect(self, url, *args, **kwargs):
        purl = urlparse(url)

        eurl = urlunparse(
            (
                purl.scheme,
                purl.netloc,
                "/".join(
                    [
                        url_escape(url_unescape(p), plus=False)
                        for p in purl.path.split("/")
                    ]
                ),
                purl.params,
                purl.query,
                purl.fragment,
            )
        )

        return super().redirect(eurl, *args, **kwargs)

    def set_default_headers(self):
        self.add_header("Content-Security-Policy", self.content_security_policy)

    async def prepare(self):
        """Check if the user is authenticated with JupyterHub if the hub
        API endpoint and token are configured.

        Redirect unauthenticated requests to the JupyterHub login page.
        Do nothing if not running as a JupyterHub service.
        """
        # if any of these are set, assume we want to do auth, even if
        # we're misconfigured (better safe than sorry!)
        if self.hub_api_url or self.hub_api_token or self.hub_base_url:

            def redirect_to_login():
                self.redirect(
                    url_path_join(self.hub_base_url, "/hub/login")
                    + "?"
                    + urlencode({"next": self.request.path})
                )

            encrypted_cookie = self.get_cookie(self.hub_cookie_name)
            if not encrypted_cookie:
                # no cookie == not authenticated
                return redirect_to_login()

            try:
                # if the hub returns a success code, the user is known
                await self.http_client.fetch(
                    url_path_join(
                        self.hub_api_url,
                        "authorizations/cookie",
                        self.hub_cookie_name,
                        quote(encrypted_cookie, safe=""),
                    ),
                    headers={"Authorization": "token " + self.hub_api_token},
                )
            except httpclient.HTTPError as ex:
                if ex.response.code == 404:
                    # hub does not recognize the cookie == not authenticated
                    return redirect_to_login()
                # let all other errors surface: they're unexpected
                raise ex

    # Properties

    @property
    def base_url(self):
        return self.settings["base_url"]

    @property
    def binder_base_url(self):
        return self.settings["binder_base_url"]

    @property
    def cache(self):
        return self.settings["cache"]

    @property
    def cache_expiry_max(self):
        return self.settings.setdefault("cache_expiry_max", 120)

    @property
    def cache_expiry_min(self):
        return self.settings.setdefault("cache_expiry_min", 60)

    @property
    def client(self):
        return self.settings["client"]

    @property
    def config(self):
        return self.settings["config"]

    @property
    def content_security_policy(self):
        return self.settings["content_security_policy"]

    @property
    def default_format(self):
        return self.settings["default_format"]

    @property
    def formats(self):
        return self.settings["formats"]

    @property
    def frontpage_setup(self):
        return self.settings["frontpage_setup"]

    @property
    def hub_api_token(self):
        return self.settings.get("hub_api_token")

    @property
    def hub_api_url(self):
        return self.settings.get("hub_api_url")

    @property
    def hub_base_url(self):
        return self.settings["hub_base_url"]

    @property
    def hub_cookie_name(self):
        return "jupyterhub-services"

    @property
    def index(self):
        return self.settings["index"]

    @property
    def ipywidgets_base_url(self):
        return self.settings["ipywidgets_base_url"]

    @property
    def jupyter_js_widgets_version(self):
        return self.settings["jupyter_js_widgets_version"]

    @property
    def jupyter_widgets_html_manager_version(self):
        return self.settings["jupyter_widgets_html_manager_version"]

    @property
    def mathjax_url(self):
        return self.settings["mathjax_url"]

    @property
    def log(self):
        return self.settings["log"]

    @property
    def max_cache_uris(self):
        return self.settings.setdefault("max_cache_uris", set())

    @property
    def pending(self):
        return self.settings.setdefault("pending", {})

    @property
    def pool(self):
        return self.settings["pool"]

    @property
    def providers(self):
        return self.settings["providers"]

    @property
    def rate_limiter(self):
        return self.settings["rate_limiter"]

    @property
    def static_url_prefix(self):
        return self.settings["static_url_prefix"]

    @property
    def statsd(self):
        if hasattr(self, "_statsd"):
            return self._statsd
        if self.settings["statsd_host"]:
            self._statsd = statsd.StatsClient(
                self.settings["statsd_host"],
                self.settings["statsd_port"],
                self.settings["statsd_prefix"] + "." + type(self).__name__,
            )
            return self._statsd
        else:
            # return an empty mock object!
            self._statsd = EmptyClass()
            return self._statsd

    # ---------------------------------------------------------------
    # template rendering
    # ---------------------------------------------------------------

    def from_base(self, url, *args):
        if not url.startswith("/") or url.startswith(self.base_url):
            return url_path_join(url, *args)
        return url_path_join(self.base_url, url, *args)

    def get_template(self, name):
        """Return the jinja template object for a given name"""
        return self.settings["jinja2_env"].get_template(name)

    def render_template(self, name, **namespace):
        namespace.update(self.template_namespace)
        template = self.get_template(name)
        return template.render(**namespace)

    # Wrappers to facilitate custom rendering in subclasses without having to rewrite entire GET methods
    # This would seem to mostly involve creating different template namespaces to enable custom logic in
    # extended templates, but there might be other possibilities
    def render_status_code_template(self, status_code, **namespace):
        return self.render_template("%d.html" % status_code, **namespace)

    def render_error_template(self, **namespace):
        return self.render_template("error.html", **namespace)

    @property
    def template_namespace(self):
        return {
            "mathjax_url": self.mathjax_url,
            "static_url": self.static_url,
            "from_base": self.from_base,
            "google_analytics_id": self.settings.get("google_analytics_id"),
            "ipywidgets_base_url": self.ipywidgets_base_url,
            "jupyter_js_widgets_version": self.jupyter_js_widgets_version,
            "jupyter_widgets_html_manager_version": self.jupyter_widgets_html_manager_version,
        }

    # Overwrite the static_url method from Tornado to work better with our custom StaticFileHandler
    def static_url(self, url):
        return url_path_join(self.static_url_prefix, url)

    def breadcrumbs(self, path, base_url):
        """Generate a list of breadcrumbs"""
        breadcrumbs = []
        if not path:
            return breadcrumbs

        for name in path.split("/"):
            base_url = url_path_join(base_url, name)
            breadcrumbs.append({"url": base_url, "name": name})
        return breadcrumbs

    def get_page_links(self, response):
        """return prev_url, next_url for pagination

        Response must be an HTTPResponse from a paginated GitHub API request.

        Each will be None if there no such link.
        """
        links = parse_header_links(response.headers.get("Link", ""))
        next_url = prev_url = None
        if "next" in links:
            next_url = "?" + urlparse(links["next"]["url"]).query
        if "prev" in links:
            prev_url = "?" + urlparse(links["prev"]["url"]).query
        return prev_url, next_url

    # ---------------------------------------------------------------
    # error handling
    # ---------------------------------------------------------------

    def client_error_message(self, exc, url, body, msg=None):
        """Turn the tornado HTTP error into something useful

        Returns error code
        """
        str_exc = str(exc)

        # strip the unhelpful 599 prefix
        if str_exc.startswith("HTTP 599: "):
            str_exc = str_exc[10:]

        if (msg is None) and body and len(body) < 100:
            # if it's a short plain-text error message, include it
            msg = "%s (%s)" % (str_exc, escape(body))

        if not msg:
            msg = str_exc

        # Now get the error code
        if exc.code == 599:
            if isinstance(exc, CurlError):
                en = getattr(exc, "errno", -1)
                # can't connect to server should be 404
                # possibly more here
                if en in (pycurl.E_COULDNT_CONNECT, pycurl.E_COULDNT_RESOLVE_HOST):
                    code = 404
            # otherwise, raise 400 with informative message:
            code = 400
        elif exc.code >= 500:
            # 5XX, server error, but not this server
            code = 502
        else:
            # client-side error, blame our client
            if exc.code == 404:
                code = 404
                msg = "Remote %s" % msg
            else:
                code = 400

        return code, msg

    def reraise_client_error(self, exc):
        """Remote fetch raised an error"""
        try:
            url = exc.response.request.url.split("?")[0]
            body = exc.response.body.decode("utf8", "replace").strip()
        except AttributeError:
            url = "url"
            body = ""

        code, msg = self.client_error_message(exc, url, body)

        slim_body = escape(body[:300])

        self.log.warn("Fetching %s failed with %s. Body=%s", url, msg, slim_body)
        raise web.HTTPError(code, msg)

    @contextmanager
    def catch_client_error(self):
        """context manager for catching httpclient errors

        they are transformed into appropriate web.HTTPErrors
        """
        try:
            yield
        except httpclient.HTTPError as e:
            self.reraise_client_error(e)
        except socket.error as e:
            raise web.HTTPError(404, str(e))

    @property
    def fetch_kwargs(self):
        return self.settings.setdefault("fetch_kwargs", {})

    async def fetch(self, url, **overrides):
        """fetch a url with our async client

        handle default arguments and wrapping exceptions
        """
        kw = {}
        kw.update(self.fetch_kwargs)
        kw.update(overrides)
        with self.catch_client_error():
            response = await self.client.fetch(url, **kw)
        return response

    def write_error(self, status_code, **kwargs):
        """render custom error pages"""
        exc_info = kwargs.get("exc_info")
        message = ""
        status_message = responses.get(status_code, "Unknown")
        if exc_info:
            # get the custom message, if defined
            exception = exc_info[1]
            try:
                message = exception.log_message % exception.args
            except Exception:
                pass

            # construct the custom reason, if defined
            reason = getattr(exception, "reason", "")
            if reason:
                status_message = reason

        # build template namespace
        namespace = dict(
            status_code=status_code,
            status_message=status_message,
            message=message,
            exception=exception,
        )

        # render the template
        try:
            html = self.render_status_code_template(status_code, **namespace)
        except Exception as e:
            html = self.render_error_template(**namespace)
        self.set_header("Content-Type", "text/html")
        self.write(html)

    # ---------------------------------------------------------------
    # response caching
    # ---------------------------------------------------------------

    @property
    def cache_headers(self):
        # are there other headers to cache?
        h = {}
        for key in ("Content-Type",):
            if key in self._headers:
                h[key] = self._headers[key]
        return h

    _cache_key = None
    _cache_key_attr = "uri"

    @property
    def cache_key(self):
        """Use checksum for cache key because cache has size limit on keys
        """

        if self._cache_key is None:
            to_hash = utf8(getattr(self.request, self._cache_key_attr))
            self._cache_key = hashlib.sha1(to_hash).hexdigest()
        return self._cache_key

    def truncate(self, s, limit=256):
        """Truncate long strings"""
        if len(s) > limit:
            s = "%s...%s" % (s[: limit // 2], s[limit // 2 :])
        return s

    async def cache_and_finish(self, content=""):
        """finish a request and cache the result

        currently only works if:

        - result is not written in multiple chunks
        - custom headers are not used
        """
        request_time = self.request.request_time()
        # set cache expiry to 120x request time
        # bounded by cache_expiry_min,max
        # a 30 second render will be cached for an hour
        expiry = max(
            min(120 * request_time, self.cache_expiry_max), self.cache_expiry_min
        )

        if self.request.uri in self.max_cache_uris:
            # if it's a link from the front page, cache for a long time
            expiry = self.cache_expiry_max

        if expiry > 0:
            self.set_header("Cache-Control", "max-age=%i" % expiry)

        self.write(content)
        self.finish()

        short_url = self.truncate(self.request.path)
        cache_data = pickle.dumps(
            {"headers": self.cache_headers, "body": content}, pickle.HIGHEST_PROTOCOL
        )
        log = self.log.info if expiry > self.cache_expiry_min else self.log.debug
        log("Caching (expiry=%is) %s", expiry, short_url)
        try:
            with time_block("Cache set %s" % short_url, logger=self.log):
                await self.cache.set(
                    self.cache_key, cache_data, int(time.time() + expiry)
                )
        except Exception:
            self.log.error("Cache set for %s failed", short_url, exc_info=True)
        else:
            self.log.debug("Cache set finished %s", short_url)


def cached(method):
    """decorator for a cached page.

    This only handles getting from the cache, not writing to it.
    Writing to the cache must be handled in the decorated method.
    """

    async def cached_method(self, *args, **kwargs):
        uri = self.request.path
        short_url = self.truncate(uri)

        if self.get_argument("flush_cache", False):
            await self.rate_limiter.check(self)
            self.log.info("Flushing cache %s", short_url)
            # call the wrapped method
            await method(self, *args, **kwargs)
            return

        pending_future = self.pending.get(uri, None)
        loop = IOLoop.current()
        if pending_future:
            self.log.info("Waiting for concurrent request at %s", short_url)
            tic = loop.time()
            await pending_future
            toc = loop.time()
            self.log.info(
                "Waited %.3fs for concurrent request at %s", toc - tic, short_url
            )

        try:
            with time_block("Cache get %s" % short_url, logger=self.log):
                cached_pickle = await self.cache.get(self.cache_key)
            if cached_pickle is not None:
                cached = pickle.loads(cached_pickle)
            else:
                cached = None
        except Exception as e:
            self.log.error("Exception getting %s from cache", short_url, exc_info=True)
            cached = None

        if cached is not None:
            self.log.info("Cache hit %s", short_url)
            for key, value in cached["headers"].items():
                self.set_header(key, value)
            self.write(cached["body"])
        else:
            self.log.debug("Cache miss %s", short_url)
            await self.rate_limiter.check(self)
            future = self.pending[uri] = Future()
            try:
                # call the wrapped method
                await method(self, *args, **kwargs)
            finally:
                self.pending.pop(uri, None)
                # notify waiters
                future.set_result(None)

    return cached_method


class RenderingHandler(BaseHandler):
    """Base for handlers that render notebooks"""

    # notebook caches based on path (no url params)
    _cache_key_attr = "path"

    @property
    def render_timeout(self):
        """0 render_timeout means never finish early"""
        return self.settings.setdefault("render_timeout", 0)

    def initialize(self, **kwargs):
        super().initialize(**kwargs)
        loop = IOLoop.current()
        if self.render_timeout:
            self.slow_timeout = loop.add_timeout(
                loop.time() + self.render_timeout, self.finish_early
            )

    def finish_early(self):
        """When the render is slow, draw a 'waiting' page instead

        rely on the cache to deliver the page to a future request.
        """
        if self._finished:
            return
        self.log.info("Finishing early %s", self.request.uri)
        html = self.render_template("slow_notebook.html")
        self.set_status(202)  # Accepted
        self.finish(html)

        # short circuit some methods because the rest of the rendering will still happen
        self.write = self.finish = self.redirect = lambda chunk=None: None
        self.statsd.incr("rendering.waiting", 1)

    def filter_formats(self, nb, raw):
        """Generate a list of formats that can render the given nb json

        formats that do not provide a `test` method are assumed to work for
        any notebook
        """
        for name, format in self.formats.items():
            test = format.get("test", None)
            try:
                if test is None or test(nb, raw):
                    yield (name, format)
            except Exception as err:
                self.log.info("Failed to test %s: %s", self.request.uri, name)

    # empty methods to be implemented by subclasses to make GET requests more modular
    def get_notebook_data(self, **kwargs):
        """
        Pass as kwargs variables needed to define those variables which will be necessary for
        the provider to find the notebook. (E.g. path for LocalHandler, user and repo for GitHub.)
        Return variables the provider needs to find and load the notebook. Then run custom logic
        in GET or pass the output of get_notebook_data immediately to deliver_notebook.

        First part of any provider's GET method.

        Custom logic, if applicable, is middle part of any provider's GET method, and usually
        is implemented or overwritten in subclasses, while get_notebook_data and deliver_notebook
        will often remain unchanged from the parent class (e.g. for a custom GitHub provider).
        """
        pass

    def deliver_notebook(self, **kwargs):
        """
        Pass as kwargs the return values of get_notebook_data to this method. Get the JSON data
        from the provider to render the notebook. Finish with a call to self.finish_notebook.

        Last part of any provider's GET method.
        """
        pass

    # Wrappers to facilitate custom rendering in subclasses without having to rewrite entire GET methods
    # This would seem to mostly involve creating different template namespaces to enable custom logic in
    # extended templates, but there might be other possibilities
    def render_notebook_template(
        self, body, nb, download_url, json_notebook, **namespace
    ):
        return self.render_template(
            "formats/%s.html" % self.format,
            body=body,
            nb=nb,
            download_url=download_url,
            format=self.format,
            default_format=self.default_format,
            format_prefix=self.format_prefix,
            formats=dict(self.filter_formats(nb, json_notebook)),
            format_base=self.request.uri.replace(self.format_prefix, "").replace(
                self.base_url, "/"
            ),
            date=datetime.utcnow().strftime(self.date_fmt),
            **namespace
        )

    async def finish_notebook(
        self, json_notebook, download_url, msg=None, public=False, **namespace
    ):
        """Renders a notebook from its JSON body.

        Parameters
        ----------
        json_notebook: str
            Notebook document in JSON format
        download_url: str
            URL to download the notebook document
        msg: str, optional
            Extra information to log when rendering fails
        public: bool, optional
            True if the notebook is public and its access indexed, False if not
        """

        if msg is None:
            msg = download_url

        try:
            parse_time = self.statsd.timer("rendering.parsing.time").start()
            nb = reads(json_notebook, current_nbformat)
            parse_time.stop()
        except ValueError:
            self.log.error("Failed to render %s", msg, exc_info=True)
            self.statsd.incr("rendering.parsing.fail")
            raise web.HTTPError(400, "Error reading JSON notebook")

        try:
            self.log.debug("Requesting render of %s", download_url)
            with time_block(
                "Rendered %s" % download_url, logger=self.log, debug_limit=0
            ):
                self.log.info(
                    "Rendering %d B notebook from %s", len(json_notebook), download_url
                )
                render_time = self.statsd.timer("rendering.nbrender.time").start()
                loop = asyncio.get_event_loop()
                nbhtml, config = await loop.run_in_executor(
                    self.pool,
                    render_notebook,
                    self.formats[self.format],
                    nb,
                    download_url,
                    self.config,
                )
                render_time.stop()
        except NbFormatError as e:
            self.statsd.incr("rendering.nbrender.fail", 1)
            self.log.error("Invalid notebook %s: %s", msg, e)
            raise web.HTTPError(400, str(e))
        except Exception as e:
            self.statsd.incr("rendering.nbrender.fail", 1)
            self.log.error("Failed to render %s", msg, exc_info=True)
            raise web.HTTPError(400, str(e))
        else:
            self.statsd.incr("rendering.nbrender.success", 1)
            self.log.debug("Finished render of %s", download_url)

        html_time = self.statsd.timer("rendering.html.time").start()
        html = self.render_notebook_template(
            body=nbhtml,
            nb=nb,
            download_url=download_url,
            json_notebook=json_notebook,
            **namespace
        )
        html_time.stop()

        if "content_type" in self.formats[self.format]:
            self.set_header("Content-Type", self.formats[self.format]["content_type"])
        await self.cache_and_finish(html)

        # Index notebook
        self.index.index_notebook(download_url, nb, public)


class FilesRedirectHandler(BaseHandler):
    """redirect files URLs without files prefix

    matches behavior of old app, currently unused.
    """

    def get(self, before_files, after_files):
        self.log.info("Redirecting %s to %s", before_files, after_files)
        self.redirect("%s/%s" % (before_files, after_files))


class AddSlashHandler(BaseHandler):
    """redirector for URLs that should always have trailing slash"""

    def get(self, *args, **kwargs):
        uri = self.request.path + "/"
        if self.request.query:
            uri = "%s?%s" % (uri, self.request.query)
        self.redirect(uri)


class RemoveSlashHandler(BaseHandler):
    """redirector for URLs that should never have trailing slash"""

    def get(self, *args, **kwargs):
        uri = self.request.path.rstrip("/")
        if self.request.query:
            uri = "%s?%s" % (uri, self.request.query)
        self.redirect(uri)