alnoda-workspaces/workspaces/notebook-old-workspace/nbviewer/nbviewer/providers/url/handlers.py

# -----------------------------------------------------------------------------
#  Copyright (C) Jupyter Development Team
#
#  Distributed under the terms of the BSD License.  The full license is in
#  the file COPYING, distributed as part of this software.
# -----------------------------------------------------------------------------
from urllib import robotparser
from urllib.parse import urlparse

from tornado import httpclient
from tornado import web
from tornado.escape import url_unescape

from .. import _load_handler_from_location
from ...utils import quote
from ...utils import response_text
from ..base import cached
from ..base import RenderingHandler


class URLHandler(RenderingHandler):
    """Renderer for /url or /urls"""

    async def get_notebook_data(self, secure, netloc, url):
        proto = "http" + secure
        netloc = url_unescape(netloc)

        if "/?" in url:
            url, query = url.rsplit("/?", 1)
        else:
            query = None

        remote_url = u"{}://{}/{}".format(proto, netloc, quote(url))

        if query:
            remote_url = remote_url + "?" + query
        if not url.endswith(".ipynb"):
            # this is how we handle relative links (files/ URLs) in notebooks
            # if it's not a .ipynb URL and it is a link from a notebook,
            # redirect to the original URL rather than trying to render it as a notebook
            refer_url = self.request.headers.get("Referer", "").split("://")[-1]
            if refer_url.startswith(self.request.host + "/url"):
                self.redirect(remote_url)
                return

        parse_result = urlparse(remote_url)

        robots_url = parse_result.scheme + "://" + parse_result.netloc + "/robots.txt"

        public = False  # Assume non-public

        try:
            robots_response = await self.fetch(robots_url)
            robotstxt = response_text(robots_response)
            rfp = robotparser.RobotFileParser()
            rfp.set_url(robots_url)
            rfp.parse(robotstxt.splitlines())
            public = rfp.can_fetch("*", remote_url)
        except httpclient.HTTPError as e:
            self.log.debug(
                "Robots.txt not available for {}".format(remote_url), exc_info=True
            )
            public = True
        except Exception as e:
            self.log.error(e)

        return remote_url, public

    async def deliver_notebook(self, remote_url, public):
        response = await self.fetch(remote_url)

        try:
            nbjson = response_text(response, encoding="utf-8")
        except UnicodeDecodeError:
            self.log.error("Notebook is not utf8: %s", remote_url, exc_info=True)
            raise web.HTTPError(400)

        await self.finish_notebook(
            nbjson,
            download_url=remote_url,
            msg="file from url: %s" % remote_url,
            public=public,
            request=self.request,
        )

    @cached
    async def get(self, secure, netloc, url):
        remote_url, public = await self.get_notebook_data(secure, netloc, url)

        await self.deliver_notebook(remote_url, public)


def default_handlers(handlers=[], **handler_names):
    """Tornado handlers"""

    url_handler = _load_handler_from_location(handler_names["url_handler"])

    return handlers + [
        (r"/url(?P<secure>[s]?)/(?P<netloc>[^/]+)/(?P<url>.*)", url_handler, {})
    ]


def uri_rewrites(rewrites=[]):
    return rewrites + [("^http(s?)://(.*)$", u"/url{0}/{1}"), ("^(.*)$", u"/url/{0}")]