alnoda-workspaces/workspaces/notebook-old-workspace/nbviewer/nbviewer/providers/url/handlers.py
2022-05-30 07:24:06 +00:00

105 lines
3.4 KiB
Python

# -----------------------------------------------------------------------------
# Copyright (C) Jupyter Development Team
#
# Distributed under the terms of the BSD License. The full license is in
# the file COPYING, distributed as part of this software.
# -----------------------------------------------------------------------------
from urllib import robotparser
from urllib.parse import urlparse
from tornado import httpclient
from tornado import web
from tornado.escape import url_unescape
from .. import _load_handler_from_location
from ...utils import quote
from ...utils import response_text
from ..base import cached
from ..base import RenderingHandler
class URLHandler(RenderingHandler):
"""Renderer for /url or /urls"""
async def get_notebook_data(self, secure, netloc, url):
proto = "http" + secure
netloc = url_unescape(netloc)
if "/?" in url:
url, query = url.rsplit("/?", 1)
else:
query = None
remote_url = u"{}://{}/{}".format(proto, netloc, quote(url))
if query:
remote_url = remote_url + "?" + query
if not url.endswith(".ipynb"):
# this is how we handle relative links (files/ URLs) in notebooks
# if it's not a .ipynb URL and it is a link from a notebook,
# redirect to the original URL rather than trying to render it as a notebook
refer_url = self.request.headers.get("Referer", "").split("://")[-1]
if refer_url.startswith(self.request.host + "/url"):
self.redirect(remote_url)
return
parse_result = urlparse(remote_url)
robots_url = parse_result.scheme + "://" + parse_result.netloc + "/robots.txt"
public = False # Assume non-public
try:
robots_response = await self.fetch(robots_url)
robotstxt = response_text(robots_response)
rfp = robotparser.RobotFileParser()
rfp.set_url(robots_url)
rfp.parse(robotstxt.splitlines())
public = rfp.can_fetch("*", remote_url)
except httpclient.HTTPError as e:
self.log.debug(
"Robots.txt not available for {}".format(remote_url), exc_info=True
)
public = True
except Exception as e:
self.log.error(e)
return remote_url, public
async def deliver_notebook(self, remote_url, public):
response = await self.fetch(remote_url)
try:
nbjson = response_text(response, encoding="utf-8")
except UnicodeDecodeError:
self.log.error("Notebook is not utf8: %s", remote_url, exc_info=True)
raise web.HTTPError(400)
await self.finish_notebook(
nbjson,
download_url=remote_url,
msg="file from url: %s" % remote_url,
public=public,
request=self.request,
)
@cached
async def get(self, secure, netloc, url):
remote_url, public = await self.get_notebook_data(secure, netloc, url)
await self.deliver_notebook(remote_url, public)
def default_handlers(handlers=[], **handler_names):
"""Tornado handlers"""
url_handler = _load_handler_from_location(handler_names["url_handler"])
return handlers + [
(r"/url(?P<secure>[s]?)/(?P<netloc>[^/]+)/(?P<url>.*)", url_handler, {})
]
def uri_rewrites(rewrites=[]):
return rewrites + [("^http(s?)://(.*)$", u"/url{0}/{1}"), ("^(.*)$", u"/url/{0}")]