Differentiate between safe and unsafe urls where possible

* Links and HTMLPages know if they are "trusted"
* File Links know if they are safe or not
* A "Safe" file Link comes from a trusted Link/HTMLPage and has
  a hash allowing verification of the download
* Adds a --allow-unsafe PACKAGE argument to allow unsafe files on
  a per package basis
* Optimizes scraping external sites by short circuiting if
  the current trust rules won't allow using it's files anyways
This commit is contained in:
Donald Stufft 2013-06-02 18:28:51 -04:00
parent 9ccd5f0bb3
commit cc46cdc8b3
4 changed files with 126 additions and 12 deletions

View File

@ -79,6 +79,16 @@ no_allow_external = make_option(
help="Disallow the installation of externally hosted files",
)
allow_unsafe = make_option(
"--allow-unsafe",
dest="allow_unsafe",
action="append",
default=[],
metavar="PACKAGE",
help="Allow the installation of unsafe and unverifiable files for a "
"package",
)
requirements = make_option(
'-r', '--requirement',
dest='requirements',
@ -157,5 +167,6 @@ index_group = {
mirrors,
allow_external,
no_allow_external,
allow_unsafe,
]
}

View File

@ -164,6 +164,7 @@ class InstallCommand(Command):
mirrors=options.mirrors,
use_wheel=options.use_wheel,
allow_external=options.allow_external,
allow_unsafe=options.allow_unsafe,
)
def run(self, options, args):

View File

@ -48,7 +48,7 @@ class PackageFinder(object):
def __init__(self, find_links, index_urls,
use_mirrors=False, mirrors=None, main_mirror_url=None,
use_wheel=False, allow_external=False):
use_wheel=False, allow_external=False, allow_unsafe=[]):
self.find_links = find_links
self.index_urls = index_urls
self.dependency_links = []
@ -62,12 +62,23 @@ class PackageFinder(object):
self.mirror_urls = []
self.use_wheel = use_wheel
# Do we allow (safe and verifiable) externally hosted files?
self.allow_external = allow_external
# Which names are allowed to install unsafe and unverifiable files?
self.allow_unsafe = set(normalize_name(n) for n in allow_unsafe)
# Do we allow unsafe and unverifiable files?
self.allow_all_unsafe = False
# Stores if we ignored any external links so that we can instruct
# end users how to install them if no distributions are available
self.need_warn_external = False
# Stores if we ignored any unsafe links so that we can instruct
# end users how to install them if no distributions are available
self.need_warn_unsafe = False
@property
def use_wheel(self):
return self._use_wheel
@ -177,11 +188,11 @@ class PackageFinder(object):
main_index_url = None
if self.index_urls:
# Check that we have the url_name correctly spelled:
main_index_url = Link(mkurl_pypi_url(self.index_urls[0]))
main_index_url = Link(mkurl_pypi_url(self.index_urls[0]), trusted=True)
# This will also cache the page, so it's okay that we get it again later:
page = self._get_page(main_index_url, req)
if page is None:
url_name = self._find_url_name(Link(self.index_urls[0]), url_name, req) or req.url_name
url_name = self._find_url_name(Link(self.index_urls[0], trusted=True), url_name, req) or req.url_name
# Combine index URLs with mirror URLs here to allow
# adding more index URLs from requirements files
@ -193,22 +204,31 @@ class PackageFinder(object):
for url in all_index_urls] + self.find_links
else:
locations = list(self.find_links)
locations.extend(self.dependency_links)
for version in req.absolute_versions:
if url_name is not None and main_index_url is not None:
locations = [
posixpath.join(main_index_url.url, version)] + locations
file_locations, url_locations = self._sort_locations(locations)
_flocations, _ulocations = self._sort_locations(self.dependency_links)
file_locations.extend(_flocations)
# We trust every url that the user has given us whether it was given
# via --index-url, --user-mirrors/--mirror, or --find-links or a
# default option thereof
locations = [Link(url, trusted=True) for url in url_locations]
# We explicitly do not trust links that came from dependency_links
locations.extend([Link(url) for url in _ulocations])
locations = [Link(url) for url in url_locations]
logger.debug('URLs to search for versions for %s:' % req)
for location in locations:
logger.debug('* %s' % location)
found_versions = []
found_versions.extend(
self._package_versions(
[Link(url, '-f') for url in self.find_links], req.name.lower()))
# We trust every directly linked archive in find_links
[Link(url, '-f', trusted=True) for url in self.find_links], req.name.lower()))
page_versions = []
for page in self._get_pages(locations, req):
logger.debug('Analyzing links from page %s' % page.url)
@ -230,6 +250,10 @@ class PackageFinder(object):
logger.warn("Some externally hosted files were ignored (use "
"--allow-external to allow).")
if self.need_warn_unsafe:
logger.warn("Some unsafe and unverifiable files were ignored "
"(use --allow-unsafe %s to allow)." % req.name)
raise DistributionNotFound('No distributions at all found for %s' % req)
installed_version = []
if req.satisfied_by is not None:
@ -267,6 +291,10 @@ class PackageFinder(object):
logger.warn("Some externally hosted files were ignored (use "
"--allow-external to allow).")
if self.need_warn_unsafe:
logger.warn("Some unsafe and unverifiable files were ignored "
"(use --allow-unsafe %s to allow)." % req.name)
raise DistributionNotFound('No distributions matching the version for %s' % req)
if applicable_versions[0][1] is InfLink:
# We have an existing version, and its the best version
@ -286,6 +314,12 @@ class PackageFinder(object):
"versions of pip will default to disallowing "
"externally hosted files.")
if (selected_version.verifiable is not None
and not selected_version.verifiable):
logger.warn("You are installing a potentially unsafe and "
"unverifiable file. Future versions of pip will "
"default to disallowing unsafe files.")
return selected_version
@ -342,6 +376,22 @@ class PackageFinder(object):
continue
done.append(page)
for link in page.rel_links():
if not self.allow_external:
self.need_warn_external = True
logger.debug("Not searching %s for files because external "
"urls are disallowed." % link)
continue
if (link.trusted is not None
and not link.trusted
and not normalize_name(req.name).lower() in self.allow_unsafe
and not self.allow_all_unsafe): # TODO: Remove after release
logger.debug("Not searching %s for urls, it is an "
"untrusted link and cannot produce safe or "
"verifiable files." % link)
self.need_warn_unsafe = True
continue
pending_queue.put(link)
_egg_fragment_re = re.compile(r'#egg=([^&]*)')
@ -424,6 +474,18 @@ class PackageFinder(object):
self.need_warn_external = True
return []
if (link.verifiable is not None
and not link.verifiable
and not normalize_name(search_name).lower() in self.allow_unsafe
and not self.allow_all_unsafe): # TODO: Remove after release
# We have a link that we are sure we cannot verify it's integrity,
# so we should skip it unless we are allowing unsafe installs
# for this requirement.
logger.debug("Skipping %s because it is an unsafe and unverifiable"
" file." % link)
self.need_warn_unsafe = True
return []
match = self._py_version_re.search(version)
if match:
version = version[:match.start()]
@ -514,11 +576,12 @@ class HTMLPage(object):
_download_re = re.compile(r'<th>\s*download\s+url', re.I)
_href_re = re.compile('href=(?:"([^"]*)"|\'([^\']*)\'|([^>\\s\\n]*))', re.I|re.S)
def __init__(self, content, url, headers=None):
def __init__(self, content, url, headers=None, trusted=None):
self.content = content
self.parsed = html5lib.parse(self.content, namespaceHTMLElements=False)
self.url = url
self.headers = headers
self.trusted = trusted
def __str__(self):
return self.url
@ -595,7 +658,7 @@ class HTMLPage(object):
cache.set_is_archive(url)
return None
inst = cls(u(contents), real_url, headers)
inst = cls(u(contents), real_url, headers, trusted=link.trusted)
except (HTTPError, URLError, socket.timeout, socket.error, OSError, WindowsError):
e = sys.exc_info()[1]
desc = str(e)
@ -712,7 +775,7 @@ class HTMLPage(object):
if found_rels & rels:
href = anchor.get("href")
url = self.clean_link(urlparse.urljoin(self.base_url, href))
yield Link(url, self)
yield Link(url, self, trusted=False)
def scraped_rel_links(self):
# Can we get rid of this horrible horrible method?
@ -727,7 +790,7 @@ class HTMLPage(object):
if not url:
continue
url = self.clean_link(urlparse.urljoin(self.base_url, url))
yield Link(url, self)
yield Link(url, self, trusted=False)
_clean_re = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I)
@ -741,10 +804,11 @@ class HTMLPage(object):
class Link(object):
def __init__(self, url, comes_from=None, internal=None):
def __init__(self, url, comes_from=None, internal=None, trusted=None):
self.url = url
self.comes_from = comes_from
self.internal = internal
self.trusted = trusted
# Set whether it's a wheel
self.wheel = None
@ -833,6 +897,41 @@ class Link(object):
def show_url(self):
return posixpath.basename(self.url.split('#', 1)[0].split('?', 1)[0])
@property
def verifiable(self):
"""
Returns True if this link can be verified after download, False if it
cannot, and None if we cannot determine.
"""
trusted = self.trusted or getattr(self.comes_from, "trusted", None)
if trusted is not None and trusted:
# This link came from a trusted source. It *may* be verifiable but
# first we need to see if this page is operating under the new
# API version.
try:
api_version = getattr(self.comes_from, "api_version", None)
api_version = int(api_version)
except (ValueError, TypeError):
api_version = None
if api_version is None or api_version <= 1:
# This link is either trusted, or it came from a trusted,
# however it is not operating under the API version 2 so
# we can't make any claims about if it's safe or not
return
if self.hash:
# This link came from a trusted source and it has a hash, so we
# can consider it safe.
return True
else:
# This link came from a trusted source, using the new API
# version, and it does not have a hash. It is NOT verifiable
return False
elif trusted is not None:
# This link came from an untrusted source and we cannot trust it
return False
#An "Infinite Link" that compares greater than other links
InfLink = Link(Inf) #this object is not currently used as a sortable

View File

@ -21,7 +21,7 @@ from pip.util import (display_path, rmtree, ask, ask_path_exists, backup_dir,
is_installable_dir, is_local, dist_is_local,
dist_in_usersite, dist_in_site_packages, renames,
normalize_path, egg_link_path, make_path_relative,
call_subprocess, is_prerelease)
call_subprocess, is_prerelease, normalize_name)
from pip.backwardcompat import (urlparse, urllib, uses_pycache,
ConfigParser, string_types, HTTPError,
get_python_version, b)
@ -1399,6 +1399,9 @@ def parse_requirements(filename, finder=None, comes_from=None, options=None):
finder.allow_external = True
elif line.startswith("--no-allow-external"):
finder.allow_external = False
elif line.startswith("--allow-unsafe"):
line = line[len("--allow-unsafe"):].strip().lstrip("=")
finder.allow_unsafe |= set([normalize_name(line).lower()])
else:
comes_from = '-r %s (line %s)' % (filename, line_number)
if line.startswith('-e') or line.startswith('--editable'):