[svn r21196] quote unsafe characters in links

This commit is contained in:
Ian Bicking 2008-11-04 11:10:56 -06:00
parent 5c7b318f88
commit c1c84a4c97
2 changed files with 17 additions and 3 deletions

View File

@ -13,6 +13,9 @@ svn trunk
* Fixed Windows problem with putting the install record in the right
place, and generating the ``pip`` script with Setuptools.
* Download links that include embedded spaces or other unsafe
characters (those characters get %-encoded).
0.2
---

17
pip.py
View File

@ -2071,7 +2071,8 @@ class HTMLPage(object):
"""Yields all links in the page"""
for match in self._href_re.finditer(self.content):
url = match.group(1) or match.group(2) or match.group(3)
yield Link(urlparse.urljoin(self.url, url), self)
url = self.clean_link(urlparse.urljoin(self.url, url))
yield Link(url, self)
def rel_links(self):
for url in self.explicit_rel_links():
@ -2092,7 +2093,8 @@ class HTMLPage(object):
if not match:
continue
url = match.group(1) or match.group(2) or match.group(3)
yield Link(urlparse.urljoin(self.url, url), self)
url = self.clean_link(urlparse.urljoin(self.url, url))
yield Link(url, self)
def scraped_rel_links(self):
for regex in (self._homepage_re, self._download_re):
@ -2105,9 +2107,18 @@ class HTMLPage(object):
url = match.group(1) or match.group(2) or match.group(3)
if not url:
continue
url = urlparse.urljoin(self.url, url)
url = self.clean_link(urlparse.urljoin(self.url, url))
yield Link(url, self)
_clean_re = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I)
def clean_link(self, url):
"""Makes sure a link is fully encoded. That is, if a ' ' shows up in
the link, it will be rewritten to %20 (while not over-quoting
% or other characters)."""
return self._clean_re.sub(
lambda match: '%%%2x' % ord(match.group(0)), url)
class PageCache(object):
"""Cache of HTML pages"""