From c1c84a4c97b4e60743a9851229f0bf87ce89680f Mon Sep 17 00:00:00 2001 From: Ian Bicking Date: Tue, 4 Nov 2008 11:10:56 -0600 Subject: [PATCH] [svn r21196] quote unsafe characters in links --- docs/news.txt | 3 +++ pip.py | 17 ++++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/docs/news.txt b/docs/news.txt index 361337f1c..7c57e5e38 100644 --- a/docs/news.txt +++ b/docs/news.txt @@ -13,6 +13,9 @@ svn trunk * Fixed Windows problem with putting the install record in the right place, and generating the ``pip`` script with Setuptools. +* Download links that include embedded spaces or other unsafe + characters (those characters get %-encoded). + 0.2 --- diff --git a/pip.py b/pip.py index 7b00895c6..9733775e8 100755 --- a/pip.py +++ b/pip.py @@ -2071,7 +2071,8 @@ class HTMLPage(object): """Yields all links in the page""" for match in self._href_re.finditer(self.content): url = match.group(1) or match.group(2) or match.group(3) - yield Link(urlparse.urljoin(self.url, url), self) + url = self.clean_link(urlparse.urljoin(self.url, url)) + yield Link(url, self) def rel_links(self): for url in self.explicit_rel_links(): @@ -2092,7 +2093,8 @@ class HTMLPage(object): if not match: continue url = match.group(1) or match.group(2) or match.group(3) - yield Link(urlparse.urljoin(self.url, url), self) + url = self.clean_link(urlparse.urljoin(self.url, url)) + yield Link(url, self) def scraped_rel_links(self): for regex in (self._homepage_re, self._download_re): @@ -2105,9 +2107,18 @@ class HTMLPage(object): url = match.group(1) or match.group(2) or match.group(3) if not url: continue - url = urlparse.urljoin(self.url, url) + url = self.clean_link(urlparse.urljoin(self.url, url)) yield Link(url, self) + _clean_re = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I) + + def clean_link(self, url): + """Makes sure a link is fully encoded. That is, if a ' ' shows up in + the link, it will be rewritten to %20 (while not over-quoting + % or other characters).""" + return self._clean_re.sub( + lambda match: '%%%2x' % ord(match.group(0)), url) + class PageCache(object): """Cache of HTML pages"""