diff --git a/pip/download.py b/pip/download.py index dac0b3a1e..3cacdfba0 100644 --- a/pip/download.py +++ b/pip/download.py @@ -29,6 +29,7 @@ from pip.models import PyPI from pip.utils import (splitext, rmtree, format_size, display_path, backup_dir, ask_path_exists, unpack_file, ARCHIVE_EXTENSIONS, consume, call_subprocess) +from pip.utils.encoding import auto_decode from pip.utils.filesystem import check_path_owner from pip.utils.logging import indent_log from pip.utils.setuptools_build import SETUPTOOLS_SHIM @@ -413,8 +414,8 @@ def get_file_content(url, comes_from=None, session=None): else: return resp.url, resp.content try: - with open(url) as f: - content = f.read() + with open(url, 'rb') as f: + content = auto_decode(f.read()) except IOError as exc: raise InstallationError( 'Could not open requirements file: %s' % str(exc) diff --git a/pip/utils/encoding.py b/pip/utils/encoding.py new file mode 100644 index 000000000..b272a0b24 --- /dev/null +++ b/pip/utils/encoding.py @@ -0,0 +1,23 @@ +import codecs +import locale + + +BOMS = [ + (codecs.BOM_UTF8, 'utf8'), + (codecs.BOM_UTF16, 'utf16'), + (codecs.BOM_UTF16_BE, 'utf16-be'), + (codecs.BOM_UTF16_LE, 'utf16-le'), + (codecs.BOM_UTF32, 'utf32'), + (codecs.BOM_UTF32_BE, 'utf32-be'), + (codecs.BOM_UTF32_LE, 'utf32-le'), +] + + +def auto_decode(data): + """Check a bytes string for a BOM to correctly detect the encoding + + Fallback to locale.getpreferredencoding(False) like open() on Python3""" + for bom, encoding in BOMS: + if data.startswith(bom): + return data[len(bom):].decode(encoding) + return data.decode(locale.getpreferredencoding(False)) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index fe66018db..68068730b 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- + """ util tests @@ -15,6 +17,7 @@ from mock import Mock, patch from pip.exceptions import HashMismatch, HashMissing, InstallationError from pip.utils import (egg_link_path, get_installed_distributions, untar_file, unzip_file, rmtree, normalize_path) +from pip.utils.encoding import auto_decode from pip.utils.hashes import Hashes, MissingHashes from pip._vendor.six import BytesIO @@ -447,3 +450,18 @@ class TestHashes(object): assert Hashes({'sha256': 'dummy'}) assert not Hashes() assert not Hashes({}) + + +class TestEncoding(object): + """Tests for pip.utils.encoding""" + + def test_auto_decode_utf16_le(self): + data = ( + b'\xff\xfeD\x00j\x00a\x00n\x00g\x00o\x00=\x00' + b'=\x001\x00.\x004\x00.\x002\x00' + ) + assert auto_decode(data) == "Django==1.4.2" + + def test_auto_decode_utf8_no_bom(self): + data = u"Fort de café" + assert auto_decode(data.encode('utf8')) == data