utils: decode requirement files according to their BOM if present

This commit is contained in:
Xavier Fernandez 2016-02-12 23:41:21 +01:00
parent 0c73957b6d
commit e2889268bd
3 changed files with 44 additions and 2 deletions

View File

@ -29,6 +29,7 @@ from pip.models import PyPI
from pip.utils import (splitext, rmtree, format_size, display_path,
backup_dir, ask_path_exists, unpack_file,
ARCHIVE_EXTENSIONS, consume, call_subprocess)
from pip.utils.encoding import auto_decode
from pip.utils.filesystem import check_path_owner
from pip.utils.logging import indent_log
from pip.utils.setuptools_build import SETUPTOOLS_SHIM
@ -413,8 +414,8 @@ def get_file_content(url, comes_from=None, session=None):
else:
return resp.url, resp.content
try:
with open(url) as f:
content = f.read()
with open(url, 'rb') as f:
content = auto_decode(f.read())
except IOError as exc:
raise InstallationError(
'Could not open requirements file: %s' % str(exc)

23
pip/utils/encoding.py Normal file
View File

@ -0,0 +1,23 @@
import codecs
import locale
BOMS = [
(codecs.BOM_UTF8, 'utf8'),
(codecs.BOM_UTF16, 'utf16'),
(codecs.BOM_UTF16_BE, 'utf16-be'),
(codecs.BOM_UTF16_LE, 'utf16-le'),
(codecs.BOM_UTF32, 'utf32'),
(codecs.BOM_UTF32_BE, 'utf32-be'),
(codecs.BOM_UTF32_LE, 'utf32-le'),
]
def auto_decode(data):
"""Check a bytes string for a BOM to correctly detect the encoding
Fallback to locale.getpreferredencoding(False) like open() on Python3"""
for bom, encoding in BOMS:
if data.startswith(bom):
return data[len(bom):].decode(encoding)
return data.decode(locale.getpreferredencoding(False))

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
"""
util tests
@ -15,6 +17,7 @@ from mock import Mock, patch
from pip.exceptions import HashMismatch, HashMissing, InstallationError
from pip.utils import (egg_link_path, get_installed_distributions,
untar_file, unzip_file, rmtree, normalize_path)
from pip.utils.encoding import auto_decode
from pip.utils.hashes import Hashes, MissingHashes
from pip._vendor.six import BytesIO
@ -447,3 +450,18 @@ class TestHashes(object):
assert Hashes({'sha256': 'dummy'})
assert not Hashes()
assert not Hashes({})
class TestEncoding(object):
"""Tests for pip.utils.encoding"""
def test_auto_decode_utf16_le(self):
data = (
b'\xff\xfeD\x00j\x00a\x00n\x00g\x00o\x00=\x00'
b'=\x001\x00.\x004\x00.\x002\x00'
)
assert auto_decode(data) == "Django==1.4.2"
def test_auto_decode_utf8_no_bom(self):
data = u"Fort de café"
assert auto_decode(data.encode('utf8')) == data