diff --git a/news/6054.bugfix b/news/6054.bugfix new file mode 100644 index 000000000..feb76ef41 --- /dev/null +++ b/news/6054.bugfix @@ -0,0 +1,4 @@ +Fix ``utils.encoding.auto_decode()`` ``LookupError`` with invalid encodings. +``utils.encoding.auto_decode()`` was broken when decoding Big Endian BOM +byte-strings on Little Endian or vice versa. + diff --git a/src/pip/_internal/utils/encoding.py b/src/pip/_internal/utils/encoding.py index 9861530c9..30139f2e5 100644 --- a/src/pip/_internal/utils/encoding.py +++ b/src/pip/_internal/utils/encoding.py @@ -9,13 +9,13 @@ if MYPY_CHECK_RUNNING: from typing import List, Tuple, Text BOMS = [ - (codecs.BOM_UTF8, 'utf8'), - (codecs.BOM_UTF16, 'utf16'), - (codecs.BOM_UTF16_BE, 'utf16-be'), - (codecs.BOM_UTF16_LE, 'utf16-le'), - (codecs.BOM_UTF32, 'utf32'), - (codecs.BOM_UTF32_BE, 'utf32-be'), - (codecs.BOM_UTF32_LE, 'utf32-le'), + (codecs.BOM_UTF8, 'utf-8'), + (codecs.BOM_UTF16, 'utf-16'), + (codecs.BOM_UTF16_BE, 'utf-16-be'), + (codecs.BOM_UTF16_LE, 'utf-16-le'), + (codecs.BOM_UTF32, 'utf-32'), + (codecs.BOM_UTF32_BE, 'utf-32-be'), + (codecs.BOM_UTF32_LE, 'utf-32-le'), ] # type: List[Tuple[bytes, Text]] ENCODING_RE = re.compile(br'coding[:=]\s*([-\w.]+)') diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 9348dd771..a5d31c238 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -4,6 +4,7 @@ util tests """ +import codecs import itertools import os import shutil @@ -20,7 +21,7 @@ from mock import Mock, patch from pip._internal.exceptions import ( HashMismatch, HashMissing, InstallationError, UnsupportedPythonVersion, ) -from pip._internal.utils.encoding import auto_decode +from pip._internal.utils.encoding import BOMS, auto_decode from pip._internal.utils.glibc import check_glibc_version from pip._internal.utils.hashes import Hashes, MissingHashes from pip._internal.utils.misc import ( @@ -462,11 +463,20 @@ class TestHashes(object): class TestEncoding(object): """Tests for pip._internal.utils.encoding""" - def test_auto_decode_utf16_le(self): + def test_auto_decode_utf_16_le(self): data = ( b'\xff\xfeD\x00j\x00a\x00n\x00g\x00o\x00=\x00' b'=\x001\x00.\x004\x00.\x002\x00' ) + assert data.startswith(codecs.BOM_UTF16_LE) + assert auto_decode(data) == "Django==1.4.2" + + def test_auto_decode_utf_16_be(self): + data = ( + b'\xfe\xff\x00D\x00j\x00a\x00n\x00g\x00o\x00=' + b'\x00=\x001\x00.\x004\x00.\x002' + ) + assert data.startswith(codecs.BOM_UTF16_BE) assert auto_decode(data) == "Django==1.4.2" def test_auto_decode_no_bom(self): @@ -486,6 +496,11 @@ class TestEncoding(object): ret = auto_decode(data.encode(sys.getdefaultencoding())) assert ret == data + @pytest.mark.parametrize('encoding', [encoding for bom, encoding in BOMS]) + def test_all_encodings_are_valid(self, encoding): + # we really only care that there is no LookupError + assert ''.encode(encoding).decode(encoding) == '' + class TestTempDirectory(object):