1
1
Fork 0
mirror of https://github.com/pypa/pip synced 2023-12-13 21:30:23 +01:00

Fix utils.encoding.auto_decode() LookupError with invalid encodings

utils.encoding.auto_decode() was broken when decoding Big Endian BOM
byte-strings on Little Endian or vice versa.

The TestEncoding.test_auto_decode_utf_16_le test was failing on Big Endian
systems, such as Fedora's s390x builders. A similar test, but with BE BOM
test_auto_decode_utf_16_be was added in order to reproduce this on a Little
Endian system (which is much easier to come by).

A regression test was added to check that all listed encodings in
utils.encoding.BOMS are valid.

Fixes https://github.com/pypa/pip/issues/6054
This commit is contained in:
Miro Hrončok 2019-03-01 15:49:24 +01:00
parent 729404d4c5
commit d48475d008
3 changed files with 28 additions and 9 deletions

4
news/6054.bugfix Normal file
View file

@ -0,0 +1,4 @@
Fix ``utils.encoding.auto_decode()`` ``LookupError`` with invalid encodings.
``utils.encoding.auto_decode()`` was broken when decoding Big Endian BOM
byte-strings on Little Endian or vice versa.

View file

@ -9,13 +9,13 @@ if MYPY_CHECK_RUNNING:
from typing import List, Tuple, Text
BOMS = [
(codecs.BOM_UTF8, 'utf8'),
(codecs.BOM_UTF16, 'utf16'),
(codecs.BOM_UTF16_BE, 'utf16-be'),
(codecs.BOM_UTF16_LE, 'utf16-le'),
(codecs.BOM_UTF32, 'utf32'),
(codecs.BOM_UTF32_BE, 'utf32-be'),
(codecs.BOM_UTF32_LE, 'utf32-le'),
(codecs.BOM_UTF8, 'utf-8'),
(codecs.BOM_UTF16, 'utf-16'),
(codecs.BOM_UTF16_BE, 'utf-16-be'),
(codecs.BOM_UTF16_LE, 'utf-16-le'),
(codecs.BOM_UTF32, 'utf-32'),
(codecs.BOM_UTF32_BE, 'utf-32-be'),
(codecs.BOM_UTF32_LE, 'utf-32-le'),
] # type: List[Tuple[bytes, Text]]
ENCODING_RE = re.compile(br'coding[:=]\s*([-\w.]+)')

View file

@ -4,6 +4,7 @@
util tests
"""
import codecs
import itertools
import os
import shutil
@ -20,7 +21,7 @@ from mock import Mock, patch
from pip._internal.exceptions import (
HashMismatch, HashMissing, InstallationError, UnsupportedPythonVersion,
)
from pip._internal.utils.encoding import auto_decode
from pip._internal.utils.encoding import BOMS, auto_decode
from pip._internal.utils.glibc import check_glibc_version
from pip._internal.utils.hashes import Hashes, MissingHashes
from pip._internal.utils.misc import (
@ -462,11 +463,20 @@ class TestHashes(object):
class TestEncoding(object):
"""Tests for pip._internal.utils.encoding"""
def test_auto_decode_utf16_le(self):
def test_auto_decode_utf_16_le(self):
data = (
b'\xff\xfeD\x00j\x00a\x00n\x00g\x00o\x00=\x00'
b'=\x001\x00.\x004\x00.\x002\x00'
)
assert data.startswith(codecs.BOM_UTF16_LE)
assert auto_decode(data) == "Django==1.4.2"
def test_auto_decode_utf_16_be(self):
data = (
b'\xfe\xff\x00D\x00j\x00a\x00n\x00g\x00o\x00='
b'\x00=\x001\x00.\x004\x00.\x002'
)
assert data.startswith(codecs.BOM_UTF16_BE)
assert auto_decode(data) == "Django==1.4.2"
def test_auto_decode_no_bom(self):
@ -486,6 +496,11 @@ class TestEncoding(object):
ret = auto_decode(data.encode(sys.getdefaultencoding()))
assert ret == data
@pytest.mark.parametrize('encoding', [encoding for bom, encoding in BOMS])
def test_all_encodings_are_valid(self, encoding):
# we really only care that there is no LookupError
assert ''.encode(encoding).decode(encoding) == ''
class TestTempDirectory(object):