Merge pull request #6671 from cjerdonek/make-subprocess-error-non-ascii-cmd

Handle non-ascii commands in Python 2 in make_subprocess_output_error()
This commit is contained in:
Chris Jerdonek 2019-07-04 15:43:25 -07:00 committed by GitHub
commit 0d5a98390e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 132 additions and 25 deletions

View File

@ -15,7 +15,7 @@ from pip._vendor.urllib3.util import IS_PYOPENSSL
from pip._internal.utils.typing import MYPY_CHECK_RUNNING
if MYPY_CHECK_RUNNING:
from typing import Tuple, Text
from typing import Optional, Text, Tuple, Union
try:
import _ssl # noqa
@ -83,18 +83,29 @@ else:
backslashreplace_decode = "backslashreplace_decode"
def console_to_str(data):
# type: (bytes) -> Text
"""Return a string, safe for output, of subprocess output.
We assume the data is in the locale preferred encoding.
If it won't decode properly, we warn the user but decode as
best we can.
We also ensure that the output can be safely written to
standard output without encoding errors.
def str_to_display(data, desc=None):
# type: (Union[bytes, Text], Optional[str]) -> Text
"""
For display or logging purposes, convert a bytes object (or text) to
text (e.g. unicode in Python 2) safe for output.
:param desc: An optional phrase describing the input data, for use in
the log message if a warning is logged. Defaults to "Bytes object".
This function should never error out and so can take a best effort
approach. It is okay to be lossy if needed since the return value is
just for display.
We assume the data is in the locale preferred encoding. If it won't
decode properly, we warn the user but decode as best we can.
We also ensure that the output can be safely written to standard output
without encoding errors.
"""
if isinstance(data, text_type):
return data
# Otherwise, data is a bytes object (str in Python 2).
# First, get the encoding we assume. This is the preferred
# encoding for the locale, unless that is not found, or
# it is ASCII, in which case assume UTF-8
@ -107,10 +118,10 @@ def console_to_str(data):
try:
decoded_data = data.decode(encoding)
except UnicodeDecodeError:
logger.warning(
"Subprocess output does not appear to be encoded as %s",
encoding,
)
if desc is None:
desc = 'Bytes object'
msg_format = '{} does not appear to be encoded as %s'.format(desc)
logger.warning(msg_format, encoding)
decoded_data = data.decode(encoding, errors=backslashreplace_decode)
# Make sure we can print the output, by encoding it to the output
@ -138,6 +149,13 @@ def console_to_str(data):
return decoded_data
def console_to_str(data):
# type: (bytes) -> Text
"""Return a string, safe for output, of subprocess output.
"""
return str_to_display(data, desc='Subprocess output')
if sys.version_info >= (3,):
def native_str(s, replace=False):
# type: (str, bool) -> str

View File

@ -35,7 +35,7 @@ from pip._internal.locations import (
write_delete_marker_file,
)
from pip._internal.utils.compat import (
WINDOWS, console_to_str, expanduser, stdlib_pkgs,
WINDOWS, console_to_str, expanduser, stdlib_pkgs, str_to_display,
)
from pip._internal.utils.typing import MYPY_CHECK_RUNNING
@ -751,19 +751,25 @@ def make_subprocess_output_error(
:param lines: A list of lines, each ending with a newline.
"""
command = format_command_args(cmd_args)
# Convert `command` to text (unicode in Python 2) so we can use it as
# an argument in the unicode format string below. This avoids
# "UnicodeDecodeError: 'ascii' codec can't decode byte ..." in Python 2
# when the formatted command contains a non-ascii character.
command_display = str_to_display(command, desc='command bytes')
# We know the joined output value ends in a newline.
output = ''.join(lines)
msg = (
# We need to mark this explicitly as a unicode string to avoid
# "UnicodeEncodeError: 'ascii' codec can't encode character ..."
# errors in Python 2 since e.g. `output` is a unicode string.
# Use a unicode string to avoid "UnicodeEncodeError: 'ascii'
# codec can't encode character ..." in Python 2 when a format
# argument (e.g. `output`) has a non-ascii character.
u'Command errored out with exit status {exit_status}:\n'
' command: {command}\n'
' command: {command_display}\n'
' cwd: {cwd}\n'
'Complete output ({line_count} lines):\n{output}{divider}'
).format(
exit_status=exit_status,
command=command,
command_display=command_display,
cwd=cwd,
line_count=len(lines),
output=output,

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
import locale
import os
@ -5,7 +7,7 @@ import pytest
import pip._internal.utils.compat as pip_compat
from pip._internal.utils.compat import (
console_to_str, expanduser, get_path_uid, native_str,
console_to_str, expanduser, get_path_uid, native_str, str_to_display,
)
@ -45,6 +47,58 @@ def test_get_path_uid_symlink_without_NOFOLLOW(tmpdir, monkeypatch):
get_path_uid(fs)
@pytest.mark.parametrize('data, expected', [
('abc', u'abc'),
# Test text (unicode in Python 2) input.
(u'abc', u'abc'),
# Test text input with non-ascii characters.
(u'déf', u'déf'),
])
def test_str_to_display(data, expected):
actual = str_to_display(data)
assert actual == expected, (
# Show the encoding for easier troubleshooting.
'encoding: {!r}'.format(locale.getpreferredencoding())
)
@pytest.mark.parametrize('data, encoding, expected', [
# Test str input with non-ascii characters.
('déf', 'utf-8', u'déf'),
# Test bytes input with non-ascii characters:
(u'déf'.encode('utf-8'), 'utf-8', u'déf'),
# Test a Windows encoding.
(u'déf'.encode('cp1252'), 'cp1252', u'déf'),
# Test a Windows encoding with incompatibly encoded text.
(u'déf'.encode('utf-8'), 'cp1252', u'déf'),
])
def test_str_to_display__encoding(monkeypatch, data, encoding, expected):
monkeypatch.setattr(locale, 'getpreferredencoding', lambda: encoding)
actual = str_to_display(data)
assert actual == expected, (
# Show the encoding for easier troubleshooting.
'encoding: {!r}'.format(locale.getpreferredencoding())
)
def test_str_to_display__decode_error(monkeypatch, caplog):
monkeypatch.setattr(locale, 'getpreferredencoding', lambda: 'utf-8')
# Encode with an incompatible encoding.
data = u'ab'.encode('utf-16')
actual = str_to_display(data)
assert actual == u'\\xff\\xfea\x00b\x00', (
# Show the encoding for easier troubleshooting.
'encoding: {!r}'.format(locale.getpreferredencoding())
)
assert len(caplog.records) == 1
record = caplog.records[0]
assert record.levelname == 'WARNING'
assert record.message == (
'Bytes object does not appear to be encoded as utf-8'
)
def test_console_to_str(monkeypatch):
some_bytes = b"a\xE9\xC3\xE9b"
encodings = ('ascii', 'utf-8', 'iso-8859-1', 'iso-8859-5',

View File

@ -6,6 +6,7 @@ util tests
"""
import codecs
import itertools
import locale
import os
import shutil
import stat
@ -767,10 +768,38 @@ def test_make_subprocess_output_error():
assert actual == expected, 'actual: {}'.format(actual)
# This test is mainly important for checking unicode in Python 2.
def test_make_subprocess_output_error__unicode():
def test_make_subprocess_output_error__non_ascii_command_arg(monkeypatch):
"""
Test a line with non-ascii unicode characters.
Test a command argument with a non-ascii character.
"""
cmd_args = ['foo', 'déf']
if sys.version_info[0] == 2:
# Check in Python 2 that the str (bytes object) with the non-ascii
# character has the encoding we expect. (This comes from the source
# code encoding at the top of the file.)
assert cmd_args[1].decode('utf-8') == u'déf'
# We need to monkeypatch so the encoding will be correct on Windows.
monkeypatch.setattr(locale, 'getpreferredencoding', lambda: 'utf-8')
actual = make_subprocess_output_error(
cmd_args=cmd_args,
cwd='/path/to/cwd',
lines=[],
exit_status=1,
)
expected = dedent(u"""\
Command errored out with exit status 1:
command: foo 'déf'
cwd: /path/to/cwd
Complete output (0 lines):
----------------------------------------""")
assert actual == expected, u'actual: {}'.format(actual)
# This test is mainly important for checking unicode in Python 2.
def test_make_subprocess_output_error__non_ascii_line():
"""
Test a line with a non-ascii character.
"""
lines = [u'curly-quote: \u2018\n']
actual = make_subprocess_output_error(