Merge pull request #8223 from uranusjr/unicode-wheel

2020-05-19 18:02:06 +05:30 · 2020-05-19 18:02:06 +05:30 · 15f0863a65
parent 993531179c 0a31845007
commit 15f0863a65
6 changed files with 159 additions and 72 deletions
--- a/news/5712.bugfix
+++ b/news/5712.bugfix
@ -0,0 +1,2 @@
+Correctly treat wheels contenting non-ASCII file contents so they can be
+installed on Windows.
--- a/src/pip/_internal/operations/install/wheel.py
+++ b/src/pip/_internal/operations/install/wheel.py
@ -1,9 +1,6 @@
 """Support for installing and building the "wheel" binary package format.
 """

-# The following comment should be removed at some point in the future.
-# mypy: strict-optional=False
-
 from __future__ import absolute_import

 import collections
@ -24,7 +21,14 @@ from zipfile import ZipFile
 from pip._vendor import pkg_resources
 from pip._vendor.distlib.scripts import ScriptMaker
 from pip._vendor.distlib.util import get_export_entry
-from pip._vendor.six import StringIO
+from pip._vendor.six import (
+    PY2,
+    StringIO,
+    ensure_str,
+    ensure_text,
+    itervalues,
+    text_type,
+)

 from pip._internal.exceptions import InstallationError
 from pip._internal.locations import get_major_minor_version
@ -43,28 +47,35 @@ if not MYPY_CHECK_RUNNING:
    from pip._internal.utils.typing import cast
 else:
    from email.message import Message
-    import typing  # noqa F401
    from typing import (
-        Dict, List, Optional, Sequence, Tuple, Any,
-        Iterable, Iterator, Callable, Set, IO, cast
+        Any,
+        Callable,
+        Dict,
+        IO,
+        Iterable,
+        Iterator,
+        List,
+        NewType,
+        Optional,
+        Sequence,
+        Set,
+        Tuple,
+        Union,
+        cast,
    )

    from pip._internal.models.scheme import Scheme
    from pip._internal.utils.filesystem import NamedTemporaryFileResult

-    InstalledCSVRow = Tuple[str, ...]
+    RecordPath = NewType('RecordPath', text_type)
+    InstalledCSVRow = Tuple[RecordPath, str, Union[int, str]]


 logger = logging.getLogger(__name__)


-def normpath(src, p):
-    # type: (str, str) -> str
-    return os.path.relpath(src, p).replace(os.path.sep, '/')
-
-
 def rehash(path, blocksize=1 << 20):
-    # type: (str, int) -> Tuple[str, str]
+    # type: (text_type, int) -> Tuple[str, str]
    """Return (encoded_digest, length) for path using hashlib.sha256()"""
    h, length = hash_file(path, blocksize)
    digest = 'sha256=' + urlsafe_b64encode(
@ -79,14 +90,14 @@ def csv_io_kwargs(mode):
    """Return keyword arguments to properly open a CSV file
    in the given mode.
    """
-    if sys.version_info.major < 3:
+    if PY2:
        return {'mode': '{}b'.format(mode)}
    else:
-        return {'mode': mode, 'newline': ''}
+        return {'mode': mode, 'newline': '', 'encoding': 'utf-8'}


 def fix_script(path):
-    # type: (str) -> Optional[bool]
+    # type: (text_type) -> Optional[bool]
    """Replace #!python with #!/path/to/python
    Return True if file was changed.
    """
@ -217,9 +228,12 @@ def message_about_scripts_not_on_PATH(scripts):
    return "\n".join(msg_lines)


-def sorted_outrows(outrows):
-    # type: (Iterable[InstalledCSVRow]) -> List[InstalledCSVRow]
-    """Return the given rows of a RECORD file in sorted order.
+def _normalized_outrows(outrows):
+    # type: (Iterable[InstalledCSVRow]) -> List[Tuple[str, str, str]]
+    """Normalize the given rows of a RECORD file.
+
+    Items in each row are converted into str. Rows are then sorted to make
+    the value more predictable for tests.

    Each row is a 3-tuple (path, hash, size) and corresponds to a record of
    a RECORD file (see PEP 376 and PEP 427 for details).  For the rows
@ -234,13 +248,35 @@ def sorted_outrows(outrows):
    # coerce each element to a string to avoid a TypeError in this case.
    # For additional background, see--
    # https://github.com/pypa/pip/issues/5868
-    return sorted(outrows, key=lambda row: tuple(str(x) for x in row))
+    return sorted(
+        (ensure_str(record_path, encoding='utf-8'), hash_, str(size))
+        for record_path, hash_, size in outrows
+    )
+
+
+def _record_to_fs_path(record_path):
+    # type: (RecordPath) -> text_type
+    return record_path
+
+
+def _fs_to_record_path(path, relative_to=None):
+    # type: (text_type, Optional[text_type]) -> RecordPath
+    if relative_to is not None:
+        path = os.path.relpath(path, relative_to)
+    path = path.replace(os.path.sep, '/')
+    return cast('RecordPath', path)
+
+
+def _parse_record_path(record_column):
+    # type: (str) -> RecordPath
+    p = ensure_text(record_column, encoding='utf-8')
+    return cast('RecordPath', p)


 def get_csv_rows_for_installed(
    old_csv_rows,  # type: Iterable[List[str]]
-    installed,  # type: Dict[str, str]
-    changed,  # type: Set[str]
+    installed,  # type: Dict[RecordPath, RecordPath]
+    changed,  # type: Set[RecordPath]
    generated,  # type: List[str]
    lib_dir,  # type: str
 ):
@ -255,21 +291,20 @@ def get_csv_rows_for_installed(
            logger.warning(
                'RECORD line has more than three elements: {}'.format(row)
            )
-        # Make a copy because we are mutating the row.
-        row = list(row)
-        old_path = row[0]
-        new_path = installed.pop(old_path, old_path)
-        row[0] = new_path
-        if new_path in changed:
-            digest, length = rehash(new_path)
-            row[1] = digest
-            row[2] = length
-        installed_rows.append(tuple(row))
+        old_record_path = _parse_record_path(row[0])
+        new_record_path = installed.pop(old_record_path, old_record_path)
+        if new_record_path in changed:
+            digest, length = rehash(_record_to_fs_path(new_record_path))
+        else:
+            digest = row[1] if len(row) > 1 else ''
+            length = row[2] if len(row) > 2 else ''
+        installed_rows.append((new_record_path, digest, length))
    for f in generated:
+        path = _fs_to_record_path(f, lib_dir)
        digest, length = rehash(f)
-        installed_rows.append((normpath(f, lib_dir), digest, str(length)))
-    for f in installed:
-        installed_rows.append((installed[f], '', ''))
+        installed_rows.append((path, digest, length))
+    for installed_record_path in itervalues(installed):
+        installed_rows.append((installed_record_path, '', ''))
    return installed_rows


@ -338,8 +373,8 @@ def install_unpacked_wheel(
    #   installed = files copied from the wheel to the destination
    #   changed = files changed while installing (scripts #! line typically)
    #   generated = files newly generated during the install (script wrappers)
-    installed = {}  # type: Dict[str, str]
-    changed = set()
+    installed = {}  # type: Dict[RecordPath, RecordPath]
+    changed = set()  # type: Set[RecordPath]
    generated = []  # type: List[str]

    # Compile all of the pyc files that we're going to be installing
@ -351,20 +386,20 @@ def install_unpacked_wheel(
        logger.debug(stdout.getvalue())

    def record_installed(srcfile, destfile, modified=False):
-        # type: (str, str, bool) -> None
+        # type: (text_type, text_type, bool) -> None
        """Map archive RECORD paths to installation RECORD paths."""
-        oldpath = normpath(srcfile, wheeldir)
-        newpath = normpath(destfile, lib_dir)
+        oldpath = _fs_to_record_path(srcfile, wheeldir)
+        newpath = _fs_to_record_path(destfile, lib_dir)
        installed[oldpath] = newpath
        if modified:
-            changed.add(destfile)
+            changed.add(_fs_to_record_path(destfile))

    def clobber(
-            source,  # type: str
-            dest,  # type: str
+            source,  # type: text_type
+            dest,  # type: text_type
            is_base,  # type: bool
-            fixer=None,  # type: Optional[Callable[[str], Any]]
-            filter=None  # type: Optional[Callable[[str], bool]]
+            fixer=None,  # type: Optional[Callable[[text_type], Any]]
+            filter=None  # type: Optional[Callable[[text_type], bool]]
    ):
        # type: (...) -> None
        ensure_dir(dest)  # common for the 'include' path
@ -423,7 +458,11 @@ def install_unpacked_wheel(
                    changed = fixer(destfile)
                record_installed(srcfile, destfile, changed)

-    clobber(source, lib_dir, True)
+    clobber(
+        ensure_text(source, encoding=sys.getfilesystemencoding()),
+        ensure_text(lib_dir, encoding=sys.getfilesystemencoding()),
+        True,
+    )

    dest_info_dir = os.path.join(lib_dir, info_dir)

@ -432,7 +471,7 @@ def install_unpacked_wheel(
    console, gui = get_entrypoints(ep_file)

    def is_entrypoint_wrapper(name):
-        # type: (str) -> bool
+        # type: (text_type) -> bool
        # EP, EP.exe and EP-script.py are scripts generated for
        # entry point EP by setuptools
        if name.lower().endswith('.exe'):
@ -456,7 +495,13 @@ def install_unpacked_wheel(
                filter = is_entrypoint_wrapper
            source = os.path.join(wheeldir, datadir, subdir)
            dest = getattr(scheme, subdir)
-            clobber(source, dest, False, fixer=fixer, filter=filter)
+            clobber(
+                ensure_text(source, encoding=sys.getfilesystemencoding()),
+                ensure_text(dest, encoding=sys.getfilesystemencoding()),
+                False,
+                fixer=fixer,
+                filter=filter,
+            )

    maker = PipScriptMaker(None, scheme.scripts)

@ -606,16 +651,11 @@ def install_unpacked_wheel(
            generated=generated,
            lib_dir=lib_dir)
    with _generate_file(record_path, **csv_io_kwargs('w')) as record_file:
-
-        # The type mypy infers for record_file using reveal_type
-        # is different for Python 3 (typing.IO[Any]) and
-        # Python 2 (typing.BinaryIO), leading us to explicitly
-        # cast to typing.IO[str] as a workaround
-        # for bad Python 2 behaviour
-        record_file_obj = cast('IO[str]', record_file)
-
-        writer = csv.writer(record_file_obj)
-        writer.writerows(sorted_outrows(rows))  # sort to simplify testing
+        # The type mypy infers for record_file is different for Python 3
+        # (typing.IO[Any]) and Python 2 (typing.BinaryIO). We explicitly
+        # cast to typing.IO[str] as a workaround.
+        writer = csv.writer(cast('IO[str]', record_file))
+        writer.writerows(_normalized_outrows(rows))


 def install_wheel(
--- a/src/pip/_internal/utils/misc.py
+++ b/src/pip/_internal/utils/misc.py
@ -131,7 +131,7 @@ def get_prog():
 # Retry every half second for up to 3 seconds
@retry(stop_max_delay=3000, wait_fixed=500)
 def rmtree(dir, ignore_errors=False):
-    # type: (str, bool) -> None
+    # type: (Text, bool) -> None
    shutil.rmtree(dir, ignore_errors=ignore_errors,
                  onerror=rmtree_errorhandler)

@ -876,7 +876,7 @@ def is_console_interactive():


 def hash_file(path, blocksize=1 << 20):
-    # type: (str, int) -> Tuple[Any, int]
+    # type: (Text, int) -> Tuple[Any, int]
    """Return (hash, length) for path using hashlib.sha256()
    """

--- a/src/pip/_internal/utils/temp_dir.py
+++ b/src/pip/_internal/utils/temp_dir.py
@ -8,6 +8,7 @@ import tempfile
 from contextlib import contextmanager

 from pip._vendor.contextlib2 import ExitStack
+from pip._vendor.six import ensure_text

 from pip._internal.utils.misc import enum, rmtree
 from pip._internal.utils.typing import MYPY_CHECK_RUNNING
@ -193,7 +194,9 @@ class TempDirectory(object):
        """
        self._deleted = True
        if os.path.exists(self._path):
-            rmtree(self._path)
+            # Make sure to pass unicode on Python 2 to make the contents also
+            # use unicode, ensuring non-ASCII names and can be represented.
+            rmtree(ensure_text(self._path))


 class AdjacentTempDirectory(TempDirectory):
--- a/tests/functional/test_install_wheel.py
+++ b/tests/functional/test_install_wheel.py
@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 import distutils
 import glob
 import os
@ -125,6 +127,36 @@ def test_basic_install_from_wheel_file(script, data):
                                                        result.stdout)


+# Installation seems to work, but scripttest fails to check.
+# I really don't care now since we're desupporting it soon anyway.
+@skip_if_python2
+def test_basic_install_from_unicode_wheel(script, data):
+    """
+    Test installing from a wheel (that has a script)
+    """
+    make_wheel(
+        'unicode_package',
+        '1.0',
+        extra_files={
+            'வணக்கம்/__init__.py': b'',
+            'வணக்கம்/નમસ્તે.py': b'',
+        },
+    ).save_to_dir(script.scratch_path)
+
+    result = script.pip(
+        'install', 'unicode_package==1.0', '--no-index',
+        '--find-links', script.scratch_path,
+    )
+    dist_info_folder = script.site_packages / 'unicode_package-1.0.dist-info'
+    assert dist_info_folder in result.files_created, str(result)
+
+    file1 = script.site_packages.joinpath('வணக்கம்', '__init__.py')
+    assert file1 in result.files_created, str(result)
+
+    file2 = script.site_packages.joinpath('வணக்கம்', 'નમસ્તે.py')
+    assert file2 in result.files_created, str(result)
+
+
 def test_install_from_wheel_with_headers(script, data):
    """
    Test installing from a wheel file with headers
--- a/tests/unit/test_wheel.py
+++ b/tests/unit/test_wheel.py
@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 """Tests for wheel binary packages and .dist-info."""
 import csv
 import logging
@ -114,8 +116,8 @@ def test_raise_for_invalid_entrypoint_fail(entrypoint):

@pytest.mark.parametrize("outrows, expected", [
    ([
-        ('', '', 'a'),
-        ('', '', ''),
+        (u'', '', 'a'),
+        (u'', '', ''),
    ], [
        ('', '', ''),
        ('', '', 'a'),
@ -123,15 +125,23 @@ def test_raise_for_invalid_entrypoint_fail(entrypoint):
    ([
        # Include an int to check avoiding the following error:
        # > TypeError: '<' not supported between instances of 'str' and 'int'
-        ('', '', 1),
-        ('', '', ''),
+        (u'', '', 1),
+        (u'', '', ''),
    ], [
        ('', '', ''),
-        ('', '', 1),
+        ('', '', '1'),
+    ]),
+    ([
+        # Test the normalization correctly encode everything for csv.writer().
+        (u'😉', '', 1),
+        (u'', '', ''),
+    ], [
+        ('', '', ''),
+        ('😉', '', '1'),
    ]),
 ])
-def test_sorted_outrows(outrows, expected):
-    actual = wheel.sorted_outrows(outrows)
+def test_normalized_outrows(outrows, expected):
+    actual = wheel._normalized_outrows(outrows)
    assert actual == expected


@ -141,7 +151,7 @@ def call_get_csv_rows_for_installed(tmpdir, text):

    # Test that an installed file appearing in RECORD has its filename
    # updated in the new RECORD file.
-    installed = {'a': 'z'}
+    installed = {u'a': 'z'}
    changed = set()
    generated = []
    lib_dir = '/lib/dir'
@ -180,9 +190,9 @@ def test_get_csv_rows_for_installed__long_lines(tmpdir, caplog):
    outrows = call_get_csv_rows_for_installed(tmpdir, text)

    expected = [
-        ('z', 'b', 'c', 'd'),
+        ('z', 'b', 'c'),
        ('e', 'f', 'g'),
-        ('h', 'i', 'j', 'k'),
+        ('h', 'i', 'j'),
    ]
    assert outrows == expected