Start parsing the "data-yanked" attribute.

This commit is contained in:
Chris Jerdonek 2019-06-24 08:41:45 -07:00
parent fdde0b483e
commit 8666bb1a5d
2 changed files with 71 additions and 8 deletions

View File

@ -43,15 +43,17 @@ if MYPY_CHECK_RUNNING:
Any, Callable, Iterable, Iterator, List, MutableMapping, Optional,
Sequence, Set, Tuple, Union,
)
import xml.etree.ElementTree
from pip._vendor.packaging.version import _BaseVersion
from pip._vendor.requests import Response
from pip._internal.models.search_scope import SearchScope
from pip._internal.req import InstallRequirement
from pip._internal.download import PipSession
SecureOrigin = Tuple[str, str, Optional[str]]
BuildTag = Tuple[Any, ...] # either empty tuple or Tuple[int, str]
CandidateSortingKey = Tuple[int, _BaseVersion, BuildTag, Optional[int]]
HTMLElement = xml.etree.ElementTree.Element
SecureOrigin = Tuple[str, str, Optional[str]]
__all__ = ['FormatControl', 'FoundCandidates', 'PackageFinder']
@ -1151,6 +1153,37 @@ def _clean_link(url):
return urllib_parse.urlunparse(result._replace(path=path))
def _link_from_element(
anchor, # type: HTMLElement
page_url, # type: str
base_url, # type: str
):
# type: (...) -> Optional[Link]
"""
Convert an anchor element in a simple repository page to a Link.
"""
href = anchor.get("href")
if not href:
return None
url = _clean_link(urllib_parse.urljoin(base_url, href))
pyrequire = anchor.get('data-requires-python')
pyrequire = unescape(pyrequire) if pyrequire else None
yanked_reason = anchor.get('data-yanked')
if yanked_reason:
yanked_reason = unescape(yanked_reason)
link = Link(
url,
comes_from=page_url,
requires_python=pyrequire,
yanked_reason=yanked_reason,
)
return link
class HTMLPage(object):
"""Represents one page, along with its URL"""
@ -1173,12 +1206,14 @@ class HTMLPage(object):
)
base_url = _determine_base_url(document, self.url)
for anchor in document.findall(".//a"):
if anchor.get("href"):
href = anchor.get("href")
url = _clean_link(urllib_parse.urljoin(base_url, href))
pyrequire = anchor.get('data-requires-python')
pyrequire = unescape(pyrequire) if pyrequire else None
yield Link(url, self.url, requires_python=pyrequire)
link = _link_from_element(
anchor,
page_url=self.url,
base_url=base_url,
)
if link is None:
continue
yield link
Search = namedtuple('Search', 'supplied canonical formats')

View File

@ -7,7 +7,7 @@ from pip._vendor import html5lib, requests
from pip._internal.download import PipSession
from pip._internal.index import (
CandidateEvaluator, Link, PackageFinder, Search,
CandidateEvaluator, HTMLPage, Link, PackageFinder, Search,
_check_link_requires_python, _clean_link, _determine_base_url,
_egg_info_matches, _find_name_version_sep, _get_html_page,
)
@ -521,3 +521,31 @@ def test_clean_link_windows(url, clean_url):
@pytest.mark.skipif("sys.platform == 'win32'")
def test_clean_link_non_windows(url, clean_url):
assert(_clean_link(url) == clean_url)
class TestHTMLPage:
@pytest.mark.parametrize(
('anchor_html, expected'),
[
# Test not present.
('<a href="/pkg1-1.0.tar.gz"></a>', None),
# Test present with no value.
('<a href="/pkg2-1.0.tar.gz" data-yanked></a>', ''),
# Test the empty string.
('<a href="/pkg3-1.0.tar.gz" data-yanked=""></a>', ''),
# Test a non-empty string.
('<a href="/pkg4-1.0.tar.gz" data-yanked="error"></a>', 'error'),
# Test a value with an escaped character.
('<a href="/pkg4-1.0.tar.gz" data-yanked="version &lt 1"></a>',
'version < 1'),
]
)
def test_iter_links__yanked_reason(self, anchor_html, expected):
html = '<html><body>{}</body></html>'.format(anchor_html)
html_bytes = html.encode('utf-8')
page = HTMLPage(html_bytes, url='https://example.com/simple/')
links = list(page.iter_links())
link, = links
actual = link.yanked_reason
assert actual == expected