From be20a75c108b5db5ca0dc097e6f46a3ebccfd48a Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Sat, 28 Jan 2023 20:41:43 +0000 Subject: [PATCH] Upgrade chardet to 5.1.0 --- news/chardet.vendor.rst | 1 + src/pip/_vendor/chardet.pyi | 1 - src/pip/_vendor/chardet/__init__.py | 36 +++- src/pip/_vendor/chardet/big5prober.py | 6 +- src/pip/_vendor/chardet/chardistribution.py | 54 +++--- src/pip/_vendor/chardet/charsetgroupprober.py | 31 ++-- src/pip/_vendor/chardet/charsetprober.py | 35 ++-- src/pip/_vendor/chardet/cli/chardetect.py | 42 ++++- src/pip/_vendor/chardet/codingstatemachine.py | 16 +- .../_vendor/chardet/codingstatemachinedict.py | 19 ++ src/pip/_vendor/chardet/cp949prober.py | 6 +- src/pip/_vendor/chardet/enums.py | 9 +- src/pip/_vendor/chardet/escprober.py | 26 +-- src/pip/_vendor/chardet/escsm.py | 9 +- src/pip/_vendor/chardet/eucjpprober.py | 19 +- src/pip/_vendor/chardet/euckrprober.py | 6 +- src/pip/_vendor/chardet/euctwprober.py | 6 +- src/pip/_vendor/chardet/gb2312prober.py | 6 +- src/pip/_vendor/chardet/hebrewprober.py | 56 +++--- src/pip/_vendor/chardet/johabprober.py | 6 +- src/pip/_vendor/chardet/jpcntx.py | 31 ++-- src/pip/_vendor/chardet/latin1prober.py | 18 +- src/pip/_vendor/chardet/macromanprober.py | 162 ++++++++++++++++++ src/pip/_vendor/chardet/mbcharsetprober.py | 32 ++-- src/pip/_vendor/chardet/mbcsgroupprober.py | 3 +- src/pip/_vendor/chardet/mbcssm.py | 23 +-- src/pip/_vendor/chardet/metadata/languages.py | 37 ++-- src/pip/_vendor/chardet/py.typed | 0 src/pip/_vendor/chardet/resultdict.py | 16 ++ src/pip/_vendor/chardet/sbcharsetprober.py | 52 +++--- src/pip/_vendor/chardet/sbcsgroupprober.py | 2 +- src/pip/_vendor/chardet/sjisprober.py | 19 +- src/pip/_vendor/chardet/universaldetector.py | 68 ++++++-- src/pip/_vendor/chardet/utf1632prober.py | 32 ++-- src/pip/_vendor/chardet/utf8prober.py | 16 +- src/pip/_vendor/chardet/version.py | 4 +- src/pip/_vendor/vendor.txt | 2 +- 37 files changed, 620 insertions(+), 287 deletions(-) create mode 100644 news/chardet.vendor.rst delete mode 100644 src/pip/_vendor/chardet.pyi create mode 100644 src/pip/_vendor/chardet/codingstatemachinedict.py create mode 100644 src/pip/_vendor/chardet/macromanprober.py create mode 100644 src/pip/_vendor/chardet/py.typed create mode 100644 src/pip/_vendor/chardet/resultdict.py diff --git a/news/chardet.vendor.rst b/news/chardet.vendor.rst new file mode 100644 index 000000000..5aceb6c5e --- /dev/null +++ b/news/chardet.vendor.rst @@ -0,0 +1 @@ +Upgrade chardet to 5.1.0 diff --git a/src/pip/_vendor/chardet.pyi b/src/pip/_vendor/chardet.pyi deleted file mode 100644 index 29e87e331..000000000 --- a/src/pip/_vendor/chardet.pyi +++ /dev/null @@ -1 +0,0 @@ -from chardet import * \ No newline at end of file diff --git a/src/pip/_vendor/chardet/__init__.py b/src/pip/_vendor/chardet/__init__.py index e91ad6182..fe581623d 100644 --- a/src/pip/_vendor/chardet/__init__.py +++ b/src/pip/_vendor/chardet/__init__.py @@ -15,19 +15,29 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import List, Union + +from .charsetgroupprober import CharSetGroupProber +from .charsetprober import CharSetProber from .enums import InputState +from .resultdict import ResultDict from .universaldetector import UniversalDetector from .version import VERSION, __version__ __all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"] -def detect(byte_str): +def detect( + byte_str: Union[bytes, bytearray], should_rename_legacy: bool = False +) -> ResultDict: """ Detect the encoding of the given byte string. :param byte_str: The byte sequence to examine. :type byte_str: ``bytes`` or ``bytearray`` + :param should_rename_legacy: Should we rename legacy encodings + to their more modern equivalents? + :type should_rename_legacy: ``bool`` """ if not isinstance(byte_str, bytearray): if not isinstance(byte_str, bytes): @@ -35,12 +45,16 @@ def detect(byte_str): f"Expected object of type bytes or bytearray, got: {type(byte_str)}" ) byte_str = bytearray(byte_str) - detector = UniversalDetector() + detector = UniversalDetector(should_rename_legacy=should_rename_legacy) detector.feed(byte_str) return detector.close() -def detect_all(byte_str, ignore_threshold=False): +def detect_all( + byte_str: Union[bytes, bytearray], + ignore_threshold: bool = False, + should_rename_legacy: bool = False, +) -> List[ResultDict]: """ Detect all the possible encodings of the given byte string. @@ -50,6 +64,9 @@ def detect_all(byte_str, ignore_threshold=False): ``UniversalDetector.MINIMUM_THRESHOLD`` in results. :type ignore_threshold: ``bool`` + :param should_rename_legacy: Should we rename legacy encodings + to their more modern equivalents? + :type should_rename_legacy: ``bool`` """ if not isinstance(byte_str, bytearray): if not isinstance(byte_str, bytes): @@ -58,15 +75,15 @@ def detect_all(byte_str, ignore_threshold=False): ) byte_str = bytearray(byte_str) - detector = UniversalDetector() + detector = UniversalDetector(should_rename_legacy=should_rename_legacy) detector.feed(byte_str) detector.close() if detector.input_state == InputState.HIGH_BYTE: - results = [] - probers = [] + results: List[ResultDict] = [] + probers: List[CharSetProber] = [] for prober in detector.charset_probers: - if hasattr(prober, "probers"): + if isinstance(prober, CharSetGroupProber): probers.extend(p for p in prober.probers) else: probers.append(prober) @@ -80,6 +97,11 @@ def detect_all(byte_str, ignore_threshold=False): charset_name = detector.ISO_WIN_MAP.get( lower_charset_name, charset_name ) + # Rename legacy encodings with superset encodings if asked + if should_rename_legacy: + charset_name = detector.LEGACY_MAP.get( + charset_name.lower(), charset_name + ) results.append( { "encoding": charset_name, diff --git a/src/pip/_vendor/chardet/big5prober.py b/src/pip/_vendor/chardet/big5prober.py index e4dfa7aa0..ef09c60e3 100644 --- a/src/pip/_vendor/chardet/big5prober.py +++ b/src/pip/_vendor/chardet/big5prober.py @@ -32,16 +32,16 @@ from .mbcssm import BIG5_SM_MODEL class Big5Prober(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(BIG5_SM_MODEL) self.distribution_analyzer = Big5DistributionAnalysis() self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "Big5" @property - def language(self): + def language(self) -> str: return "Chinese" diff --git a/src/pip/_vendor/chardet/chardistribution.py b/src/pip/_vendor/chardet/chardistribution.py index 27b4a2939..176cb9964 100644 --- a/src/pip/_vendor/chardet/chardistribution.py +++ b/src/pip/_vendor/chardet/chardistribution.py @@ -25,6 +25,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Tuple, Union + from .big5freq import ( BIG5_CHAR_TO_FREQ_ORDER, BIG5_TABLE_SIZE, @@ -59,22 +61,22 @@ class CharDistributionAnalysis: SURE_NO = 0.01 MINIMUM_DATA_THRESHOLD = 3 - def __init__(self): + def __init__(self) -> None: # Mapping table to get frequency order from char order (get from # GetOrder()) - self._char_to_freq_order = tuple() - self._table_size = None # Size of above table + self._char_to_freq_order: Tuple[int, ...] = tuple() + self._table_size = 0 # Size of above table # This is a constant value which varies from language to language, # used in calculating confidence. See # http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html # for further detail. - self.typical_distribution_ratio = None - self._done = None - self._total_chars = None - self._freq_chars = None + self.typical_distribution_ratio = 0.0 + self._done = False + self._total_chars = 0 + self._freq_chars = 0 self.reset() - def reset(self): + def reset(self) -> None: """reset analyser, clear any state""" # If this flag is set to True, detection is done and conclusion has # been made @@ -83,7 +85,7 @@ class CharDistributionAnalysis: # The number of characters whose frequency order is less than 512 self._freq_chars = 0 - def feed(self, char, char_len): + def feed(self, char: Union[bytes, bytearray], char_len: int) -> None: """feed a character with known length""" if char_len == 2: # we only care about 2-bytes character in our distribution analysis @@ -97,7 +99,7 @@ class CharDistributionAnalysis: if 512 > self._char_to_freq_order[order]: self._freq_chars += 1 - def get_confidence(self): + def get_confidence(self) -> float: """return confidence based on existing data""" # if we didn't receive any character in our consideration range, # return negative answer @@ -114,12 +116,12 @@ class CharDistributionAnalysis: # normalize confidence (we don't want to be 100% sure) return self.SURE_YES - def got_enough_data(self): + def got_enough_data(self) -> bool: # It is not necessary to receive all data to draw conclusion. # For charset detection, certain amount of data is enough return self._total_chars > self.ENOUGH_DATA_THRESHOLD - def get_order(self, _): + def get_order(self, _: Union[bytes, bytearray]) -> int: # We do not handle characters based on the original encoding string, # but convert this encoding string to a number, here called order. # This allows multiple encodings of a language to share one frequency @@ -128,13 +130,13 @@ class CharDistributionAnalysis: class EUCTWDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER self._table_size = EUCTW_TABLE_SIZE self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for euc-TW encoding, we are interested # first byte range: 0xc4 -- 0xfe # second byte range: 0xa1 -- 0xfe @@ -146,13 +148,13 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis): class EUCKRDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER self._table_size = EUCKR_TABLE_SIZE self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for euc-KR encoding, we are interested # first byte range: 0xb0 -- 0xfe # second byte range: 0xa1 -- 0xfe @@ -164,13 +166,13 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis): class JOHABDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER self._table_size = EUCKR_TABLE_SIZE self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: first_char = byte_str[0] if 0x88 <= first_char < 0xD4: code = first_char * 256 + byte_str[1] @@ -179,13 +181,13 @@ class JOHABDistributionAnalysis(CharDistributionAnalysis): class GB2312DistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER self._table_size = GB2312_TABLE_SIZE self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for GB2312 encoding, we are interested # first byte range: 0xb0 -- 0xfe # second byte range: 0xa1 -- 0xfe @@ -197,13 +199,13 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis): class Big5DistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER self._table_size = BIG5_TABLE_SIZE self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for big5 encoding, we are interested # first byte range: 0xa4 -- 0xfe # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe @@ -217,13 +219,13 @@ class Big5DistributionAnalysis(CharDistributionAnalysis): class SJISDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER self._table_size = JIS_TABLE_SIZE self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for sjis encoding, we are interested # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe @@ -242,13 +244,13 @@ class SJISDistributionAnalysis(CharDistributionAnalysis): class EUCJPDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER self._table_size = JIS_TABLE_SIZE self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for euc-JP encoding, we are interested # first byte range: 0xa0 -- 0xfe # second byte range: 0xa1 -- 0xfe diff --git a/src/pip/_vendor/chardet/charsetgroupprober.py b/src/pip/_vendor/chardet/charsetgroupprober.py index 778ff332b..6def56b4a 100644 --- a/src/pip/_vendor/chardet/charsetgroupprober.py +++ b/src/pip/_vendor/chardet/charsetgroupprober.py @@ -25,29 +25,30 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import List, Optional, Union + from .charsetprober import CharSetProber -from .enums import ProbingState +from .enums import LanguageFilter, ProbingState class CharSetGroupProber(CharSetProber): - def __init__(self, lang_filter=None): + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: super().__init__(lang_filter=lang_filter) self._active_num = 0 - self.probers = [] - self._best_guess_prober = None + self.probers: List[CharSetProber] = [] + self._best_guess_prober: Optional[CharSetProber] = None - def reset(self): + def reset(self) -> None: super().reset() self._active_num = 0 for prober in self.probers: - if prober: - prober.reset() - prober.active = True - self._active_num += 1 + prober.reset() + prober.active = True + self._active_num += 1 self._best_guess_prober = None @property - def charset_name(self): + def charset_name(self) -> Optional[str]: if not self._best_guess_prober: self.get_confidence() if not self._best_guess_prober: @@ -55,17 +56,15 @@ class CharSetGroupProber(CharSetProber): return self._best_guess_prober.charset_name @property - def language(self): + def language(self) -> Optional[str]: if not self._best_guess_prober: self.get_confidence() if not self._best_guess_prober: return None return self._best_guess_prober.language - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: for prober in self.probers: - if not prober: - continue if not prober.active: continue state = prober.feed(byte_str) @@ -83,7 +82,7 @@ class CharSetGroupProber(CharSetProber): return self.state return self.state - def get_confidence(self): + def get_confidence(self) -> float: state = self.state if state == ProbingState.FOUND_IT: return 0.99 @@ -92,8 +91,6 @@ class CharSetGroupProber(CharSetProber): best_conf = 0.0 self._best_guess_prober = None for prober in self.probers: - if not prober: - continue if not prober.active: self.logger.debug("%s not active", prober.charset_name) continue diff --git a/src/pip/_vendor/chardet/charsetprober.py b/src/pip/_vendor/chardet/charsetprober.py index 9f1afd999..a103ca113 100644 --- a/src/pip/_vendor/chardet/charsetprober.py +++ b/src/pip/_vendor/chardet/charsetprober.py @@ -28,8 +28,9 @@ import logging import re +from typing import Optional, Union -from .enums import ProbingState +from .enums import LanguageFilter, ProbingState INTERNATIONAL_WORDS_PATTERN = re.compile( b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?" @@ -40,35 +41,40 @@ class CharSetProber: SHORTCUT_THRESHOLD = 0.95 - def __init__(self, lang_filter=None): - self._state = None + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: + self._state = ProbingState.DETECTING + self.active = True self.lang_filter = lang_filter self.logger = logging.getLogger(__name__) - def reset(self): + def reset(self) -> None: self._state = ProbingState.DETECTING @property - def charset_name(self): + def charset_name(self) -> Optional[str]: return None - def feed(self, byte_str): + @property + def language(self) -> Optional[str]: + raise NotImplementedError + + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: raise NotImplementedError @property - def state(self): + def state(self) -> ProbingState: return self._state - def get_confidence(self): + def get_confidence(self) -> float: return 0.0 @staticmethod - def filter_high_byte_only(buf): + def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes: buf = re.sub(b"([\x00-\x7F])+", b" ", buf) return buf @staticmethod - def filter_international_words(buf): + def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray: """ We define three types of bytes: alphabet: english alphabets [a-zA-Z] @@ -102,7 +108,7 @@ class CharSetProber: return filtered @staticmethod - def remove_xml_tags(buf): + def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytes: """ Returns a copy of ``buf`` that retains only the sequences of English alphabet and high byte characters that are not between <> characters. @@ -117,10 +123,13 @@ class CharSetProber: for curr, buf_char in enumerate(buf): # Check if we're coming out of or entering an XML tag - if buf_char == b">": + + # https://github.com/python/typeshed/issues/8182 + if buf_char == b">": # type: ignore[comparison-overlap] prev = curr + 1 in_tag = False - elif buf_char == b"<": + # https://github.com/python/typeshed/issues/8182 + elif buf_char == b"<": # type: ignore[comparison-overlap] if curr > prev and not in_tag: # Keep everything after last non-extended-ASCII, # non-alphabetic character diff --git a/src/pip/_vendor/chardet/cli/chardetect.py b/src/pip/_vendor/chardet/cli/chardetect.py index 7926fa37e..43f6e144f 100644 --- a/src/pip/_vendor/chardet/cli/chardetect.py +++ b/src/pip/_vendor/chardet/cli/chardetect.py @@ -15,12 +15,18 @@ If no paths are provided, it takes its input from stdin. import argparse import sys +from typing import Iterable, List, Optional from .. import __version__ from ..universaldetector import UniversalDetector -def description_of(lines, name="stdin"): +def description_of( + lines: Iterable[bytes], + name: str = "stdin", + minimal: bool = False, + should_rename_legacy: bool = False, +) -> Optional[str]: """ Return a string describing the probable encoding of a file or list of strings. @@ -29,8 +35,11 @@ def description_of(lines, name="stdin"): :type lines: Iterable of bytes :param name: Name of file or collection of lines :type name: str + :param should_rename_legacy: Should we rename legacy encodings to + their more modern equivalents? + :type should_rename_legacy: ``bool`` """ - u = UniversalDetector() + u = UniversalDetector(should_rename_legacy=should_rename_legacy) for line in lines: line = bytearray(line) u.feed(line) @@ -39,12 +48,14 @@ def description_of(lines, name="stdin"): break u.close() result = u.result + if minimal: + return result["encoding"] if result["encoding"]: return f'{name}: {result["encoding"]} with confidence {result["confidence"]}' return f"{name}: no result" -def main(argv=None): +def main(argv: Optional[List[str]] = None) -> None: """ Handles command line arguments and gets things started. @@ -54,17 +65,28 @@ def main(argv=None): """ # Get command line arguments parser = argparse.ArgumentParser( - description="Takes one or more file paths and reports their detected \ - encodings" + description=( + "Takes one or more file paths and reports their detected encodings" + ) ) parser.add_argument( "input", - help="File whose encoding we would like to determine. \ - (default: stdin)", + help="File whose encoding we would like to determine. (default: stdin)", type=argparse.FileType("rb"), nargs="*", default=[sys.stdin.buffer], ) + parser.add_argument( + "--minimal", + help="Print only the encoding to standard output", + action="store_true", + ) + parser.add_argument( + "-l", + "--legacy", + help="Rename legacy encodings to more modern ones.", + action="store_true", + ) parser.add_argument( "--version", action="version", version=f"%(prog)s {__version__}" ) @@ -79,7 +101,11 @@ def main(argv=None): "--help\n", file=sys.stderr, ) - print(description_of(f, f.name)) + print( + description_of( + f, f.name, minimal=args.minimal, should_rename_legacy=args.legacy + ) + ) if __name__ == "__main__": diff --git a/src/pip/_vendor/chardet/codingstatemachine.py b/src/pip/_vendor/chardet/codingstatemachine.py index d3e3e825d..8ed4a8773 100644 --- a/src/pip/_vendor/chardet/codingstatemachine.py +++ b/src/pip/_vendor/chardet/codingstatemachine.py @@ -27,6 +27,7 @@ import logging +from .codingstatemachinedict import CodingStateMachineDict from .enums import MachineState @@ -53,18 +54,19 @@ class CodingStateMachine: encoding from consideration from here on. """ - def __init__(self, sm): + def __init__(self, sm: CodingStateMachineDict) -> None: self._model = sm self._curr_byte_pos = 0 self._curr_char_len = 0 - self._curr_state = None + self._curr_state = MachineState.START + self.active = True self.logger = logging.getLogger(__name__) self.reset() - def reset(self): + def reset(self) -> None: self._curr_state = MachineState.START - def next_state(self, c): + def next_state(self, c: int) -> int: # for each byte we get its class # if it is first byte, we also get byte length byte_class = self._model["class_table"][c] @@ -77,12 +79,12 @@ class CodingStateMachine: self._curr_byte_pos += 1 return self._curr_state - def get_current_charlen(self): + def get_current_charlen(self) -> int: return self._curr_char_len - def get_coding_state_machine(self): + def get_coding_state_machine(self) -> str: return self._model["name"] @property - def language(self): + def language(self) -> str: return self._model["language"] diff --git a/src/pip/_vendor/chardet/codingstatemachinedict.py b/src/pip/_vendor/chardet/codingstatemachinedict.py new file mode 100644 index 000000000..7a3c4c7e3 --- /dev/null +++ b/src/pip/_vendor/chardet/codingstatemachinedict.py @@ -0,0 +1,19 @@ +from typing import TYPE_CHECKING, Tuple + +if TYPE_CHECKING: + # TypedDict was introduced in Python 3.8. + # + # TODO: Remove the else block and TYPE_CHECKING check when dropping support + # for Python 3.7. + from typing import TypedDict + + class CodingStateMachineDict(TypedDict, total=False): + class_table: Tuple[int, ...] + class_factor: int + state_table: Tuple[int, ...] + char_len_table: Tuple[int, ...] + name: str + language: str # Optional key + +else: + CodingStateMachineDict = dict diff --git a/src/pip/_vendor/chardet/cp949prober.py b/src/pip/_vendor/chardet/cp949prober.py index 28a1f3dbb..fa7307ed8 100644 --- a/src/pip/_vendor/chardet/cp949prober.py +++ b/src/pip/_vendor/chardet/cp949prober.py @@ -32,7 +32,7 @@ from .mbcssm import CP949_SM_MODEL class CP949Prober(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(CP949_SM_MODEL) # NOTE: CP949 is a superset of EUC-KR, so the distribution should be @@ -41,9 +41,9 @@ class CP949Prober(MultiByteCharSetProber): self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "CP949" @property - def language(self): + def language(self) -> str: return "Korean" diff --git a/src/pip/_vendor/chardet/enums.py b/src/pip/_vendor/chardet/enums.py index 32a77e76c..5e3e19823 100644 --- a/src/pip/_vendor/chardet/enums.py +++ b/src/pip/_vendor/chardet/enums.py @@ -4,6 +4,8 @@ All of the Enums that are used throughout the chardet package. :author: Dan Blanchard (dan.blanchard@gmail.com) """ +from enum import Enum, Flag + class InputState: """ @@ -15,12 +17,13 @@ class InputState: HIGH_BYTE = 2 -class LanguageFilter: +class LanguageFilter(Flag): """ This enum represents the different language filters we can apply to a ``UniversalDetector``. """ + NONE = 0x00 CHINESE_SIMPLIFIED = 0x01 CHINESE_TRADITIONAL = 0x02 JAPANESE = 0x04 @@ -31,7 +34,7 @@ class LanguageFilter: CJK = CHINESE | JAPANESE | KOREAN -class ProbingState: +class ProbingState(Enum): """ This enum represents the different states a prober can be in. """ @@ -62,7 +65,7 @@ class SequenceLikelihood: POSITIVE = 3 @classmethod - def get_num_categories(cls): + def get_num_categories(cls) -> int: """:returns: The number of likelihood categories in the enum.""" return 4 diff --git a/src/pip/_vendor/chardet/escprober.py b/src/pip/_vendor/chardet/escprober.py index d9926115d..fd713830d 100644 --- a/src/pip/_vendor/chardet/escprober.py +++ b/src/pip/_vendor/chardet/escprober.py @@ -25,6 +25,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Optional, Union + from .charsetprober import CharSetProber from .codingstatemachine import CodingStateMachine from .enums import LanguageFilter, MachineState, ProbingState @@ -43,7 +45,7 @@ class EscCharSetProber(CharSetProber): identify these encodings. """ - def __init__(self, lang_filter=None): + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: super().__init__(lang_filter=lang_filter) self.coding_sm = [] if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED: @@ -53,17 +55,15 @@ class EscCharSetProber(CharSetProber): self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL)) if self.lang_filter & LanguageFilter.KOREAN: self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL)) - self.active_sm_count = None - self._detected_charset = None - self._detected_language = None - self._state = None + self.active_sm_count = 0 + self._detected_charset: Optional[str] = None + self._detected_language: Optional[str] = None + self._state = ProbingState.DETECTING self.reset() - def reset(self): + def reset(self) -> None: super().reset() for coding_sm in self.coding_sm: - if not coding_sm: - continue coding_sm.active = True coding_sm.reset() self.active_sm_count = len(self.coding_sm) @@ -71,20 +71,20 @@ class EscCharSetProber(CharSetProber): self._detected_language = None @property - def charset_name(self): + def charset_name(self) -> Optional[str]: return self._detected_charset @property - def language(self): + def language(self) -> Optional[str]: return self._detected_language - def get_confidence(self): + def get_confidence(self) -> float: return 0.99 if self._detected_charset else 0.00 - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: for c in byte_str: for coding_sm in self.coding_sm: - if not coding_sm or not coding_sm.active: + if not coding_sm.active: continue coding_state = coding_sm.next_state(c) if coding_state == MachineState.ERROR: diff --git a/src/pip/_vendor/chardet/escsm.py b/src/pip/_vendor/chardet/escsm.py index 3aa0f4d96..11d4adf77 100644 --- a/src/pip/_vendor/chardet/escsm.py +++ b/src/pip/_vendor/chardet/escsm.py @@ -25,6 +25,7 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from .codingstatemachinedict import CodingStateMachineDict from .enums import MachineState # fmt: off @@ -75,7 +76,7 @@ MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0) -HZ_SM_MODEL = { +HZ_SM_MODEL: CodingStateMachineDict = { "class_table": HZ_CLS, "class_factor": 6, "state_table": HZ_ST, @@ -134,7 +135,7 @@ ISO2022CN_ST = ( ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0) -ISO2022CN_SM_MODEL = { +ISO2022CN_SM_MODEL: CodingStateMachineDict = { "class_table": ISO2022CN_CLS, "class_factor": 9, "state_table": ISO2022CN_ST, @@ -194,7 +195,7 @@ ISO2022JP_ST = ( ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0) -ISO2022JP_SM_MODEL = { +ISO2022JP_SM_MODEL: CodingStateMachineDict = { "class_table": ISO2022JP_CLS, "class_factor": 10, "state_table": ISO2022JP_ST, @@ -250,7 +251,7 @@ ISO2022KR_ST = ( ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0) -ISO2022KR_SM_MODEL = { +ISO2022KR_SM_MODEL: CodingStateMachineDict = { "class_table": ISO2022KR_CLS, "class_factor": 6, "state_table": ISO2022KR_ST, diff --git a/src/pip/_vendor/chardet/eucjpprober.py b/src/pip/_vendor/chardet/eucjpprober.py index abf2e66e2..39487f409 100644 --- a/src/pip/_vendor/chardet/eucjpprober.py +++ b/src/pip/_vendor/chardet/eucjpprober.py @@ -25,6 +25,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Union + from .chardistribution import EUCJPDistributionAnalysis from .codingstatemachine import CodingStateMachine from .enums import MachineState, ProbingState @@ -34,26 +36,29 @@ from .mbcssm import EUCJP_SM_MODEL class EUCJPProber(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL) self.distribution_analyzer = EUCJPDistributionAnalysis() self.context_analyzer = EUCJPContextAnalysis() self.reset() - def reset(self): + def reset(self) -> None: super().reset() self.context_analyzer.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "EUC-JP" @property - def language(self): + def language(self) -> str: return "Japanese" - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: + assert self.coding_sm is not None + assert self.distribution_analyzer is not None + for i, byte in enumerate(byte_str): # PY3K: byte_str is a byte array, so byte is an int, not a byte coding_state = self.coding_sm.next_state(byte) @@ -89,7 +94,9 @@ class EUCJPProber(MultiByteCharSetProber): return self.state - def get_confidence(self): + def get_confidence(self) -> float: + assert self.distribution_analyzer is not None + context_conf = self.context_analyzer.get_confidence() distrib_conf = self.distribution_analyzer.get_confidence() return max(context_conf, distrib_conf) diff --git a/src/pip/_vendor/chardet/euckrprober.py b/src/pip/_vendor/chardet/euckrprober.py index 154a6d216..1fc5de046 100644 --- a/src/pip/_vendor/chardet/euckrprober.py +++ b/src/pip/_vendor/chardet/euckrprober.py @@ -32,16 +32,16 @@ from .mbcssm import EUCKR_SM_MODEL class EUCKRProber(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL) self.distribution_analyzer = EUCKRDistributionAnalysis() self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "EUC-KR" @property - def language(self): + def language(self) -> str: return "Korean" diff --git a/src/pip/_vendor/chardet/euctwprober.py b/src/pip/_vendor/chardet/euctwprober.py index ca10a23ca..a37ab1899 100644 --- a/src/pip/_vendor/chardet/euctwprober.py +++ b/src/pip/_vendor/chardet/euctwprober.py @@ -32,16 +32,16 @@ from .mbcssm import EUCTW_SM_MODEL class EUCTWProber(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL) self.distribution_analyzer = EUCTWDistributionAnalysis() self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "EUC-TW" @property - def language(self): + def language(self) -> str: return "Taiwan" diff --git a/src/pip/_vendor/chardet/gb2312prober.py b/src/pip/_vendor/chardet/gb2312prober.py index 251c04295..d423e7311 100644 --- a/src/pip/_vendor/chardet/gb2312prober.py +++ b/src/pip/_vendor/chardet/gb2312prober.py @@ -32,16 +32,16 @@ from .mbcssm import GB2312_SM_MODEL class GB2312Prober(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(GB2312_SM_MODEL) self.distribution_analyzer = GB2312DistributionAnalysis() self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "GB2312" @property - def language(self): + def language(self) -> str: return "Chinese" diff --git a/src/pip/_vendor/chardet/hebrewprober.py b/src/pip/_vendor/chardet/hebrewprober.py index 3ca634bf3..785d0057b 100644 --- a/src/pip/_vendor/chardet/hebrewprober.py +++ b/src/pip/_vendor/chardet/hebrewprober.py @@ -25,8 +25,11 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Optional, Union + from .charsetprober import CharSetProber from .enums import ProbingState +from .sbcharsetprober import SingleByteCharSetProber # This prober doesn't actually recognize a language or a charset. # It is a helper prober for the use of the Hebrew model probers @@ -127,6 +130,7 @@ from .enums import ProbingState class HebrewProber(CharSetProber): + SPACE = 0x20 # windows-1255 / ISO-8859-8 code points of interest FINAL_KAF = 0xEA NORMAL_KAF = 0xEB @@ -152,31 +156,35 @@ class HebrewProber(CharSetProber): VISUAL_HEBREW_NAME = "ISO-8859-8" LOGICAL_HEBREW_NAME = "windows-1255" - def __init__(self): + def __init__(self) -> None: super().__init__() - self._final_char_logical_score = None - self._final_char_visual_score = None - self._prev = None - self._before_prev = None - self._logical_prober = None - self._visual_prober = None + self._final_char_logical_score = 0 + self._final_char_visual_score = 0 + self._prev = self.SPACE + self._before_prev = self.SPACE + self._logical_prober: Optional[SingleByteCharSetProber] = None + self._visual_prober: Optional[SingleByteCharSetProber] = None self.reset() - def reset(self): + def reset(self) -> None: self._final_char_logical_score = 0 self._final_char_visual_score = 0 # The two last characters seen in the previous buffer, # mPrev and mBeforePrev are initialized to space in order to simulate # a word delimiter at the beginning of the data - self._prev = " " - self._before_prev = " " + self._prev = self.SPACE + self._before_prev = self.SPACE # These probers are owned by the group prober. - def set_model_probers(self, logical_prober, visual_prober): + def set_model_probers( + self, + logical_prober: SingleByteCharSetProber, + visual_prober: SingleByteCharSetProber, + ) -> None: self._logical_prober = logical_prober self._visual_prober = visual_prober - def is_final(self, c): + def is_final(self, c: int) -> bool: return c in [ self.FINAL_KAF, self.FINAL_MEM, @@ -185,7 +193,7 @@ class HebrewProber(CharSetProber): self.FINAL_TSADI, ] - def is_non_final(self, c): + def is_non_final(self, c: int) -> bool: # The normal Tsadi is not a good Non-Final letter due to words like # 'lechotet' (to chat) containing an apostrophe after the tsadi. This # apostrophe is converted to a space in FilterWithoutEnglishLetters @@ -198,7 +206,7 @@ class HebrewProber(CharSetProber): # since these words are quite rare. return c in [self.NORMAL_KAF, self.NORMAL_MEM, self.NORMAL_NUN, self.NORMAL_PE] - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: # Final letter analysis for logical-visual decision. # Look for evidence that the received buffer is either logical Hebrew # or visual Hebrew. @@ -232,9 +240,9 @@ class HebrewProber(CharSetProber): byte_str = self.filter_high_byte_only(byte_str) for cur in byte_str: - if cur == " ": + if cur == self.SPACE: # We stand on a space - a word just ended - if self._before_prev != " ": + if self._before_prev != self.SPACE: # next-to-last char was not a space so self._prev is not a # 1 letter word if self.is_final(self._prev): @@ -247,9 +255,9 @@ class HebrewProber(CharSetProber): else: # Not standing on a space if ( - (self._before_prev == " ") + (self._before_prev == self.SPACE) and (self.is_final(self._prev)) - and (cur != " ") + and (cur != self.SPACE) ): # case (3) [-2:space][-1:final letter][cur:not space] self._final_char_visual_score += 1 @@ -261,7 +269,10 @@ class HebrewProber(CharSetProber): return ProbingState.DETECTING @property - def charset_name(self): + def charset_name(self) -> str: + assert self._logical_prober is not None + assert self._visual_prober is not None + # Make the decision: is it Logical or Visual? # If the final letter score distance is dominant enough, rely on it. finalsub = self._final_char_logical_score - self._final_char_visual_score @@ -289,11 +300,14 @@ class HebrewProber(CharSetProber): return self.LOGICAL_HEBREW_NAME @property - def language(self): + def language(self) -> str: return "Hebrew" @property - def state(self): + def state(self) -> ProbingState: + assert self._logical_prober is not None + assert self._visual_prober is not None + # Remain active as long as any of the model probers are active. if (self._logical_prober.state == ProbingState.NOT_ME) and ( self._visual_prober.state == ProbingState.NOT_ME diff --git a/src/pip/_vendor/chardet/johabprober.py b/src/pip/_vendor/chardet/johabprober.py index 6f359d193..d7364ba61 100644 --- a/src/pip/_vendor/chardet/johabprober.py +++ b/src/pip/_vendor/chardet/johabprober.py @@ -32,16 +32,16 @@ from .mbcssm import JOHAB_SM_MODEL class JOHABProber(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(JOHAB_SM_MODEL) self.distribution_analyzer = JOHABDistributionAnalysis() self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "Johab" @property - def language(self): + def language(self) -> str: return "Korean" diff --git a/src/pip/_vendor/chardet/jpcntx.py b/src/pip/_vendor/chardet/jpcntx.py index 7a8e5be06..2f53bdda0 100644 --- a/src/pip/_vendor/chardet/jpcntx.py +++ b/src/pip/_vendor/chardet/jpcntx.py @@ -25,6 +25,7 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import List, Tuple, Union # This is hiragana 2-char sequence table, the number in each cell represents its frequency category # fmt: off @@ -123,15 +124,15 @@ class JapaneseContextAnalysis: MAX_REL_THRESHOLD = 1000 MINIMUM_DATA_THRESHOLD = 4 - def __init__(self): - self._total_rel = None - self._rel_sample = None - self._need_to_skip_char_num = None - self._last_char_order = None - self._done = None + def __init__(self) -> None: + self._total_rel = 0 + self._rel_sample: List[int] = [] + self._need_to_skip_char_num = 0 + self._last_char_order = -1 + self._done = False self.reset() - def reset(self): + def reset(self) -> None: self._total_rel = 0 # total sequence received # category counters, each integer counts sequence in its category self._rel_sample = [0] * self.NUM_OF_CATEGORY @@ -143,7 +144,7 @@ class JapaneseContextAnalysis: # been made self._done = False - def feed(self, byte_str, num_bytes): + def feed(self, byte_str: Union[bytes, bytearray], num_bytes: int) -> None: if self._done: return @@ -172,29 +173,29 @@ class JapaneseContextAnalysis: ] += 1 self._last_char_order = order - def got_enough_data(self): + def got_enough_data(self) -> bool: return self._total_rel > self.ENOUGH_REL_THRESHOLD - def get_confidence(self): + def get_confidence(self) -> float: # This is just one way to calculate confidence. It works well for me. if self._total_rel > self.MINIMUM_DATA_THRESHOLD: return (self._total_rel - self._rel_sample[0]) / self._total_rel return self.DONT_KNOW - def get_order(self, _): + def get_order(self, _: Union[bytes, bytearray]) -> Tuple[int, int]: return -1, 1 class SJISContextAnalysis(JapaneseContextAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._charset_name = "SHIFT_JIS" @property - def charset_name(self): + def charset_name(self) -> str: return self._charset_name - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]: if not byte_str: return -1, 1 # find out current char's byte length @@ -216,7 +217,7 @@ class SJISContextAnalysis(JapaneseContextAnalysis): class EUCJPContextAnalysis(JapaneseContextAnalysis): - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]: if not byte_str: return -1, 1 # find out current char's byte length diff --git a/src/pip/_vendor/chardet/latin1prober.py b/src/pip/_vendor/chardet/latin1prober.py index 241f14ab9..59a01d91b 100644 --- a/src/pip/_vendor/chardet/latin1prober.py +++ b/src/pip/_vendor/chardet/latin1prober.py @@ -26,6 +26,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import List, Union + from .charsetprober import CharSetProber from .enums import ProbingState @@ -96,26 +98,26 @@ Latin1ClassModel = ( class Latin1Prober(CharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() - self._last_char_class = None - self._freq_counter = None + self._last_char_class = OTH + self._freq_counter: List[int] = [] self.reset() - def reset(self): + def reset(self) -> None: self._last_char_class = OTH self._freq_counter = [0] * FREQ_CAT_NUM super().reset() @property - def charset_name(self): + def charset_name(self) -> str: return "ISO-8859-1" @property - def language(self): + def language(self) -> str: return "" - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: byte_str = self.remove_xml_tags(byte_str) for c in byte_str: char_class = Latin1_CharToClass[c] @@ -128,7 +130,7 @@ class Latin1Prober(CharSetProber): return self.state - def get_confidence(self): + def get_confidence(self) -> float: if self.state == ProbingState.NOT_ME: return 0.01 diff --git a/src/pip/_vendor/chardet/macromanprober.py b/src/pip/_vendor/chardet/macromanprober.py new file mode 100644 index 000000000..1425d10ec --- /dev/null +++ b/src/pip/_vendor/chardet/macromanprober.py @@ -0,0 +1,162 @@ +######################## BEGIN LICENSE BLOCK ######################## +# This code was modified from latin1prober.py by Rob Speer . +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Rob Speer - adapt to MacRoman encoding +# Mark Pilgrim - port to Python +# Shy Shalom - original C code +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA +# 02110-1301 USA +######################### END LICENSE BLOCK ######################### + +from typing import List, Union + +from .charsetprober import CharSetProber +from .enums import ProbingState + +FREQ_CAT_NUM = 4 + +UDF = 0 # undefined +OTH = 1 # other +ASC = 2 # ascii capital letter +ASS = 3 # ascii small letter +ACV = 4 # accent capital vowel +ACO = 5 # accent capital other +ASV = 6 # accent small vowel +ASO = 7 # accent small other +ODD = 8 # character that is unlikely to appear +CLASS_NUM = 9 # total classes + +# The change from Latin1 is that we explicitly look for extended characters +# that are infrequently-occurring symbols, and consider them to always be +# improbable. This should let MacRoman get out of the way of more likely +# encodings in most situations. + +# fmt: off +MacRoman_CharToClass = ( + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F + OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47 + ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F + ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57 + ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F + OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67 + ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F + ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77 + ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F + ACV, ACV, ACO, ACV, ACO, ACV, ACV, ASV, # 80 - 87 + ASV, ASV, ASV, ASV, ASV, ASO, ASV, ASV, # 88 - 8F + ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASV, # 90 - 97 + ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # 98 - 9F + OTH, OTH, OTH, OTH, OTH, OTH, OTH, ASO, # A0 - A7 + OTH, OTH, ODD, ODD, OTH, OTH, ACV, ACV, # A8 - AF + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7 + OTH, OTH, OTH, OTH, OTH, OTH, ASV, ASV, # B8 - BF + OTH, OTH, ODD, OTH, ODD, OTH, OTH, OTH, # C0 - C7 + OTH, OTH, OTH, ACV, ACV, ACV, ACV, ASV, # C8 - CF + OTH, OTH, OTH, OTH, OTH, OTH, OTH, ODD, # D0 - D7 + ASV, ACV, ODD, OTH, OTH, OTH, OTH, OTH, # D8 - DF + OTH, OTH, OTH, OTH, OTH, ACV, ACV, ACV, # E0 - E7 + ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # E8 - EF + ODD, ACV, ACV, ACV, ACV, ASV, ODD, ODD, # F0 - F7 + ODD, ODD, ODD, ODD, ODD, ODD, ODD, ODD, # F8 - FF +) + +# 0 : illegal +# 1 : very unlikely +# 2 : normal +# 3 : very likely +MacRomanClassModel = ( +# UDF OTH ASC ASS ACV ACO ASV ASO ODD + 0, 0, 0, 0, 0, 0, 0, 0, 0, # UDF + 0, 3, 3, 3, 3, 3, 3, 3, 1, # OTH + 0, 3, 3, 3, 3, 3, 3, 3, 1, # ASC + 0, 3, 3, 3, 1, 1, 3, 3, 1, # ASS + 0, 3, 3, 3, 1, 2, 1, 2, 1, # ACV + 0, 3, 3, 3, 3, 3, 3, 3, 1, # ACO + 0, 3, 1, 3, 1, 1, 1, 3, 1, # ASV + 0, 3, 1, 3, 1, 1, 3, 3, 1, # ASO + 0, 1, 1, 1, 1, 1, 1, 1, 1, # ODD +) +# fmt: on + + +class MacRomanProber(CharSetProber): + def __init__(self) -> None: + super().__init__() + self._last_char_class = OTH + self._freq_counter: List[int] = [] + self.reset() + + def reset(self) -> None: + self._last_char_class = OTH + self._freq_counter = [0] * FREQ_CAT_NUM + + # express the prior that MacRoman is a somewhat rare encoding; + # this can be done by starting out in a slightly improbable state + # that must be overcome + self._freq_counter[2] = 10 + + super().reset() + + @property + def charset_name(self) -> str: + return "MacRoman" + + @property + def language(self) -> str: + return "" + + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: + byte_str = self.remove_xml_tags(byte_str) + for c in byte_str: + char_class = MacRoman_CharToClass[c] + freq = MacRomanClassModel[(self._last_char_class * CLASS_NUM) + char_class] + if freq == 0: + self._state = ProbingState.NOT_ME + break + self._freq_counter[freq] += 1 + self._last_char_class = char_class + + return self.state + + def get_confidence(self) -> float: + if self.state == ProbingState.NOT_ME: + return 0.01 + + total = sum(self._freq_counter) + confidence = ( + 0.0 + if total < 0.01 + else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total + ) + confidence = max(confidence, 0.0) + # lower the confidence of MacRoman so that other more accurate + # detector can take priority. + confidence *= 0.73 + return confidence diff --git a/src/pip/_vendor/chardet/mbcharsetprober.py b/src/pip/_vendor/chardet/mbcharsetprober.py index bf96ad5d4..666307e8f 100644 --- a/src/pip/_vendor/chardet/mbcharsetprober.py +++ b/src/pip/_vendor/chardet/mbcharsetprober.py @@ -27,8 +27,12 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Optional, Union + +from .chardistribution import CharDistributionAnalysis from .charsetprober import CharSetProber -from .enums import MachineState, ProbingState +from .codingstatemachine import CodingStateMachine +from .enums import LanguageFilter, MachineState, ProbingState class MultiByteCharSetProber(CharSetProber): @@ -36,29 +40,24 @@ class MultiByteCharSetProber(CharSetProber): MultiByteCharSetProber """ - def __init__(self, lang_filter=None): + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: super().__init__(lang_filter=lang_filter) - self.distribution_analyzer = None - self.coding_sm = None - self._last_char = [0, 0] + self.distribution_analyzer: Optional[CharDistributionAnalysis] = None + self.coding_sm: Optional[CodingStateMachine] = None + self._last_char = bytearray(b"\0\0") - def reset(self): + def reset(self) -> None: super().reset() if self.coding_sm: self.coding_sm.reset() if self.distribution_analyzer: self.distribution_analyzer.reset() - self._last_char = [0, 0] + self._last_char = bytearray(b"\0\0") - @property - def charset_name(self): - raise NotImplementedError + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: + assert self.coding_sm is not None + assert self.distribution_analyzer is not None - @property - def language(self): - raise NotImplementedError - - def feed(self, byte_str): for i, byte in enumerate(byte_str): coding_state = self.coding_sm.next_state(byte) if coding_state == MachineState.ERROR: @@ -91,5 +90,6 @@ class MultiByteCharSetProber(CharSetProber): return self.state - def get_confidence(self): + def get_confidence(self) -> float: + assert self.distribution_analyzer is not None return self.distribution_analyzer.get_confidence() diff --git a/src/pip/_vendor/chardet/mbcsgroupprober.py b/src/pip/_vendor/chardet/mbcsgroupprober.py index 94488360c..6cb9cc7b3 100644 --- a/src/pip/_vendor/chardet/mbcsgroupprober.py +++ b/src/pip/_vendor/chardet/mbcsgroupprober.py @@ -30,6 +30,7 @@ from .big5prober import Big5Prober from .charsetgroupprober import CharSetGroupProber from .cp949prober import CP949Prober +from .enums import LanguageFilter from .eucjpprober import EUCJPProber from .euckrprober import EUCKRProber from .euctwprober import EUCTWProber @@ -40,7 +41,7 @@ from .utf8prober import UTF8Prober class MBCSGroupProber(CharSetGroupProber): - def __init__(self, lang_filter=None): + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: super().__init__(lang_filter=lang_filter) self.probers = [ UTF8Prober(), diff --git a/src/pip/_vendor/chardet/mbcssm.py b/src/pip/_vendor/chardet/mbcssm.py index d3b9c4b75..7bbe97e66 100644 --- a/src/pip/_vendor/chardet/mbcssm.py +++ b/src/pip/_vendor/chardet/mbcssm.py @@ -25,6 +25,7 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from .codingstatemachinedict import CodingStateMachineDict from .enums import MachineState # BIG5 @@ -74,7 +75,7 @@ BIG5_ST = ( BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0) -BIG5_SM_MODEL = { +BIG5_SM_MODEL: CodingStateMachineDict = { "class_table": BIG5_CLS, "class_factor": 5, "state_table": BIG5_ST, @@ -117,7 +118,7 @@ CP949_ST = ( CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2) -CP949_SM_MODEL = { +CP949_SM_MODEL: CodingStateMachineDict = { "class_table": CP949_CLS, "class_factor": 10, "state_table": CP949_ST, @@ -173,7 +174,7 @@ EUCJP_ST = ( EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0) -EUCJP_SM_MODEL = { +EUCJP_SM_MODEL: CodingStateMachineDict = { "class_table": EUCJP_CLS, "class_factor": 6, "state_table": EUCJP_ST, @@ -226,7 +227,7 @@ EUCKR_ST = ( EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0) -EUCKR_SM_MODEL = { +EUCKR_SM_MODEL: CodingStateMachineDict = { "class_table": EUCKR_CLS, "class_factor": 4, "state_table": EUCKR_ST, @@ -283,7 +284,7 @@ JOHAB_ST = ( JOHAB_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 0, 0, 2, 2, 2) -JOHAB_SM_MODEL = { +JOHAB_SM_MODEL: CodingStateMachineDict = { "class_table": JOHAB_CLS, "class_factor": 10, "state_table": JOHAB_ST, @@ -340,7 +341,7 @@ EUCTW_ST = ( EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3) -EUCTW_SM_MODEL = { +EUCTW_SM_MODEL: CodingStateMachineDict = { "class_table": EUCTW_CLS, "class_factor": 7, "state_table": EUCTW_ST, @@ -402,7 +403,7 @@ GB2312_ST = ( # 2 here. GB2312_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 1, 2) -GB2312_SM_MODEL = { +GB2312_SM_MODEL: CodingStateMachineDict = { "class_table": GB2312_CLS, "class_factor": 7, "state_table": GB2312_ST, @@ -458,7 +459,7 @@ SJIS_ST = ( SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0) -SJIS_SM_MODEL = { +SJIS_SM_MODEL: CodingStateMachineDict = { "class_table": SJIS_CLS, "class_factor": 6, "state_table": SJIS_ST, @@ -516,7 +517,7 @@ UCS2BE_ST = ( UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2) -UCS2BE_SM_MODEL = { +UCS2BE_SM_MODEL: CodingStateMachineDict = { "class_table": UCS2BE_CLS, "class_factor": 6, "state_table": UCS2BE_ST, @@ -574,7 +575,7 @@ UCS2LE_ST = ( UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2) -UCS2LE_SM_MODEL = { +UCS2LE_SM_MODEL: CodingStateMachineDict = { "class_table": UCS2LE_CLS, "class_factor": 6, "state_table": UCS2LE_ST, @@ -651,7 +652,7 @@ UTF8_ST = ( UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6) -UTF8_SM_MODEL = { +UTF8_SM_MODEL: CodingStateMachineDict = { "class_table": UTF8_CLS, "class_factor": 16, "state_table": UTF8_ST, diff --git a/src/pip/_vendor/chardet/metadata/languages.py b/src/pip/_vendor/chardet/metadata/languages.py index 1d37884c3..eb40c5f0c 100644 --- a/src/pip/_vendor/chardet/metadata/languages.py +++ b/src/pip/_vendor/chardet/metadata/languages.py @@ -6,6 +6,7 @@ This code is based on the language metadata from the uchardet project. """ from string import ascii_letters +from typing import List, Optional # TODO: Add Ukrainian (KOI8-U) @@ -33,13 +34,13 @@ class Language: def __init__( self, - name=None, - iso_code=None, - use_ascii=True, - charsets=None, - alphabet=None, - wiki_start_pages=None, - ): + name: Optional[str] = None, + iso_code: Optional[str] = None, + use_ascii: bool = True, + charsets: Optional[List[str]] = None, + alphabet: Optional[str] = None, + wiki_start_pages: Optional[List[str]] = None, + ) -> None: super().__init__() self.name = name self.iso_code = iso_code @@ -55,7 +56,7 @@ class Language: self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None self.wiki_start_pages = wiki_start_pages - def __repr__(self): + def __repr__(self) -> str: param_str = ", ".join( f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_") ) @@ -103,7 +104,7 @@ LANGUAGES = { name="Danish", iso_code="da", use_ascii=True, - charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], alphabet="æøåÆØÅ", wiki_start_pages=["Forside"], ), @@ -111,8 +112,8 @@ LANGUAGES = { name="German", iso_code="de", use_ascii=True, - charsets=["ISO-8859-1", "WINDOWS-1252"], - alphabet="äöüßÄÖÜ", + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], + alphabet="äöüßẞÄÖÜ", wiki_start_pages=["Wikipedia:Hauptseite"], ), "Greek": Language( @@ -127,7 +128,7 @@ LANGUAGES = { name="English", iso_code="en", use_ascii=True, - charsets=["ISO-8859-1", "WINDOWS-1252"], + charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"], wiki_start_pages=["Main_Page"], ), "Esperanto": Language( @@ -143,7 +144,7 @@ LANGUAGES = { name="Spanish", iso_code="es", use_ascii=True, - charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], alphabet="ñáéíóúüÑÁÉÍÓÚÜ", wiki_start_pages=["Wikipedia:Portada"], ), @@ -161,7 +162,7 @@ LANGUAGES = { name="Finnish", iso_code="fi", use_ascii=True, - charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], alphabet="ÅÄÖŠŽåäöšž", wiki_start_pages=["Wikipedia:Etusivu"], ), @@ -169,7 +170,7 @@ LANGUAGES = { name="French", iso_code="fr", use_ascii=True, - charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ", wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"], ), @@ -203,7 +204,7 @@ LANGUAGES = { name="Italian", iso_code="it", use_ascii=True, - charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], alphabet="ÀÈÉÌÒÓÙàèéìòóù", wiki_start_pages=["Pagina_principale"], ), @@ -237,7 +238,7 @@ LANGUAGES = { name="Dutch", iso_code="nl", use_ascii=True, - charsets=["ISO-8859-1", "WINDOWS-1252"], + charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"], wiki_start_pages=["Hoofdpagina"], ), "Polish": Language( @@ -253,7 +254,7 @@ LANGUAGES = { name="Portuguese", iso_code="pt", use_ascii=True, - charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú", wiki_start_pages=["Wikipédia:Página_principal"], ), diff --git a/src/pip/_vendor/chardet/py.typed b/src/pip/_vendor/chardet/py.typed new file mode 100644 index 000000000..e69de29bb diff --git a/src/pip/_vendor/chardet/resultdict.py b/src/pip/_vendor/chardet/resultdict.py new file mode 100644 index 000000000..7d36e64c4 --- /dev/null +++ b/src/pip/_vendor/chardet/resultdict.py @@ -0,0 +1,16 @@ +from typing import TYPE_CHECKING, Optional + +if TYPE_CHECKING: + # TypedDict was introduced in Python 3.8. + # + # TODO: Remove the else block and TYPE_CHECKING check when dropping support + # for Python 3.7. + from typing import TypedDict + + class ResultDict(TypedDict): + encoding: Optional[str] + confidence: float + language: Optional[str] + +else: + ResultDict = dict diff --git a/src/pip/_vendor/chardet/sbcharsetprober.py b/src/pip/_vendor/chardet/sbcharsetprober.py index 31d70e154..0ffbcdd2c 100644 --- a/src/pip/_vendor/chardet/sbcharsetprober.py +++ b/src/pip/_vendor/chardet/sbcharsetprober.py @@ -26,23 +26,20 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from collections import namedtuple +from typing import Dict, List, NamedTuple, Optional, Union from .charsetprober import CharSetProber from .enums import CharacterCategory, ProbingState, SequenceLikelihood -SingleByteCharSetModel = namedtuple( - "SingleByteCharSetModel", - [ - "charset_name", - "language", - "char_to_order_map", - "language_model", - "typical_positive_ratio", - "keep_ascii_letters", - "alphabet", - ], -) + +class SingleByteCharSetModel(NamedTuple): + charset_name: str + language: str + char_to_order_map: Dict[int, int] + language_model: Dict[int, Dict[int, int]] + typical_positive_ratio: float + keep_ascii_letters: bool + alphabet: str class SingleByteCharSetProber(CharSetProber): @@ -51,22 +48,27 @@ class SingleByteCharSetProber(CharSetProber): POSITIVE_SHORTCUT_THRESHOLD = 0.95 NEGATIVE_SHORTCUT_THRESHOLD = 0.05 - def __init__(self, model, is_reversed=False, name_prober=None): + def __init__( + self, + model: SingleByteCharSetModel, + is_reversed: bool = False, + name_prober: Optional[CharSetProber] = None, + ) -> None: super().__init__() self._model = model # TRUE if we need to reverse every pair in the model lookup self._reversed = is_reversed # Optional auxiliary prober for name decision self._name_prober = name_prober - self._last_order = None - self._seq_counters = None - self._total_seqs = None - self._total_char = None - self._control_char = None - self._freq_char = None + self._last_order = 255 + self._seq_counters: List[int] = [] + self._total_seqs = 0 + self._total_char = 0 + self._control_char = 0 + self._freq_char = 0 self.reset() - def reset(self): + def reset(self) -> None: super().reset() # char order of last character self._last_order = 255 @@ -78,18 +80,18 @@ class SingleByteCharSetProber(CharSetProber): self._freq_char = 0 @property - def charset_name(self): + def charset_name(self) -> Optional[str]: if self._name_prober: return self._name_prober.charset_name return self._model.charset_name @property - def language(self): + def language(self) -> Optional[str]: if self._name_prober: return self._name_prober.language return self._model.language - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: # TODO: Make filter_international_words keep things in self.alphabet if not self._model.keep_ascii_letters: byte_str = self.filter_international_words(byte_str) @@ -139,7 +141,7 @@ class SingleByteCharSetProber(CharSetProber): return self.state - def get_confidence(self): + def get_confidence(self) -> float: r = 0.01 if self._total_seqs > 0: r = ( diff --git a/src/pip/_vendor/chardet/sbcsgroupprober.py b/src/pip/_vendor/chardet/sbcsgroupprober.py index cad001cb1..890ae8465 100644 --- a/src/pip/_vendor/chardet/sbcsgroupprober.py +++ b/src/pip/_vendor/chardet/sbcsgroupprober.py @@ -48,7 +48,7 @@ from .sbcharsetprober import SingleByteCharSetProber class SBCSGroupProber(CharSetGroupProber): - def __init__(self): + def __init__(self) -> None: super().__init__() hebrew_prober = HebrewProber() logical_hebrew_prober = SingleByteCharSetProber( diff --git a/src/pip/_vendor/chardet/sjisprober.py b/src/pip/_vendor/chardet/sjisprober.py index 3bcbdb71d..91df07796 100644 --- a/src/pip/_vendor/chardet/sjisprober.py +++ b/src/pip/_vendor/chardet/sjisprober.py @@ -25,6 +25,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Union + from .chardistribution import SJISDistributionAnalysis from .codingstatemachine import CodingStateMachine from .enums import MachineState, ProbingState @@ -34,26 +36,29 @@ from .mbcssm import SJIS_SM_MODEL class SJISProber(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(SJIS_SM_MODEL) self.distribution_analyzer = SJISDistributionAnalysis() self.context_analyzer = SJISContextAnalysis() self.reset() - def reset(self): + def reset(self) -> None: super().reset() self.context_analyzer.reset() @property - def charset_name(self): + def charset_name(self) -> str: return self.context_analyzer.charset_name @property - def language(self): + def language(self) -> str: return "Japanese" - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: + assert self.coding_sm is not None + assert self.distribution_analyzer is not None + for i, byte in enumerate(byte_str): coding_state = self.coding_sm.next_state(byte) if coding_state == MachineState.ERROR: @@ -92,7 +97,9 @@ class SJISProber(MultiByteCharSetProber): return self.state - def get_confidence(self): + def get_confidence(self) -> float: + assert self.distribution_analyzer is not None + context_conf = self.context_analyzer.get_confidence() distrib_conf = self.distribution_analyzer.get_confidence() return max(context_conf, distrib_conf) diff --git a/src/pip/_vendor/chardet/universaldetector.py b/src/pip/_vendor/chardet/universaldetector.py index 22fcf8290..30c441dc2 100644 --- a/src/pip/_vendor/chardet/universaldetector.py +++ b/src/pip/_vendor/chardet/universaldetector.py @@ -39,12 +39,16 @@ class a user of ``chardet`` should use. import codecs import logging import re +from typing import List, Optional, Union from .charsetgroupprober import CharSetGroupProber +from .charsetprober import CharSetProber from .enums import InputState, LanguageFilter, ProbingState from .escprober import EscCharSetProber from .latin1prober import Latin1Prober +from .macromanprober import MacRomanProber from .mbcsgroupprober import MBCSGroupProber +from .resultdict import ResultDict from .sbcsgroupprober import SBCSGroupProber from .utf1632prober import UTF1632Prober @@ -80,34 +84,55 @@ class UniversalDetector: "iso-8859-9": "Windows-1254", "iso-8859-13": "Windows-1257", } + # Based on https://encoding.spec.whatwg.org/#names-and-labels + # but altered to match Python names for encodings and remove mappings + # that break tests. + LEGACY_MAP = { + "ascii": "Windows-1252", + "iso-8859-1": "Windows-1252", + "tis-620": "ISO-8859-11", + "iso-8859-9": "Windows-1254", + "gb2312": "GB18030", + "euc-kr": "CP949", + "utf-16le": "UTF-16", + } - def __init__(self, lang_filter=LanguageFilter.ALL): - self._esc_charset_prober = None - self._utf1632_prober = None - self._charset_probers = [] - self.result = None - self.done = None - self._got_data = None - self._input_state = None - self._last_char = None + def __init__( + self, + lang_filter: LanguageFilter = LanguageFilter.ALL, + should_rename_legacy: bool = False, + ) -> None: + self._esc_charset_prober: Optional[EscCharSetProber] = None + self._utf1632_prober: Optional[UTF1632Prober] = None + self._charset_probers: List[CharSetProber] = [] + self.result: ResultDict = { + "encoding": None, + "confidence": 0.0, + "language": None, + } + self.done = False + self._got_data = False + self._input_state = InputState.PURE_ASCII + self._last_char = b"" self.lang_filter = lang_filter self.logger = logging.getLogger(__name__) - self._has_win_bytes = None + self._has_win_bytes = False + self.should_rename_legacy = should_rename_legacy self.reset() @property - def input_state(self): + def input_state(self) -> int: return self._input_state @property - def has_win_bytes(self): + def has_win_bytes(self) -> bool: return self._has_win_bytes @property - def charset_probers(self): + def charset_probers(self) -> List[CharSetProber]: return self._charset_probers - def reset(self): + def reset(self) -> None: """ Reset the UniversalDetector and all of its probers back to their initial states. This is called by ``__init__``, so you only need to @@ -126,7 +151,7 @@ class UniversalDetector: for prober in self._charset_probers: prober.reset() - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> None: """ Takes a chunk of a document and feeds it through all of the relevant charset probers. @@ -166,6 +191,7 @@ class UniversalDetector: elif byte_str.startswith(b"\xFE\xFF\x00\x00"): # FE FF 00 00 UCS-4, unusual octet order BOM (3412) self.result = { + # TODO: This encoding is not supported by Python. Should remove? "encoding": "X-ISO-10646-UCS-4-3412", "confidence": 1.0, "language": "", @@ -173,6 +199,7 @@ class UniversalDetector: elif byte_str.startswith(b"\x00\x00\xFF\xFE"): # 00 00 FF FE UCS-4, unusual octet order BOM (2143) self.result = { + # TODO: This encoding is not supported by Python. Should remove? "encoding": "X-ISO-10646-UCS-4-2143", "confidence": 1.0, "language": "", @@ -242,6 +269,7 @@ class UniversalDetector: if self.lang_filter & LanguageFilter.NON_CJK: self._charset_probers.append(SBCSGroupProber()) self._charset_probers.append(Latin1Prober()) + self._charset_probers.append(MacRomanProber()) for prober in self._charset_probers: if prober.feed(byte_str) == ProbingState.FOUND_IT: self.result = { @@ -254,7 +282,7 @@ class UniversalDetector: if self.WIN_BYTE_DETECTOR.search(byte_str): self._has_win_bytes = True - def close(self): + def close(self) -> ResultDict: """ Stop analyzing the current document and come up with a final prediction. @@ -288,7 +316,8 @@ class UniversalDetector: max_prober = prober if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD): charset_name = max_prober.charset_name - lower_charset_name = max_prober.charset_name.lower() + assert charset_name is not None + lower_charset_name = charset_name.lower() confidence = max_prober.get_confidence() # Use Windows encoding name instead of ISO-8859 if we saw any # extra Windows-specific bytes @@ -297,6 +326,11 @@ class UniversalDetector: charset_name = self.ISO_WIN_MAP.get( lower_charset_name, charset_name ) + # Rename legacy encodings with superset encodings if asked + if self.should_rename_legacy: + charset_name = self.LEGACY_MAP.get( + (charset_name or "").lower(), charset_name + ) self.result = { "encoding": charset_name, "confidence": confidence, diff --git a/src/pip/_vendor/chardet/utf1632prober.py b/src/pip/_vendor/chardet/utf1632prober.py index 9fd1580b8..6bdec63d6 100644 --- a/src/pip/_vendor/chardet/utf1632prober.py +++ b/src/pip/_vendor/chardet/utf1632prober.py @@ -18,6 +18,8 @@ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import List, Union + from .charsetprober import CharSetProber from .enums import ProbingState @@ -36,7 +38,7 @@ class UTF1632Prober(CharSetProber): # a fixed constant ratio of expected zeros or non-zeros in modulo-position. EXPECTED_RATIO = 0.94 - def __init__(self): + def __init__(self) -> None: super().__init__() self.position = 0 self.zeros_at_mod = [0] * 4 @@ -51,7 +53,7 @@ class UTF1632Prober(CharSetProber): self.first_half_surrogate_pair_detected_16le = False self.reset() - def reset(self): + def reset(self) -> None: super().reset() self.position = 0 self.zeros_at_mod = [0] * 4 @@ -66,7 +68,7 @@ class UTF1632Prober(CharSetProber): self.quad = [0, 0, 0, 0] @property - def charset_name(self): + def charset_name(self) -> str: if self.is_likely_utf32be(): return "utf-32be" if self.is_likely_utf32le(): @@ -79,16 +81,16 @@ class UTF1632Prober(CharSetProber): return "utf-16" @property - def language(self): + def language(self) -> str: return "" - def approx_32bit_chars(self): + def approx_32bit_chars(self) -> float: return max(1.0, self.position / 4.0) - def approx_16bit_chars(self): + def approx_16bit_chars(self) -> float: return max(1.0, self.position / 2.0) - def is_likely_utf32be(self): + def is_likely_utf32be(self) -> bool: approx_chars = self.approx_32bit_chars() return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO @@ -98,7 +100,7 @@ class UTF1632Prober(CharSetProber): and not self.invalid_utf32be ) - def is_likely_utf32le(self): + def is_likely_utf32le(self) -> bool: approx_chars = self.approx_32bit_chars() return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO @@ -108,7 +110,7 @@ class UTF1632Prober(CharSetProber): and not self.invalid_utf32le ) - def is_likely_utf16be(self): + def is_likely_utf16be(self) -> bool: approx_chars = self.approx_16bit_chars() return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( (self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars @@ -118,7 +120,7 @@ class UTF1632Prober(CharSetProber): and not self.invalid_utf16be ) - def is_likely_utf16le(self): + def is_likely_utf16le(self) -> bool: approx_chars = self.approx_16bit_chars() return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( (self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars @@ -128,7 +130,7 @@ class UTF1632Prober(CharSetProber): and not self.invalid_utf16le ) - def validate_utf32_characters(self, quad): + def validate_utf32_characters(self, quad: List[int]) -> None: """ Validate if the quad of bytes is valid UTF-32. @@ -150,7 +152,7 @@ class UTF1632Prober(CharSetProber): ): self.invalid_utf32le = True - def validate_utf16_characters(self, pair): + def validate_utf16_characters(self, pair: List[int]) -> None: """ Validate if the pair of bytes is valid UTF-16. @@ -182,7 +184,7 @@ class UTF1632Prober(CharSetProber): else: self.invalid_utf16le = True - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: for c in byte_str: mod4 = self.position % 4 self.quad[mod4] = c @@ -198,7 +200,7 @@ class UTF1632Prober(CharSetProber): return self.state @property - def state(self): + def state(self) -> ProbingState: if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}: # terminal, decided states return self._state @@ -210,7 +212,7 @@ class UTF1632Prober(CharSetProber): self._state = ProbingState.NOT_ME return self._state - def get_confidence(self): + def get_confidence(self) -> float: return ( 0.85 if ( diff --git a/src/pip/_vendor/chardet/utf8prober.py b/src/pip/_vendor/chardet/utf8prober.py index 3aae09e86..d96354d97 100644 --- a/src/pip/_vendor/chardet/utf8prober.py +++ b/src/pip/_vendor/chardet/utf8prober.py @@ -25,6 +25,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Union + from .charsetprober import CharSetProber from .codingstatemachine import CodingStateMachine from .enums import MachineState, ProbingState @@ -34,26 +36,26 @@ from .mbcssm import UTF8_SM_MODEL class UTF8Prober(CharSetProber): ONE_CHAR_PROB = 0.5 - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(UTF8_SM_MODEL) - self._num_mb_chars = None + self._num_mb_chars = 0 self.reset() - def reset(self): + def reset(self) -> None: super().reset() self.coding_sm.reset() self._num_mb_chars = 0 @property - def charset_name(self): + def charset_name(self) -> str: return "utf-8" @property - def language(self): + def language(self) -> str: return "" - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: for c in byte_str: coding_state = self.coding_sm.next_state(c) if coding_state == MachineState.ERROR: @@ -72,7 +74,7 @@ class UTF8Prober(CharSetProber): return self.state - def get_confidence(self): + def get_confidence(self) -> float: unlike = 0.99 if self._num_mb_chars < 6: unlike *= self.ONE_CHAR_PROB**self._num_mb_chars diff --git a/src/pip/_vendor/chardet/version.py b/src/pip/_vendor/chardet/version.py index a08a06b9a..c5e9d85cd 100644 --- a/src/pip/_vendor/chardet/version.py +++ b/src/pip/_vendor/chardet/version.py @@ -1,9 +1,9 @@ """ This module exists only to simplify retrieving the version number of chardet -from within setup.py and from chardet subpackages. +from within setuptools and from chardet subpackages. :author: Dan Blanchard (dan.blanchard@gmail.com) """ -__version__ = "5.0.0" +__version__ = "5.1.0" VERSION = __version__.split(".") diff --git a/src/pip/_vendor/vendor.txt b/src/pip/_vendor/vendor.txt index cd42578f4..31cda4a80 100644 --- a/src/pip/_vendor/vendor.txt +++ b/src/pip/_vendor/vendor.txt @@ -9,7 +9,7 @@ pyparsing==3.0.9 pyproject-hooks==1.0.0 requests==2.28.2 certifi==2022.12.7 - chardet==5.0.0 + chardet==5.1.0 idna==3.4 urllib3==1.26.12 rich==12.6.0