mirror of https://github.com/pypa/pip
Upgrade chardet to 5.1.0
This commit is contained in:
parent
1c110bede6
commit
be20a75c10
|
@ -0,0 +1 @@
|
|||
Upgrade chardet to 5.1.0
|
|
@ -1 +0,0 @@
|
|||
from chardet import *
|
|
@ -15,19 +15,29 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from typing import List, Union
|
||||
|
||||
from .charsetgroupprober import CharSetGroupProber
|
||||
from .charsetprober import CharSetProber
|
||||
from .enums import InputState
|
||||
from .resultdict import ResultDict
|
||||
from .universaldetector import UniversalDetector
|
||||
from .version import VERSION, __version__
|
||||
|
||||
__all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"]
|
||||
|
||||
|
||||
def detect(byte_str):
|
||||
def detect(
|
||||
byte_str: Union[bytes, bytearray], should_rename_legacy: bool = False
|
||||
) -> ResultDict:
|
||||
"""
|
||||
Detect the encoding of the given byte string.
|
||||
|
||||
:param byte_str: The byte sequence to examine.
|
||||
:type byte_str: ``bytes`` or ``bytearray``
|
||||
:param should_rename_legacy: Should we rename legacy encodings
|
||||
to their more modern equivalents?
|
||||
:type should_rename_legacy: ``bool``
|
||||
"""
|
||||
if not isinstance(byte_str, bytearray):
|
||||
if not isinstance(byte_str, bytes):
|
||||
|
@ -35,12 +45,16 @@ def detect(byte_str):
|
|||
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
|
||||
)
|
||||
byte_str = bytearray(byte_str)
|
||||
detector = UniversalDetector()
|
||||
detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
|
||||
detector.feed(byte_str)
|
||||
return detector.close()
|
||||
|
||||
|
||||
def detect_all(byte_str, ignore_threshold=False):
|
||||
def detect_all(
|
||||
byte_str: Union[bytes, bytearray],
|
||||
ignore_threshold: bool = False,
|
||||
should_rename_legacy: bool = False,
|
||||
) -> List[ResultDict]:
|
||||
"""
|
||||
Detect all the possible encodings of the given byte string.
|
||||
|
||||
|
@ -50,6 +64,9 @@ def detect_all(byte_str, ignore_threshold=False):
|
|||
``UniversalDetector.MINIMUM_THRESHOLD``
|
||||
in results.
|
||||
:type ignore_threshold: ``bool``
|
||||
:param should_rename_legacy: Should we rename legacy encodings
|
||||
to their more modern equivalents?
|
||||
:type should_rename_legacy: ``bool``
|
||||
"""
|
||||
if not isinstance(byte_str, bytearray):
|
||||
if not isinstance(byte_str, bytes):
|
||||
|
@ -58,15 +75,15 @@ def detect_all(byte_str, ignore_threshold=False):
|
|||
)
|
||||
byte_str = bytearray(byte_str)
|
||||
|
||||
detector = UniversalDetector()
|
||||
detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
|
||||
detector.feed(byte_str)
|
||||
detector.close()
|
||||
|
||||
if detector.input_state == InputState.HIGH_BYTE:
|
||||
results = []
|
||||
probers = []
|
||||
results: List[ResultDict] = []
|
||||
probers: List[CharSetProber] = []
|
||||
for prober in detector.charset_probers:
|
||||
if hasattr(prober, "probers"):
|
||||
if isinstance(prober, CharSetGroupProber):
|
||||
probers.extend(p for p in prober.probers)
|
||||
else:
|
||||
probers.append(prober)
|
||||
|
@ -80,6 +97,11 @@ def detect_all(byte_str, ignore_threshold=False):
|
|||
charset_name = detector.ISO_WIN_MAP.get(
|
||||
lower_charset_name, charset_name
|
||||
)
|
||||
# Rename legacy encodings with superset encodings if asked
|
||||
if should_rename_legacy:
|
||||
charset_name = detector.LEGACY_MAP.get(
|
||||
charset_name.lower(), charset_name
|
||||
)
|
||||
results.append(
|
||||
{
|
||||
"encoding": charset_name,
|
||||
|
|
|
@ -32,16 +32,16 @@ from .mbcssm import BIG5_SM_MODEL
|
|||
|
||||
|
||||
class Big5Prober(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.coding_sm = CodingStateMachine(BIG5_SM_MODEL)
|
||||
self.distribution_analyzer = Big5DistributionAnalysis()
|
||||
self.reset()
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return "Big5"
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return "Chinese"
|
||||
|
|
|
@ -25,6 +25,8 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from typing import Tuple, Union
|
||||
|
||||
from .big5freq import (
|
||||
BIG5_CHAR_TO_FREQ_ORDER,
|
||||
BIG5_TABLE_SIZE,
|
||||
|
@ -59,22 +61,22 @@ class CharDistributionAnalysis:
|
|||
SURE_NO = 0.01
|
||||
MINIMUM_DATA_THRESHOLD = 3
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
# Mapping table to get frequency order from char order (get from
|
||||
# GetOrder())
|
||||
self._char_to_freq_order = tuple()
|
||||
self._table_size = None # Size of above table
|
||||
self._char_to_freq_order: Tuple[int, ...] = tuple()
|
||||
self._table_size = 0 # Size of above table
|
||||
# This is a constant value which varies from language to language,
|
||||
# used in calculating confidence. See
|
||||
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
|
||||
# for further detail.
|
||||
self.typical_distribution_ratio = None
|
||||
self._done = None
|
||||
self._total_chars = None
|
||||
self._freq_chars = None
|
||||
self.typical_distribution_ratio = 0.0
|
||||
self._done = False
|
||||
self._total_chars = 0
|
||||
self._freq_chars = 0
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
"""reset analyser, clear any state"""
|
||||
# If this flag is set to True, detection is done and conclusion has
|
||||
# been made
|
||||
|
@ -83,7 +85,7 @@ class CharDistributionAnalysis:
|
|||
# The number of characters whose frequency order is less than 512
|
||||
self._freq_chars = 0
|
||||
|
||||
def feed(self, char, char_len):
|
||||
def feed(self, char: Union[bytes, bytearray], char_len: int) -> None:
|
||||
"""feed a character with known length"""
|
||||
if char_len == 2:
|
||||
# we only care about 2-bytes character in our distribution analysis
|
||||
|
@ -97,7 +99,7 @@ class CharDistributionAnalysis:
|
|||
if 512 > self._char_to_freq_order[order]:
|
||||
self._freq_chars += 1
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
"""return confidence based on existing data"""
|
||||
# if we didn't receive any character in our consideration range,
|
||||
# return negative answer
|
||||
|
@ -114,12 +116,12 @@ class CharDistributionAnalysis:
|
|||
# normalize confidence (we don't want to be 100% sure)
|
||||
return self.SURE_YES
|
||||
|
||||
def got_enough_data(self):
|
||||
def got_enough_data(self) -> bool:
|
||||
# It is not necessary to receive all data to draw conclusion.
|
||||
# For charset detection, certain amount of data is enough
|
||||
return self._total_chars > self.ENOUGH_DATA_THRESHOLD
|
||||
|
||||
def get_order(self, _):
|
||||
def get_order(self, _: Union[bytes, bytearray]) -> int:
|
||||
# We do not handle characters based on the original encoding string,
|
||||
# but convert this encoding string to a number, here called order.
|
||||
# This allows multiple encodings of a language to share one frequency
|
||||
|
@ -128,13 +130,13 @@ class CharDistributionAnalysis:
|
|||
|
||||
|
||||
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
|
||||
self._table_size = EUCTW_TABLE_SIZE
|
||||
self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, byte_str):
|
||||
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||
# for euc-TW encoding, we are interested
|
||||
# first byte range: 0xc4 -- 0xfe
|
||||
# second byte range: 0xa1 -- 0xfe
|
||||
|
@ -146,13 +148,13 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
|||
|
||||
|
||||
class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
|
||||
self._table_size = EUCKR_TABLE_SIZE
|
||||
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, byte_str):
|
||||
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||
# for euc-KR encoding, we are interested
|
||||
# first byte range: 0xb0 -- 0xfe
|
||||
# second byte range: 0xa1 -- 0xfe
|
||||
|
@ -164,13 +166,13 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
|||
|
||||
|
||||
class JOHABDistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
|
||||
self._table_size = EUCKR_TABLE_SIZE
|
||||
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, byte_str):
|
||||
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||
first_char = byte_str[0]
|
||||
if 0x88 <= first_char < 0xD4:
|
||||
code = first_char * 256 + byte_str[1]
|
||||
|
@ -179,13 +181,13 @@ class JOHABDistributionAnalysis(CharDistributionAnalysis):
|
|||
|
||||
|
||||
class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
|
||||
self._table_size = GB2312_TABLE_SIZE
|
||||
self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, byte_str):
|
||||
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||
# for GB2312 encoding, we are interested
|
||||
# first byte range: 0xb0 -- 0xfe
|
||||
# second byte range: 0xa1 -- 0xfe
|
||||
|
@ -197,13 +199,13 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
|||
|
||||
|
||||
class Big5DistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
|
||||
self._table_size = BIG5_TABLE_SIZE
|
||||
self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, byte_str):
|
||||
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||
# for big5 encoding, we are interested
|
||||
# first byte range: 0xa4 -- 0xfe
|
||||
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
|
||||
|
@ -217,13 +219,13 @@ class Big5DistributionAnalysis(CharDistributionAnalysis):
|
|||
|
||||
|
||||
class SJISDistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
|
||||
self._table_size = JIS_TABLE_SIZE
|
||||
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, byte_str):
|
||||
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||
# for sjis encoding, we are interested
|
||||
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
|
||||
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
|
||||
|
@ -242,13 +244,13 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
|
|||
|
||||
|
||||
class EUCJPDistributionAnalysis(CharDistributionAnalysis):
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
|
||||
self._table_size = JIS_TABLE_SIZE
|
||||
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
||||
|
||||
def get_order(self, byte_str):
|
||||
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||
# for euc-JP encoding, we are interested
|
||||
# first byte range: 0xa0 -- 0xfe
|
||||
# second byte range: 0xa1 -- 0xfe
|
||||
|
|
|
@ -25,29 +25,30 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from .charsetprober import CharSetProber
|
||||
from .enums import ProbingState
|
||||
from .enums import LanguageFilter, ProbingState
|
||||
|
||||
|
||||
class CharSetGroupProber(CharSetProber):
|
||||
def __init__(self, lang_filter=None):
|
||||
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
|
||||
super().__init__(lang_filter=lang_filter)
|
||||
self._active_num = 0
|
||||
self.probers = []
|
||||
self._best_guess_prober = None
|
||||
self.probers: List[CharSetProber] = []
|
||||
self._best_guess_prober: Optional[CharSetProber] = None
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
super().reset()
|
||||
self._active_num = 0
|
||||
for prober in self.probers:
|
||||
if prober:
|
||||
prober.reset()
|
||||
prober.active = True
|
||||
self._active_num += 1
|
||||
prober.reset()
|
||||
prober.active = True
|
||||
self._active_num += 1
|
||||
self._best_guess_prober = None
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> Optional[str]:
|
||||
if not self._best_guess_prober:
|
||||
self.get_confidence()
|
||||
if not self._best_guess_prober:
|
||||
|
@ -55,17 +56,15 @@ class CharSetGroupProber(CharSetProber):
|
|||
return self._best_guess_prober.charset_name
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> Optional[str]:
|
||||
if not self._best_guess_prober:
|
||||
self.get_confidence()
|
||||
if not self._best_guess_prober:
|
||||
return None
|
||||
return self._best_guess_prober.language
|
||||
|
||||
def feed(self, byte_str):
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
for prober in self.probers:
|
||||
if not prober:
|
||||
continue
|
||||
if not prober.active:
|
||||
continue
|
||||
state = prober.feed(byte_str)
|
||||
|
@ -83,7 +82,7 @@ class CharSetGroupProber(CharSetProber):
|
|||
return self.state
|
||||
return self.state
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
state = self.state
|
||||
if state == ProbingState.FOUND_IT:
|
||||
return 0.99
|
||||
|
@ -92,8 +91,6 @@ class CharSetGroupProber(CharSetProber):
|
|||
best_conf = 0.0
|
||||
self._best_guess_prober = None
|
||||
for prober in self.probers:
|
||||
if not prober:
|
||||
continue
|
||||
if not prober.active:
|
||||
self.logger.debug("%s not active", prober.charset_name)
|
||||
continue
|
||||
|
|
|
@ -28,8 +28,9 @@
|
|||
|
||||
import logging
|
||||
import re
|
||||
from typing import Optional, Union
|
||||
|
||||
from .enums import ProbingState
|
||||
from .enums import LanguageFilter, ProbingState
|
||||
|
||||
INTERNATIONAL_WORDS_PATTERN = re.compile(
|
||||
b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?"
|
||||
|
@ -40,35 +41,40 @@ class CharSetProber:
|
|||
|
||||
SHORTCUT_THRESHOLD = 0.95
|
||||
|
||||
def __init__(self, lang_filter=None):
|
||||
self._state = None
|
||||
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
|
||||
self._state = ProbingState.DETECTING
|
||||
self.active = True
|
||||
self.lang_filter = lang_filter
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
self._state = ProbingState.DETECTING
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> Optional[str]:
|
||||
return None
|
||||
|
||||
def feed(self, byte_str):
|
||||
@property
|
||||
def language(self) -> Optional[str]:
|
||||
raise NotImplementedError
|
||||
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def state(self):
|
||||
def state(self) -> ProbingState:
|
||||
return self._state
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
return 0.0
|
||||
|
||||
@staticmethod
|
||||
def filter_high_byte_only(buf):
|
||||
def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes:
|
||||
buf = re.sub(b"([\x00-\x7F])+", b" ", buf)
|
||||
return buf
|
||||
|
||||
@staticmethod
|
||||
def filter_international_words(buf):
|
||||
def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray:
|
||||
"""
|
||||
We define three types of bytes:
|
||||
alphabet: english alphabets [a-zA-Z]
|
||||
|
@ -102,7 +108,7 @@ class CharSetProber:
|
|||
return filtered
|
||||
|
||||
@staticmethod
|
||||
def remove_xml_tags(buf):
|
||||
def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytes:
|
||||
"""
|
||||
Returns a copy of ``buf`` that retains only the sequences of English
|
||||
alphabet and high byte characters that are not between <> characters.
|
||||
|
@ -117,10 +123,13 @@ class CharSetProber:
|
|||
|
||||
for curr, buf_char in enumerate(buf):
|
||||
# Check if we're coming out of or entering an XML tag
|
||||
if buf_char == b">":
|
||||
|
||||
# https://github.com/python/typeshed/issues/8182
|
||||
if buf_char == b">": # type: ignore[comparison-overlap]
|
||||
prev = curr + 1
|
||||
in_tag = False
|
||||
elif buf_char == b"<":
|
||||
# https://github.com/python/typeshed/issues/8182
|
||||
elif buf_char == b"<": # type: ignore[comparison-overlap]
|
||||
if curr > prev and not in_tag:
|
||||
# Keep everything after last non-extended-ASCII,
|
||||
# non-alphabetic character
|
||||
|
|
|
@ -15,12 +15,18 @@ If no paths are provided, it takes its input from stdin.
|
|||
|
||||
import argparse
|
||||
import sys
|
||||
from typing import Iterable, List, Optional
|
||||
|
||||
from .. import __version__
|
||||
from ..universaldetector import UniversalDetector
|
||||
|
||||
|
||||
def description_of(lines, name="stdin"):
|
||||
def description_of(
|
||||
lines: Iterable[bytes],
|
||||
name: str = "stdin",
|
||||
minimal: bool = False,
|
||||
should_rename_legacy: bool = False,
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Return a string describing the probable encoding of a file or
|
||||
list of strings.
|
||||
|
@ -29,8 +35,11 @@ def description_of(lines, name="stdin"):
|
|||
:type lines: Iterable of bytes
|
||||
:param name: Name of file or collection of lines
|
||||
:type name: str
|
||||
:param should_rename_legacy: Should we rename legacy encodings to
|
||||
their more modern equivalents?
|
||||
:type should_rename_legacy: ``bool``
|
||||
"""
|
||||
u = UniversalDetector()
|
||||
u = UniversalDetector(should_rename_legacy=should_rename_legacy)
|
||||
for line in lines:
|
||||
line = bytearray(line)
|
||||
u.feed(line)
|
||||
|
@ -39,12 +48,14 @@ def description_of(lines, name="stdin"):
|
|||
break
|
||||
u.close()
|
||||
result = u.result
|
||||
if minimal:
|
||||
return result["encoding"]
|
||||
if result["encoding"]:
|
||||
return f'{name}: {result["encoding"]} with confidence {result["confidence"]}'
|
||||
return f"{name}: no result"
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
def main(argv: Optional[List[str]] = None) -> None:
|
||||
"""
|
||||
Handles command line arguments and gets things started.
|
||||
|
||||
|
@ -54,17 +65,28 @@ def main(argv=None):
|
|||
"""
|
||||
# Get command line arguments
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Takes one or more file paths and reports their detected \
|
||||
encodings"
|
||||
description=(
|
||||
"Takes one or more file paths and reports their detected encodings"
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"input",
|
||||
help="File whose encoding we would like to determine. \
|
||||
(default: stdin)",
|
||||
help="File whose encoding we would like to determine. (default: stdin)",
|
||||
type=argparse.FileType("rb"),
|
||||
nargs="*",
|
||||
default=[sys.stdin.buffer],
|
||||
)
|
||||
parser.add_argument(
|
||||
"--minimal",
|
||||
help="Print only the encoding to standard output",
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-l",
|
||||
"--legacy",
|
||||
help="Rename legacy encodings to more modern ones.",
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version", action="version", version=f"%(prog)s {__version__}"
|
||||
)
|
||||
|
@ -79,7 +101,11 @@ def main(argv=None):
|
|||
"--help\n",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(description_of(f, f.name))
|
||||
print(
|
||||
description_of(
|
||||
f, f.name, minimal=args.minimal, should_rename_legacy=args.legacy
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
|
||||
import logging
|
||||
|
||||
from .codingstatemachinedict import CodingStateMachineDict
|
||||
from .enums import MachineState
|
||||
|
||||
|
||||
|
@ -53,18 +54,19 @@ class CodingStateMachine:
|
|||
encoding from consideration from here on.
|
||||
"""
|
||||
|
||||
def __init__(self, sm):
|
||||
def __init__(self, sm: CodingStateMachineDict) -> None:
|
||||
self._model = sm
|
||||
self._curr_byte_pos = 0
|
||||
self._curr_char_len = 0
|
||||
self._curr_state = None
|
||||
self._curr_state = MachineState.START
|
||||
self.active = True
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
self._curr_state = MachineState.START
|
||||
|
||||
def next_state(self, c):
|
||||
def next_state(self, c: int) -> int:
|
||||
# for each byte we get its class
|
||||
# if it is first byte, we also get byte length
|
||||
byte_class = self._model["class_table"][c]
|
||||
|
@ -77,12 +79,12 @@ class CodingStateMachine:
|
|||
self._curr_byte_pos += 1
|
||||
return self._curr_state
|
||||
|
||||
def get_current_charlen(self):
|
||||
def get_current_charlen(self) -> int:
|
||||
return self._curr_char_len
|
||||
|
||||
def get_coding_state_machine(self):
|
||||
def get_coding_state_machine(self) -> str:
|
||||
return self._model["name"]
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return self._model["language"]
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
from typing import TYPE_CHECKING, Tuple
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# TypedDict was introduced in Python 3.8.
|
||||
#
|
||||
# TODO: Remove the else block and TYPE_CHECKING check when dropping support
|
||||
# for Python 3.7.
|
||||
from typing import TypedDict
|
||||
|
||||
class CodingStateMachineDict(TypedDict, total=False):
|
||||
class_table: Tuple[int, ...]
|
||||
class_factor: int
|
||||
state_table: Tuple[int, ...]
|
||||
char_len_table: Tuple[int, ...]
|
||||
name: str
|
||||
language: str # Optional key
|
||||
|
||||
else:
|
||||
CodingStateMachineDict = dict
|
|
@ -32,7 +32,7 @@ from .mbcssm import CP949_SM_MODEL
|
|||
|
||||
|
||||
class CP949Prober(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.coding_sm = CodingStateMachine(CP949_SM_MODEL)
|
||||
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be
|
||||
|
@ -41,9 +41,9 @@ class CP949Prober(MultiByteCharSetProber):
|
|||
self.reset()
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return "CP949"
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return "Korean"
|
||||
|
|
|
@ -4,6 +4,8 @@ All of the Enums that are used throughout the chardet package.
|
|||
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
||||
"""
|
||||
|
||||
from enum import Enum, Flag
|
||||
|
||||
|
||||
class InputState:
|
||||
"""
|
||||
|
@ -15,12 +17,13 @@ class InputState:
|
|||
HIGH_BYTE = 2
|
||||
|
||||
|
||||
class LanguageFilter:
|
||||
class LanguageFilter(Flag):
|
||||
"""
|
||||
This enum represents the different language filters we can apply to a
|
||||
``UniversalDetector``.
|
||||
"""
|
||||
|
||||
NONE = 0x00
|
||||
CHINESE_SIMPLIFIED = 0x01
|
||||
CHINESE_TRADITIONAL = 0x02
|
||||
JAPANESE = 0x04
|
||||
|
@ -31,7 +34,7 @@ class LanguageFilter:
|
|||
CJK = CHINESE | JAPANESE | KOREAN
|
||||
|
||||
|
||||
class ProbingState:
|
||||
class ProbingState(Enum):
|
||||
"""
|
||||
This enum represents the different states a prober can be in.
|
||||
"""
|
||||
|
@ -62,7 +65,7 @@ class SequenceLikelihood:
|
|||
POSITIVE = 3
|
||||
|
||||
@classmethod
|
||||
def get_num_categories(cls):
|
||||
def get_num_categories(cls) -> int:
|
||||
""":returns: The number of likelihood categories in the enum."""
|
||||
return 4
|
||||
|
||||
|
|
|
@ -25,6 +25,8 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
from .charsetprober import CharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .enums import LanguageFilter, MachineState, ProbingState
|
||||
|
@ -43,7 +45,7 @@ class EscCharSetProber(CharSetProber):
|
|||
identify these encodings.
|
||||
"""
|
||||
|
||||
def __init__(self, lang_filter=None):
|
||||
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
|
||||
super().__init__(lang_filter=lang_filter)
|
||||
self.coding_sm = []
|
||||
if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
|
||||
|
@ -53,17 +55,15 @@ class EscCharSetProber(CharSetProber):
|
|||
self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))
|
||||
if self.lang_filter & LanguageFilter.KOREAN:
|
||||
self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))
|
||||
self.active_sm_count = None
|
||||
self._detected_charset = None
|
||||
self._detected_language = None
|
||||
self._state = None
|
||||
self.active_sm_count = 0
|
||||
self._detected_charset: Optional[str] = None
|
||||
self._detected_language: Optional[str] = None
|
||||
self._state = ProbingState.DETECTING
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
super().reset()
|
||||
for coding_sm in self.coding_sm:
|
||||
if not coding_sm:
|
||||
continue
|
||||
coding_sm.active = True
|
||||
coding_sm.reset()
|
||||
self.active_sm_count = len(self.coding_sm)
|
||||
|
@ -71,20 +71,20 @@ class EscCharSetProber(CharSetProber):
|
|||
self._detected_language = None
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> Optional[str]:
|
||||
return self._detected_charset
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> Optional[str]:
|
||||
return self._detected_language
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
return 0.99 if self._detected_charset else 0.00
|
||||
|
||||
def feed(self, byte_str):
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
for c in byte_str:
|
||||
for coding_sm in self.coding_sm:
|
||||
if not coding_sm or not coding_sm.active:
|
||||
if not coding_sm.active:
|
||||
continue
|
||||
coding_state = coding_sm.next_state(c)
|
||||
if coding_state == MachineState.ERROR:
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .codingstatemachinedict import CodingStateMachineDict
|
||||
from .enums import MachineState
|
||||
|
||||
# fmt: off
|
||||
|
@ -75,7 +76,7 @@ MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR
|
|||
|
||||
HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
||||
|
||||
HZ_SM_MODEL = {
|
||||
HZ_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": HZ_CLS,
|
||||
"class_factor": 6,
|
||||
"state_table": HZ_ST,
|
||||
|
@ -134,7 +135,7 @@ ISO2022CN_ST = (
|
|||
|
||||
ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||
|
||||
ISO2022CN_SM_MODEL = {
|
||||
ISO2022CN_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": ISO2022CN_CLS,
|
||||
"class_factor": 9,
|
||||
"state_table": ISO2022CN_ST,
|
||||
|
@ -194,7 +195,7 @@ ISO2022JP_ST = (
|
|||
|
||||
ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||
|
||||
ISO2022JP_SM_MODEL = {
|
||||
ISO2022JP_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": ISO2022JP_CLS,
|
||||
"class_factor": 10,
|
||||
"state_table": ISO2022JP_ST,
|
||||
|
@ -250,7 +251,7 @@ ISO2022KR_ST = (
|
|||
|
||||
ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
||||
|
||||
ISO2022KR_SM_MODEL = {
|
||||
ISO2022KR_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": ISO2022KR_CLS,
|
||||
"class_factor": 6,
|
||||
"state_table": ISO2022KR_ST,
|
||||
|
|
|
@ -25,6 +25,8 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from typing import Union
|
||||
|
||||
from .chardistribution import EUCJPDistributionAnalysis
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .enums import MachineState, ProbingState
|
||||
|
@ -34,26 +36,29 @@ from .mbcssm import EUCJP_SM_MODEL
|
|||
|
||||
|
||||
class EUCJPProber(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL)
|
||||
self.distribution_analyzer = EUCJPDistributionAnalysis()
|
||||
self.context_analyzer = EUCJPContextAnalysis()
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
super().reset()
|
||||
self.context_analyzer.reset()
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return "EUC-JP"
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return "Japanese"
|
||||
|
||||
def feed(self, byte_str):
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
assert self.coding_sm is not None
|
||||
assert self.distribution_analyzer is not None
|
||||
|
||||
for i, byte in enumerate(byte_str):
|
||||
# PY3K: byte_str is a byte array, so byte is an int, not a byte
|
||||
coding_state = self.coding_sm.next_state(byte)
|
||||
|
@ -89,7 +94,9 @@ class EUCJPProber(MultiByteCharSetProber):
|
|||
|
||||
return self.state
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
assert self.distribution_analyzer is not None
|
||||
|
||||
context_conf = self.context_analyzer.get_confidence()
|
||||
distrib_conf = self.distribution_analyzer.get_confidence()
|
||||
return max(context_conf, distrib_conf)
|
||||
|
|
|
@ -32,16 +32,16 @@ from .mbcssm import EUCKR_SM_MODEL
|
|||
|
||||
|
||||
class EUCKRProber(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL)
|
||||
self.distribution_analyzer = EUCKRDistributionAnalysis()
|
||||
self.reset()
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return "EUC-KR"
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return "Korean"
|
||||
|
|
|
@ -32,16 +32,16 @@ from .mbcssm import EUCTW_SM_MODEL
|
|||
|
||||
|
||||
class EUCTWProber(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL)
|
||||
self.distribution_analyzer = EUCTWDistributionAnalysis()
|
||||
self.reset()
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return "EUC-TW"
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return "Taiwan"
|
||||
|
|
|
@ -32,16 +32,16 @@ from .mbcssm import GB2312_SM_MODEL
|
|||
|
||||
|
||||
class GB2312Prober(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.coding_sm = CodingStateMachine(GB2312_SM_MODEL)
|
||||
self.distribution_analyzer = GB2312DistributionAnalysis()
|
||||
self.reset()
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return "GB2312"
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return "Chinese"
|
||||
|
|
|
@ -25,8 +25,11 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
from .charsetprober import CharSetProber
|
||||
from .enums import ProbingState
|
||||
from .sbcharsetprober import SingleByteCharSetProber
|
||||
|
||||
# This prober doesn't actually recognize a language or a charset.
|
||||
# It is a helper prober for the use of the Hebrew model probers
|
||||
|
@ -127,6 +130,7 @@ from .enums import ProbingState
|
|||
|
||||
|
||||
class HebrewProber(CharSetProber):
|
||||
SPACE = 0x20
|
||||
# windows-1255 / ISO-8859-8 code points of interest
|
||||
FINAL_KAF = 0xEA
|
||||
NORMAL_KAF = 0xEB
|
||||
|
@ -152,31 +156,35 @@ class HebrewProber(CharSetProber):
|
|||
VISUAL_HEBREW_NAME = "ISO-8859-8"
|
||||
LOGICAL_HEBREW_NAME = "windows-1255"
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._final_char_logical_score = None
|
||||
self._final_char_visual_score = None
|
||||
self._prev = None
|
||||
self._before_prev = None
|
||||
self._logical_prober = None
|
||||
self._visual_prober = None
|
||||
self._final_char_logical_score = 0
|
||||
self._final_char_visual_score = 0
|
||||
self._prev = self.SPACE
|
||||
self._before_prev = self.SPACE
|
||||
self._logical_prober: Optional[SingleByteCharSetProber] = None
|
||||
self._visual_prober: Optional[SingleByteCharSetProber] = None
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
self._final_char_logical_score = 0
|
||||
self._final_char_visual_score = 0
|
||||
# The two last characters seen in the previous buffer,
|
||||
# mPrev and mBeforePrev are initialized to space in order to simulate
|
||||
# a word delimiter at the beginning of the data
|
||||
self._prev = " "
|
||||
self._before_prev = " "
|
||||
self._prev = self.SPACE
|
||||
self._before_prev = self.SPACE
|
||||
# These probers are owned by the group prober.
|
||||
|
||||
def set_model_probers(self, logical_prober, visual_prober):
|
||||
def set_model_probers(
|
||||
self,
|
||||
logical_prober: SingleByteCharSetProber,
|
||||
visual_prober: SingleByteCharSetProber,
|
||||
) -> None:
|
||||
self._logical_prober = logical_prober
|
||||
self._visual_prober = visual_prober
|
||||
|
||||
def is_final(self, c):
|
||||
def is_final(self, c: int) -> bool:
|
||||
return c in [
|
||||
self.FINAL_KAF,
|
||||
self.FINAL_MEM,
|
||||
|
@ -185,7 +193,7 @@ class HebrewProber(CharSetProber):
|
|||
self.FINAL_TSADI,
|
||||
]
|
||||
|
||||
def is_non_final(self, c):
|
||||
def is_non_final(self, c: int) -> bool:
|
||||
# The normal Tsadi is not a good Non-Final letter due to words like
|
||||
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This
|
||||
# apostrophe is converted to a space in FilterWithoutEnglishLetters
|
||||
|
@ -198,7 +206,7 @@ class HebrewProber(CharSetProber):
|
|||
# since these words are quite rare.
|
||||
return c in [self.NORMAL_KAF, self.NORMAL_MEM, self.NORMAL_NUN, self.NORMAL_PE]
|
||||
|
||||
def feed(self, byte_str):
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
# Final letter analysis for logical-visual decision.
|
||||
# Look for evidence that the received buffer is either logical Hebrew
|
||||
# or visual Hebrew.
|
||||
|
@ -232,9 +240,9 @@ class HebrewProber(CharSetProber):
|
|||
byte_str = self.filter_high_byte_only(byte_str)
|
||||
|
||||
for cur in byte_str:
|
||||
if cur == " ":
|
||||
if cur == self.SPACE:
|
||||
# We stand on a space - a word just ended
|
||||
if self._before_prev != " ":
|
||||
if self._before_prev != self.SPACE:
|
||||
# next-to-last char was not a space so self._prev is not a
|
||||
# 1 letter word
|
||||
if self.is_final(self._prev):
|
||||
|
@ -247,9 +255,9 @@ class HebrewProber(CharSetProber):
|
|||
else:
|
||||
# Not standing on a space
|
||||
if (
|
||||
(self._before_prev == " ")
|
||||
(self._before_prev == self.SPACE)
|
||||
and (self.is_final(self._prev))
|
||||
and (cur != " ")
|
||||
and (cur != self.SPACE)
|
||||
):
|
||||
# case (3) [-2:space][-1:final letter][cur:not space]
|
||||
self._final_char_visual_score += 1
|
||||
|
@ -261,7 +269,10 @@ class HebrewProber(CharSetProber):
|
|||
return ProbingState.DETECTING
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
assert self._logical_prober is not None
|
||||
assert self._visual_prober is not None
|
||||
|
||||
# Make the decision: is it Logical or Visual?
|
||||
# If the final letter score distance is dominant enough, rely on it.
|
||||
finalsub = self._final_char_logical_score - self._final_char_visual_score
|
||||
|
@ -289,11 +300,14 @@ class HebrewProber(CharSetProber):
|
|||
return self.LOGICAL_HEBREW_NAME
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return "Hebrew"
|
||||
|
||||
@property
|
||||
def state(self):
|
||||
def state(self) -> ProbingState:
|
||||
assert self._logical_prober is not None
|
||||
assert self._visual_prober is not None
|
||||
|
||||
# Remain active as long as any of the model probers are active.
|
||||
if (self._logical_prober.state == ProbingState.NOT_ME) and (
|
||||
self._visual_prober.state == ProbingState.NOT_ME
|
||||
|
|
|
@ -32,16 +32,16 @@ from .mbcssm import JOHAB_SM_MODEL
|
|||
|
||||
|
||||
class JOHABProber(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.coding_sm = CodingStateMachine(JOHAB_SM_MODEL)
|
||||
self.distribution_analyzer = JOHABDistributionAnalysis()
|
||||
self.reset()
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return "Johab"
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return "Korean"
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from typing import List, Tuple, Union
|
||||
|
||||
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
||||
# fmt: off
|
||||
|
@ -123,15 +124,15 @@ class JapaneseContextAnalysis:
|
|||
MAX_REL_THRESHOLD = 1000
|
||||
MINIMUM_DATA_THRESHOLD = 4
|
||||
|
||||
def __init__(self):
|
||||
self._total_rel = None
|
||||
self._rel_sample = None
|
||||
self._need_to_skip_char_num = None
|
||||
self._last_char_order = None
|
||||
self._done = None
|
||||
def __init__(self) -> None:
|
||||
self._total_rel = 0
|
||||
self._rel_sample: List[int] = []
|
||||
self._need_to_skip_char_num = 0
|
||||
self._last_char_order = -1
|
||||
self._done = False
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
self._total_rel = 0 # total sequence received
|
||||
# category counters, each integer counts sequence in its category
|
||||
self._rel_sample = [0] * self.NUM_OF_CATEGORY
|
||||
|
@ -143,7 +144,7 @@ class JapaneseContextAnalysis:
|
|||
# been made
|
||||
self._done = False
|
||||
|
||||
def feed(self, byte_str, num_bytes):
|
||||
def feed(self, byte_str: Union[bytes, bytearray], num_bytes: int) -> None:
|
||||
if self._done:
|
||||
return
|
||||
|
||||
|
@ -172,29 +173,29 @@ class JapaneseContextAnalysis:
|
|||
] += 1
|
||||
self._last_char_order = order
|
||||
|
||||
def got_enough_data(self):
|
||||
def got_enough_data(self) -> bool:
|
||||
return self._total_rel > self.ENOUGH_REL_THRESHOLD
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
# This is just one way to calculate confidence. It works well for me.
|
||||
if self._total_rel > self.MINIMUM_DATA_THRESHOLD:
|
||||
return (self._total_rel - self._rel_sample[0]) / self._total_rel
|
||||
return self.DONT_KNOW
|
||||
|
||||
def get_order(self, _):
|
||||
def get_order(self, _: Union[bytes, bytearray]) -> Tuple[int, int]:
|
||||
return -1, 1
|
||||
|
||||
|
||||
class SJISContextAnalysis(JapaneseContextAnalysis):
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._charset_name = "SHIFT_JIS"
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return self._charset_name
|
||||
|
||||
def get_order(self, byte_str):
|
||||
def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]:
|
||||
if not byte_str:
|
||||
return -1, 1
|
||||
# find out current char's byte length
|
||||
|
@ -216,7 +217,7 @@ class SJISContextAnalysis(JapaneseContextAnalysis):
|
|||
|
||||
|
||||
class EUCJPContextAnalysis(JapaneseContextAnalysis):
|
||||
def get_order(self, byte_str):
|
||||
def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]:
|
||||
if not byte_str:
|
||||
return -1, 1
|
||||
# find out current char's byte length
|
||||
|
|
|
@ -26,6 +26,8 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from typing import List, Union
|
||||
|
||||
from .charsetprober import CharSetProber
|
||||
from .enums import ProbingState
|
||||
|
||||
|
@ -96,26 +98,26 @@ Latin1ClassModel = (
|
|||
|
||||
|
||||
class Latin1Prober(CharSetProber):
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._last_char_class = None
|
||||
self._freq_counter = None
|
||||
self._last_char_class = OTH
|
||||
self._freq_counter: List[int] = []
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
self._last_char_class = OTH
|
||||
self._freq_counter = [0] * FREQ_CAT_NUM
|
||||
super().reset()
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return "ISO-8859-1"
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return ""
|
||||
|
||||
def feed(self, byte_str):
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
byte_str = self.remove_xml_tags(byte_str)
|
||||
for c in byte_str:
|
||||
char_class = Latin1_CharToClass[c]
|
||||
|
@ -128,7 +130,7 @@ class Latin1Prober(CharSetProber):
|
|||
|
||||
return self.state
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
if self.state == ProbingState.NOT_ME:
|
||||
return 0.01
|
||||
|
||||
|
|
|
@ -0,0 +1,162 @@
|
|||
######################## BEGIN LICENSE BLOCK ########################
|
||||
# This code was modified from latin1prober.py by Rob Speer <rob@lumino.so>.
|
||||
# The Original Code is Mozilla Universal charset detector code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is
|
||||
# Netscape Communications Corporation.
|
||||
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||
# the Initial Developer. All Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
# Rob Speer - adapt to MacRoman encoding
|
||||
# Mark Pilgrim - port to Python
|
||||
# Shy Shalom - original C code
|
||||
#
|
||||
# This library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# This library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this library; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from typing import List, Union
|
||||
|
||||
from .charsetprober import CharSetProber
|
||||
from .enums import ProbingState
|
||||
|
||||
FREQ_CAT_NUM = 4
|
||||
|
||||
UDF = 0 # undefined
|
||||
OTH = 1 # other
|
||||
ASC = 2 # ascii capital letter
|
||||
ASS = 3 # ascii small letter
|
||||
ACV = 4 # accent capital vowel
|
||||
ACO = 5 # accent capital other
|
||||
ASV = 6 # accent small vowel
|
||||
ASO = 7 # accent small other
|
||||
ODD = 8 # character that is unlikely to appear
|
||||
CLASS_NUM = 9 # total classes
|
||||
|
||||
# The change from Latin1 is that we explicitly look for extended characters
|
||||
# that are infrequently-occurring symbols, and consider them to always be
|
||||
# improbable. This should let MacRoman get out of the way of more likely
|
||||
# encodings in most situations.
|
||||
|
||||
# fmt: off
|
||||
MacRoman_CharToClass = (
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
|
||||
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
|
||||
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
|
||||
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
|
||||
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
|
||||
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
|
||||
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
|
||||
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
|
||||
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
|
||||
ACV, ACV, ACO, ACV, ACO, ACV, ACV, ASV, # 80 - 87
|
||||
ASV, ASV, ASV, ASV, ASV, ASO, ASV, ASV, # 88 - 8F
|
||||
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASV, # 90 - 97
|
||||
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # 98 - 9F
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, ASO, # A0 - A7
|
||||
OTH, OTH, ODD, ODD, OTH, OTH, ACV, ACV, # A8 - AF
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, ASV, ASV, # B8 - BF
|
||||
OTH, OTH, ODD, OTH, ODD, OTH, OTH, OTH, # C0 - C7
|
||||
OTH, OTH, OTH, ACV, ACV, ACV, ACV, ASV, # C8 - CF
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, ODD, # D0 - D7
|
||||
ASV, ACV, ODD, OTH, OTH, OTH, OTH, OTH, # D8 - DF
|
||||
OTH, OTH, OTH, OTH, OTH, ACV, ACV, ACV, # E0 - E7
|
||||
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # E8 - EF
|
||||
ODD, ACV, ACV, ACV, ACV, ASV, ODD, ODD, # F0 - F7
|
||||
ODD, ODD, ODD, ODD, ODD, ODD, ODD, ODD, # F8 - FF
|
||||
)
|
||||
|
||||
# 0 : illegal
|
||||
# 1 : very unlikely
|
||||
# 2 : normal
|
||||
# 3 : very likely
|
||||
MacRomanClassModel = (
|
||||
# UDF OTH ASC ASS ACV ACO ASV ASO ODD
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, # UDF
|
||||
0, 3, 3, 3, 3, 3, 3, 3, 1, # OTH
|
||||
0, 3, 3, 3, 3, 3, 3, 3, 1, # ASC
|
||||
0, 3, 3, 3, 1, 1, 3, 3, 1, # ASS
|
||||
0, 3, 3, 3, 1, 2, 1, 2, 1, # ACV
|
||||
0, 3, 3, 3, 3, 3, 3, 3, 1, # ACO
|
||||
0, 3, 1, 3, 1, 1, 1, 3, 1, # ASV
|
||||
0, 3, 1, 3, 1, 1, 3, 3, 1, # ASO
|
||||
0, 1, 1, 1, 1, 1, 1, 1, 1, # ODD
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
|
||||
class MacRomanProber(CharSetProber):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._last_char_class = OTH
|
||||
self._freq_counter: List[int] = []
|
||||
self.reset()
|
||||
|
||||
def reset(self) -> None:
|
||||
self._last_char_class = OTH
|
||||
self._freq_counter = [0] * FREQ_CAT_NUM
|
||||
|
||||
# express the prior that MacRoman is a somewhat rare encoding;
|
||||
# this can be done by starting out in a slightly improbable state
|
||||
# that must be overcome
|
||||
self._freq_counter[2] = 10
|
||||
|
||||
super().reset()
|
||||
|
||||
@property
|
||||
def charset_name(self) -> str:
|
||||
return "MacRoman"
|
||||
|
||||
@property
|
||||
def language(self) -> str:
|
||||
return ""
|
||||
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
byte_str = self.remove_xml_tags(byte_str)
|
||||
for c in byte_str:
|
||||
char_class = MacRoman_CharToClass[c]
|
||||
freq = MacRomanClassModel[(self._last_char_class * CLASS_NUM) + char_class]
|
||||
if freq == 0:
|
||||
self._state = ProbingState.NOT_ME
|
||||
break
|
||||
self._freq_counter[freq] += 1
|
||||
self._last_char_class = char_class
|
||||
|
||||
return self.state
|
||||
|
||||
def get_confidence(self) -> float:
|
||||
if self.state == ProbingState.NOT_ME:
|
||||
return 0.01
|
||||
|
||||
total = sum(self._freq_counter)
|
||||
confidence = (
|
||||
0.0
|
||||
if total < 0.01
|
||||
else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total
|
||||
)
|
||||
confidence = max(confidence, 0.0)
|
||||
# lower the confidence of MacRoman so that other more accurate
|
||||
# detector can take priority.
|
||||
confidence *= 0.73
|
||||
return confidence
|
|
@ -27,8 +27,12 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
from .chardistribution import CharDistributionAnalysis
|
||||
from .charsetprober import CharSetProber
|
||||
from .enums import MachineState, ProbingState
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .enums import LanguageFilter, MachineState, ProbingState
|
||||
|
||||
|
||||
class MultiByteCharSetProber(CharSetProber):
|
||||
|
@ -36,29 +40,24 @@ class MultiByteCharSetProber(CharSetProber):
|
|||
MultiByteCharSetProber
|
||||
"""
|
||||
|
||||
def __init__(self, lang_filter=None):
|
||||
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
|
||||
super().__init__(lang_filter=lang_filter)
|
||||
self.distribution_analyzer = None
|
||||
self.coding_sm = None
|
||||
self._last_char = [0, 0]
|
||||
self.distribution_analyzer: Optional[CharDistributionAnalysis] = None
|
||||
self.coding_sm: Optional[CodingStateMachine] = None
|
||||
self._last_char = bytearray(b"\0\0")
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
super().reset()
|
||||
if self.coding_sm:
|
||||
self.coding_sm.reset()
|
||||
if self.distribution_analyzer:
|
||||
self.distribution_analyzer.reset()
|
||||
self._last_char = [0, 0]
|
||||
self._last_char = bytearray(b"\0\0")
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
raise NotImplementedError
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
assert self.coding_sm is not None
|
||||
assert self.distribution_analyzer is not None
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def feed(self, byte_str):
|
||||
for i, byte in enumerate(byte_str):
|
||||
coding_state = self.coding_sm.next_state(byte)
|
||||
if coding_state == MachineState.ERROR:
|
||||
|
@ -91,5 +90,6 @@ class MultiByteCharSetProber(CharSetProber):
|
|||
|
||||
return self.state
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
assert self.distribution_analyzer is not None
|
||||
return self.distribution_analyzer.get_confidence()
|
||||
|
|
|
@ -30,6 +30,7 @@
|
|||
from .big5prober import Big5Prober
|
||||
from .charsetgroupprober import CharSetGroupProber
|
||||
from .cp949prober import CP949Prober
|
||||
from .enums import LanguageFilter
|
||||
from .eucjpprober import EUCJPProber
|
||||
from .euckrprober import EUCKRProber
|
||||
from .euctwprober import EUCTWProber
|
||||
|
@ -40,7 +41,7 @@ from .utf8prober import UTF8Prober
|
|||
|
||||
|
||||
class MBCSGroupProber(CharSetGroupProber):
|
||||
def __init__(self, lang_filter=None):
|
||||
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
|
||||
super().__init__(lang_filter=lang_filter)
|
||||
self.probers = [
|
||||
UTF8Prober(),
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from .codingstatemachinedict import CodingStateMachineDict
|
||||
from .enums import MachineState
|
||||
|
||||
# BIG5
|
||||
|
@ -74,7 +75,7 @@ BIG5_ST = (
|
|||
|
||||
BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0)
|
||||
|
||||
BIG5_SM_MODEL = {
|
||||
BIG5_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": BIG5_CLS,
|
||||
"class_factor": 5,
|
||||
"state_table": BIG5_ST,
|
||||
|
@ -117,7 +118,7 @@ CP949_ST = (
|
|||
|
||||
CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
|
||||
|
||||
CP949_SM_MODEL = {
|
||||
CP949_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": CP949_CLS,
|
||||
"class_factor": 10,
|
||||
"state_table": CP949_ST,
|
||||
|
@ -173,7 +174,7 @@ EUCJP_ST = (
|
|||
|
||||
EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0)
|
||||
|
||||
EUCJP_SM_MODEL = {
|
||||
EUCJP_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": EUCJP_CLS,
|
||||
"class_factor": 6,
|
||||
"state_table": EUCJP_ST,
|
||||
|
@ -226,7 +227,7 @@ EUCKR_ST = (
|
|||
|
||||
EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0)
|
||||
|
||||
EUCKR_SM_MODEL = {
|
||||
EUCKR_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": EUCKR_CLS,
|
||||
"class_factor": 4,
|
||||
"state_table": EUCKR_ST,
|
||||
|
@ -283,7 +284,7 @@ JOHAB_ST = (
|
|||
|
||||
JOHAB_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 0, 0, 2, 2, 2)
|
||||
|
||||
JOHAB_SM_MODEL = {
|
||||
JOHAB_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": JOHAB_CLS,
|
||||
"class_factor": 10,
|
||||
"state_table": JOHAB_ST,
|
||||
|
@ -340,7 +341,7 @@ EUCTW_ST = (
|
|||
|
||||
EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3)
|
||||
|
||||
EUCTW_SM_MODEL = {
|
||||
EUCTW_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": EUCTW_CLS,
|
||||
"class_factor": 7,
|
||||
"state_table": EUCTW_ST,
|
||||
|
@ -402,7 +403,7 @@ GB2312_ST = (
|
|||
# 2 here.
|
||||
GB2312_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 1, 2)
|
||||
|
||||
GB2312_SM_MODEL = {
|
||||
GB2312_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": GB2312_CLS,
|
||||
"class_factor": 7,
|
||||
"state_table": GB2312_ST,
|
||||
|
@ -458,7 +459,7 @@ SJIS_ST = (
|
|||
|
||||
SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0)
|
||||
|
||||
SJIS_SM_MODEL = {
|
||||
SJIS_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": SJIS_CLS,
|
||||
"class_factor": 6,
|
||||
"state_table": SJIS_ST,
|
||||
|
@ -516,7 +517,7 @@ UCS2BE_ST = (
|
|||
|
||||
UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2)
|
||||
|
||||
UCS2BE_SM_MODEL = {
|
||||
UCS2BE_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": UCS2BE_CLS,
|
||||
"class_factor": 6,
|
||||
"state_table": UCS2BE_ST,
|
||||
|
@ -574,7 +575,7 @@ UCS2LE_ST = (
|
|||
|
||||
UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2)
|
||||
|
||||
UCS2LE_SM_MODEL = {
|
||||
UCS2LE_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": UCS2LE_CLS,
|
||||
"class_factor": 6,
|
||||
"state_table": UCS2LE_ST,
|
||||
|
@ -651,7 +652,7 @@ UTF8_ST = (
|
|||
|
||||
UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)
|
||||
|
||||
UTF8_SM_MODEL = {
|
||||
UTF8_SM_MODEL: CodingStateMachineDict = {
|
||||
"class_table": UTF8_CLS,
|
||||
"class_factor": 16,
|
||||
"state_table": UTF8_ST,
|
||||
|
|
|
@ -6,6 +6,7 @@ This code is based on the language metadata from the uchardet project.
|
|||
"""
|
||||
|
||||
from string import ascii_letters
|
||||
from typing import List, Optional
|
||||
|
||||
# TODO: Add Ukrainian (KOI8-U)
|
||||
|
||||
|
@ -33,13 +34,13 @@ class Language:
|
|||
|
||||
def __init__(
|
||||
self,
|
||||
name=None,
|
||||
iso_code=None,
|
||||
use_ascii=True,
|
||||
charsets=None,
|
||||
alphabet=None,
|
||||
wiki_start_pages=None,
|
||||
):
|
||||
name: Optional[str] = None,
|
||||
iso_code: Optional[str] = None,
|
||||
use_ascii: bool = True,
|
||||
charsets: Optional[List[str]] = None,
|
||||
alphabet: Optional[str] = None,
|
||||
wiki_start_pages: Optional[List[str]] = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.name = name
|
||||
self.iso_code = iso_code
|
||||
|
@ -55,7 +56,7 @@ class Language:
|
|||
self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None
|
||||
self.wiki_start_pages = wiki_start_pages
|
||||
|
||||
def __repr__(self):
|
||||
def __repr__(self) -> str:
|
||||
param_str = ", ".join(
|
||||
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
|
||||
)
|
||||
|
@ -103,7 +104,7 @@ LANGUAGES = {
|
|||
name="Danish",
|
||||
iso_code="da",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="æøåÆØÅ",
|
||||
wiki_start_pages=["Forside"],
|
||||
),
|
||||
|
@ -111,8 +112,8 @@ LANGUAGES = {
|
|||
name="German",
|
||||
iso_code="de",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "WINDOWS-1252"],
|
||||
alphabet="äöüßÄÖÜ",
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="äöüßẞÄÖÜ",
|
||||
wiki_start_pages=["Wikipedia:Hauptseite"],
|
||||
),
|
||||
"Greek": Language(
|
||||
|
@ -127,7 +128,7 @@ LANGUAGES = {
|
|||
name="English",
|
||||
iso_code="en",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "WINDOWS-1252"],
|
||||
charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
|
||||
wiki_start_pages=["Main_Page"],
|
||||
),
|
||||
"Esperanto": Language(
|
||||
|
@ -143,7 +144,7 @@ LANGUAGES = {
|
|||
name="Spanish",
|
||||
iso_code="es",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="ñáéíóúüÑÁÉÍÓÚÜ",
|
||||
wiki_start_pages=["Wikipedia:Portada"],
|
||||
),
|
||||
|
@ -161,7 +162,7 @@ LANGUAGES = {
|
|||
name="Finnish",
|
||||
iso_code="fi",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="ÅÄÖŠŽåäöšž",
|
||||
wiki_start_pages=["Wikipedia:Etusivu"],
|
||||
),
|
||||
|
@ -169,7 +170,7 @@ LANGUAGES = {
|
|||
name="French",
|
||||
iso_code="fr",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ",
|
||||
wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"],
|
||||
),
|
||||
|
@ -203,7 +204,7 @@ LANGUAGES = {
|
|||
name="Italian",
|
||||
iso_code="it",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="ÀÈÉÌÒÓÙàèéìòóù",
|
||||
wiki_start_pages=["Pagina_principale"],
|
||||
),
|
||||
|
@ -237,7 +238,7 @@ LANGUAGES = {
|
|||
name="Dutch",
|
||||
iso_code="nl",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "WINDOWS-1252"],
|
||||
charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
|
||||
wiki_start_pages=["Hoofdpagina"],
|
||||
),
|
||||
"Polish": Language(
|
||||
|
@ -253,7 +254,7 @@ LANGUAGES = {
|
|||
name="Portuguese",
|
||||
iso_code="pt",
|
||||
use_ascii=True,
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
|
||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||
alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú",
|
||||
wiki_start_pages=["Wikipédia:Página_principal"],
|
||||
),
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# TypedDict was introduced in Python 3.8.
|
||||
#
|
||||
# TODO: Remove the else block and TYPE_CHECKING check when dropping support
|
||||
# for Python 3.7.
|
||||
from typing import TypedDict
|
||||
|
||||
class ResultDict(TypedDict):
|
||||
encoding: Optional[str]
|
||||
confidence: float
|
||||
language: Optional[str]
|
||||
|
||||
else:
|
||||
ResultDict = dict
|
|
@ -26,23 +26,20 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from collections import namedtuple
|
||||
from typing import Dict, List, NamedTuple, Optional, Union
|
||||
|
||||
from .charsetprober import CharSetProber
|
||||
from .enums import CharacterCategory, ProbingState, SequenceLikelihood
|
||||
|
||||
SingleByteCharSetModel = namedtuple(
|
||||
"SingleByteCharSetModel",
|
||||
[
|
||||
"charset_name",
|
||||
"language",
|
||||
"char_to_order_map",
|
||||
"language_model",
|
||||
"typical_positive_ratio",
|
||||
"keep_ascii_letters",
|
||||
"alphabet",
|
||||
],
|
||||
)
|
||||
|
||||
class SingleByteCharSetModel(NamedTuple):
|
||||
charset_name: str
|
||||
language: str
|
||||
char_to_order_map: Dict[int, int]
|
||||
language_model: Dict[int, Dict[int, int]]
|
||||
typical_positive_ratio: float
|
||||
keep_ascii_letters: bool
|
||||
alphabet: str
|
||||
|
||||
|
||||
class SingleByteCharSetProber(CharSetProber):
|
||||
|
@ -51,22 +48,27 @@ class SingleByteCharSetProber(CharSetProber):
|
|||
POSITIVE_SHORTCUT_THRESHOLD = 0.95
|
||||
NEGATIVE_SHORTCUT_THRESHOLD = 0.05
|
||||
|
||||
def __init__(self, model, is_reversed=False, name_prober=None):
|
||||
def __init__(
|
||||
self,
|
||||
model: SingleByteCharSetModel,
|
||||
is_reversed: bool = False,
|
||||
name_prober: Optional[CharSetProber] = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self._model = model
|
||||
# TRUE if we need to reverse every pair in the model lookup
|
||||
self._reversed = is_reversed
|
||||
# Optional auxiliary prober for name decision
|
||||
self._name_prober = name_prober
|
||||
self._last_order = None
|
||||
self._seq_counters = None
|
||||
self._total_seqs = None
|
||||
self._total_char = None
|
||||
self._control_char = None
|
||||
self._freq_char = None
|
||||
self._last_order = 255
|
||||
self._seq_counters: List[int] = []
|
||||
self._total_seqs = 0
|
||||
self._total_char = 0
|
||||
self._control_char = 0
|
||||
self._freq_char = 0
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
super().reset()
|
||||
# char order of last character
|
||||
self._last_order = 255
|
||||
|
@ -78,18 +80,18 @@ class SingleByteCharSetProber(CharSetProber):
|
|||
self._freq_char = 0
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> Optional[str]:
|
||||
if self._name_prober:
|
||||
return self._name_prober.charset_name
|
||||
return self._model.charset_name
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> Optional[str]:
|
||||
if self._name_prober:
|
||||
return self._name_prober.language
|
||||
return self._model.language
|
||||
|
||||
def feed(self, byte_str):
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
# TODO: Make filter_international_words keep things in self.alphabet
|
||||
if not self._model.keep_ascii_letters:
|
||||
byte_str = self.filter_international_words(byte_str)
|
||||
|
@ -139,7 +141,7 @@ class SingleByteCharSetProber(CharSetProber):
|
|||
|
||||
return self.state
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
r = 0.01
|
||||
if self._total_seqs > 0:
|
||||
r = (
|
||||
|
|
|
@ -48,7 +48,7 @@ from .sbcharsetprober import SingleByteCharSetProber
|
|||
|
||||
|
||||
class SBCSGroupProber(CharSetGroupProber):
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
hebrew_prober = HebrewProber()
|
||||
logical_hebrew_prober = SingleByteCharSetProber(
|
||||
|
|
|
@ -25,6 +25,8 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from typing import Union
|
||||
|
||||
from .chardistribution import SJISDistributionAnalysis
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .enums import MachineState, ProbingState
|
||||
|
@ -34,26 +36,29 @@ from .mbcssm import SJIS_SM_MODEL
|
|||
|
||||
|
||||
class SJISProber(MultiByteCharSetProber):
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.coding_sm = CodingStateMachine(SJIS_SM_MODEL)
|
||||
self.distribution_analyzer = SJISDistributionAnalysis()
|
||||
self.context_analyzer = SJISContextAnalysis()
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
super().reset()
|
||||
self.context_analyzer.reset()
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return self.context_analyzer.charset_name
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return "Japanese"
|
||||
|
||||
def feed(self, byte_str):
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
assert self.coding_sm is not None
|
||||
assert self.distribution_analyzer is not None
|
||||
|
||||
for i, byte in enumerate(byte_str):
|
||||
coding_state = self.coding_sm.next_state(byte)
|
||||
if coding_state == MachineState.ERROR:
|
||||
|
@ -92,7 +97,9 @@ class SJISProber(MultiByteCharSetProber):
|
|||
|
||||
return self.state
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
assert self.distribution_analyzer is not None
|
||||
|
||||
context_conf = self.context_analyzer.get_confidence()
|
||||
distrib_conf = self.distribution_analyzer.get_confidence()
|
||||
return max(context_conf, distrib_conf)
|
||||
|
|
|
@ -39,12 +39,16 @@ class a user of ``chardet`` should use.
|
|||
import codecs
|
||||
import logging
|
||||
import re
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from .charsetgroupprober import CharSetGroupProber
|
||||
from .charsetprober import CharSetProber
|
||||
from .enums import InputState, LanguageFilter, ProbingState
|
||||
from .escprober import EscCharSetProber
|
||||
from .latin1prober import Latin1Prober
|
||||
from .macromanprober import MacRomanProber
|
||||
from .mbcsgroupprober import MBCSGroupProber
|
||||
from .resultdict import ResultDict
|
||||
from .sbcsgroupprober import SBCSGroupProber
|
||||
from .utf1632prober import UTF1632Prober
|
||||
|
||||
|
@ -80,34 +84,55 @@ class UniversalDetector:
|
|||
"iso-8859-9": "Windows-1254",
|
||||
"iso-8859-13": "Windows-1257",
|
||||
}
|
||||
# Based on https://encoding.spec.whatwg.org/#names-and-labels
|
||||
# but altered to match Python names for encodings and remove mappings
|
||||
# that break tests.
|
||||
LEGACY_MAP = {
|
||||
"ascii": "Windows-1252",
|
||||
"iso-8859-1": "Windows-1252",
|
||||
"tis-620": "ISO-8859-11",
|
||||
"iso-8859-9": "Windows-1254",
|
||||
"gb2312": "GB18030",
|
||||
"euc-kr": "CP949",
|
||||
"utf-16le": "UTF-16",
|
||||
}
|
||||
|
||||
def __init__(self, lang_filter=LanguageFilter.ALL):
|
||||
self._esc_charset_prober = None
|
||||
self._utf1632_prober = None
|
||||
self._charset_probers = []
|
||||
self.result = None
|
||||
self.done = None
|
||||
self._got_data = None
|
||||
self._input_state = None
|
||||
self._last_char = None
|
||||
def __init__(
|
||||
self,
|
||||
lang_filter: LanguageFilter = LanguageFilter.ALL,
|
||||
should_rename_legacy: bool = False,
|
||||
) -> None:
|
||||
self._esc_charset_prober: Optional[EscCharSetProber] = None
|
||||
self._utf1632_prober: Optional[UTF1632Prober] = None
|
||||
self._charset_probers: List[CharSetProber] = []
|
||||
self.result: ResultDict = {
|
||||
"encoding": None,
|
||||
"confidence": 0.0,
|
||||
"language": None,
|
||||
}
|
||||
self.done = False
|
||||
self._got_data = False
|
||||
self._input_state = InputState.PURE_ASCII
|
||||
self._last_char = b""
|
||||
self.lang_filter = lang_filter
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._has_win_bytes = None
|
||||
self._has_win_bytes = False
|
||||
self.should_rename_legacy = should_rename_legacy
|
||||
self.reset()
|
||||
|
||||
@property
|
||||
def input_state(self):
|
||||
def input_state(self) -> int:
|
||||
return self._input_state
|
||||
|
||||
@property
|
||||
def has_win_bytes(self):
|
||||
def has_win_bytes(self) -> bool:
|
||||
return self._has_win_bytes
|
||||
|
||||
@property
|
||||
def charset_probers(self):
|
||||
def charset_probers(self) -> List[CharSetProber]:
|
||||
return self._charset_probers
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
"""
|
||||
Reset the UniversalDetector and all of its probers back to their
|
||||
initial states. This is called by ``__init__``, so you only need to
|
||||
|
@ -126,7 +151,7 @@ class UniversalDetector:
|
|||
for prober in self._charset_probers:
|
||||
prober.reset()
|
||||
|
||||
def feed(self, byte_str):
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> None:
|
||||
"""
|
||||
Takes a chunk of a document and feeds it through all of the relevant
|
||||
charset probers.
|
||||
|
@ -166,6 +191,7 @@ class UniversalDetector:
|
|||
elif byte_str.startswith(b"\xFE\xFF\x00\x00"):
|
||||
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
||||
self.result = {
|
||||
# TODO: This encoding is not supported by Python. Should remove?
|
||||
"encoding": "X-ISO-10646-UCS-4-3412",
|
||||
"confidence": 1.0,
|
||||
"language": "",
|
||||
|
@ -173,6 +199,7 @@ class UniversalDetector:
|
|||
elif byte_str.startswith(b"\x00\x00\xFF\xFE"):
|
||||
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
||||
self.result = {
|
||||
# TODO: This encoding is not supported by Python. Should remove?
|
||||
"encoding": "X-ISO-10646-UCS-4-2143",
|
||||
"confidence": 1.0,
|
||||
"language": "",
|
||||
|
@ -242,6 +269,7 @@ class UniversalDetector:
|
|||
if self.lang_filter & LanguageFilter.NON_CJK:
|
||||
self._charset_probers.append(SBCSGroupProber())
|
||||
self._charset_probers.append(Latin1Prober())
|
||||
self._charset_probers.append(MacRomanProber())
|
||||
for prober in self._charset_probers:
|
||||
if prober.feed(byte_str) == ProbingState.FOUND_IT:
|
||||
self.result = {
|
||||
|
@ -254,7 +282,7 @@ class UniversalDetector:
|
|||
if self.WIN_BYTE_DETECTOR.search(byte_str):
|
||||
self._has_win_bytes = True
|
||||
|
||||
def close(self):
|
||||
def close(self) -> ResultDict:
|
||||
"""
|
||||
Stop analyzing the current document and come up with a final
|
||||
prediction.
|
||||
|
@ -288,7 +316,8 @@ class UniversalDetector:
|
|||
max_prober = prober
|
||||
if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
|
||||
charset_name = max_prober.charset_name
|
||||
lower_charset_name = max_prober.charset_name.lower()
|
||||
assert charset_name is not None
|
||||
lower_charset_name = charset_name.lower()
|
||||
confidence = max_prober.get_confidence()
|
||||
# Use Windows encoding name instead of ISO-8859 if we saw any
|
||||
# extra Windows-specific bytes
|
||||
|
@ -297,6 +326,11 @@ class UniversalDetector:
|
|||
charset_name = self.ISO_WIN_MAP.get(
|
||||
lower_charset_name, charset_name
|
||||
)
|
||||
# Rename legacy encodings with superset encodings if asked
|
||||
if self.should_rename_legacy:
|
||||
charset_name = self.LEGACY_MAP.get(
|
||||
(charset_name or "").lower(), charset_name
|
||||
)
|
||||
self.result = {
|
||||
"encoding": charset_name,
|
||||
"confidence": confidence,
|
||||
|
|
|
@ -18,6 +18,8 @@
|
|||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
from typing import List, Union
|
||||
|
||||
from .charsetprober import CharSetProber
|
||||
from .enums import ProbingState
|
||||
|
||||
|
@ -36,7 +38,7 @@ class UTF1632Prober(CharSetProber):
|
|||
# a fixed constant ratio of expected zeros or non-zeros in modulo-position.
|
||||
EXPECTED_RATIO = 0.94
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.position = 0
|
||||
self.zeros_at_mod = [0] * 4
|
||||
|
@ -51,7 +53,7 @@ class UTF1632Prober(CharSetProber):
|
|||
self.first_half_surrogate_pair_detected_16le = False
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
super().reset()
|
||||
self.position = 0
|
||||
self.zeros_at_mod = [0] * 4
|
||||
|
@ -66,7 +68,7 @@ class UTF1632Prober(CharSetProber):
|
|||
self.quad = [0, 0, 0, 0]
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
if self.is_likely_utf32be():
|
||||
return "utf-32be"
|
||||
if self.is_likely_utf32le():
|
||||
|
@ -79,16 +81,16 @@ class UTF1632Prober(CharSetProber):
|
|||
return "utf-16"
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return ""
|
||||
|
||||
def approx_32bit_chars(self):
|
||||
def approx_32bit_chars(self) -> float:
|
||||
return max(1.0, self.position / 4.0)
|
||||
|
||||
def approx_16bit_chars(self):
|
||||
def approx_16bit_chars(self) -> float:
|
||||
return max(1.0, self.position / 2.0)
|
||||
|
||||
def is_likely_utf32be(self):
|
||||
def is_likely_utf32be(self) -> bool:
|
||||
approx_chars = self.approx_32bit_chars()
|
||||
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
|
||||
self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
|
||||
|
@ -98,7 +100,7 @@ class UTF1632Prober(CharSetProber):
|
|||
and not self.invalid_utf32be
|
||||
)
|
||||
|
||||
def is_likely_utf32le(self):
|
||||
def is_likely_utf32le(self) -> bool:
|
||||
approx_chars = self.approx_32bit_chars()
|
||||
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
|
||||
self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
|
||||
|
@ -108,7 +110,7 @@ class UTF1632Prober(CharSetProber):
|
|||
and not self.invalid_utf32le
|
||||
)
|
||||
|
||||
def is_likely_utf16be(self):
|
||||
def is_likely_utf16be(self) -> bool:
|
||||
approx_chars = self.approx_16bit_chars()
|
||||
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
|
||||
(self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars
|
||||
|
@ -118,7 +120,7 @@ class UTF1632Prober(CharSetProber):
|
|||
and not self.invalid_utf16be
|
||||
)
|
||||
|
||||
def is_likely_utf16le(self):
|
||||
def is_likely_utf16le(self) -> bool:
|
||||
approx_chars = self.approx_16bit_chars()
|
||||
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
|
||||
(self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars
|
||||
|
@ -128,7 +130,7 @@ class UTF1632Prober(CharSetProber):
|
|||
and not self.invalid_utf16le
|
||||
)
|
||||
|
||||
def validate_utf32_characters(self, quad):
|
||||
def validate_utf32_characters(self, quad: List[int]) -> None:
|
||||
"""
|
||||
Validate if the quad of bytes is valid UTF-32.
|
||||
|
||||
|
@ -150,7 +152,7 @@ class UTF1632Prober(CharSetProber):
|
|||
):
|
||||
self.invalid_utf32le = True
|
||||
|
||||
def validate_utf16_characters(self, pair):
|
||||
def validate_utf16_characters(self, pair: List[int]) -> None:
|
||||
"""
|
||||
Validate if the pair of bytes is valid UTF-16.
|
||||
|
||||
|
@ -182,7 +184,7 @@ class UTF1632Prober(CharSetProber):
|
|||
else:
|
||||
self.invalid_utf16le = True
|
||||
|
||||
def feed(self, byte_str):
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
for c in byte_str:
|
||||
mod4 = self.position % 4
|
||||
self.quad[mod4] = c
|
||||
|
@ -198,7 +200,7 @@ class UTF1632Prober(CharSetProber):
|
|||
return self.state
|
||||
|
||||
@property
|
||||
def state(self):
|
||||
def state(self) -> ProbingState:
|
||||
if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}:
|
||||
# terminal, decided states
|
||||
return self._state
|
||||
|
@ -210,7 +212,7 @@ class UTF1632Prober(CharSetProber):
|
|||
self._state = ProbingState.NOT_ME
|
||||
return self._state
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
return (
|
||||
0.85
|
||||
if (
|
||||
|
|
|
@ -25,6 +25,8 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
from typing import Union
|
||||
|
||||
from .charsetprober import CharSetProber
|
||||
from .codingstatemachine import CodingStateMachine
|
||||
from .enums import MachineState, ProbingState
|
||||
|
@ -34,26 +36,26 @@ from .mbcssm import UTF8_SM_MODEL
|
|||
class UTF8Prober(CharSetProber):
|
||||
ONE_CHAR_PROB = 0.5
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.coding_sm = CodingStateMachine(UTF8_SM_MODEL)
|
||||
self._num_mb_chars = None
|
||||
self._num_mb_chars = 0
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
def reset(self) -> None:
|
||||
super().reset()
|
||||
self.coding_sm.reset()
|
||||
self._num_mb_chars = 0
|
||||
|
||||
@property
|
||||
def charset_name(self):
|
||||
def charset_name(self) -> str:
|
||||
return "utf-8"
|
||||
|
||||
@property
|
||||
def language(self):
|
||||
def language(self) -> str:
|
||||
return ""
|
||||
|
||||
def feed(self, byte_str):
|
||||
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||
for c in byte_str:
|
||||
coding_state = self.coding_sm.next_state(c)
|
||||
if coding_state == MachineState.ERROR:
|
||||
|
@ -72,7 +74,7 @@ class UTF8Prober(CharSetProber):
|
|||
|
||||
return self.state
|
||||
|
||||
def get_confidence(self):
|
||||
def get_confidence(self) -> float:
|
||||
unlike = 0.99
|
||||
if self._num_mb_chars < 6:
|
||||
unlike *= self.ONE_CHAR_PROB**self._num_mb_chars
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
"""
|
||||
This module exists only to simplify retrieving the version number of chardet
|
||||
from within setup.py and from chardet subpackages.
|
||||
from within setuptools and from chardet subpackages.
|
||||
|
||||
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
||||
"""
|
||||
|
||||
__version__ = "5.0.0"
|
||||
__version__ = "5.1.0"
|
||||
VERSION = __version__.split(".")
|
||||
|
|
|
@ -9,7 +9,7 @@ pyparsing==3.0.9
|
|||
pyproject-hooks==1.0.0
|
||||
requests==2.28.2
|
||||
certifi==2022.12.7
|
||||
chardet==5.0.0
|
||||
chardet==5.1.0
|
||||
idna==3.4
|
||||
urllib3==1.26.12
|
||||
rich==12.6.0
|
||||
|
|
Loading…
Reference in New Issue