Upgrade chardet to 5.1.0

This commit is contained in:
Pradyun Gedam 2023-01-28 20:41:43 +00:00
parent 1c110bede6
commit be20a75c10
No known key found for this signature in database
GPG Key ID: FF99710C4332258E
37 changed files with 620 additions and 287 deletions

1
news/chardet.vendor.rst Normal file
View File

@ -0,0 +1 @@
Upgrade chardet to 5.1.0

View File

@ -1 +0,0 @@
from chardet import *

View File

@ -15,19 +15,29 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from typing import List, Union
from .charsetgroupprober import CharSetGroupProber
from .charsetprober import CharSetProber
from .enums import InputState from .enums import InputState
from .resultdict import ResultDict
from .universaldetector import UniversalDetector from .universaldetector import UniversalDetector
from .version import VERSION, __version__ from .version import VERSION, __version__
__all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"] __all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"]
def detect(byte_str): def detect(
byte_str: Union[bytes, bytearray], should_rename_legacy: bool = False
) -> ResultDict:
""" """
Detect the encoding of the given byte string. Detect the encoding of the given byte string.
:param byte_str: The byte sequence to examine. :param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray`` :type byte_str: ``bytes`` or ``bytearray``
:param should_rename_legacy: Should we rename legacy encodings
to their more modern equivalents?
:type should_rename_legacy: ``bool``
""" """
if not isinstance(byte_str, bytearray): if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes): if not isinstance(byte_str, bytes):
@ -35,12 +45,16 @@ def detect(byte_str):
f"Expected object of type bytes or bytearray, got: {type(byte_str)}" f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
) )
byte_str = bytearray(byte_str) byte_str = bytearray(byte_str)
detector = UniversalDetector() detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
detector.feed(byte_str) detector.feed(byte_str)
return detector.close() return detector.close()
def detect_all(byte_str, ignore_threshold=False): def detect_all(
byte_str: Union[bytes, bytearray],
ignore_threshold: bool = False,
should_rename_legacy: bool = False,
) -> List[ResultDict]:
""" """
Detect all the possible encodings of the given byte string. Detect all the possible encodings of the given byte string.
@ -50,6 +64,9 @@ def detect_all(byte_str, ignore_threshold=False):
``UniversalDetector.MINIMUM_THRESHOLD`` ``UniversalDetector.MINIMUM_THRESHOLD``
in results. in results.
:type ignore_threshold: ``bool`` :type ignore_threshold: ``bool``
:param should_rename_legacy: Should we rename legacy encodings
to their more modern equivalents?
:type should_rename_legacy: ``bool``
""" """
if not isinstance(byte_str, bytearray): if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes): if not isinstance(byte_str, bytes):
@ -58,15 +75,15 @@ def detect_all(byte_str, ignore_threshold=False):
) )
byte_str = bytearray(byte_str) byte_str = bytearray(byte_str)
detector = UniversalDetector() detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
detector.feed(byte_str) detector.feed(byte_str)
detector.close() detector.close()
if detector.input_state == InputState.HIGH_BYTE: if detector.input_state == InputState.HIGH_BYTE:
results = [] results: List[ResultDict] = []
probers = [] probers: List[CharSetProber] = []
for prober in detector.charset_probers: for prober in detector.charset_probers:
if hasattr(prober, "probers"): if isinstance(prober, CharSetGroupProber):
probers.extend(p for p in prober.probers) probers.extend(p for p in prober.probers)
else: else:
probers.append(prober) probers.append(prober)
@ -80,6 +97,11 @@ def detect_all(byte_str, ignore_threshold=False):
charset_name = detector.ISO_WIN_MAP.get( charset_name = detector.ISO_WIN_MAP.get(
lower_charset_name, charset_name lower_charset_name, charset_name
) )
# Rename legacy encodings with superset encodings if asked
if should_rename_legacy:
charset_name = detector.LEGACY_MAP.get(
charset_name.lower(), charset_name
)
results.append( results.append(
{ {
"encoding": charset_name, "encoding": charset_name,

View File

@ -32,16 +32,16 @@ from .mbcssm import BIG5_SM_MODEL
class Big5Prober(MultiByteCharSetProber): class Big5Prober(MultiByteCharSetProber):
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
self.coding_sm = CodingStateMachine(BIG5_SM_MODEL) self.coding_sm = CodingStateMachine(BIG5_SM_MODEL)
self.distribution_analyzer = Big5DistributionAnalysis() self.distribution_analyzer = Big5DistributionAnalysis()
self.reset() self.reset()
@property @property
def charset_name(self): def charset_name(self) -> str:
return "Big5" return "Big5"
@property @property
def language(self): def language(self) -> str:
return "Chinese" return "Chinese"

View File

@ -25,6 +25,8 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from typing import Tuple, Union
from .big5freq import ( from .big5freq import (
BIG5_CHAR_TO_FREQ_ORDER, BIG5_CHAR_TO_FREQ_ORDER,
BIG5_TABLE_SIZE, BIG5_TABLE_SIZE,
@ -59,22 +61,22 @@ class CharDistributionAnalysis:
SURE_NO = 0.01 SURE_NO = 0.01
MINIMUM_DATA_THRESHOLD = 3 MINIMUM_DATA_THRESHOLD = 3
def __init__(self): def __init__(self) -> None:
# Mapping table to get frequency order from char order (get from # Mapping table to get frequency order from char order (get from
# GetOrder()) # GetOrder())
self._char_to_freq_order = tuple() self._char_to_freq_order: Tuple[int, ...] = tuple()
self._table_size = None # Size of above table self._table_size = 0 # Size of above table
# This is a constant value which varies from language to language, # This is a constant value which varies from language to language,
# used in calculating confidence. See # used in calculating confidence. See
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html # http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
# for further detail. # for further detail.
self.typical_distribution_ratio = None self.typical_distribution_ratio = 0.0
self._done = None self._done = False
self._total_chars = None self._total_chars = 0
self._freq_chars = None self._freq_chars = 0
self.reset() self.reset()
def reset(self): def reset(self) -> None:
"""reset analyser, clear any state""" """reset analyser, clear any state"""
# If this flag is set to True, detection is done and conclusion has # If this flag is set to True, detection is done and conclusion has
# been made # been made
@ -83,7 +85,7 @@ class CharDistributionAnalysis:
# The number of characters whose frequency order is less than 512 # The number of characters whose frequency order is less than 512
self._freq_chars = 0 self._freq_chars = 0
def feed(self, char, char_len): def feed(self, char: Union[bytes, bytearray], char_len: int) -> None:
"""feed a character with known length""" """feed a character with known length"""
if char_len == 2: if char_len == 2:
# we only care about 2-bytes character in our distribution analysis # we only care about 2-bytes character in our distribution analysis
@ -97,7 +99,7 @@ class CharDistributionAnalysis:
if 512 > self._char_to_freq_order[order]: if 512 > self._char_to_freq_order[order]:
self._freq_chars += 1 self._freq_chars += 1
def get_confidence(self): def get_confidence(self) -> float:
"""return confidence based on existing data""" """return confidence based on existing data"""
# if we didn't receive any character in our consideration range, # if we didn't receive any character in our consideration range,
# return negative answer # return negative answer
@ -114,12 +116,12 @@ class CharDistributionAnalysis:
# normalize confidence (we don't want to be 100% sure) # normalize confidence (we don't want to be 100% sure)
return self.SURE_YES return self.SURE_YES
def got_enough_data(self): def got_enough_data(self) -> bool:
# It is not necessary to receive all data to draw conclusion. # It is not necessary to receive all data to draw conclusion.
# For charset detection, certain amount of data is enough # For charset detection, certain amount of data is enough
return self._total_chars > self.ENOUGH_DATA_THRESHOLD return self._total_chars > self.ENOUGH_DATA_THRESHOLD
def get_order(self, _): def get_order(self, _: Union[bytes, bytearray]) -> int:
# We do not handle characters based on the original encoding string, # We do not handle characters based on the original encoding string,
# but convert this encoding string to a number, here called order. # but convert this encoding string to a number, here called order.
# This allows multiple encodings of a language to share one frequency # This allows multiple encodings of a language to share one frequency
@ -128,13 +130,13 @@ class CharDistributionAnalysis:
class EUCTWDistributionAnalysis(CharDistributionAnalysis): class EUCTWDistributionAnalysis(CharDistributionAnalysis):
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
self._table_size = EUCTW_TABLE_SIZE self._table_size = EUCTW_TABLE_SIZE
self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str): def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
# for euc-TW encoding, we are interested # for euc-TW encoding, we are interested
# first byte range: 0xc4 -- 0xfe # first byte range: 0xc4 -- 0xfe
# second byte range: 0xa1 -- 0xfe # second byte range: 0xa1 -- 0xfe
@ -146,13 +148,13 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis):
class EUCKRDistributionAnalysis(CharDistributionAnalysis): class EUCKRDistributionAnalysis(CharDistributionAnalysis):
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
self._table_size = EUCKR_TABLE_SIZE self._table_size = EUCKR_TABLE_SIZE
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str): def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
# for euc-KR encoding, we are interested # for euc-KR encoding, we are interested
# first byte range: 0xb0 -- 0xfe # first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe # second byte range: 0xa1 -- 0xfe
@ -164,13 +166,13 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis):
class JOHABDistributionAnalysis(CharDistributionAnalysis): class JOHABDistributionAnalysis(CharDistributionAnalysis):
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
self._table_size = EUCKR_TABLE_SIZE self._table_size = EUCKR_TABLE_SIZE
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str): def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
first_char = byte_str[0] first_char = byte_str[0]
if 0x88 <= first_char < 0xD4: if 0x88 <= first_char < 0xD4:
code = first_char * 256 + byte_str[1] code = first_char * 256 + byte_str[1]
@ -179,13 +181,13 @@ class JOHABDistributionAnalysis(CharDistributionAnalysis):
class GB2312DistributionAnalysis(CharDistributionAnalysis): class GB2312DistributionAnalysis(CharDistributionAnalysis):
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
self._table_size = GB2312_TABLE_SIZE self._table_size = GB2312_TABLE_SIZE
self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str): def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
# for GB2312 encoding, we are interested # for GB2312 encoding, we are interested
# first byte range: 0xb0 -- 0xfe # first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe # second byte range: 0xa1 -- 0xfe
@ -197,13 +199,13 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis):
class Big5DistributionAnalysis(CharDistributionAnalysis): class Big5DistributionAnalysis(CharDistributionAnalysis):
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
self._table_size = BIG5_TABLE_SIZE self._table_size = BIG5_TABLE_SIZE
self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str): def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
# for big5 encoding, we are interested # for big5 encoding, we are interested
# first byte range: 0xa4 -- 0xfe # first byte range: 0xa4 -- 0xfe
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
@ -217,13 +219,13 @@ class Big5DistributionAnalysis(CharDistributionAnalysis):
class SJISDistributionAnalysis(CharDistributionAnalysis): class SJISDistributionAnalysis(CharDistributionAnalysis):
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
self._table_size = JIS_TABLE_SIZE self._table_size = JIS_TABLE_SIZE
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str): def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
# for sjis encoding, we are interested # for sjis encoding, we are interested
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
@ -242,13 +244,13 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
class EUCJPDistributionAnalysis(CharDistributionAnalysis): class EUCJPDistributionAnalysis(CharDistributionAnalysis):
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
self._table_size = JIS_TABLE_SIZE self._table_size = JIS_TABLE_SIZE
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str): def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
# for euc-JP encoding, we are interested # for euc-JP encoding, we are interested
# first byte range: 0xa0 -- 0xfe # first byte range: 0xa0 -- 0xfe
# second byte range: 0xa1 -- 0xfe # second byte range: 0xa1 -- 0xfe

View File

@ -25,29 +25,30 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from typing import List, Optional, Union
from .charsetprober import CharSetProber from .charsetprober import CharSetProber
from .enums import ProbingState from .enums import LanguageFilter, ProbingState
class CharSetGroupProber(CharSetProber): class CharSetGroupProber(CharSetProber):
def __init__(self, lang_filter=None): def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
super().__init__(lang_filter=lang_filter) super().__init__(lang_filter=lang_filter)
self._active_num = 0 self._active_num = 0
self.probers = [] self.probers: List[CharSetProber] = []
self._best_guess_prober = None self._best_guess_prober: Optional[CharSetProber] = None
def reset(self): def reset(self) -> None:
super().reset() super().reset()
self._active_num = 0 self._active_num = 0
for prober in self.probers: for prober in self.probers:
if prober: prober.reset()
prober.reset() prober.active = True
prober.active = True self._active_num += 1
self._active_num += 1
self._best_guess_prober = None self._best_guess_prober = None
@property @property
def charset_name(self): def charset_name(self) -> Optional[str]:
if not self._best_guess_prober: if not self._best_guess_prober:
self.get_confidence() self.get_confidence()
if not self._best_guess_prober: if not self._best_guess_prober:
@ -55,17 +56,15 @@ class CharSetGroupProber(CharSetProber):
return self._best_guess_prober.charset_name return self._best_guess_prober.charset_name
@property @property
def language(self): def language(self) -> Optional[str]:
if not self._best_guess_prober: if not self._best_guess_prober:
self.get_confidence() self.get_confidence()
if not self._best_guess_prober: if not self._best_guess_prober:
return None return None
return self._best_guess_prober.language return self._best_guess_prober.language
def feed(self, byte_str): def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
for prober in self.probers: for prober in self.probers:
if not prober:
continue
if not prober.active: if not prober.active:
continue continue
state = prober.feed(byte_str) state = prober.feed(byte_str)
@ -83,7 +82,7 @@ class CharSetGroupProber(CharSetProber):
return self.state return self.state
return self.state return self.state
def get_confidence(self): def get_confidence(self) -> float:
state = self.state state = self.state
if state == ProbingState.FOUND_IT: if state == ProbingState.FOUND_IT:
return 0.99 return 0.99
@ -92,8 +91,6 @@ class CharSetGroupProber(CharSetProber):
best_conf = 0.0 best_conf = 0.0
self._best_guess_prober = None self._best_guess_prober = None
for prober in self.probers: for prober in self.probers:
if not prober:
continue
if not prober.active: if not prober.active:
self.logger.debug("%s not active", prober.charset_name) self.logger.debug("%s not active", prober.charset_name)
continue continue

View File

@ -28,8 +28,9 @@
import logging import logging
import re import re
from typing import Optional, Union
from .enums import ProbingState from .enums import LanguageFilter, ProbingState
INTERNATIONAL_WORDS_PATTERN = re.compile( INTERNATIONAL_WORDS_PATTERN = re.compile(
b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?" b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?"
@ -40,35 +41,40 @@ class CharSetProber:
SHORTCUT_THRESHOLD = 0.95 SHORTCUT_THRESHOLD = 0.95
def __init__(self, lang_filter=None): def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
self._state = None self._state = ProbingState.DETECTING
self.active = True
self.lang_filter = lang_filter self.lang_filter = lang_filter
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
def reset(self): def reset(self) -> None:
self._state = ProbingState.DETECTING self._state = ProbingState.DETECTING
@property @property
def charset_name(self): def charset_name(self) -> Optional[str]:
return None return None
def feed(self, byte_str): @property
def language(self) -> Optional[str]:
raise NotImplementedError
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
raise NotImplementedError raise NotImplementedError
@property @property
def state(self): def state(self) -> ProbingState:
return self._state return self._state
def get_confidence(self): def get_confidence(self) -> float:
return 0.0 return 0.0
@staticmethod @staticmethod
def filter_high_byte_only(buf): def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes:
buf = re.sub(b"([\x00-\x7F])+", b" ", buf) buf = re.sub(b"([\x00-\x7F])+", b" ", buf)
return buf return buf
@staticmethod @staticmethod
def filter_international_words(buf): def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray:
""" """
We define three types of bytes: We define three types of bytes:
alphabet: english alphabets [a-zA-Z] alphabet: english alphabets [a-zA-Z]
@ -102,7 +108,7 @@ class CharSetProber:
return filtered return filtered
@staticmethod @staticmethod
def remove_xml_tags(buf): def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytes:
""" """
Returns a copy of ``buf`` that retains only the sequences of English Returns a copy of ``buf`` that retains only the sequences of English
alphabet and high byte characters that are not between <> characters. alphabet and high byte characters that are not between <> characters.
@ -117,10 +123,13 @@ class CharSetProber:
for curr, buf_char in enumerate(buf): for curr, buf_char in enumerate(buf):
# Check if we're coming out of or entering an XML tag # Check if we're coming out of or entering an XML tag
if buf_char == b">":
# https://github.com/python/typeshed/issues/8182
if buf_char == b">": # type: ignore[comparison-overlap]
prev = curr + 1 prev = curr + 1
in_tag = False in_tag = False
elif buf_char == b"<": # https://github.com/python/typeshed/issues/8182
elif buf_char == b"<": # type: ignore[comparison-overlap]
if curr > prev and not in_tag: if curr > prev and not in_tag:
# Keep everything after last non-extended-ASCII, # Keep everything after last non-extended-ASCII,
# non-alphabetic character # non-alphabetic character

View File

@ -15,12 +15,18 @@ If no paths are provided, it takes its input from stdin.
import argparse import argparse
import sys import sys
from typing import Iterable, List, Optional
from .. import __version__ from .. import __version__
from ..universaldetector import UniversalDetector from ..universaldetector import UniversalDetector
def description_of(lines, name="stdin"): def description_of(
lines: Iterable[bytes],
name: str = "stdin",
minimal: bool = False,
should_rename_legacy: bool = False,
) -> Optional[str]:
""" """
Return a string describing the probable encoding of a file or Return a string describing the probable encoding of a file or
list of strings. list of strings.
@ -29,8 +35,11 @@ def description_of(lines, name="stdin"):
:type lines: Iterable of bytes :type lines: Iterable of bytes
:param name: Name of file or collection of lines :param name: Name of file or collection of lines
:type name: str :type name: str
:param should_rename_legacy: Should we rename legacy encodings to
their more modern equivalents?
:type should_rename_legacy: ``bool``
""" """
u = UniversalDetector() u = UniversalDetector(should_rename_legacy=should_rename_legacy)
for line in lines: for line in lines:
line = bytearray(line) line = bytearray(line)
u.feed(line) u.feed(line)
@ -39,12 +48,14 @@ def description_of(lines, name="stdin"):
break break
u.close() u.close()
result = u.result result = u.result
if minimal:
return result["encoding"]
if result["encoding"]: if result["encoding"]:
return f'{name}: {result["encoding"]} with confidence {result["confidence"]}' return f'{name}: {result["encoding"]} with confidence {result["confidence"]}'
return f"{name}: no result" return f"{name}: no result"
def main(argv=None): def main(argv: Optional[List[str]] = None) -> None:
""" """
Handles command line arguments and gets things started. Handles command line arguments and gets things started.
@ -54,17 +65,28 @@ def main(argv=None):
""" """
# Get command line arguments # Get command line arguments
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Takes one or more file paths and reports their detected \ description=(
encodings" "Takes one or more file paths and reports their detected encodings"
)
) )
parser.add_argument( parser.add_argument(
"input", "input",
help="File whose encoding we would like to determine. \ help="File whose encoding we would like to determine. (default: stdin)",
(default: stdin)",
type=argparse.FileType("rb"), type=argparse.FileType("rb"),
nargs="*", nargs="*",
default=[sys.stdin.buffer], default=[sys.stdin.buffer],
) )
parser.add_argument(
"--minimal",
help="Print only the encoding to standard output",
action="store_true",
)
parser.add_argument(
"-l",
"--legacy",
help="Rename legacy encodings to more modern ones.",
action="store_true",
)
parser.add_argument( parser.add_argument(
"--version", action="version", version=f"%(prog)s {__version__}" "--version", action="version", version=f"%(prog)s {__version__}"
) )
@ -79,7 +101,11 @@ def main(argv=None):
"--help\n", "--help\n",
file=sys.stderr, file=sys.stderr,
) )
print(description_of(f, f.name)) print(
description_of(
f, f.name, minimal=args.minimal, should_rename_legacy=args.legacy
)
)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -27,6 +27,7 @@
import logging import logging
from .codingstatemachinedict import CodingStateMachineDict
from .enums import MachineState from .enums import MachineState
@ -53,18 +54,19 @@ class CodingStateMachine:
encoding from consideration from here on. encoding from consideration from here on.
""" """
def __init__(self, sm): def __init__(self, sm: CodingStateMachineDict) -> None:
self._model = sm self._model = sm
self._curr_byte_pos = 0 self._curr_byte_pos = 0
self._curr_char_len = 0 self._curr_char_len = 0
self._curr_state = None self._curr_state = MachineState.START
self.active = True
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
self.reset() self.reset()
def reset(self): def reset(self) -> None:
self._curr_state = MachineState.START self._curr_state = MachineState.START
def next_state(self, c): def next_state(self, c: int) -> int:
# for each byte we get its class # for each byte we get its class
# if it is first byte, we also get byte length # if it is first byte, we also get byte length
byte_class = self._model["class_table"][c] byte_class = self._model["class_table"][c]
@ -77,12 +79,12 @@ class CodingStateMachine:
self._curr_byte_pos += 1 self._curr_byte_pos += 1
return self._curr_state return self._curr_state
def get_current_charlen(self): def get_current_charlen(self) -> int:
return self._curr_char_len return self._curr_char_len
def get_coding_state_machine(self): def get_coding_state_machine(self) -> str:
return self._model["name"] return self._model["name"]
@property @property
def language(self): def language(self) -> str:
return self._model["language"] return self._model["language"]

View File

@ -0,0 +1,19 @@
from typing import TYPE_CHECKING, Tuple
if TYPE_CHECKING:
# TypedDict was introduced in Python 3.8.
#
# TODO: Remove the else block and TYPE_CHECKING check when dropping support
# for Python 3.7.
from typing import TypedDict
class CodingStateMachineDict(TypedDict, total=False):
class_table: Tuple[int, ...]
class_factor: int
state_table: Tuple[int, ...]
char_len_table: Tuple[int, ...]
name: str
language: str # Optional key
else:
CodingStateMachineDict = dict

View File

@ -32,7 +32,7 @@ from .mbcssm import CP949_SM_MODEL
class CP949Prober(MultiByteCharSetProber): class CP949Prober(MultiByteCharSetProber):
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
self.coding_sm = CodingStateMachine(CP949_SM_MODEL) self.coding_sm = CodingStateMachine(CP949_SM_MODEL)
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be # NOTE: CP949 is a superset of EUC-KR, so the distribution should be
@ -41,9 +41,9 @@ class CP949Prober(MultiByteCharSetProber):
self.reset() self.reset()
@property @property
def charset_name(self): def charset_name(self) -> str:
return "CP949" return "CP949"
@property @property
def language(self): def language(self) -> str:
return "Korean" return "Korean"

View File

@ -4,6 +4,8 @@ All of the Enums that are used throughout the chardet package.
:author: Dan Blanchard (dan.blanchard@gmail.com) :author: Dan Blanchard (dan.blanchard@gmail.com)
""" """
from enum import Enum, Flag
class InputState: class InputState:
""" """
@ -15,12 +17,13 @@ class InputState:
HIGH_BYTE = 2 HIGH_BYTE = 2
class LanguageFilter: class LanguageFilter(Flag):
""" """
This enum represents the different language filters we can apply to a This enum represents the different language filters we can apply to a
``UniversalDetector``. ``UniversalDetector``.
""" """
NONE = 0x00
CHINESE_SIMPLIFIED = 0x01 CHINESE_SIMPLIFIED = 0x01
CHINESE_TRADITIONAL = 0x02 CHINESE_TRADITIONAL = 0x02
JAPANESE = 0x04 JAPANESE = 0x04
@ -31,7 +34,7 @@ class LanguageFilter:
CJK = CHINESE | JAPANESE | KOREAN CJK = CHINESE | JAPANESE | KOREAN
class ProbingState: class ProbingState(Enum):
""" """
This enum represents the different states a prober can be in. This enum represents the different states a prober can be in.
""" """
@ -62,7 +65,7 @@ class SequenceLikelihood:
POSITIVE = 3 POSITIVE = 3
@classmethod @classmethod
def get_num_categories(cls): def get_num_categories(cls) -> int:
""":returns: The number of likelihood categories in the enum.""" """:returns: The number of likelihood categories in the enum."""
return 4 return 4

View File

@ -25,6 +25,8 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from typing import Optional, Union
from .charsetprober import CharSetProber from .charsetprober import CharSetProber
from .codingstatemachine import CodingStateMachine from .codingstatemachine import CodingStateMachine
from .enums import LanguageFilter, MachineState, ProbingState from .enums import LanguageFilter, MachineState, ProbingState
@ -43,7 +45,7 @@ class EscCharSetProber(CharSetProber):
identify these encodings. identify these encodings.
""" """
def __init__(self, lang_filter=None): def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
super().__init__(lang_filter=lang_filter) super().__init__(lang_filter=lang_filter)
self.coding_sm = [] self.coding_sm = []
if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED: if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
@ -53,17 +55,15 @@ class EscCharSetProber(CharSetProber):
self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL)) self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))
if self.lang_filter & LanguageFilter.KOREAN: if self.lang_filter & LanguageFilter.KOREAN:
self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL)) self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))
self.active_sm_count = None self.active_sm_count = 0
self._detected_charset = None self._detected_charset: Optional[str] = None
self._detected_language = None self._detected_language: Optional[str] = None
self._state = None self._state = ProbingState.DETECTING
self.reset() self.reset()
def reset(self): def reset(self) -> None:
super().reset() super().reset()
for coding_sm in self.coding_sm: for coding_sm in self.coding_sm:
if not coding_sm:
continue
coding_sm.active = True coding_sm.active = True
coding_sm.reset() coding_sm.reset()
self.active_sm_count = len(self.coding_sm) self.active_sm_count = len(self.coding_sm)
@ -71,20 +71,20 @@ class EscCharSetProber(CharSetProber):
self._detected_language = None self._detected_language = None
@property @property
def charset_name(self): def charset_name(self) -> Optional[str]:
return self._detected_charset return self._detected_charset
@property @property
def language(self): def language(self) -> Optional[str]:
return self._detected_language return self._detected_language
def get_confidence(self): def get_confidence(self) -> float:
return 0.99 if self._detected_charset else 0.00 return 0.99 if self._detected_charset else 0.00
def feed(self, byte_str): def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
for c in byte_str: for c in byte_str:
for coding_sm in self.coding_sm: for coding_sm in self.coding_sm:
if not coding_sm or not coding_sm.active: if not coding_sm.active:
continue continue
coding_state = coding_sm.next_state(c) coding_state = coding_sm.next_state(c)
if coding_state == MachineState.ERROR: if coding_state == MachineState.ERROR:

View File

@ -25,6 +25,7 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from .codingstatemachinedict import CodingStateMachineDict
from .enums import MachineState from .enums import MachineState
# fmt: off # fmt: off
@ -75,7 +76,7 @@ MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR
HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0) HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
HZ_SM_MODEL = { HZ_SM_MODEL: CodingStateMachineDict = {
"class_table": HZ_CLS, "class_table": HZ_CLS,
"class_factor": 6, "class_factor": 6,
"state_table": HZ_ST, "state_table": HZ_ST,
@ -134,7 +135,7 @@ ISO2022CN_ST = (
ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0) ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0)
ISO2022CN_SM_MODEL = { ISO2022CN_SM_MODEL: CodingStateMachineDict = {
"class_table": ISO2022CN_CLS, "class_table": ISO2022CN_CLS,
"class_factor": 9, "class_factor": 9,
"state_table": ISO2022CN_ST, "state_table": ISO2022CN_ST,
@ -194,7 +195,7 @@ ISO2022JP_ST = (
ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0) ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
ISO2022JP_SM_MODEL = { ISO2022JP_SM_MODEL: CodingStateMachineDict = {
"class_table": ISO2022JP_CLS, "class_table": ISO2022JP_CLS,
"class_factor": 10, "class_factor": 10,
"state_table": ISO2022JP_ST, "state_table": ISO2022JP_ST,
@ -250,7 +251,7 @@ ISO2022KR_ST = (
ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0) ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
ISO2022KR_SM_MODEL = { ISO2022KR_SM_MODEL: CodingStateMachineDict = {
"class_table": ISO2022KR_CLS, "class_table": ISO2022KR_CLS,
"class_factor": 6, "class_factor": 6,
"state_table": ISO2022KR_ST, "state_table": ISO2022KR_ST,

View File

@ -25,6 +25,8 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from typing import Union
from .chardistribution import EUCJPDistributionAnalysis from .chardistribution import EUCJPDistributionAnalysis
from .codingstatemachine import CodingStateMachine from .codingstatemachine import CodingStateMachine
from .enums import MachineState, ProbingState from .enums import MachineState, ProbingState
@ -34,26 +36,29 @@ from .mbcssm import EUCJP_SM_MODEL
class EUCJPProber(MultiByteCharSetProber): class EUCJPProber(MultiByteCharSetProber):
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL) self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL)
self.distribution_analyzer = EUCJPDistributionAnalysis() self.distribution_analyzer = EUCJPDistributionAnalysis()
self.context_analyzer = EUCJPContextAnalysis() self.context_analyzer = EUCJPContextAnalysis()
self.reset() self.reset()
def reset(self): def reset(self) -> None:
super().reset() super().reset()
self.context_analyzer.reset() self.context_analyzer.reset()
@property @property
def charset_name(self): def charset_name(self) -> str:
return "EUC-JP" return "EUC-JP"
@property @property
def language(self): def language(self) -> str:
return "Japanese" return "Japanese"
def feed(self, byte_str): def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
assert self.coding_sm is not None
assert self.distribution_analyzer is not None
for i, byte in enumerate(byte_str): for i, byte in enumerate(byte_str):
# PY3K: byte_str is a byte array, so byte is an int, not a byte # PY3K: byte_str is a byte array, so byte is an int, not a byte
coding_state = self.coding_sm.next_state(byte) coding_state = self.coding_sm.next_state(byte)
@ -89,7 +94,9 @@ class EUCJPProber(MultiByteCharSetProber):
return self.state return self.state
def get_confidence(self): def get_confidence(self) -> float:
assert self.distribution_analyzer is not None
context_conf = self.context_analyzer.get_confidence() context_conf = self.context_analyzer.get_confidence()
distrib_conf = self.distribution_analyzer.get_confidence() distrib_conf = self.distribution_analyzer.get_confidence()
return max(context_conf, distrib_conf) return max(context_conf, distrib_conf)

View File

@ -32,16 +32,16 @@ from .mbcssm import EUCKR_SM_MODEL
class EUCKRProber(MultiByteCharSetProber): class EUCKRProber(MultiByteCharSetProber):
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL) self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL)
self.distribution_analyzer = EUCKRDistributionAnalysis() self.distribution_analyzer = EUCKRDistributionAnalysis()
self.reset() self.reset()
@property @property
def charset_name(self): def charset_name(self) -> str:
return "EUC-KR" return "EUC-KR"
@property @property
def language(self): def language(self) -> str:
return "Korean" return "Korean"

View File

@ -32,16 +32,16 @@ from .mbcssm import EUCTW_SM_MODEL
class EUCTWProber(MultiByteCharSetProber): class EUCTWProber(MultiByteCharSetProber):
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL) self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL)
self.distribution_analyzer = EUCTWDistributionAnalysis() self.distribution_analyzer = EUCTWDistributionAnalysis()
self.reset() self.reset()
@property @property
def charset_name(self): def charset_name(self) -> str:
return "EUC-TW" return "EUC-TW"
@property @property
def language(self): def language(self) -> str:
return "Taiwan" return "Taiwan"

View File

@ -32,16 +32,16 @@ from .mbcssm import GB2312_SM_MODEL
class GB2312Prober(MultiByteCharSetProber): class GB2312Prober(MultiByteCharSetProber):
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
self.coding_sm = CodingStateMachine(GB2312_SM_MODEL) self.coding_sm = CodingStateMachine(GB2312_SM_MODEL)
self.distribution_analyzer = GB2312DistributionAnalysis() self.distribution_analyzer = GB2312DistributionAnalysis()
self.reset() self.reset()
@property @property
def charset_name(self): def charset_name(self) -> str:
return "GB2312" return "GB2312"
@property @property
def language(self): def language(self) -> str:
return "Chinese" return "Chinese"

View File

@ -25,8 +25,11 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from typing import Optional, Union
from .charsetprober import CharSetProber from .charsetprober import CharSetProber
from .enums import ProbingState from .enums import ProbingState
from .sbcharsetprober import SingleByteCharSetProber
# This prober doesn't actually recognize a language or a charset. # This prober doesn't actually recognize a language or a charset.
# It is a helper prober for the use of the Hebrew model probers # It is a helper prober for the use of the Hebrew model probers
@ -127,6 +130,7 @@ from .enums import ProbingState
class HebrewProber(CharSetProber): class HebrewProber(CharSetProber):
SPACE = 0x20
# windows-1255 / ISO-8859-8 code points of interest # windows-1255 / ISO-8859-8 code points of interest
FINAL_KAF = 0xEA FINAL_KAF = 0xEA
NORMAL_KAF = 0xEB NORMAL_KAF = 0xEB
@ -152,31 +156,35 @@ class HebrewProber(CharSetProber):
VISUAL_HEBREW_NAME = "ISO-8859-8" VISUAL_HEBREW_NAME = "ISO-8859-8"
LOGICAL_HEBREW_NAME = "windows-1255" LOGICAL_HEBREW_NAME = "windows-1255"
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
self._final_char_logical_score = None self._final_char_logical_score = 0
self._final_char_visual_score = None self._final_char_visual_score = 0
self._prev = None self._prev = self.SPACE
self._before_prev = None self._before_prev = self.SPACE
self._logical_prober = None self._logical_prober: Optional[SingleByteCharSetProber] = None
self._visual_prober = None self._visual_prober: Optional[SingleByteCharSetProber] = None
self.reset() self.reset()
def reset(self): def reset(self) -> None:
self._final_char_logical_score = 0 self._final_char_logical_score = 0
self._final_char_visual_score = 0 self._final_char_visual_score = 0
# The two last characters seen in the previous buffer, # The two last characters seen in the previous buffer,
# mPrev and mBeforePrev are initialized to space in order to simulate # mPrev and mBeforePrev are initialized to space in order to simulate
# a word delimiter at the beginning of the data # a word delimiter at the beginning of the data
self._prev = " " self._prev = self.SPACE
self._before_prev = " " self._before_prev = self.SPACE
# These probers are owned by the group prober. # These probers are owned by the group prober.
def set_model_probers(self, logical_prober, visual_prober): def set_model_probers(
self,
logical_prober: SingleByteCharSetProber,
visual_prober: SingleByteCharSetProber,
) -> None:
self._logical_prober = logical_prober self._logical_prober = logical_prober
self._visual_prober = visual_prober self._visual_prober = visual_prober
def is_final(self, c): def is_final(self, c: int) -> bool:
return c in [ return c in [
self.FINAL_KAF, self.FINAL_KAF,
self.FINAL_MEM, self.FINAL_MEM,
@ -185,7 +193,7 @@ class HebrewProber(CharSetProber):
self.FINAL_TSADI, self.FINAL_TSADI,
] ]
def is_non_final(self, c): def is_non_final(self, c: int) -> bool:
# The normal Tsadi is not a good Non-Final letter due to words like # The normal Tsadi is not a good Non-Final letter due to words like
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This # 'lechotet' (to chat) containing an apostrophe after the tsadi. This
# apostrophe is converted to a space in FilterWithoutEnglishLetters # apostrophe is converted to a space in FilterWithoutEnglishLetters
@ -198,7 +206,7 @@ class HebrewProber(CharSetProber):
# since these words are quite rare. # since these words are quite rare.
return c in [self.NORMAL_KAF, self.NORMAL_MEM, self.NORMAL_NUN, self.NORMAL_PE] return c in [self.NORMAL_KAF, self.NORMAL_MEM, self.NORMAL_NUN, self.NORMAL_PE]
def feed(self, byte_str): def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
# Final letter analysis for logical-visual decision. # Final letter analysis for logical-visual decision.
# Look for evidence that the received buffer is either logical Hebrew # Look for evidence that the received buffer is either logical Hebrew
# or visual Hebrew. # or visual Hebrew.
@ -232,9 +240,9 @@ class HebrewProber(CharSetProber):
byte_str = self.filter_high_byte_only(byte_str) byte_str = self.filter_high_byte_only(byte_str)
for cur in byte_str: for cur in byte_str:
if cur == " ": if cur == self.SPACE:
# We stand on a space - a word just ended # We stand on a space - a word just ended
if self._before_prev != " ": if self._before_prev != self.SPACE:
# next-to-last char was not a space so self._prev is not a # next-to-last char was not a space so self._prev is not a
# 1 letter word # 1 letter word
if self.is_final(self._prev): if self.is_final(self._prev):
@ -247,9 +255,9 @@ class HebrewProber(CharSetProber):
else: else:
# Not standing on a space # Not standing on a space
if ( if (
(self._before_prev == " ") (self._before_prev == self.SPACE)
and (self.is_final(self._prev)) and (self.is_final(self._prev))
and (cur != " ") and (cur != self.SPACE)
): ):
# case (3) [-2:space][-1:final letter][cur:not space] # case (3) [-2:space][-1:final letter][cur:not space]
self._final_char_visual_score += 1 self._final_char_visual_score += 1
@ -261,7 +269,10 @@ class HebrewProber(CharSetProber):
return ProbingState.DETECTING return ProbingState.DETECTING
@property @property
def charset_name(self): def charset_name(self) -> str:
assert self._logical_prober is not None
assert self._visual_prober is not None
# Make the decision: is it Logical or Visual? # Make the decision: is it Logical or Visual?
# If the final letter score distance is dominant enough, rely on it. # If the final letter score distance is dominant enough, rely on it.
finalsub = self._final_char_logical_score - self._final_char_visual_score finalsub = self._final_char_logical_score - self._final_char_visual_score
@ -289,11 +300,14 @@ class HebrewProber(CharSetProber):
return self.LOGICAL_HEBREW_NAME return self.LOGICAL_HEBREW_NAME
@property @property
def language(self): def language(self) -> str:
return "Hebrew" return "Hebrew"
@property @property
def state(self): def state(self) -> ProbingState:
assert self._logical_prober is not None
assert self._visual_prober is not None
# Remain active as long as any of the model probers are active. # Remain active as long as any of the model probers are active.
if (self._logical_prober.state == ProbingState.NOT_ME) and ( if (self._logical_prober.state == ProbingState.NOT_ME) and (
self._visual_prober.state == ProbingState.NOT_ME self._visual_prober.state == ProbingState.NOT_ME

View File

@ -32,16 +32,16 @@ from .mbcssm import JOHAB_SM_MODEL
class JOHABProber(MultiByteCharSetProber): class JOHABProber(MultiByteCharSetProber):
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
self.coding_sm = CodingStateMachine(JOHAB_SM_MODEL) self.coding_sm = CodingStateMachine(JOHAB_SM_MODEL)
self.distribution_analyzer = JOHABDistributionAnalysis() self.distribution_analyzer = JOHABDistributionAnalysis()
self.reset() self.reset()
@property @property
def charset_name(self): def charset_name(self) -> str:
return "Johab" return "Johab"
@property @property
def language(self): def language(self) -> str:
return "Korean" return "Korean"

View File

@ -25,6 +25,7 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from typing import List, Tuple, Union
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category # This is hiragana 2-char sequence table, the number in each cell represents its frequency category
# fmt: off # fmt: off
@ -123,15 +124,15 @@ class JapaneseContextAnalysis:
MAX_REL_THRESHOLD = 1000 MAX_REL_THRESHOLD = 1000
MINIMUM_DATA_THRESHOLD = 4 MINIMUM_DATA_THRESHOLD = 4
def __init__(self): def __init__(self) -> None:
self._total_rel = None self._total_rel = 0
self._rel_sample = None self._rel_sample: List[int] = []
self._need_to_skip_char_num = None self._need_to_skip_char_num = 0
self._last_char_order = None self._last_char_order = -1
self._done = None self._done = False
self.reset() self.reset()
def reset(self): def reset(self) -> None:
self._total_rel = 0 # total sequence received self._total_rel = 0 # total sequence received
# category counters, each integer counts sequence in its category # category counters, each integer counts sequence in its category
self._rel_sample = [0] * self.NUM_OF_CATEGORY self._rel_sample = [0] * self.NUM_OF_CATEGORY
@ -143,7 +144,7 @@ class JapaneseContextAnalysis:
# been made # been made
self._done = False self._done = False
def feed(self, byte_str, num_bytes): def feed(self, byte_str: Union[bytes, bytearray], num_bytes: int) -> None:
if self._done: if self._done:
return return
@ -172,29 +173,29 @@ class JapaneseContextAnalysis:
] += 1 ] += 1
self._last_char_order = order self._last_char_order = order
def got_enough_data(self): def got_enough_data(self) -> bool:
return self._total_rel > self.ENOUGH_REL_THRESHOLD return self._total_rel > self.ENOUGH_REL_THRESHOLD
def get_confidence(self): def get_confidence(self) -> float:
# This is just one way to calculate confidence. It works well for me. # This is just one way to calculate confidence. It works well for me.
if self._total_rel > self.MINIMUM_DATA_THRESHOLD: if self._total_rel > self.MINIMUM_DATA_THRESHOLD:
return (self._total_rel - self._rel_sample[0]) / self._total_rel return (self._total_rel - self._rel_sample[0]) / self._total_rel
return self.DONT_KNOW return self.DONT_KNOW
def get_order(self, _): def get_order(self, _: Union[bytes, bytearray]) -> Tuple[int, int]:
return -1, 1 return -1, 1
class SJISContextAnalysis(JapaneseContextAnalysis): class SJISContextAnalysis(JapaneseContextAnalysis):
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
self._charset_name = "SHIFT_JIS" self._charset_name = "SHIFT_JIS"
@property @property
def charset_name(self): def charset_name(self) -> str:
return self._charset_name return self._charset_name
def get_order(self, byte_str): def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]:
if not byte_str: if not byte_str:
return -1, 1 return -1, 1
# find out current char's byte length # find out current char's byte length
@ -216,7 +217,7 @@ class SJISContextAnalysis(JapaneseContextAnalysis):
class EUCJPContextAnalysis(JapaneseContextAnalysis): class EUCJPContextAnalysis(JapaneseContextAnalysis):
def get_order(self, byte_str): def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]:
if not byte_str: if not byte_str:
return -1, 1 return -1, 1
# find out current char's byte length # find out current char's byte length

View File

@ -26,6 +26,8 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from typing import List, Union
from .charsetprober import CharSetProber from .charsetprober import CharSetProber
from .enums import ProbingState from .enums import ProbingState
@ -96,26 +98,26 @@ Latin1ClassModel = (
class Latin1Prober(CharSetProber): class Latin1Prober(CharSetProber):
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
self._last_char_class = None self._last_char_class = OTH
self._freq_counter = None self._freq_counter: List[int] = []
self.reset() self.reset()
def reset(self): def reset(self) -> None:
self._last_char_class = OTH self._last_char_class = OTH
self._freq_counter = [0] * FREQ_CAT_NUM self._freq_counter = [0] * FREQ_CAT_NUM
super().reset() super().reset()
@property @property
def charset_name(self): def charset_name(self) -> str:
return "ISO-8859-1" return "ISO-8859-1"
@property @property
def language(self): def language(self) -> str:
return "" return ""
def feed(self, byte_str): def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
byte_str = self.remove_xml_tags(byte_str) byte_str = self.remove_xml_tags(byte_str)
for c in byte_str: for c in byte_str:
char_class = Latin1_CharToClass[c] char_class = Latin1_CharToClass[c]
@ -128,7 +130,7 @@ class Latin1Prober(CharSetProber):
return self.state return self.state
def get_confidence(self): def get_confidence(self) -> float:
if self.state == ProbingState.NOT_ME: if self.state == ProbingState.NOT_ME:
return 0.01 return 0.01

View File

@ -0,0 +1,162 @@
######################## BEGIN LICENSE BLOCK ########################
# This code was modified from latin1prober.py by Rob Speer <rob@lumino.so>.
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Rob Speer - adapt to MacRoman encoding
# Mark Pilgrim - port to Python
# Shy Shalom - original C code
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import List, Union
from .charsetprober import CharSetProber
from .enums import ProbingState
FREQ_CAT_NUM = 4
UDF = 0 # undefined
OTH = 1 # other
ASC = 2 # ascii capital letter
ASS = 3 # ascii small letter
ACV = 4 # accent capital vowel
ACO = 5 # accent capital other
ASV = 6 # accent small vowel
ASO = 7 # accent small other
ODD = 8 # character that is unlikely to appear
CLASS_NUM = 9 # total classes
# The change from Latin1 is that we explicitly look for extended characters
# that are infrequently-occurring symbols, and consider them to always be
# improbable. This should let MacRoman get out of the way of more likely
# encodings in most situations.
# fmt: off
MacRoman_CharToClass = (
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
ACV, ACV, ACO, ACV, ACO, ACV, ACV, ASV, # 80 - 87
ASV, ASV, ASV, ASV, ASV, ASO, ASV, ASV, # 88 - 8F
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASV, # 90 - 97
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # 98 - 9F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, ASO, # A0 - A7
OTH, OTH, ODD, ODD, OTH, OTH, ACV, ACV, # A8 - AF
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
OTH, OTH, OTH, OTH, OTH, OTH, ASV, ASV, # B8 - BF
OTH, OTH, ODD, OTH, ODD, OTH, OTH, OTH, # C0 - C7
OTH, OTH, OTH, ACV, ACV, ACV, ACV, ASV, # C8 - CF
OTH, OTH, OTH, OTH, OTH, OTH, OTH, ODD, # D0 - D7
ASV, ACV, ODD, OTH, OTH, OTH, OTH, OTH, # D8 - DF
OTH, OTH, OTH, OTH, OTH, ACV, ACV, ACV, # E0 - E7
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # E8 - EF
ODD, ACV, ACV, ACV, ACV, ASV, ODD, ODD, # F0 - F7
ODD, ODD, ODD, ODD, ODD, ODD, ODD, ODD, # F8 - FF
)
# 0 : illegal
# 1 : very unlikely
# 2 : normal
# 3 : very likely
MacRomanClassModel = (
# UDF OTH ASC ASS ACV ACO ASV ASO ODD
0, 0, 0, 0, 0, 0, 0, 0, 0, # UDF
0, 3, 3, 3, 3, 3, 3, 3, 1, # OTH
0, 3, 3, 3, 3, 3, 3, 3, 1, # ASC
0, 3, 3, 3, 1, 1, 3, 3, 1, # ASS
0, 3, 3, 3, 1, 2, 1, 2, 1, # ACV
0, 3, 3, 3, 3, 3, 3, 3, 1, # ACO
0, 3, 1, 3, 1, 1, 1, 3, 1, # ASV
0, 3, 1, 3, 1, 1, 3, 3, 1, # ASO
0, 1, 1, 1, 1, 1, 1, 1, 1, # ODD
)
# fmt: on
class MacRomanProber(CharSetProber):
def __init__(self) -> None:
super().__init__()
self._last_char_class = OTH
self._freq_counter: List[int] = []
self.reset()
def reset(self) -> None:
self._last_char_class = OTH
self._freq_counter = [0] * FREQ_CAT_NUM
# express the prior that MacRoman is a somewhat rare encoding;
# this can be done by starting out in a slightly improbable state
# that must be overcome
self._freq_counter[2] = 10
super().reset()
@property
def charset_name(self) -> str:
return "MacRoman"
@property
def language(self) -> str:
return ""
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
byte_str = self.remove_xml_tags(byte_str)
for c in byte_str:
char_class = MacRoman_CharToClass[c]
freq = MacRomanClassModel[(self._last_char_class * CLASS_NUM) + char_class]
if freq == 0:
self._state = ProbingState.NOT_ME
break
self._freq_counter[freq] += 1
self._last_char_class = char_class
return self.state
def get_confidence(self) -> float:
if self.state == ProbingState.NOT_ME:
return 0.01
total = sum(self._freq_counter)
confidence = (
0.0
if total < 0.01
else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total
)
confidence = max(confidence, 0.0)
# lower the confidence of MacRoman so that other more accurate
# detector can take priority.
confidence *= 0.73
return confidence

View File

@ -27,8 +27,12 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from typing import Optional, Union
from .chardistribution import CharDistributionAnalysis
from .charsetprober import CharSetProber from .charsetprober import CharSetProber
from .enums import MachineState, ProbingState from .codingstatemachine import CodingStateMachine
from .enums import LanguageFilter, MachineState, ProbingState
class MultiByteCharSetProber(CharSetProber): class MultiByteCharSetProber(CharSetProber):
@ -36,29 +40,24 @@ class MultiByteCharSetProber(CharSetProber):
MultiByteCharSetProber MultiByteCharSetProber
""" """
def __init__(self, lang_filter=None): def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
super().__init__(lang_filter=lang_filter) super().__init__(lang_filter=lang_filter)
self.distribution_analyzer = None self.distribution_analyzer: Optional[CharDistributionAnalysis] = None
self.coding_sm = None self.coding_sm: Optional[CodingStateMachine] = None
self._last_char = [0, 0] self._last_char = bytearray(b"\0\0")
def reset(self): def reset(self) -> None:
super().reset() super().reset()
if self.coding_sm: if self.coding_sm:
self.coding_sm.reset() self.coding_sm.reset()
if self.distribution_analyzer: if self.distribution_analyzer:
self.distribution_analyzer.reset() self.distribution_analyzer.reset()
self._last_char = [0, 0] self._last_char = bytearray(b"\0\0")
@property def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
def charset_name(self): assert self.coding_sm is not None
raise NotImplementedError assert self.distribution_analyzer is not None
@property
def language(self):
raise NotImplementedError
def feed(self, byte_str):
for i, byte in enumerate(byte_str): for i, byte in enumerate(byte_str):
coding_state = self.coding_sm.next_state(byte) coding_state = self.coding_sm.next_state(byte)
if coding_state == MachineState.ERROR: if coding_state == MachineState.ERROR:
@ -91,5 +90,6 @@ class MultiByteCharSetProber(CharSetProber):
return self.state return self.state
def get_confidence(self): def get_confidence(self) -> float:
assert self.distribution_analyzer is not None
return self.distribution_analyzer.get_confidence() return self.distribution_analyzer.get_confidence()

View File

@ -30,6 +30,7 @@
from .big5prober import Big5Prober from .big5prober import Big5Prober
from .charsetgroupprober import CharSetGroupProber from .charsetgroupprober import CharSetGroupProber
from .cp949prober import CP949Prober from .cp949prober import CP949Prober
from .enums import LanguageFilter
from .eucjpprober import EUCJPProber from .eucjpprober import EUCJPProber
from .euckrprober import EUCKRProber from .euckrprober import EUCKRProber
from .euctwprober import EUCTWProber from .euctwprober import EUCTWProber
@ -40,7 +41,7 @@ from .utf8prober import UTF8Prober
class MBCSGroupProber(CharSetGroupProber): class MBCSGroupProber(CharSetGroupProber):
def __init__(self, lang_filter=None): def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
super().__init__(lang_filter=lang_filter) super().__init__(lang_filter=lang_filter)
self.probers = [ self.probers = [
UTF8Prober(), UTF8Prober(),

View File

@ -25,6 +25,7 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from .codingstatemachinedict import CodingStateMachineDict
from .enums import MachineState from .enums import MachineState
# BIG5 # BIG5
@ -74,7 +75,7 @@ BIG5_ST = (
BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0) BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0)
BIG5_SM_MODEL = { BIG5_SM_MODEL: CodingStateMachineDict = {
"class_table": BIG5_CLS, "class_table": BIG5_CLS,
"class_factor": 5, "class_factor": 5,
"state_table": BIG5_ST, "state_table": BIG5_ST,
@ -117,7 +118,7 @@ CP949_ST = (
CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2) CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
CP949_SM_MODEL = { CP949_SM_MODEL: CodingStateMachineDict = {
"class_table": CP949_CLS, "class_table": CP949_CLS,
"class_factor": 10, "class_factor": 10,
"state_table": CP949_ST, "state_table": CP949_ST,
@ -173,7 +174,7 @@ EUCJP_ST = (
EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0) EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0)
EUCJP_SM_MODEL = { EUCJP_SM_MODEL: CodingStateMachineDict = {
"class_table": EUCJP_CLS, "class_table": EUCJP_CLS,
"class_factor": 6, "class_factor": 6,
"state_table": EUCJP_ST, "state_table": EUCJP_ST,
@ -226,7 +227,7 @@ EUCKR_ST = (
EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0) EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0)
EUCKR_SM_MODEL = { EUCKR_SM_MODEL: CodingStateMachineDict = {
"class_table": EUCKR_CLS, "class_table": EUCKR_CLS,
"class_factor": 4, "class_factor": 4,
"state_table": EUCKR_ST, "state_table": EUCKR_ST,
@ -283,7 +284,7 @@ JOHAB_ST = (
JOHAB_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 0, 0, 2, 2, 2) JOHAB_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 0, 0, 2, 2, 2)
JOHAB_SM_MODEL = { JOHAB_SM_MODEL: CodingStateMachineDict = {
"class_table": JOHAB_CLS, "class_table": JOHAB_CLS,
"class_factor": 10, "class_factor": 10,
"state_table": JOHAB_ST, "state_table": JOHAB_ST,
@ -340,7 +341,7 @@ EUCTW_ST = (
EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3) EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3)
EUCTW_SM_MODEL = { EUCTW_SM_MODEL: CodingStateMachineDict = {
"class_table": EUCTW_CLS, "class_table": EUCTW_CLS,
"class_factor": 7, "class_factor": 7,
"state_table": EUCTW_ST, "state_table": EUCTW_ST,
@ -402,7 +403,7 @@ GB2312_ST = (
# 2 here. # 2 here.
GB2312_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 1, 2) GB2312_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 1, 2)
GB2312_SM_MODEL = { GB2312_SM_MODEL: CodingStateMachineDict = {
"class_table": GB2312_CLS, "class_table": GB2312_CLS,
"class_factor": 7, "class_factor": 7,
"state_table": GB2312_ST, "state_table": GB2312_ST,
@ -458,7 +459,7 @@ SJIS_ST = (
SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0) SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0)
SJIS_SM_MODEL = { SJIS_SM_MODEL: CodingStateMachineDict = {
"class_table": SJIS_CLS, "class_table": SJIS_CLS,
"class_factor": 6, "class_factor": 6,
"state_table": SJIS_ST, "state_table": SJIS_ST,
@ -516,7 +517,7 @@ UCS2BE_ST = (
UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2) UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2)
UCS2BE_SM_MODEL = { UCS2BE_SM_MODEL: CodingStateMachineDict = {
"class_table": UCS2BE_CLS, "class_table": UCS2BE_CLS,
"class_factor": 6, "class_factor": 6,
"state_table": UCS2BE_ST, "state_table": UCS2BE_ST,
@ -574,7 +575,7 @@ UCS2LE_ST = (
UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2) UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2)
UCS2LE_SM_MODEL = { UCS2LE_SM_MODEL: CodingStateMachineDict = {
"class_table": UCS2LE_CLS, "class_table": UCS2LE_CLS,
"class_factor": 6, "class_factor": 6,
"state_table": UCS2LE_ST, "state_table": UCS2LE_ST,
@ -651,7 +652,7 @@ UTF8_ST = (
UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6) UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)
UTF8_SM_MODEL = { UTF8_SM_MODEL: CodingStateMachineDict = {
"class_table": UTF8_CLS, "class_table": UTF8_CLS,
"class_factor": 16, "class_factor": 16,
"state_table": UTF8_ST, "state_table": UTF8_ST,

View File

@ -6,6 +6,7 @@ This code is based on the language metadata from the uchardet project.
""" """
from string import ascii_letters from string import ascii_letters
from typing import List, Optional
# TODO: Add Ukrainian (KOI8-U) # TODO: Add Ukrainian (KOI8-U)
@ -33,13 +34,13 @@ class Language:
def __init__( def __init__(
self, self,
name=None, name: Optional[str] = None,
iso_code=None, iso_code: Optional[str] = None,
use_ascii=True, use_ascii: bool = True,
charsets=None, charsets: Optional[List[str]] = None,
alphabet=None, alphabet: Optional[str] = None,
wiki_start_pages=None, wiki_start_pages: Optional[List[str]] = None,
): ) -> None:
super().__init__() super().__init__()
self.name = name self.name = name
self.iso_code = iso_code self.iso_code = iso_code
@ -55,7 +56,7 @@ class Language:
self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None
self.wiki_start_pages = wiki_start_pages self.wiki_start_pages = wiki_start_pages
def __repr__(self): def __repr__(self) -> str:
param_str = ", ".join( param_str = ", ".join(
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_") f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
) )
@ -103,7 +104,7 @@ LANGUAGES = {
name="Danish", name="Danish",
iso_code="da", iso_code="da",
use_ascii=True, use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="æøåÆØÅ", alphabet="æøåÆØÅ",
wiki_start_pages=["Forside"], wiki_start_pages=["Forside"],
), ),
@ -111,8 +112,8 @@ LANGUAGES = {
name="German", name="German",
iso_code="de", iso_code="de",
use_ascii=True, use_ascii=True,
charsets=["ISO-8859-1", "WINDOWS-1252"], charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="äöüßÄÖÜ", alphabet="äöüßÄÖÜ",
wiki_start_pages=["Wikipedia:Hauptseite"], wiki_start_pages=["Wikipedia:Hauptseite"],
), ),
"Greek": Language( "Greek": Language(
@ -127,7 +128,7 @@ LANGUAGES = {
name="English", name="English",
iso_code="en", iso_code="en",
use_ascii=True, use_ascii=True,
charsets=["ISO-8859-1", "WINDOWS-1252"], charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
wiki_start_pages=["Main_Page"], wiki_start_pages=["Main_Page"],
), ),
"Esperanto": Language( "Esperanto": Language(
@ -143,7 +144,7 @@ LANGUAGES = {
name="Spanish", name="Spanish",
iso_code="es", iso_code="es",
use_ascii=True, use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="ñáéíóúüÑÁÉÍÓÚÜ", alphabet="ñáéíóúüÑÁÉÍÓÚÜ",
wiki_start_pages=["Wikipedia:Portada"], wiki_start_pages=["Wikipedia:Portada"],
), ),
@ -161,7 +162,7 @@ LANGUAGES = {
name="Finnish", name="Finnish",
iso_code="fi", iso_code="fi",
use_ascii=True, use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="ÅÄÖŠŽåäöšž", alphabet="ÅÄÖŠŽåäöšž",
wiki_start_pages=["Wikipedia:Etusivu"], wiki_start_pages=["Wikipedia:Etusivu"],
), ),
@ -169,7 +170,7 @@ LANGUAGES = {
name="French", name="French",
iso_code="fr", iso_code="fr",
use_ascii=True, use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ", alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ",
wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"], wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"],
), ),
@ -203,7 +204,7 @@ LANGUAGES = {
name="Italian", name="Italian",
iso_code="it", iso_code="it",
use_ascii=True, use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="ÀÈÉÌÒÓÙàèéìòóù", alphabet="ÀÈÉÌÒÓÙàèéìòóù",
wiki_start_pages=["Pagina_principale"], wiki_start_pages=["Pagina_principale"],
), ),
@ -237,7 +238,7 @@ LANGUAGES = {
name="Dutch", name="Dutch",
iso_code="nl", iso_code="nl",
use_ascii=True, use_ascii=True,
charsets=["ISO-8859-1", "WINDOWS-1252"], charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
wiki_start_pages=["Hoofdpagina"], wiki_start_pages=["Hoofdpagina"],
), ),
"Polish": Language( "Polish": Language(
@ -253,7 +254,7 @@ LANGUAGES = {
name="Portuguese", name="Portuguese",
iso_code="pt", iso_code="pt",
use_ascii=True, use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú", alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú",
wiki_start_pages=["Wikipédia:Página_principal"], wiki_start_pages=["Wikipédia:Página_principal"],
), ),

View File

View File

@ -0,0 +1,16 @@
from typing import TYPE_CHECKING, Optional
if TYPE_CHECKING:
# TypedDict was introduced in Python 3.8.
#
# TODO: Remove the else block and TYPE_CHECKING check when dropping support
# for Python 3.7.
from typing import TypedDict
class ResultDict(TypedDict):
encoding: Optional[str]
confidence: float
language: Optional[str]
else:
ResultDict = dict

View File

@ -26,23 +26,20 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from collections import namedtuple from typing import Dict, List, NamedTuple, Optional, Union
from .charsetprober import CharSetProber from .charsetprober import CharSetProber
from .enums import CharacterCategory, ProbingState, SequenceLikelihood from .enums import CharacterCategory, ProbingState, SequenceLikelihood
SingleByteCharSetModel = namedtuple(
"SingleByteCharSetModel", class SingleByteCharSetModel(NamedTuple):
[ charset_name: str
"charset_name", language: str
"language", char_to_order_map: Dict[int, int]
"char_to_order_map", language_model: Dict[int, Dict[int, int]]
"language_model", typical_positive_ratio: float
"typical_positive_ratio", keep_ascii_letters: bool
"keep_ascii_letters", alphabet: str
"alphabet",
],
)
class SingleByteCharSetProber(CharSetProber): class SingleByteCharSetProber(CharSetProber):
@ -51,22 +48,27 @@ class SingleByteCharSetProber(CharSetProber):
POSITIVE_SHORTCUT_THRESHOLD = 0.95 POSITIVE_SHORTCUT_THRESHOLD = 0.95
NEGATIVE_SHORTCUT_THRESHOLD = 0.05 NEGATIVE_SHORTCUT_THRESHOLD = 0.05
def __init__(self, model, is_reversed=False, name_prober=None): def __init__(
self,
model: SingleByteCharSetModel,
is_reversed: bool = False,
name_prober: Optional[CharSetProber] = None,
) -> None:
super().__init__() super().__init__()
self._model = model self._model = model
# TRUE if we need to reverse every pair in the model lookup # TRUE if we need to reverse every pair in the model lookup
self._reversed = is_reversed self._reversed = is_reversed
# Optional auxiliary prober for name decision # Optional auxiliary prober for name decision
self._name_prober = name_prober self._name_prober = name_prober
self._last_order = None self._last_order = 255
self._seq_counters = None self._seq_counters: List[int] = []
self._total_seqs = None self._total_seqs = 0
self._total_char = None self._total_char = 0
self._control_char = None self._control_char = 0
self._freq_char = None self._freq_char = 0
self.reset() self.reset()
def reset(self): def reset(self) -> None:
super().reset() super().reset()
# char order of last character # char order of last character
self._last_order = 255 self._last_order = 255
@ -78,18 +80,18 @@ class SingleByteCharSetProber(CharSetProber):
self._freq_char = 0 self._freq_char = 0
@property @property
def charset_name(self): def charset_name(self) -> Optional[str]:
if self._name_prober: if self._name_prober:
return self._name_prober.charset_name return self._name_prober.charset_name
return self._model.charset_name return self._model.charset_name
@property @property
def language(self): def language(self) -> Optional[str]:
if self._name_prober: if self._name_prober:
return self._name_prober.language return self._name_prober.language
return self._model.language return self._model.language
def feed(self, byte_str): def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
# TODO: Make filter_international_words keep things in self.alphabet # TODO: Make filter_international_words keep things in self.alphabet
if not self._model.keep_ascii_letters: if not self._model.keep_ascii_letters:
byte_str = self.filter_international_words(byte_str) byte_str = self.filter_international_words(byte_str)
@ -139,7 +141,7 @@ class SingleByteCharSetProber(CharSetProber):
return self.state return self.state
def get_confidence(self): def get_confidence(self) -> float:
r = 0.01 r = 0.01
if self._total_seqs > 0: if self._total_seqs > 0:
r = ( r = (

View File

@ -48,7 +48,7 @@ from .sbcharsetprober import SingleByteCharSetProber
class SBCSGroupProber(CharSetGroupProber): class SBCSGroupProber(CharSetGroupProber):
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
hebrew_prober = HebrewProber() hebrew_prober = HebrewProber()
logical_hebrew_prober = SingleByteCharSetProber( logical_hebrew_prober = SingleByteCharSetProber(

View File

@ -25,6 +25,8 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from typing import Union
from .chardistribution import SJISDistributionAnalysis from .chardistribution import SJISDistributionAnalysis
from .codingstatemachine import CodingStateMachine from .codingstatemachine import CodingStateMachine
from .enums import MachineState, ProbingState from .enums import MachineState, ProbingState
@ -34,26 +36,29 @@ from .mbcssm import SJIS_SM_MODEL
class SJISProber(MultiByteCharSetProber): class SJISProber(MultiByteCharSetProber):
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
self.coding_sm = CodingStateMachine(SJIS_SM_MODEL) self.coding_sm = CodingStateMachine(SJIS_SM_MODEL)
self.distribution_analyzer = SJISDistributionAnalysis() self.distribution_analyzer = SJISDistributionAnalysis()
self.context_analyzer = SJISContextAnalysis() self.context_analyzer = SJISContextAnalysis()
self.reset() self.reset()
def reset(self): def reset(self) -> None:
super().reset() super().reset()
self.context_analyzer.reset() self.context_analyzer.reset()
@property @property
def charset_name(self): def charset_name(self) -> str:
return self.context_analyzer.charset_name return self.context_analyzer.charset_name
@property @property
def language(self): def language(self) -> str:
return "Japanese" return "Japanese"
def feed(self, byte_str): def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
assert self.coding_sm is not None
assert self.distribution_analyzer is not None
for i, byte in enumerate(byte_str): for i, byte in enumerate(byte_str):
coding_state = self.coding_sm.next_state(byte) coding_state = self.coding_sm.next_state(byte)
if coding_state == MachineState.ERROR: if coding_state == MachineState.ERROR:
@ -92,7 +97,9 @@ class SJISProber(MultiByteCharSetProber):
return self.state return self.state
def get_confidence(self): def get_confidence(self) -> float:
assert self.distribution_analyzer is not None
context_conf = self.context_analyzer.get_confidence() context_conf = self.context_analyzer.get_confidence()
distrib_conf = self.distribution_analyzer.get_confidence() distrib_conf = self.distribution_analyzer.get_confidence()
return max(context_conf, distrib_conf) return max(context_conf, distrib_conf)

View File

@ -39,12 +39,16 @@ class a user of ``chardet`` should use.
import codecs import codecs
import logging import logging
import re import re
from typing import List, Optional, Union
from .charsetgroupprober import CharSetGroupProber from .charsetgroupprober import CharSetGroupProber
from .charsetprober import CharSetProber
from .enums import InputState, LanguageFilter, ProbingState from .enums import InputState, LanguageFilter, ProbingState
from .escprober import EscCharSetProber from .escprober import EscCharSetProber
from .latin1prober import Latin1Prober from .latin1prober import Latin1Prober
from .macromanprober import MacRomanProber
from .mbcsgroupprober import MBCSGroupProber from .mbcsgroupprober import MBCSGroupProber
from .resultdict import ResultDict
from .sbcsgroupprober import SBCSGroupProber from .sbcsgroupprober import SBCSGroupProber
from .utf1632prober import UTF1632Prober from .utf1632prober import UTF1632Prober
@ -80,34 +84,55 @@ class UniversalDetector:
"iso-8859-9": "Windows-1254", "iso-8859-9": "Windows-1254",
"iso-8859-13": "Windows-1257", "iso-8859-13": "Windows-1257",
} }
# Based on https://encoding.spec.whatwg.org/#names-and-labels
# but altered to match Python names for encodings and remove mappings
# that break tests.
LEGACY_MAP = {
"ascii": "Windows-1252",
"iso-8859-1": "Windows-1252",
"tis-620": "ISO-8859-11",
"iso-8859-9": "Windows-1254",
"gb2312": "GB18030",
"euc-kr": "CP949",
"utf-16le": "UTF-16",
}
def __init__(self, lang_filter=LanguageFilter.ALL): def __init__(
self._esc_charset_prober = None self,
self._utf1632_prober = None lang_filter: LanguageFilter = LanguageFilter.ALL,
self._charset_probers = [] should_rename_legacy: bool = False,
self.result = None ) -> None:
self.done = None self._esc_charset_prober: Optional[EscCharSetProber] = None
self._got_data = None self._utf1632_prober: Optional[UTF1632Prober] = None
self._input_state = None self._charset_probers: List[CharSetProber] = []
self._last_char = None self.result: ResultDict = {
"encoding": None,
"confidence": 0.0,
"language": None,
}
self.done = False
self._got_data = False
self._input_state = InputState.PURE_ASCII
self._last_char = b""
self.lang_filter = lang_filter self.lang_filter = lang_filter
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
self._has_win_bytes = None self._has_win_bytes = False
self.should_rename_legacy = should_rename_legacy
self.reset() self.reset()
@property @property
def input_state(self): def input_state(self) -> int:
return self._input_state return self._input_state
@property @property
def has_win_bytes(self): def has_win_bytes(self) -> bool:
return self._has_win_bytes return self._has_win_bytes
@property @property
def charset_probers(self): def charset_probers(self) -> List[CharSetProber]:
return self._charset_probers return self._charset_probers
def reset(self): def reset(self) -> None:
""" """
Reset the UniversalDetector and all of its probers back to their Reset the UniversalDetector and all of its probers back to their
initial states. This is called by ``__init__``, so you only need to initial states. This is called by ``__init__``, so you only need to
@ -126,7 +151,7 @@ class UniversalDetector:
for prober in self._charset_probers: for prober in self._charset_probers:
prober.reset() prober.reset()
def feed(self, byte_str): def feed(self, byte_str: Union[bytes, bytearray]) -> None:
""" """
Takes a chunk of a document and feeds it through all of the relevant Takes a chunk of a document and feeds it through all of the relevant
charset probers. charset probers.
@ -166,6 +191,7 @@ class UniversalDetector:
elif byte_str.startswith(b"\xFE\xFF\x00\x00"): elif byte_str.startswith(b"\xFE\xFF\x00\x00"):
# FE FF 00 00 UCS-4, unusual octet order BOM (3412) # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
self.result = { self.result = {
# TODO: This encoding is not supported by Python. Should remove?
"encoding": "X-ISO-10646-UCS-4-3412", "encoding": "X-ISO-10646-UCS-4-3412",
"confidence": 1.0, "confidence": 1.0,
"language": "", "language": "",
@ -173,6 +199,7 @@ class UniversalDetector:
elif byte_str.startswith(b"\x00\x00\xFF\xFE"): elif byte_str.startswith(b"\x00\x00\xFF\xFE"):
# 00 00 FF FE UCS-4, unusual octet order BOM (2143) # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
self.result = { self.result = {
# TODO: This encoding is not supported by Python. Should remove?
"encoding": "X-ISO-10646-UCS-4-2143", "encoding": "X-ISO-10646-UCS-4-2143",
"confidence": 1.0, "confidence": 1.0,
"language": "", "language": "",
@ -242,6 +269,7 @@ class UniversalDetector:
if self.lang_filter & LanguageFilter.NON_CJK: if self.lang_filter & LanguageFilter.NON_CJK:
self._charset_probers.append(SBCSGroupProber()) self._charset_probers.append(SBCSGroupProber())
self._charset_probers.append(Latin1Prober()) self._charset_probers.append(Latin1Prober())
self._charset_probers.append(MacRomanProber())
for prober in self._charset_probers: for prober in self._charset_probers:
if prober.feed(byte_str) == ProbingState.FOUND_IT: if prober.feed(byte_str) == ProbingState.FOUND_IT:
self.result = { self.result = {
@ -254,7 +282,7 @@ class UniversalDetector:
if self.WIN_BYTE_DETECTOR.search(byte_str): if self.WIN_BYTE_DETECTOR.search(byte_str):
self._has_win_bytes = True self._has_win_bytes = True
def close(self): def close(self) -> ResultDict:
""" """
Stop analyzing the current document and come up with a final Stop analyzing the current document and come up with a final
prediction. prediction.
@ -288,7 +316,8 @@ class UniversalDetector:
max_prober = prober max_prober = prober
if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD): if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
charset_name = max_prober.charset_name charset_name = max_prober.charset_name
lower_charset_name = max_prober.charset_name.lower() assert charset_name is not None
lower_charset_name = charset_name.lower()
confidence = max_prober.get_confidence() confidence = max_prober.get_confidence()
# Use Windows encoding name instead of ISO-8859 if we saw any # Use Windows encoding name instead of ISO-8859 if we saw any
# extra Windows-specific bytes # extra Windows-specific bytes
@ -297,6 +326,11 @@ class UniversalDetector:
charset_name = self.ISO_WIN_MAP.get( charset_name = self.ISO_WIN_MAP.get(
lower_charset_name, charset_name lower_charset_name, charset_name
) )
# Rename legacy encodings with superset encodings if asked
if self.should_rename_legacy:
charset_name = self.LEGACY_MAP.get(
(charset_name or "").lower(), charset_name
)
self.result = { self.result = {
"encoding": charset_name, "encoding": charset_name,
"confidence": confidence, "confidence": confidence,

View File

@ -18,6 +18,8 @@
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from typing import List, Union
from .charsetprober import CharSetProber from .charsetprober import CharSetProber
from .enums import ProbingState from .enums import ProbingState
@ -36,7 +38,7 @@ class UTF1632Prober(CharSetProber):
# a fixed constant ratio of expected zeros or non-zeros in modulo-position. # a fixed constant ratio of expected zeros or non-zeros in modulo-position.
EXPECTED_RATIO = 0.94 EXPECTED_RATIO = 0.94
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
self.position = 0 self.position = 0
self.zeros_at_mod = [0] * 4 self.zeros_at_mod = [0] * 4
@ -51,7 +53,7 @@ class UTF1632Prober(CharSetProber):
self.first_half_surrogate_pair_detected_16le = False self.first_half_surrogate_pair_detected_16le = False
self.reset() self.reset()
def reset(self): def reset(self) -> None:
super().reset() super().reset()
self.position = 0 self.position = 0
self.zeros_at_mod = [0] * 4 self.zeros_at_mod = [0] * 4
@ -66,7 +68,7 @@ class UTF1632Prober(CharSetProber):
self.quad = [0, 0, 0, 0] self.quad = [0, 0, 0, 0]
@property @property
def charset_name(self): def charset_name(self) -> str:
if self.is_likely_utf32be(): if self.is_likely_utf32be():
return "utf-32be" return "utf-32be"
if self.is_likely_utf32le(): if self.is_likely_utf32le():
@ -79,16 +81,16 @@ class UTF1632Prober(CharSetProber):
return "utf-16" return "utf-16"
@property @property
def language(self): def language(self) -> str:
return "" return ""
def approx_32bit_chars(self): def approx_32bit_chars(self) -> float:
return max(1.0, self.position / 4.0) return max(1.0, self.position / 4.0)
def approx_16bit_chars(self): def approx_16bit_chars(self) -> float:
return max(1.0, self.position / 2.0) return max(1.0, self.position / 2.0)
def is_likely_utf32be(self): def is_likely_utf32be(self) -> bool:
approx_chars = self.approx_32bit_chars() approx_chars = self.approx_32bit_chars()
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
@ -98,7 +100,7 @@ class UTF1632Prober(CharSetProber):
and not self.invalid_utf32be and not self.invalid_utf32be
) )
def is_likely_utf32le(self): def is_likely_utf32le(self) -> bool:
approx_chars = self.approx_32bit_chars() approx_chars = self.approx_32bit_chars()
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
@ -108,7 +110,7 @@ class UTF1632Prober(CharSetProber):
and not self.invalid_utf32le and not self.invalid_utf32le
) )
def is_likely_utf16be(self): def is_likely_utf16be(self) -> bool:
approx_chars = self.approx_16bit_chars() approx_chars = self.approx_16bit_chars()
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
(self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars (self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars
@ -118,7 +120,7 @@ class UTF1632Prober(CharSetProber):
and not self.invalid_utf16be and not self.invalid_utf16be
) )
def is_likely_utf16le(self): def is_likely_utf16le(self) -> bool:
approx_chars = self.approx_16bit_chars() approx_chars = self.approx_16bit_chars()
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
(self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars (self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars
@ -128,7 +130,7 @@ class UTF1632Prober(CharSetProber):
and not self.invalid_utf16le and not self.invalid_utf16le
) )
def validate_utf32_characters(self, quad): def validate_utf32_characters(self, quad: List[int]) -> None:
""" """
Validate if the quad of bytes is valid UTF-32. Validate if the quad of bytes is valid UTF-32.
@ -150,7 +152,7 @@ class UTF1632Prober(CharSetProber):
): ):
self.invalid_utf32le = True self.invalid_utf32le = True
def validate_utf16_characters(self, pair): def validate_utf16_characters(self, pair: List[int]) -> None:
""" """
Validate if the pair of bytes is valid UTF-16. Validate if the pair of bytes is valid UTF-16.
@ -182,7 +184,7 @@ class UTF1632Prober(CharSetProber):
else: else:
self.invalid_utf16le = True self.invalid_utf16le = True
def feed(self, byte_str): def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
for c in byte_str: for c in byte_str:
mod4 = self.position % 4 mod4 = self.position % 4
self.quad[mod4] = c self.quad[mod4] = c
@ -198,7 +200,7 @@ class UTF1632Prober(CharSetProber):
return self.state return self.state
@property @property
def state(self): def state(self) -> ProbingState:
if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}: if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}:
# terminal, decided states # terminal, decided states
return self._state return self._state
@ -210,7 +212,7 @@ class UTF1632Prober(CharSetProber):
self._state = ProbingState.NOT_ME self._state = ProbingState.NOT_ME
return self._state return self._state
def get_confidence(self): def get_confidence(self) -> float:
return ( return (
0.85 0.85
if ( if (

View File

@ -25,6 +25,8 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
from typing import Union
from .charsetprober import CharSetProber from .charsetprober import CharSetProber
from .codingstatemachine import CodingStateMachine from .codingstatemachine import CodingStateMachine
from .enums import MachineState, ProbingState from .enums import MachineState, ProbingState
@ -34,26 +36,26 @@ from .mbcssm import UTF8_SM_MODEL
class UTF8Prober(CharSetProber): class UTF8Prober(CharSetProber):
ONE_CHAR_PROB = 0.5 ONE_CHAR_PROB = 0.5
def __init__(self): def __init__(self) -> None:
super().__init__() super().__init__()
self.coding_sm = CodingStateMachine(UTF8_SM_MODEL) self.coding_sm = CodingStateMachine(UTF8_SM_MODEL)
self._num_mb_chars = None self._num_mb_chars = 0
self.reset() self.reset()
def reset(self): def reset(self) -> None:
super().reset() super().reset()
self.coding_sm.reset() self.coding_sm.reset()
self._num_mb_chars = 0 self._num_mb_chars = 0
@property @property
def charset_name(self): def charset_name(self) -> str:
return "utf-8" return "utf-8"
@property @property
def language(self): def language(self) -> str:
return "" return ""
def feed(self, byte_str): def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
for c in byte_str: for c in byte_str:
coding_state = self.coding_sm.next_state(c) coding_state = self.coding_sm.next_state(c)
if coding_state == MachineState.ERROR: if coding_state == MachineState.ERROR:
@ -72,7 +74,7 @@ class UTF8Prober(CharSetProber):
return self.state return self.state
def get_confidence(self): def get_confidence(self) -> float:
unlike = 0.99 unlike = 0.99
if self._num_mb_chars < 6: if self._num_mb_chars < 6:
unlike *= self.ONE_CHAR_PROB**self._num_mb_chars unlike *= self.ONE_CHAR_PROB**self._num_mb_chars

View File

@ -1,9 +1,9 @@
""" """
This module exists only to simplify retrieving the version number of chardet This module exists only to simplify retrieving the version number of chardet
from within setup.py and from chardet subpackages. from within setuptools and from chardet subpackages.
:author: Dan Blanchard (dan.blanchard@gmail.com) :author: Dan Blanchard (dan.blanchard@gmail.com)
""" """
__version__ = "5.0.0" __version__ = "5.1.0"
VERSION = __version__.split(".") VERSION = __version__.split(".")

View File

@ -9,7 +9,7 @@ pyparsing==3.0.9
pyproject-hooks==1.0.0 pyproject-hooks==1.0.0
requests==2.28.2 requests==2.28.2
certifi==2022.12.7 certifi==2022.12.7
chardet==5.0.0 chardet==5.1.0
idna==3.4 idna==3.4
urllib3==1.26.12 urllib3==1.26.12
rich==12.6.0 rich==12.6.0