Upgrade chardet to 5.1.0

This commit is contained in:
Pradyun Gedam 2023-01-28 20:41:43 +00:00
parent 1c110bede6
commit be20a75c10
No known key found for this signature in database
GPG Key ID: FF99710C4332258E
37 changed files with 620 additions and 287 deletions

1
news/chardet.vendor.rst Normal file
View File

@ -0,0 +1 @@
Upgrade chardet to 5.1.0

View File

@ -1 +0,0 @@
from chardet import *

View File

@ -15,19 +15,29 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import List, Union
from .charsetgroupprober import CharSetGroupProber
from .charsetprober import CharSetProber
from .enums import InputState
from .resultdict import ResultDict
from .universaldetector import UniversalDetector
from .version import VERSION, __version__
__all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"]
def detect(byte_str):
def detect(
byte_str: Union[bytes, bytearray], should_rename_legacy: bool = False
) -> ResultDict:
"""
Detect the encoding of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
:param should_rename_legacy: Should we rename legacy encodings
to their more modern equivalents?
:type should_rename_legacy: ``bool``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
@ -35,12 +45,16 @@ def detect(byte_str):
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
)
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
detector.feed(byte_str)
return detector.close()
def detect_all(byte_str, ignore_threshold=False):
def detect_all(
byte_str: Union[bytes, bytearray],
ignore_threshold: bool = False,
should_rename_legacy: bool = False,
) -> List[ResultDict]:
"""
Detect all the possible encodings of the given byte string.
@ -50,6 +64,9 @@ def detect_all(byte_str, ignore_threshold=False):
``UniversalDetector.MINIMUM_THRESHOLD``
in results.
:type ignore_threshold: ``bool``
:param should_rename_legacy: Should we rename legacy encodings
to their more modern equivalents?
:type should_rename_legacy: ``bool``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
@ -58,15 +75,15 @@ def detect_all(byte_str, ignore_threshold=False):
)
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
detector.feed(byte_str)
detector.close()
if detector.input_state == InputState.HIGH_BYTE:
results = []
probers = []
results: List[ResultDict] = []
probers: List[CharSetProber] = []
for prober in detector.charset_probers:
if hasattr(prober, "probers"):
if isinstance(prober, CharSetGroupProber):
probers.extend(p for p in prober.probers)
else:
probers.append(prober)
@ -80,6 +97,11 @@ def detect_all(byte_str, ignore_threshold=False):
charset_name = detector.ISO_WIN_MAP.get(
lower_charset_name, charset_name
)
# Rename legacy encodings with superset encodings if asked
if should_rename_legacy:
charset_name = detector.LEGACY_MAP.get(
charset_name.lower(), charset_name
)
results.append(
{
"encoding": charset_name,

View File

@ -32,16 +32,16 @@ from .mbcssm import BIG5_SM_MODEL
class Big5Prober(MultiByteCharSetProber):
def __init__(self):
def __init__(self) -> None:
super().__init__()
self.coding_sm = CodingStateMachine(BIG5_SM_MODEL)
self.distribution_analyzer = Big5DistributionAnalysis()
self.reset()
@property
def charset_name(self):
def charset_name(self) -> str:
return "Big5"
@property
def language(self):
def language(self) -> str:
return "Chinese"

View File

@ -25,6 +25,8 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import Tuple, Union
from .big5freq import (
BIG5_CHAR_TO_FREQ_ORDER,
BIG5_TABLE_SIZE,
@ -59,22 +61,22 @@ class CharDistributionAnalysis:
SURE_NO = 0.01
MINIMUM_DATA_THRESHOLD = 3
def __init__(self):
def __init__(self) -> None:
# Mapping table to get frequency order from char order (get from
# GetOrder())
self._char_to_freq_order = tuple()
self._table_size = None # Size of above table
self._char_to_freq_order: Tuple[int, ...] = tuple()
self._table_size = 0 # Size of above table
# This is a constant value which varies from language to language,
# used in calculating confidence. See
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
# for further detail.
self.typical_distribution_ratio = None
self._done = None
self._total_chars = None
self._freq_chars = None
self.typical_distribution_ratio = 0.0
self._done = False
self._total_chars = 0
self._freq_chars = 0
self.reset()
def reset(self):
def reset(self) -> None:
"""reset analyser, clear any state"""
# If this flag is set to True, detection is done and conclusion has
# been made
@ -83,7 +85,7 @@ class CharDistributionAnalysis:
# The number of characters whose frequency order is less than 512
self._freq_chars = 0
def feed(self, char, char_len):
def feed(self, char: Union[bytes, bytearray], char_len: int) -> None:
"""feed a character with known length"""
if char_len == 2:
# we only care about 2-bytes character in our distribution analysis
@ -97,7 +99,7 @@ class CharDistributionAnalysis:
if 512 > self._char_to_freq_order[order]:
self._freq_chars += 1
def get_confidence(self):
def get_confidence(self) -> float:
"""return confidence based on existing data"""
# if we didn't receive any character in our consideration range,
# return negative answer
@ -114,12 +116,12 @@ class CharDistributionAnalysis:
# normalize confidence (we don't want to be 100% sure)
return self.SURE_YES
def got_enough_data(self):
def got_enough_data(self) -> bool:
# It is not necessary to receive all data to draw conclusion.
# For charset detection, certain amount of data is enough
return self._total_chars > self.ENOUGH_DATA_THRESHOLD
def get_order(self, _):
def get_order(self, _: Union[bytes, bytearray]) -> int:
# We do not handle characters based on the original encoding string,
# but convert this encoding string to a number, here called order.
# This allows multiple encodings of a language to share one frequency
@ -128,13 +130,13 @@ class CharDistributionAnalysis:
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
def __init__(self) -> None:
super().__init__()
self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
self._table_size = EUCTW_TABLE_SIZE
self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str):
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
# for euc-TW encoding, we are interested
# first byte range: 0xc4 -- 0xfe
# second byte range: 0xa1 -- 0xfe
@ -146,13 +148,13 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis):
class EUCKRDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
def __init__(self) -> None:
super().__init__()
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
self._table_size = EUCKR_TABLE_SIZE
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str):
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
# for euc-KR encoding, we are interested
# first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
@ -164,13 +166,13 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis):
class JOHABDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
def __init__(self) -> None:
super().__init__()
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
self._table_size = EUCKR_TABLE_SIZE
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str):
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
first_char = byte_str[0]
if 0x88 <= first_char < 0xD4:
code = first_char * 256 + byte_str[1]
@ -179,13 +181,13 @@ class JOHABDistributionAnalysis(CharDistributionAnalysis):
class GB2312DistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
def __init__(self) -> None:
super().__init__()
self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
self._table_size = GB2312_TABLE_SIZE
self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str):
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
# for GB2312 encoding, we are interested
# first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
@ -197,13 +199,13 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis):
class Big5DistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
def __init__(self) -> None:
super().__init__()
self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
self._table_size = BIG5_TABLE_SIZE
self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str):
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
# for big5 encoding, we are interested
# first byte range: 0xa4 -- 0xfe
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
@ -217,13 +219,13 @@ class Big5DistributionAnalysis(CharDistributionAnalysis):
class SJISDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
def __init__(self) -> None:
super().__init__()
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
self._table_size = JIS_TABLE_SIZE
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str):
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
# for sjis encoding, we are interested
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
@ -242,13 +244,13 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
class EUCJPDistributionAnalysis(CharDistributionAnalysis):
def __init__(self):
def __init__(self) -> None:
super().__init__()
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
self._table_size = JIS_TABLE_SIZE
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
def get_order(self, byte_str):
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
# for euc-JP encoding, we are interested
# first byte range: 0xa0 -- 0xfe
# second byte range: 0xa1 -- 0xfe

View File

@ -25,29 +25,30 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import List, Optional, Union
from .charsetprober import CharSetProber
from .enums import ProbingState
from .enums import LanguageFilter, ProbingState
class CharSetGroupProber(CharSetProber):
def __init__(self, lang_filter=None):
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
super().__init__(lang_filter=lang_filter)
self._active_num = 0
self.probers = []
self._best_guess_prober = None
self.probers: List[CharSetProber] = []
self._best_guess_prober: Optional[CharSetProber] = None
def reset(self):
def reset(self) -> None:
super().reset()
self._active_num = 0
for prober in self.probers:
if prober:
prober.reset()
prober.active = True
self._active_num += 1
prober.reset()
prober.active = True
self._active_num += 1
self._best_guess_prober = None
@property
def charset_name(self):
def charset_name(self) -> Optional[str]:
if not self._best_guess_prober:
self.get_confidence()
if not self._best_guess_prober:
@ -55,17 +56,15 @@ class CharSetGroupProber(CharSetProber):
return self._best_guess_prober.charset_name
@property
def language(self):
def language(self) -> Optional[str]:
if not self._best_guess_prober:
self.get_confidence()
if not self._best_guess_prober:
return None
return self._best_guess_prober.language
def feed(self, byte_str):
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
for prober in self.probers:
if not prober:
continue
if not prober.active:
continue
state = prober.feed(byte_str)
@ -83,7 +82,7 @@ class CharSetGroupProber(CharSetProber):
return self.state
return self.state
def get_confidence(self):
def get_confidence(self) -> float:
state = self.state
if state == ProbingState.FOUND_IT:
return 0.99
@ -92,8 +91,6 @@ class CharSetGroupProber(CharSetProber):
best_conf = 0.0
self._best_guess_prober = None
for prober in self.probers:
if not prober:
continue
if not prober.active:
self.logger.debug("%s not active", prober.charset_name)
continue

View File

@ -28,8 +28,9 @@
import logging
import re
from typing import Optional, Union
from .enums import ProbingState
from .enums import LanguageFilter, ProbingState
INTERNATIONAL_WORDS_PATTERN = re.compile(
b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?"
@ -40,35 +41,40 @@ class CharSetProber:
SHORTCUT_THRESHOLD = 0.95
def __init__(self, lang_filter=None):
self._state = None
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
self._state = ProbingState.DETECTING
self.active = True
self.lang_filter = lang_filter
self.logger = logging.getLogger(__name__)
def reset(self):
def reset(self) -> None:
self._state = ProbingState.DETECTING
@property
def charset_name(self):
def charset_name(self) -> Optional[str]:
return None
def feed(self, byte_str):
@property
def language(self) -> Optional[str]:
raise NotImplementedError
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
raise NotImplementedError
@property
def state(self):
def state(self) -> ProbingState:
return self._state
def get_confidence(self):
def get_confidence(self) -> float:
return 0.0
@staticmethod
def filter_high_byte_only(buf):
def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes:
buf = re.sub(b"([\x00-\x7F])+", b" ", buf)
return buf
@staticmethod
def filter_international_words(buf):
def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray:
"""
We define three types of bytes:
alphabet: english alphabets [a-zA-Z]
@ -102,7 +108,7 @@ class CharSetProber:
return filtered
@staticmethod
def remove_xml_tags(buf):
def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytes:
"""
Returns a copy of ``buf`` that retains only the sequences of English
alphabet and high byte characters that are not between <> characters.
@ -117,10 +123,13 @@ class CharSetProber:
for curr, buf_char in enumerate(buf):
# Check if we're coming out of or entering an XML tag
if buf_char == b">":
# https://github.com/python/typeshed/issues/8182
if buf_char == b">": # type: ignore[comparison-overlap]
prev = curr + 1
in_tag = False
elif buf_char == b"<":
# https://github.com/python/typeshed/issues/8182
elif buf_char == b"<": # type: ignore[comparison-overlap]
if curr > prev and not in_tag:
# Keep everything after last non-extended-ASCII,
# non-alphabetic character

View File

@ -15,12 +15,18 @@ If no paths are provided, it takes its input from stdin.
import argparse
import sys
from typing import Iterable, List, Optional
from .. import __version__
from ..universaldetector import UniversalDetector
def description_of(lines, name="stdin"):
def description_of(
lines: Iterable[bytes],
name: str = "stdin",
minimal: bool = False,
should_rename_legacy: bool = False,
) -> Optional[str]:
"""
Return a string describing the probable encoding of a file or
list of strings.
@ -29,8 +35,11 @@ def description_of(lines, name="stdin"):
:type lines: Iterable of bytes
:param name: Name of file or collection of lines
:type name: str
:param should_rename_legacy: Should we rename legacy encodings to
their more modern equivalents?
:type should_rename_legacy: ``bool``
"""
u = UniversalDetector()
u = UniversalDetector(should_rename_legacy=should_rename_legacy)
for line in lines:
line = bytearray(line)
u.feed(line)
@ -39,12 +48,14 @@ def description_of(lines, name="stdin"):
break
u.close()
result = u.result
if minimal:
return result["encoding"]
if result["encoding"]:
return f'{name}: {result["encoding"]} with confidence {result["confidence"]}'
return f"{name}: no result"
def main(argv=None):
def main(argv: Optional[List[str]] = None) -> None:
"""
Handles command line arguments and gets things started.
@ -54,17 +65,28 @@ def main(argv=None):
"""
# Get command line arguments
parser = argparse.ArgumentParser(
description="Takes one or more file paths and reports their detected \
encodings"
description=(
"Takes one or more file paths and reports their detected encodings"
)
)
parser.add_argument(
"input",
help="File whose encoding we would like to determine. \
(default: stdin)",
help="File whose encoding we would like to determine. (default: stdin)",
type=argparse.FileType("rb"),
nargs="*",
default=[sys.stdin.buffer],
)
parser.add_argument(
"--minimal",
help="Print only the encoding to standard output",
action="store_true",
)
parser.add_argument(
"-l",
"--legacy",
help="Rename legacy encodings to more modern ones.",
action="store_true",
)
parser.add_argument(
"--version", action="version", version=f"%(prog)s {__version__}"
)
@ -79,7 +101,11 @@ def main(argv=None):
"--help\n",
file=sys.stderr,
)
print(description_of(f, f.name))
print(
description_of(
f, f.name, minimal=args.minimal, should_rename_legacy=args.legacy
)
)
if __name__ == "__main__":

View File

@ -27,6 +27,7 @@
import logging
from .codingstatemachinedict import CodingStateMachineDict
from .enums import MachineState
@ -53,18 +54,19 @@ class CodingStateMachine:
encoding from consideration from here on.
"""
def __init__(self, sm):
def __init__(self, sm: CodingStateMachineDict) -> None:
self._model = sm
self._curr_byte_pos = 0
self._curr_char_len = 0
self._curr_state = None
self._curr_state = MachineState.START
self.active = True
self.logger = logging.getLogger(__name__)
self.reset()
def reset(self):
def reset(self) -> None:
self._curr_state = MachineState.START
def next_state(self, c):
def next_state(self, c: int) -> int:
# for each byte we get its class
# if it is first byte, we also get byte length
byte_class = self._model["class_table"][c]
@ -77,12 +79,12 @@ class CodingStateMachine:
self._curr_byte_pos += 1
return self._curr_state
def get_current_charlen(self):
def get_current_charlen(self) -> int:
return self._curr_char_len
def get_coding_state_machine(self):
def get_coding_state_machine(self) -> str:
return self._model["name"]
@property
def language(self):
def language(self) -> str:
return self._model["language"]

View File

@ -0,0 +1,19 @@
from typing import TYPE_CHECKING, Tuple
if TYPE_CHECKING:
# TypedDict was introduced in Python 3.8.
#
# TODO: Remove the else block and TYPE_CHECKING check when dropping support
# for Python 3.7.
from typing import TypedDict
class CodingStateMachineDict(TypedDict, total=False):
class_table: Tuple[int, ...]
class_factor: int
state_table: Tuple[int, ...]
char_len_table: Tuple[int, ...]
name: str
language: str # Optional key
else:
CodingStateMachineDict = dict

View File

@ -32,7 +32,7 @@ from .mbcssm import CP949_SM_MODEL
class CP949Prober(MultiByteCharSetProber):
def __init__(self):
def __init__(self) -> None:
super().__init__()
self.coding_sm = CodingStateMachine(CP949_SM_MODEL)
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be
@ -41,9 +41,9 @@ class CP949Prober(MultiByteCharSetProber):
self.reset()
@property
def charset_name(self):
def charset_name(self) -> str:
return "CP949"
@property
def language(self):
def language(self) -> str:
return "Korean"

View File

@ -4,6 +4,8 @@ All of the Enums that are used throughout the chardet package.
:author: Dan Blanchard (dan.blanchard@gmail.com)
"""
from enum import Enum, Flag
class InputState:
"""
@ -15,12 +17,13 @@ class InputState:
HIGH_BYTE = 2
class LanguageFilter:
class LanguageFilter(Flag):
"""
This enum represents the different language filters we can apply to a
``UniversalDetector``.
"""
NONE = 0x00
CHINESE_SIMPLIFIED = 0x01
CHINESE_TRADITIONAL = 0x02
JAPANESE = 0x04
@ -31,7 +34,7 @@ class LanguageFilter:
CJK = CHINESE | JAPANESE | KOREAN
class ProbingState:
class ProbingState(Enum):
"""
This enum represents the different states a prober can be in.
"""
@ -62,7 +65,7 @@ class SequenceLikelihood:
POSITIVE = 3
@classmethod
def get_num_categories(cls):
def get_num_categories(cls) -> int:
""":returns: The number of likelihood categories in the enum."""
return 4

View File

@ -25,6 +25,8 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import Optional, Union
from .charsetprober import CharSetProber
from .codingstatemachine import CodingStateMachine
from .enums import LanguageFilter, MachineState, ProbingState
@ -43,7 +45,7 @@ class EscCharSetProber(CharSetProber):
identify these encodings.
"""
def __init__(self, lang_filter=None):
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
super().__init__(lang_filter=lang_filter)
self.coding_sm = []
if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
@ -53,17 +55,15 @@ class EscCharSetProber(CharSetProber):
self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))
if self.lang_filter & LanguageFilter.KOREAN:
self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))
self.active_sm_count = None
self._detected_charset = None
self._detected_language = None
self._state = None
self.active_sm_count = 0
self._detected_charset: Optional[str] = None
self._detected_language: Optional[str] = None
self._state = ProbingState.DETECTING
self.reset()
def reset(self):
def reset(self) -> None:
super().reset()
for coding_sm in self.coding_sm:
if not coding_sm:
continue
coding_sm.active = True
coding_sm.reset()
self.active_sm_count = len(self.coding_sm)
@ -71,20 +71,20 @@ class EscCharSetProber(CharSetProber):
self._detected_language = None
@property
def charset_name(self):
def charset_name(self) -> Optional[str]:
return self._detected_charset
@property
def language(self):
def language(self) -> Optional[str]:
return self._detected_language
def get_confidence(self):
def get_confidence(self) -> float:
return 0.99 if self._detected_charset else 0.00
def feed(self, byte_str):
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
for c in byte_str:
for coding_sm in self.coding_sm:
if not coding_sm or not coding_sm.active:
if not coding_sm.active:
continue
coding_state = coding_sm.next_state(c)
if coding_state == MachineState.ERROR:

View File

@ -25,6 +25,7 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .codingstatemachinedict import CodingStateMachineDict
from .enums import MachineState
# fmt: off
@ -75,7 +76,7 @@ MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR
HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
HZ_SM_MODEL = {
HZ_SM_MODEL: CodingStateMachineDict = {
"class_table": HZ_CLS,
"class_factor": 6,
"state_table": HZ_ST,
@ -134,7 +135,7 @@ ISO2022CN_ST = (
ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0)
ISO2022CN_SM_MODEL = {
ISO2022CN_SM_MODEL: CodingStateMachineDict = {
"class_table": ISO2022CN_CLS,
"class_factor": 9,
"state_table": ISO2022CN_ST,
@ -194,7 +195,7 @@ ISO2022JP_ST = (
ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
ISO2022JP_SM_MODEL = {
ISO2022JP_SM_MODEL: CodingStateMachineDict = {
"class_table": ISO2022JP_CLS,
"class_factor": 10,
"state_table": ISO2022JP_ST,
@ -250,7 +251,7 @@ ISO2022KR_ST = (
ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
ISO2022KR_SM_MODEL = {
ISO2022KR_SM_MODEL: CodingStateMachineDict = {
"class_table": ISO2022KR_CLS,
"class_factor": 6,
"state_table": ISO2022KR_ST,

View File

@ -25,6 +25,8 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import Union
from .chardistribution import EUCJPDistributionAnalysis
from .codingstatemachine import CodingStateMachine
from .enums import MachineState, ProbingState
@ -34,26 +36,29 @@ from .mbcssm import EUCJP_SM_MODEL
class EUCJPProber(MultiByteCharSetProber):
def __init__(self):
def __init__(self) -> None:
super().__init__()
self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL)
self.distribution_analyzer = EUCJPDistributionAnalysis()
self.context_analyzer = EUCJPContextAnalysis()
self.reset()
def reset(self):
def reset(self) -> None:
super().reset()
self.context_analyzer.reset()
@property
def charset_name(self):
def charset_name(self) -> str:
return "EUC-JP"
@property
def language(self):
def language(self) -> str:
return "Japanese"
def feed(self, byte_str):
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
assert self.coding_sm is not None
assert self.distribution_analyzer is not None
for i, byte in enumerate(byte_str):
# PY3K: byte_str is a byte array, so byte is an int, not a byte
coding_state = self.coding_sm.next_state(byte)
@ -89,7 +94,9 @@ class EUCJPProber(MultiByteCharSetProber):
return self.state
def get_confidence(self):
def get_confidence(self) -> float:
assert self.distribution_analyzer is not None
context_conf = self.context_analyzer.get_confidence()
distrib_conf = self.distribution_analyzer.get_confidence()
return max(context_conf, distrib_conf)

View File

@ -32,16 +32,16 @@ from .mbcssm import EUCKR_SM_MODEL
class EUCKRProber(MultiByteCharSetProber):
def __init__(self):
def __init__(self) -> None:
super().__init__()
self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL)
self.distribution_analyzer = EUCKRDistributionAnalysis()
self.reset()
@property
def charset_name(self):
def charset_name(self) -> str:
return "EUC-KR"
@property
def language(self):
def language(self) -> str:
return "Korean"

View File

@ -32,16 +32,16 @@ from .mbcssm import EUCTW_SM_MODEL
class EUCTWProber(MultiByteCharSetProber):
def __init__(self):
def __init__(self) -> None:
super().__init__()
self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL)
self.distribution_analyzer = EUCTWDistributionAnalysis()
self.reset()
@property
def charset_name(self):
def charset_name(self) -> str:
return "EUC-TW"
@property
def language(self):
def language(self) -> str:
return "Taiwan"

View File

@ -32,16 +32,16 @@ from .mbcssm import GB2312_SM_MODEL
class GB2312Prober(MultiByteCharSetProber):
def __init__(self):
def __init__(self) -> None:
super().__init__()
self.coding_sm = CodingStateMachine(GB2312_SM_MODEL)
self.distribution_analyzer = GB2312DistributionAnalysis()
self.reset()
@property
def charset_name(self):
def charset_name(self) -> str:
return "GB2312"
@property
def language(self):
def language(self) -> str:
return "Chinese"

View File

@ -25,8 +25,11 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import Optional, Union
from .charsetprober import CharSetProber
from .enums import ProbingState
from .sbcharsetprober import SingleByteCharSetProber
# This prober doesn't actually recognize a language or a charset.
# It is a helper prober for the use of the Hebrew model probers
@ -127,6 +130,7 @@ from .enums import ProbingState
class HebrewProber(CharSetProber):
SPACE = 0x20
# windows-1255 / ISO-8859-8 code points of interest
FINAL_KAF = 0xEA
NORMAL_KAF = 0xEB
@ -152,31 +156,35 @@ class HebrewProber(CharSetProber):
VISUAL_HEBREW_NAME = "ISO-8859-8"
LOGICAL_HEBREW_NAME = "windows-1255"
def __init__(self):
def __init__(self) -> None:
super().__init__()
self._final_char_logical_score = None
self._final_char_visual_score = None
self._prev = None
self._before_prev = None
self._logical_prober = None
self._visual_prober = None
self._final_char_logical_score = 0
self._final_char_visual_score = 0
self._prev = self.SPACE
self._before_prev = self.SPACE
self._logical_prober: Optional[SingleByteCharSetProber] = None
self._visual_prober: Optional[SingleByteCharSetProber] = None
self.reset()
def reset(self):
def reset(self) -> None:
self._final_char_logical_score = 0
self._final_char_visual_score = 0
# The two last characters seen in the previous buffer,
# mPrev and mBeforePrev are initialized to space in order to simulate
# a word delimiter at the beginning of the data
self._prev = " "
self._before_prev = " "
self._prev = self.SPACE
self._before_prev = self.SPACE
# These probers are owned by the group prober.
def set_model_probers(self, logical_prober, visual_prober):
def set_model_probers(
self,
logical_prober: SingleByteCharSetProber,
visual_prober: SingleByteCharSetProber,
) -> None:
self._logical_prober = logical_prober
self._visual_prober = visual_prober
def is_final(self, c):
def is_final(self, c: int) -> bool:
return c in [
self.FINAL_KAF,
self.FINAL_MEM,
@ -185,7 +193,7 @@ class HebrewProber(CharSetProber):
self.FINAL_TSADI,
]
def is_non_final(self, c):
def is_non_final(self, c: int) -> bool:
# The normal Tsadi is not a good Non-Final letter due to words like
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This
# apostrophe is converted to a space in FilterWithoutEnglishLetters
@ -198,7 +206,7 @@ class HebrewProber(CharSetProber):
# since these words are quite rare.
return c in [self.NORMAL_KAF, self.NORMAL_MEM, self.NORMAL_NUN, self.NORMAL_PE]
def feed(self, byte_str):
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
# Final letter analysis for logical-visual decision.
# Look for evidence that the received buffer is either logical Hebrew
# or visual Hebrew.
@ -232,9 +240,9 @@ class HebrewProber(CharSetProber):
byte_str = self.filter_high_byte_only(byte_str)
for cur in byte_str:
if cur == " ":
if cur == self.SPACE:
# We stand on a space - a word just ended
if self._before_prev != " ":
if self._before_prev != self.SPACE:
# next-to-last char was not a space so self._prev is not a
# 1 letter word
if self.is_final(self._prev):
@ -247,9 +255,9 @@ class HebrewProber(CharSetProber):
else:
# Not standing on a space
if (
(self._before_prev == " ")
(self._before_prev == self.SPACE)
and (self.is_final(self._prev))
and (cur != " ")
and (cur != self.SPACE)
):
# case (3) [-2:space][-1:final letter][cur:not space]
self._final_char_visual_score += 1
@ -261,7 +269,10 @@ class HebrewProber(CharSetProber):
return ProbingState.DETECTING
@property
def charset_name(self):
def charset_name(self) -> str:
assert self._logical_prober is not None
assert self._visual_prober is not None
# Make the decision: is it Logical or Visual?
# If the final letter score distance is dominant enough, rely on it.
finalsub = self._final_char_logical_score - self._final_char_visual_score
@ -289,11 +300,14 @@ class HebrewProber(CharSetProber):
return self.LOGICAL_HEBREW_NAME
@property
def language(self):
def language(self) -> str:
return "Hebrew"
@property
def state(self):
def state(self) -> ProbingState:
assert self._logical_prober is not None
assert self._visual_prober is not None
# Remain active as long as any of the model probers are active.
if (self._logical_prober.state == ProbingState.NOT_ME) and (
self._visual_prober.state == ProbingState.NOT_ME

View File

@ -32,16 +32,16 @@ from .mbcssm import JOHAB_SM_MODEL
class JOHABProber(MultiByteCharSetProber):
def __init__(self):
def __init__(self) -> None:
super().__init__()
self.coding_sm = CodingStateMachine(JOHAB_SM_MODEL)
self.distribution_analyzer = JOHABDistributionAnalysis()
self.reset()
@property
def charset_name(self):
def charset_name(self) -> str:
return "Johab"
@property
def language(self):
def language(self) -> str:
return "Korean"

View File

@ -25,6 +25,7 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import List, Tuple, Union
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
# fmt: off
@ -123,15 +124,15 @@ class JapaneseContextAnalysis:
MAX_REL_THRESHOLD = 1000
MINIMUM_DATA_THRESHOLD = 4
def __init__(self):
self._total_rel = None
self._rel_sample = None
self._need_to_skip_char_num = None
self._last_char_order = None
self._done = None
def __init__(self) -> None:
self._total_rel = 0
self._rel_sample: List[int] = []
self._need_to_skip_char_num = 0
self._last_char_order = -1
self._done = False
self.reset()
def reset(self):
def reset(self) -> None:
self._total_rel = 0 # total sequence received
# category counters, each integer counts sequence in its category
self._rel_sample = [0] * self.NUM_OF_CATEGORY
@ -143,7 +144,7 @@ class JapaneseContextAnalysis:
# been made
self._done = False
def feed(self, byte_str, num_bytes):
def feed(self, byte_str: Union[bytes, bytearray], num_bytes: int) -> None:
if self._done:
return
@ -172,29 +173,29 @@ class JapaneseContextAnalysis:
] += 1
self._last_char_order = order
def got_enough_data(self):
def got_enough_data(self) -> bool:
return self._total_rel > self.ENOUGH_REL_THRESHOLD
def get_confidence(self):
def get_confidence(self) -> float:
# This is just one way to calculate confidence. It works well for me.
if self._total_rel > self.MINIMUM_DATA_THRESHOLD:
return (self._total_rel - self._rel_sample[0]) / self._total_rel
return self.DONT_KNOW
def get_order(self, _):
def get_order(self, _: Union[bytes, bytearray]) -> Tuple[int, int]:
return -1, 1
class SJISContextAnalysis(JapaneseContextAnalysis):
def __init__(self):
def __init__(self) -> None:
super().__init__()
self._charset_name = "SHIFT_JIS"
@property
def charset_name(self):
def charset_name(self) -> str:
return self._charset_name
def get_order(self, byte_str):
def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]:
if not byte_str:
return -1, 1
# find out current char's byte length
@ -216,7 +217,7 @@ class SJISContextAnalysis(JapaneseContextAnalysis):
class EUCJPContextAnalysis(JapaneseContextAnalysis):
def get_order(self, byte_str):
def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]:
if not byte_str:
return -1, 1
# find out current char's byte length

View File

@ -26,6 +26,8 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import List, Union
from .charsetprober import CharSetProber
from .enums import ProbingState
@ -96,26 +98,26 @@ Latin1ClassModel = (
class Latin1Prober(CharSetProber):
def __init__(self):
def __init__(self) -> None:
super().__init__()
self._last_char_class = None
self._freq_counter = None
self._last_char_class = OTH
self._freq_counter: List[int] = []
self.reset()
def reset(self):
def reset(self) -> None:
self._last_char_class = OTH
self._freq_counter = [0] * FREQ_CAT_NUM
super().reset()
@property
def charset_name(self):
def charset_name(self) -> str:
return "ISO-8859-1"
@property
def language(self):
def language(self) -> str:
return ""
def feed(self, byte_str):
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
byte_str = self.remove_xml_tags(byte_str)
for c in byte_str:
char_class = Latin1_CharToClass[c]
@ -128,7 +130,7 @@ class Latin1Prober(CharSetProber):
return self.state
def get_confidence(self):
def get_confidence(self) -> float:
if self.state == ProbingState.NOT_ME:
return 0.01

View File

@ -0,0 +1,162 @@
######################## BEGIN LICENSE BLOCK ########################
# This code was modified from latin1prober.py by Rob Speer <rob@lumino.so>.
# The Original Code is Mozilla Universal charset detector code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 2001
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Rob Speer - adapt to MacRoman encoding
# Mark Pilgrim - port to Python
# Shy Shalom - original C code
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import List, Union
from .charsetprober import CharSetProber
from .enums import ProbingState
FREQ_CAT_NUM = 4
UDF = 0 # undefined
OTH = 1 # other
ASC = 2 # ascii capital letter
ASS = 3 # ascii small letter
ACV = 4 # accent capital vowel
ACO = 5 # accent capital other
ASV = 6 # accent small vowel
ASO = 7 # accent small other
ODD = 8 # character that is unlikely to appear
CLASS_NUM = 9 # total classes
# The change from Latin1 is that we explicitly look for extended characters
# that are infrequently-occurring symbols, and consider them to always be
# improbable. This should let MacRoman get out of the way of more likely
# encodings in most situations.
# fmt: off
MacRoman_CharToClass = (
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
ACV, ACV, ACO, ACV, ACO, ACV, ACV, ASV, # 80 - 87
ASV, ASV, ASV, ASV, ASV, ASO, ASV, ASV, # 88 - 8F
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASV, # 90 - 97
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # 98 - 9F
OTH, OTH, OTH, OTH, OTH, OTH, OTH, ASO, # A0 - A7
OTH, OTH, ODD, ODD, OTH, OTH, ACV, ACV, # A8 - AF
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
OTH, OTH, OTH, OTH, OTH, OTH, ASV, ASV, # B8 - BF
OTH, OTH, ODD, OTH, ODD, OTH, OTH, OTH, # C0 - C7
OTH, OTH, OTH, ACV, ACV, ACV, ACV, ASV, # C8 - CF
OTH, OTH, OTH, OTH, OTH, OTH, OTH, ODD, # D0 - D7
ASV, ACV, ODD, OTH, OTH, OTH, OTH, OTH, # D8 - DF
OTH, OTH, OTH, OTH, OTH, ACV, ACV, ACV, # E0 - E7
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # E8 - EF
ODD, ACV, ACV, ACV, ACV, ASV, ODD, ODD, # F0 - F7
ODD, ODD, ODD, ODD, ODD, ODD, ODD, ODD, # F8 - FF
)
# 0 : illegal
# 1 : very unlikely
# 2 : normal
# 3 : very likely
MacRomanClassModel = (
# UDF OTH ASC ASS ACV ACO ASV ASO ODD
0, 0, 0, 0, 0, 0, 0, 0, 0, # UDF
0, 3, 3, 3, 3, 3, 3, 3, 1, # OTH
0, 3, 3, 3, 3, 3, 3, 3, 1, # ASC
0, 3, 3, 3, 1, 1, 3, 3, 1, # ASS
0, 3, 3, 3, 1, 2, 1, 2, 1, # ACV
0, 3, 3, 3, 3, 3, 3, 3, 1, # ACO
0, 3, 1, 3, 1, 1, 1, 3, 1, # ASV
0, 3, 1, 3, 1, 1, 3, 3, 1, # ASO
0, 1, 1, 1, 1, 1, 1, 1, 1, # ODD
)
# fmt: on
class MacRomanProber(CharSetProber):
def __init__(self) -> None:
super().__init__()
self._last_char_class = OTH
self._freq_counter: List[int] = []
self.reset()
def reset(self) -> None:
self._last_char_class = OTH
self._freq_counter = [0] * FREQ_CAT_NUM
# express the prior that MacRoman is a somewhat rare encoding;
# this can be done by starting out in a slightly improbable state
# that must be overcome
self._freq_counter[2] = 10
super().reset()
@property
def charset_name(self) -> str:
return "MacRoman"
@property
def language(self) -> str:
return ""
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
byte_str = self.remove_xml_tags(byte_str)
for c in byte_str:
char_class = MacRoman_CharToClass[c]
freq = MacRomanClassModel[(self._last_char_class * CLASS_NUM) + char_class]
if freq == 0:
self._state = ProbingState.NOT_ME
break
self._freq_counter[freq] += 1
self._last_char_class = char_class
return self.state
def get_confidence(self) -> float:
if self.state == ProbingState.NOT_ME:
return 0.01
total = sum(self._freq_counter)
confidence = (
0.0
if total < 0.01
else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total
)
confidence = max(confidence, 0.0)
# lower the confidence of MacRoman so that other more accurate
# detector can take priority.
confidence *= 0.73
return confidence

View File

@ -27,8 +27,12 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import Optional, Union
from .chardistribution import CharDistributionAnalysis
from .charsetprober import CharSetProber
from .enums import MachineState, ProbingState
from .codingstatemachine import CodingStateMachine
from .enums import LanguageFilter, MachineState, ProbingState
class MultiByteCharSetProber(CharSetProber):
@ -36,29 +40,24 @@ class MultiByteCharSetProber(CharSetProber):
MultiByteCharSetProber
"""
def __init__(self, lang_filter=None):
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
super().__init__(lang_filter=lang_filter)
self.distribution_analyzer = None
self.coding_sm = None
self._last_char = [0, 0]
self.distribution_analyzer: Optional[CharDistributionAnalysis] = None
self.coding_sm: Optional[CodingStateMachine] = None
self._last_char = bytearray(b"\0\0")
def reset(self):
def reset(self) -> None:
super().reset()
if self.coding_sm:
self.coding_sm.reset()
if self.distribution_analyzer:
self.distribution_analyzer.reset()
self._last_char = [0, 0]
self._last_char = bytearray(b"\0\0")
@property
def charset_name(self):
raise NotImplementedError
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
assert self.coding_sm is not None
assert self.distribution_analyzer is not None
@property
def language(self):
raise NotImplementedError
def feed(self, byte_str):
for i, byte in enumerate(byte_str):
coding_state = self.coding_sm.next_state(byte)
if coding_state == MachineState.ERROR:
@ -91,5 +90,6 @@ class MultiByteCharSetProber(CharSetProber):
return self.state
def get_confidence(self):
def get_confidence(self) -> float:
assert self.distribution_analyzer is not None
return self.distribution_analyzer.get_confidence()

View File

@ -30,6 +30,7 @@
from .big5prober import Big5Prober
from .charsetgroupprober import CharSetGroupProber
from .cp949prober import CP949Prober
from .enums import LanguageFilter
from .eucjpprober import EUCJPProber
from .euckrprober import EUCKRProber
from .euctwprober import EUCTWProber
@ -40,7 +41,7 @@ from .utf8prober import UTF8Prober
class MBCSGroupProber(CharSetGroupProber):
def __init__(self, lang_filter=None):
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
super().__init__(lang_filter=lang_filter)
self.probers = [
UTF8Prober(),

View File

@ -25,6 +25,7 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from .codingstatemachinedict import CodingStateMachineDict
from .enums import MachineState
# BIG5
@ -74,7 +75,7 @@ BIG5_ST = (
BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0)
BIG5_SM_MODEL = {
BIG5_SM_MODEL: CodingStateMachineDict = {
"class_table": BIG5_CLS,
"class_factor": 5,
"state_table": BIG5_ST,
@ -117,7 +118,7 @@ CP949_ST = (
CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
CP949_SM_MODEL = {
CP949_SM_MODEL: CodingStateMachineDict = {
"class_table": CP949_CLS,
"class_factor": 10,
"state_table": CP949_ST,
@ -173,7 +174,7 @@ EUCJP_ST = (
EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0)
EUCJP_SM_MODEL = {
EUCJP_SM_MODEL: CodingStateMachineDict = {
"class_table": EUCJP_CLS,
"class_factor": 6,
"state_table": EUCJP_ST,
@ -226,7 +227,7 @@ EUCKR_ST = (
EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0)
EUCKR_SM_MODEL = {
EUCKR_SM_MODEL: CodingStateMachineDict = {
"class_table": EUCKR_CLS,
"class_factor": 4,
"state_table": EUCKR_ST,
@ -283,7 +284,7 @@ JOHAB_ST = (
JOHAB_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 0, 0, 2, 2, 2)
JOHAB_SM_MODEL = {
JOHAB_SM_MODEL: CodingStateMachineDict = {
"class_table": JOHAB_CLS,
"class_factor": 10,
"state_table": JOHAB_ST,
@ -340,7 +341,7 @@ EUCTW_ST = (
EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3)
EUCTW_SM_MODEL = {
EUCTW_SM_MODEL: CodingStateMachineDict = {
"class_table": EUCTW_CLS,
"class_factor": 7,
"state_table": EUCTW_ST,
@ -402,7 +403,7 @@ GB2312_ST = (
# 2 here.
GB2312_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 1, 2)
GB2312_SM_MODEL = {
GB2312_SM_MODEL: CodingStateMachineDict = {
"class_table": GB2312_CLS,
"class_factor": 7,
"state_table": GB2312_ST,
@ -458,7 +459,7 @@ SJIS_ST = (
SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0)
SJIS_SM_MODEL = {
SJIS_SM_MODEL: CodingStateMachineDict = {
"class_table": SJIS_CLS,
"class_factor": 6,
"state_table": SJIS_ST,
@ -516,7 +517,7 @@ UCS2BE_ST = (
UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2)
UCS2BE_SM_MODEL = {
UCS2BE_SM_MODEL: CodingStateMachineDict = {
"class_table": UCS2BE_CLS,
"class_factor": 6,
"state_table": UCS2BE_ST,
@ -574,7 +575,7 @@ UCS2LE_ST = (
UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2)
UCS2LE_SM_MODEL = {
UCS2LE_SM_MODEL: CodingStateMachineDict = {
"class_table": UCS2LE_CLS,
"class_factor": 6,
"state_table": UCS2LE_ST,
@ -651,7 +652,7 @@ UTF8_ST = (
UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)
UTF8_SM_MODEL = {
UTF8_SM_MODEL: CodingStateMachineDict = {
"class_table": UTF8_CLS,
"class_factor": 16,
"state_table": UTF8_ST,

View File

@ -6,6 +6,7 @@ This code is based on the language metadata from the uchardet project.
"""
from string import ascii_letters
from typing import List, Optional
# TODO: Add Ukrainian (KOI8-U)
@ -33,13 +34,13 @@ class Language:
def __init__(
self,
name=None,
iso_code=None,
use_ascii=True,
charsets=None,
alphabet=None,
wiki_start_pages=None,
):
name: Optional[str] = None,
iso_code: Optional[str] = None,
use_ascii: bool = True,
charsets: Optional[List[str]] = None,
alphabet: Optional[str] = None,
wiki_start_pages: Optional[List[str]] = None,
) -> None:
super().__init__()
self.name = name
self.iso_code = iso_code
@ -55,7 +56,7 @@ class Language:
self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None
self.wiki_start_pages = wiki_start_pages
def __repr__(self):
def __repr__(self) -> str:
param_str = ", ".join(
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
)
@ -103,7 +104,7 @@ LANGUAGES = {
name="Danish",
iso_code="da",
use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="æøåÆØÅ",
wiki_start_pages=["Forside"],
),
@ -111,8 +112,8 @@ LANGUAGES = {
name="German",
iso_code="de",
use_ascii=True,
charsets=["ISO-8859-1", "WINDOWS-1252"],
alphabet="äöüßÄÖÜ",
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="äöüßÄÖÜ",
wiki_start_pages=["Wikipedia:Hauptseite"],
),
"Greek": Language(
@ -127,7 +128,7 @@ LANGUAGES = {
name="English",
iso_code="en",
use_ascii=True,
charsets=["ISO-8859-1", "WINDOWS-1252"],
charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
wiki_start_pages=["Main_Page"],
),
"Esperanto": Language(
@ -143,7 +144,7 @@ LANGUAGES = {
name="Spanish",
iso_code="es",
use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="ñáéíóúüÑÁÉÍÓÚÜ",
wiki_start_pages=["Wikipedia:Portada"],
),
@ -161,7 +162,7 @@ LANGUAGES = {
name="Finnish",
iso_code="fi",
use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="ÅÄÖŠŽåäöšž",
wiki_start_pages=["Wikipedia:Etusivu"],
),
@ -169,7 +170,7 @@ LANGUAGES = {
name="French",
iso_code="fr",
use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ",
wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"],
),
@ -203,7 +204,7 @@ LANGUAGES = {
name="Italian",
iso_code="it",
use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="ÀÈÉÌÒÓÙàèéìòóù",
wiki_start_pages=["Pagina_principale"],
),
@ -237,7 +238,7 @@ LANGUAGES = {
name="Dutch",
iso_code="nl",
use_ascii=True,
charsets=["ISO-8859-1", "WINDOWS-1252"],
charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
wiki_start_pages=["Hoofdpagina"],
),
"Polish": Language(
@ -253,7 +254,7 @@ LANGUAGES = {
name="Portuguese",
iso_code="pt",
use_ascii=True,
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú",
wiki_start_pages=["Wikipédia:Página_principal"],
),

View File

View File

@ -0,0 +1,16 @@
from typing import TYPE_CHECKING, Optional
if TYPE_CHECKING:
# TypedDict was introduced in Python 3.8.
#
# TODO: Remove the else block and TYPE_CHECKING check when dropping support
# for Python 3.7.
from typing import TypedDict
class ResultDict(TypedDict):
encoding: Optional[str]
confidence: float
language: Optional[str]
else:
ResultDict = dict

View File

@ -26,23 +26,20 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from collections import namedtuple
from typing import Dict, List, NamedTuple, Optional, Union
from .charsetprober import CharSetProber
from .enums import CharacterCategory, ProbingState, SequenceLikelihood
SingleByteCharSetModel = namedtuple(
"SingleByteCharSetModel",
[
"charset_name",
"language",
"char_to_order_map",
"language_model",
"typical_positive_ratio",
"keep_ascii_letters",
"alphabet",
],
)
class SingleByteCharSetModel(NamedTuple):
charset_name: str
language: str
char_to_order_map: Dict[int, int]
language_model: Dict[int, Dict[int, int]]
typical_positive_ratio: float
keep_ascii_letters: bool
alphabet: str
class SingleByteCharSetProber(CharSetProber):
@ -51,22 +48,27 @@ class SingleByteCharSetProber(CharSetProber):
POSITIVE_SHORTCUT_THRESHOLD = 0.95
NEGATIVE_SHORTCUT_THRESHOLD = 0.05
def __init__(self, model, is_reversed=False, name_prober=None):
def __init__(
self,
model: SingleByteCharSetModel,
is_reversed: bool = False,
name_prober: Optional[CharSetProber] = None,
) -> None:
super().__init__()
self._model = model
# TRUE if we need to reverse every pair in the model lookup
self._reversed = is_reversed
# Optional auxiliary prober for name decision
self._name_prober = name_prober
self._last_order = None
self._seq_counters = None
self._total_seqs = None
self._total_char = None
self._control_char = None
self._freq_char = None
self._last_order = 255
self._seq_counters: List[int] = []
self._total_seqs = 0
self._total_char = 0
self._control_char = 0
self._freq_char = 0
self.reset()
def reset(self):
def reset(self) -> None:
super().reset()
# char order of last character
self._last_order = 255
@ -78,18 +80,18 @@ class SingleByteCharSetProber(CharSetProber):
self._freq_char = 0
@property
def charset_name(self):
def charset_name(self) -> Optional[str]:
if self._name_prober:
return self._name_prober.charset_name
return self._model.charset_name
@property
def language(self):
def language(self) -> Optional[str]:
if self._name_prober:
return self._name_prober.language
return self._model.language
def feed(self, byte_str):
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
# TODO: Make filter_international_words keep things in self.alphabet
if not self._model.keep_ascii_letters:
byte_str = self.filter_international_words(byte_str)
@ -139,7 +141,7 @@ class SingleByteCharSetProber(CharSetProber):
return self.state
def get_confidence(self):
def get_confidence(self) -> float:
r = 0.01
if self._total_seqs > 0:
r = (

View File

@ -48,7 +48,7 @@ from .sbcharsetprober import SingleByteCharSetProber
class SBCSGroupProber(CharSetGroupProber):
def __init__(self):
def __init__(self) -> None:
super().__init__()
hebrew_prober = HebrewProber()
logical_hebrew_prober = SingleByteCharSetProber(

View File

@ -25,6 +25,8 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import Union
from .chardistribution import SJISDistributionAnalysis
from .codingstatemachine import CodingStateMachine
from .enums import MachineState, ProbingState
@ -34,26 +36,29 @@ from .mbcssm import SJIS_SM_MODEL
class SJISProber(MultiByteCharSetProber):
def __init__(self):
def __init__(self) -> None:
super().__init__()
self.coding_sm = CodingStateMachine(SJIS_SM_MODEL)
self.distribution_analyzer = SJISDistributionAnalysis()
self.context_analyzer = SJISContextAnalysis()
self.reset()
def reset(self):
def reset(self) -> None:
super().reset()
self.context_analyzer.reset()
@property
def charset_name(self):
def charset_name(self) -> str:
return self.context_analyzer.charset_name
@property
def language(self):
def language(self) -> str:
return "Japanese"
def feed(self, byte_str):
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
assert self.coding_sm is not None
assert self.distribution_analyzer is not None
for i, byte in enumerate(byte_str):
coding_state = self.coding_sm.next_state(byte)
if coding_state == MachineState.ERROR:
@ -92,7 +97,9 @@ class SJISProber(MultiByteCharSetProber):
return self.state
def get_confidence(self):
def get_confidence(self) -> float:
assert self.distribution_analyzer is not None
context_conf = self.context_analyzer.get_confidence()
distrib_conf = self.distribution_analyzer.get_confidence()
return max(context_conf, distrib_conf)

View File

@ -39,12 +39,16 @@ class a user of ``chardet`` should use.
import codecs
import logging
import re
from typing import List, Optional, Union
from .charsetgroupprober import CharSetGroupProber
from .charsetprober import CharSetProber
from .enums import InputState, LanguageFilter, ProbingState
from .escprober import EscCharSetProber
from .latin1prober import Latin1Prober
from .macromanprober import MacRomanProber
from .mbcsgroupprober import MBCSGroupProber
from .resultdict import ResultDict
from .sbcsgroupprober import SBCSGroupProber
from .utf1632prober import UTF1632Prober
@ -80,34 +84,55 @@ class UniversalDetector:
"iso-8859-9": "Windows-1254",
"iso-8859-13": "Windows-1257",
}
# Based on https://encoding.spec.whatwg.org/#names-and-labels
# but altered to match Python names for encodings and remove mappings
# that break tests.
LEGACY_MAP = {
"ascii": "Windows-1252",
"iso-8859-1": "Windows-1252",
"tis-620": "ISO-8859-11",
"iso-8859-9": "Windows-1254",
"gb2312": "GB18030",
"euc-kr": "CP949",
"utf-16le": "UTF-16",
}
def __init__(self, lang_filter=LanguageFilter.ALL):
self._esc_charset_prober = None
self._utf1632_prober = None
self._charset_probers = []
self.result = None
self.done = None
self._got_data = None
self._input_state = None
self._last_char = None
def __init__(
self,
lang_filter: LanguageFilter = LanguageFilter.ALL,
should_rename_legacy: bool = False,
) -> None:
self._esc_charset_prober: Optional[EscCharSetProber] = None
self._utf1632_prober: Optional[UTF1632Prober] = None
self._charset_probers: List[CharSetProber] = []
self.result: ResultDict = {
"encoding": None,
"confidence": 0.0,
"language": None,
}
self.done = False
self._got_data = False
self._input_state = InputState.PURE_ASCII
self._last_char = b""
self.lang_filter = lang_filter
self.logger = logging.getLogger(__name__)
self._has_win_bytes = None
self._has_win_bytes = False
self.should_rename_legacy = should_rename_legacy
self.reset()
@property
def input_state(self):
def input_state(self) -> int:
return self._input_state
@property
def has_win_bytes(self):
def has_win_bytes(self) -> bool:
return self._has_win_bytes
@property
def charset_probers(self):
def charset_probers(self) -> List[CharSetProber]:
return self._charset_probers
def reset(self):
def reset(self) -> None:
"""
Reset the UniversalDetector and all of its probers back to their
initial states. This is called by ``__init__``, so you only need to
@ -126,7 +151,7 @@ class UniversalDetector:
for prober in self._charset_probers:
prober.reset()
def feed(self, byte_str):
def feed(self, byte_str: Union[bytes, bytearray]) -> None:
"""
Takes a chunk of a document and feeds it through all of the relevant
charset probers.
@ -166,6 +191,7 @@ class UniversalDetector:
elif byte_str.startswith(b"\xFE\xFF\x00\x00"):
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
self.result = {
# TODO: This encoding is not supported by Python. Should remove?
"encoding": "X-ISO-10646-UCS-4-3412",
"confidence": 1.0,
"language": "",
@ -173,6 +199,7 @@ class UniversalDetector:
elif byte_str.startswith(b"\x00\x00\xFF\xFE"):
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
self.result = {
# TODO: This encoding is not supported by Python. Should remove?
"encoding": "X-ISO-10646-UCS-4-2143",
"confidence": 1.0,
"language": "",
@ -242,6 +269,7 @@ class UniversalDetector:
if self.lang_filter & LanguageFilter.NON_CJK:
self._charset_probers.append(SBCSGroupProber())
self._charset_probers.append(Latin1Prober())
self._charset_probers.append(MacRomanProber())
for prober in self._charset_probers:
if prober.feed(byte_str) == ProbingState.FOUND_IT:
self.result = {
@ -254,7 +282,7 @@ class UniversalDetector:
if self.WIN_BYTE_DETECTOR.search(byte_str):
self._has_win_bytes = True
def close(self):
def close(self) -> ResultDict:
"""
Stop analyzing the current document and come up with a final
prediction.
@ -288,7 +316,8 @@ class UniversalDetector:
max_prober = prober
if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
charset_name = max_prober.charset_name
lower_charset_name = max_prober.charset_name.lower()
assert charset_name is not None
lower_charset_name = charset_name.lower()
confidence = max_prober.get_confidence()
# Use Windows encoding name instead of ISO-8859 if we saw any
# extra Windows-specific bytes
@ -297,6 +326,11 @@ class UniversalDetector:
charset_name = self.ISO_WIN_MAP.get(
lower_charset_name, charset_name
)
# Rename legacy encodings with superset encodings if asked
if self.should_rename_legacy:
charset_name = self.LEGACY_MAP.get(
(charset_name or "").lower(), charset_name
)
self.result = {
"encoding": charset_name,
"confidence": confidence,

View File

@ -18,6 +18,8 @@
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import List, Union
from .charsetprober import CharSetProber
from .enums import ProbingState
@ -36,7 +38,7 @@ class UTF1632Prober(CharSetProber):
# a fixed constant ratio of expected zeros or non-zeros in modulo-position.
EXPECTED_RATIO = 0.94
def __init__(self):
def __init__(self) -> None:
super().__init__()
self.position = 0
self.zeros_at_mod = [0] * 4
@ -51,7 +53,7 @@ class UTF1632Prober(CharSetProber):
self.first_half_surrogate_pair_detected_16le = False
self.reset()
def reset(self):
def reset(self) -> None:
super().reset()
self.position = 0
self.zeros_at_mod = [0] * 4
@ -66,7 +68,7 @@ class UTF1632Prober(CharSetProber):
self.quad = [0, 0, 0, 0]
@property
def charset_name(self):
def charset_name(self) -> str:
if self.is_likely_utf32be():
return "utf-32be"
if self.is_likely_utf32le():
@ -79,16 +81,16 @@ class UTF1632Prober(CharSetProber):
return "utf-16"
@property
def language(self):
def language(self) -> str:
return ""
def approx_32bit_chars(self):
def approx_32bit_chars(self) -> float:
return max(1.0, self.position / 4.0)
def approx_16bit_chars(self):
def approx_16bit_chars(self) -> float:
return max(1.0, self.position / 2.0)
def is_likely_utf32be(self):
def is_likely_utf32be(self) -> bool:
approx_chars = self.approx_32bit_chars()
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
@ -98,7 +100,7 @@ class UTF1632Prober(CharSetProber):
and not self.invalid_utf32be
)
def is_likely_utf32le(self):
def is_likely_utf32le(self) -> bool:
approx_chars = self.approx_32bit_chars()
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
@ -108,7 +110,7 @@ class UTF1632Prober(CharSetProber):
and not self.invalid_utf32le
)
def is_likely_utf16be(self):
def is_likely_utf16be(self) -> bool:
approx_chars = self.approx_16bit_chars()
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
(self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars
@ -118,7 +120,7 @@ class UTF1632Prober(CharSetProber):
and not self.invalid_utf16be
)
def is_likely_utf16le(self):
def is_likely_utf16le(self) -> bool:
approx_chars = self.approx_16bit_chars()
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
(self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars
@ -128,7 +130,7 @@ class UTF1632Prober(CharSetProber):
and not self.invalid_utf16le
)
def validate_utf32_characters(self, quad):
def validate_utf32_characters(self, quad: List[int]) -> None:
"""
Validate if the quad of bytes is valid UTF-32.
@ -150,7 +152,7 @@ class UTF1632Prober(CharSetProber):
):
self.invalid_utf32le = True
def validate_utf16_characters(self, pair):
def validate_utf16_characters(self, pair: List[int]) -> None:
"""
Validate if the pair of bytes is valid UTF-16.
@ -182,7 +184,7 @@ class UTF1632Prober(CharSetProber):
else:
self.invalid_utf16le = True
def feed(self, byte_str):
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
for c in byte_str:
mod4 = self.position % 4
self.quad[mod4] = c
@ -198,7 +200,7 @@ class UTF1632Prober(CharSetProber):
return self.state
@property
def state(self):
def state(self) -> ProbingState:
if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}:
# terminal, decided states
return self._state
@ -210,7 +212,7 @@ class UTF1632Prober(CharSetProber):
self._state = ProbingState.NOT_ME
return self._state
def get_confidence(self):
def get_confidence(self) -> float:
return (
0.85
if (

View File

@ -25,6 +25,8 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from typing import Union
from .charsetprober import CharSetProber
from .codingstatemachine import CodingStateMachine
from .enums import MachineState, ProbingState
@ -34,26 +36,26 @@ from .mbcssm import UTF8_SM_MODEL
class UTF8Prober(CharSetProber):
ONE_CHAR_PROB = 0.5
def __init__(self):
def __init__(self) -> None:
super().__init__()
self.coding_sm = CodingStateMachine(UTF8_SM_MODEL)
self._num_mb_chars = None
self._num_mb_chars = 0
self.reset()
def reset(self):
def reset(self) -> None:
super().reset()
self.coding_sm.reset()
self._num_mb_chars = 0
@property
def charset_name(self):
def charset_name(self) -> str:
return "utf-8"
@property
def language(self):
def language(self) -> str:
return ""
def feed(self, byte_str):
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
for c in byte_str:
coding_state = self.coding_sm.next_state(c)
if coding_state == MachineState.ERROR:
@ -72,7 +74,7 @@ class UTF8Prober(CharSetProber):
return self.state
def get_confidence(self):
def get_confidence(self) -> float:
unlike = 0.99
if self._num_mb_chars < 6:
unlike *= self.ONE_CHAR_PROB**self._num_mb_chars

View File

@ -1,9 +1,9 @@
"""
This module exists only to simplify retrieving the version number of chardet
from within setup.py and from chardet subpackages.
from within setuptools and from chardet subpackages.
:author: Dan Blanchard (dan.blanchard@gmail.com)
"""
__version__ = "5.0.0"
__version__ = "5.1.0"
VERSION = __version__.split(".")

View File

@ -9,7 +9,7 @@ pyparsing==3.0.9
pyproject-hooks==1.0.0
requests==2.28.2
certifi==2022.12.7
chardet==5.0.0
chardet==5.1.0
idna==3.4
urllib3==1.26.12
rich==12.6.0