mirror of https://github.com/pypa/pip
Upgrade chardet to 5.1.0
This commit is contained in:
parent
1c110bede6
commit
be20a75c10
|
@ -0,0 +1 @@
|
||||||
|
Upgrade chardet to 5.1.0
|
|
@ -1 +0,0 @@
|
||||||
from chardet import *
|
|
|
@ -15,19 +15,29 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from typing import List, Union
|
||||||
|
|
||||||
|
from .charsetgroupprober import CharSetGroupProber
|
||||||
|
from .charsetprober import CharSetProber
|
||||||
from .enums import InputState
|
from .enums import InputState
|
||||||
|
from .resultdict import ResultDict
|
||||||
from .universaldetector import UniversalDetector
|
from .universaldetector import UniversalDetector
|
||||||
from .version import VERSION, __version__
|
from .version import VERSION, __version__
|
||||||
|
|
||||||
__all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"]
|
__all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"]
|
||||||
|
|
||||||
|
|
||||||
def detect(byte_str):
|
def detect(
|
||||||
|
byte_str: Union[bytes, bytearray], should_rename_legacy: bool = False
|
||||||
|
) -> ResultDict:
|
||||||
"""
|
"""
|
||||||
Detect the encoding of the given byte string.
|
Detect the encoding of the given byte string.
|
||||||
|
|
||||||
:param byte_str: The byte sequence to examine.
|
:param byte_str: The byte sequence to examine.
|
||||||
:type byte_str: ``bytes`` or ``bytearray``
|
:type byte_str: ``bytes`` or ``bytearray``
|
||||||
|
:param should_rename_legacy: Should we rename legacy encodings
|
||||||
|
to their more modern equivalents?
|
||||||
|
:type should_rename_legacy: ``bool``
|
||||||
"""
|
"""
|
||||||
if not isinstance(byte_str, bytearray):
|
if not isinstance(byte_str, bytearray):
|
||||||
if not isinstance(byte_str, bytes):
|
if not isinstance(byte_str, bytes):
|
||||||
|
@ -35,12 +45,16 @@ def detect(byte_str):
|
||||||
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
|
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
|
||||||
)
|
)
|
||||||
byte_str = bytearray(byte_str)
|
byte_str = bytearray(byte_str)
|
||||||
detector = UniversalDetector()
|
detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
|
||||||
detector.feed(byte_str)
|
detector.feed(byte_str)
|
||||||
return detector.close()
|
return detector.close()
|
||||||
|
|
||||||
|
|
||||||
def detect_all(byte_str, ignore_threshold=False):
|
def detect_all(
|
||||||
|
byte_str: Union[bytes, bytearray],
|
||||||
|
ignore_threshold: bool = False,
|
||||||
|
should_rename_legacy: bool = False,
|
||||||
|
) -> List[ResultDict]:
|
||||||
"""
|
"""
|
||||||
Detect all the possible encodings of the given byte string.
|
Detect all the possible encodings of the given byte string.
|
||||||
|
|
||||||
|
@ -50,6 +64,9 @@ def detect_all(byte_str, ignore_threshold=False):
|
||||||
``UniversalDetector.MINIMUM_THRESHOLD``
|
``UniversalDetector.MINIMUM_THRESHOLD``
|
||||||
in results.
|
in results.
|
||||||
:type ignore_threshold: ``bool``
|
:type ignore_threshold: ``bool``
|
||||||
|
:param should_rename_legacy: Should we rename legacy encodings
|
||||||
|
to their more modern equivalents?
|
||||||
|
:type should_rename_legacy: ``bool``
|
||||||
"""
|
"""
|
||||||
if not isinstance(byte_str, bytearray):
|
if not isinstance(byte_str, bytearray):
|
||||||
if not isinstance(byte_str, bytes):
|
if not isinstance(byte_str, bytes):
|
||||||
|
@ -58,15 +75,15 @@ def detect_all(byte_str, ignore_threshold=False):
|
||||||
)
|
)
|
||||||
byte_str = bytearray(byte_str)
|
byte_str = bytearray(byte_str)
|
||||||
|
|
||||||
detector = UniversalDetector()
|
detector = UniversalDetector(should_rename_legacy=should_rename_legacy)
|
||||||
detector.feed(byte_str)
|
detector.feed(byte_str)
|
||||||
detector.close()
|
detector.close()
|
||||||
|
|
||||||
if detector.input_state == InputState.HIGH_BYTE:
|
if detector.input_state == InputState.HIGH_BYTE:
|
||||||
results = []
|
results: List[ResultDict] = []
|
||||||
probers = []
|
probers: List[CharSetProber] = []
|
||||||
for prober in detector.charset_probers:
|
for prober in detector.charset_probers:
|
||||||
if hasattr(prober, "probers"):
|
if isinstance(prober, CharSetGroupProber):
|
||||||
probers.extend(p for p in prober.probers)
|
probers.extend(p for p in prober.probers)
|
||||||
else:
|
else:
|
||||||
probers.append(prober)
|
probers.append(prober)
|
||||||
|
@ -80,6 +97,11 @@ def detect_all(byte_str, ignore_threshold=False):
|
||||||
charset_name = detector.ISO_WIN_MAP.get(
|
charset_name = detector.ISO_WIN_MAP.get(
|
||||||
lower_charset_name, charset_name
|
lower_charset_name, charset_name
|
||||||
)
|
)
|
||||||
|
# Rename legacy encodings with superset encodings if asked
|
||||||
|
if should_rename_legacy:
|
||||||
|
charset_name = detector.LEGACY_MAP.get(
|
||||||
|
charset_name.lower(), charset_name
|
||||||
|
)
|
||||||
results.append(
|
results.append(
|
||||||
{
|
{
|
||||||
"encoding": charset_name,
|
"encoding": charset_name,
|
||||||
|
|
|
@ -32,16 +32,16 @@ from .mbcssm import BIG5_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
class Big5Prober(MultiByteCharSetProber):
|
class Big5Prober(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(BIG5_SM_MODEL)
|
self.coding_sm = CodingStateMachine(BIG5_SM_MODEL)
|
||||||
self.distribution_analyzer = Big5DistributionAnalysis()
|
self.distribution_analyzer = Big5DistributionAnalysis()
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return "Big5"
|
return "Big5"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return "Chinese"
|
return "Chinese"
|
||||||
|
|
|
@ -25,6 +25,8 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from typing import Tuple, Union
|
||||||
|
|
||||||
from .big5freq import (
|
from .big5freq import (
|
||||||
BIG5_CHAR_TO_FREQ_ORDER,
|
BIG5_CHAR_TO_FREQ_ORDER,
|
||||||
BIG5_TABLE_SIZE,
|
BIG5_TABLE_SIZE,
|
||||||
|
@ -59,22 +61,22 @@ class CharDistributionAnalysis:
|
||||||
SURE_NO = 0.01
|
SURE_NO = 0.01
|
||||||
MINIMUM_DATA_THRESHOLD = 3
|
MINIMUM_DATA_THRESHOLD = 3
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
# Mapping table to get frequency order from char order (get from
|
# Mapping table to get frequency order from char order (get from
|
||||||
# GetOrder())
|
# GetOrder())
|
||||||
self._char_to_freq_order = tuple()
|
self._char_to_freq_order: Tuple[int, ...] = tuple()
|
||||||
self._table_size = None # Size of above table
|
self._table_size = 0 # Size of above table
|
||||||
# This is a constant value which varies from language to language,
|
# This is a constant value which varies from language to language,
|
||||||
# used in calculating confidence. See
|
# used in calculating confidence. See
|
||||||
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
|
# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
|
||||||
# for further detail.
|
# for further detail.
|
||||||
self.typical_distribution_ratio = None
|
self.typical_distribution_ratio = 0.0
|
||||||
self._done = None
|
self._done = False
|
||||||
self._total_chars = None
|
self._total_chars = 0
|
||||||
self._freq_chars = None
|
self._freq_chars = 0
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
"""reset analyser, clear any state"""
|
"""reset analyser, clear any state"""
|
||||||
# If this flag is set to True, detection is done and conclusion has
|
# If this flag is set to True, detection is done and conclusion has
|
||||||
# been made
|
# been made
|
||||||
|
@ -83,7 +85,7 @@ class CharDistributionAnalysis:
|
||||||
# The number of characters whose frequency order is less than 512
|
# The number of characters whose frequency order is less than 512
|
||||||
self._freq_chars = 0
|
self._freq_chars = 0
|
||||||
|
|
||||||
def feed(self, char, char_len):
|
def feed(self, char: Union[bytes, bytearray], char_len: int) -> None:
|
||||||
"""feed a character with known length"""
|
"""feed a character with known length"""
|
||||||
if char_len == 2:
|
if char_len == 2:
|
||||||
# we only care about 2-bytes character in our distribution analysis
|
# we only care about 2-bytes character in our distribution analysis
|
||||||
|
@ -97,7 +99,7 @@ class CharDistributionAnalysis:
|
||||||
if 512 > self._char_to_freq_order[order]:
|
if 512 > self._char_to_freq_order[order]:
|
||||||
self._freq_chars += 1
|
self._freq_chars += 1
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
"""return confidence based on existing data"""
|
"""return confidence based on existing data"""
|
||||||
# if we didn't receive any character in our consideration range,
|
# if we didn't receive any character in our consideration range,
|
||||||
# return negative answer
|
# return negative answer
|
||||||
|
@ -114,12 +116,12 @@ class CharDistributionAnalysis:
|
||||||
# normalize confidence (we don't want to be 100% sure)
|
# normalize confidence (we don't want to be 100% sure)
|
||||||
return self.SURE_YES
|
return self.SURE_YES
|
||||||
|
|
||||||
def got_enough_data(self):
|
def got_enough_data(self) -> bool:
|
||||||
# It is not necessary to receive all data to draw conclusion.
|
# It is not necessary to receive all data to draw conclusion.
|
||||||
# For charset detection, certain amount of data is enough
|
# For charset detection, certain amount of data is enough
|
||||||
return self._total_chars > self.ENOUGH_DATA_THRESHOLD
|
return self._total_chars > self.ENOUGH_DATA_THRESHOLD
|
||||||
|
|
||||||
def get_order(self, _):
|
def get_order(self, _: Union[bytes, bytearray]) -> int:
|
||||||
# We do not handle characters based on the original encoding string,
|
# We do not handle characters based on the original encoding string,
|
||||||
# but convert this encoding string to a number, here called order.
|
# but convert this encoding string to a number, here called order.
|
||||||
# This allows multiple encodings of a language to share one frequency
|
# This allows multiple encodings of a language to share one frequency
|
||||||
|
@ -128,13 +130,13 @@ class CharDistributionAnalysis:
|
||||||
|
|
||||||
|
|
||||||
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
|
self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
|
||||||
self._table_size = EUCTW_TABLE_SIZE
|
self._table_size = EUCTW_TABLE_SIZE
|
||||||
self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
|
self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
def get_order(self, byte_str):
|
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||||
# for euc-TW encoding, we are interested
|
# for euc-TW encoding, we are interested
|
||||||
# first byte range: 0xc4 -- 0xfe
|
# first byte range: 0xc4 -- 0xfe
|
||||||
# second byte range: 0xa1 -- 0xfe
|
# second byte range: 0xa1 -- 0xfe
|
||||||
|
@ -146,13 +148,13 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis):
|
||||||
|
|
||||||
|
|
||||||
class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
|
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
|
||||||
self._table_size = EUCKR_TABLE_SIZE
|
self._table_size = EUCKR_TABLE_SIZE
|
||||||
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
def get_order(self, byte_str):
|
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||||
# for euc-KR encoding, we are interested
|
# for euc-KR encoding, we are interested
|
||||||
# first byte range: 0xb0 -- 0xfe
|
# first byte range: 0xb0 -- 0xfe
|
||||||
# second byte range: 0xa1 -- 0xfe
|
# second byte range: 0xa1 -- 0xfe
|
||||||
|
@ -164,13 +166,13 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis):
|
||||||
|
|
||||||
|
|
||||||
class JOHABDistributionAnalysis(CharDistributionAnalysis):
|
class JOHABDistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
|
self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
|
||||||
self._table_size = EUCKR_TABLE_SIZE
|
self._table_size = EUCKR_TABLE_SIZE
|
||||||
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
def get_order(self, byte_str):
|
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||||
first_char = byte_str[0]
|
first_char = byte_str[0]
|
||||||
if 0x88 <= first_char < 0xD4:
|
if 0x88 <= first_char < 0xD4:
|
||||||
code = first_char * 256 + byte_str[1]
|
code = first_char * 256 + byte_str[1]
|
||||||
|
@ -179,13 +181,13 @@ class JOHABDistributionAnalysis(CharDistributionAnalysis):
|
||||||
|
|
||||||
|
|
||||||
class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
|
self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
|
||||||
self._table_size = GB2312_TABLE_SIZE
|
self._table_size = GB2312_TABLE_SIZE
|
||||||
self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO
|
self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
def get_order(self, byte_str):
|
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||||
# for GB2312 encoding, we are interested
|
# for GB2312 encoding, we are interested
|
||||||
# first byte range: 0xb0 -- 0xfe
|
# first byte range: 0xb0 -- 0xfe
|
||||||
# second byte range: 0xa1 -- 0xfe
|
# second byte range: 0xa1 -- 0xfe
|
||||||
|
@ -197,13 +199,13 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis):
|
||||||
|
|
||||||
|
|
||||||
class Big5DistributionAnalysis(CharDistributionAnalysis):
|
class Big5DistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
|
self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
|
||||||
self._table_size = BIG5_TABLE_SIZE
|
self._table_size = BIG5_TABLE_SIZE
|
||||||
self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO
|
self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
def get_order(self, byte_str):
|
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||||
# for big5 encoding, we are interested
|
# for big5 encoding, we are interested
|
||||||
# first byte range: 0xa4 -- 0xfe
|
# first byte range: 0xa4 -- 0xfe
|
||||||
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
|
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
|
||||||
|
@ -217,13 +219,13 @@ class Big5DistributionAnalysis(CharDistributionAnalysis):
|
||||||
|
|
||||||
|
|
||||||
class SJISDistributionAnalysis(CharDistributionAnalysis):
|
class SJISDistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
|
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
|
||||||
self._table_size = JIS_TABLE_SIZE
|
self._table_size = JIS_TABLE_SIZE
|
||||||
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
def get_order(self, byte_str):
|
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||||
# for sjis encoding, we are interested
|
# for sjis encoding, we are interested
|
||||||
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
|
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
|
||||||
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
|
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
|
||||||
|
@ -242,13 +244,13 @@ class SJISDistributionAnalysis(CharDistributionAnalysis):
|
||||||
|
|
||||||
|
|
||||||
class EUCJPDistributionAnalysis(CharDistributionAnalysis):
|
class EUCJPDistributionAnalysis(CharDistributionAnalysis):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
|
self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
|
||||||
self._table_size = JIS_TABLE_SIZE
|
self._table_size = JIS_TABLE_SIZE
|
||||||
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
|
||||||
|
|
||||||
def get_order(self, byte_str):
|
def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
|
||||||
# for euc-JP encoding, we are interested
|
# for euc-JP encoding, we are interested
|
||||||
# first byte range: 0xa0 -- 0xfe
|
# first byte range: 0xa0 -- 0xfe
|
||||||
# second byte range: 0xa1 -- 0xfe
|
# second byte range: 0xa1 -- 0xfe
|
||||||
|
|
|
@ -25,29 +25,30 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .enums import ProbingState
|
from .enums import LanguageFilter, ProbingState
|
||||||
|
|
||||||
|
|
||||||
class CharSetGroupProber(CharSetProber):
|
class CharSetGroupProber(CharSetProber):
|
||||||
def __init__(self, lang_filter=None):
|
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
|
||||||
super().__init__(lang_filter=lang_filter)
|
super().__init__(lang_filter=lang_filter)
|
||||||
self._active_num = 0
|
self._active_num = 0
|
||||||
self.probers = []
|
self.probers: List[CharSetProber] = []
|
||||||
self._best_guess_prober = None
|
self._best_guess_prober: Optional[CharSetProber] = None
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
super().reset()
|
super().reset()
|
||||||
self._active_num = 0
|
self._active_num = 0
|
||||||
for prober in self.probers:
|
for prober in self.probers:
|
||||||
if prober:
|
prober.reset()
|
||||||
prober.reset()
|
prober.active = True
|
||||||
prober.active = True
|
self._active_num += 1
|
||||||
self._active_num += 1
|
|
||||||
self._best_guess_prober = None
|
self._best_guess_prober = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> Optional[str]:
|
||||||
if not self._best_guess_prober:
|
if not self._best_guess_prober:
|
||||||
self.get_confidence()
|
self.get_confidence()
|
||||||
if not self._best_guess_prober:
|
if not self._best_guess_prober:
|
||||||
|
@ -55,17 +56,15 @@ class CharSetGroupProber(CharSetProber):
|
||||||
return self._best_guess_prober.charset_name
|
return self._best_guess_prober.charset_name
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> Optional[str]:
|
||||||
if not self._best_guess_prober:
|
if not self._best_guess_prober:
|
||||||
self.get_confidence()
|
self.get_confidence()
|
||||||
if not self._best_guess_prober:
|
if not self._best_guess_prober:
|
||||||
return None
|
return None
|
||||||
return self._best_guess_prober.language
|
return self._best_guess_prober.language
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
for prober in self.probers:
|
for prober in self.probers:
|
||||||
if not prober:
|
|
||||||
continue
|
|
||||||
if not prober.active:
|
if not prober.active:
|
||||||
continue
|
continue
|
||||||
state = prober.feed(byte_str)
|
state = prober.feed(byte_str)
|
||||||
|
@ -83,7 +82,7 @@ class CharSetGroupProber(CharSetProber):
|
||||||
return self.state
|
return self.state
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
state = self.state
|
state = self.state
|
||||||
if state == ProbingState.FOUND_IT:
|
if state == ProbingState.FOUND_IT:
|
||||||
return 0.99
|
return 0.99
|
||||||
|
@ -92,8 +91,6 @@ class CharSetGroupProber(CharSetProber):
|
||||||
best_conf = 0.0
|
best_conf = 0.0
|
||||||
self._best_guess_prober = None
|
self._best_guess_prober = None
|
||||||
for prober in self.probers:
|
for prober in self.probers:
|
||||||
if not prober:
|
|
||||||
continue
|
|
||||||
if not prober.active:
|
if not prober.active:
|
||||||
self.logger.debug("%s not active", prober.charset_name)
|
self.logger.debug("%s not active", prober.charset_name)
|
||||||
continue
|
continue
|
||||||
|
|
|
@ -28,8 +28,9 @@
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
from .enums import ProbingState
|
from .enums import LanguageFilter, ProbingState
|
||||||
|
|
||||||
INTERNATIONAL_WORDS_PATTERN = re.compile(
|
INTERNATIONAL_WORDS_PATTERN = re.compile(
|
||||||
b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?"
|
b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?"
|
||||||
|
@ -40,35 +41,40 @@ class CharSetProber:
|
||||||
|
|
||||||
SHORTCUT_THRESHOLD = 0.95
|
SHORTCUT_THRESHOLD = 0.95
|
||||||
|
|
||||||
def __init__(self, lang_filter=None):
|
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
|
||||||
self._state = None
|
self._state = ProbingState.DETECTING
|
||||||
|
self.active = True
|
||||||
self.lang_filter = lang_filter
|
self.lang_filter = lang_filter
|
||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
self._state = ProbingState.DETECTING
|
self._state = ProbingState.DETECTING
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> Optional[str]:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def feed(self, byte_str):
|
@property
|
||||||
|
def language(self) -> Optional[str]:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def state(self):
|
def state(self) -> ProbingState:
|
||||||
return self._state
|
return self._state
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def filter_high_byte_only(buf):
|
def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes:
|
||||||
buf = re.sub(b"([\x00-\x7F])+", b" ", buf)
|
buf = re.sub(b"([\x00-\x7F])+", b" ", buf)
|
||||||
return buf
|
return buf
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def filter_international_words(buf):
|
def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray:
|
||||||
"""
|
"""
|
||||||
We define three types of bytes:
|
We define three types of bytes:
|
||||||
alphabet: english alphabets [a-zA-Z]
|
alphabet: english alphabets [a-zA-Z]
|
||||||
|
@ -102,7 +108,7 @@ class CharSetProber:
|
||||||
return filtered
|
return filtered
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def remove_xml_tags(buf):
|
def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytes:
|
||||||
"""
|
"""
|
||||||
Returns a copy of ``buf`` that retains only the sequences of English
|
Returns a copy of ``buf`` that retains only the sequences of English
|
||||||
alphabet and high byte characters that are not between <> characters.
|
alphabet and high byte characters that are not between <> characters.
|
||||||
|
@ -117,10 +123,13 @@ class CharSetProber:
|
||||||
|
|
||||||
for curr, buf_char in enumerate(buf):
|
for curr, buf_char in enumerate(buf):
|
||||||
# Check if we're coming out of or entering an XML tag
|
# Check if we're coming out of or entering an XML tag
|
||||||
if buf_char == b">":
|
|
||||||
|
# https://github.com/python/typeshed/issues/8182
|
||||||
|
if buf_char == b">": # type: ignore[comparison-overlap]
|
||||||
prev = curr + 1
|
prev = curr + 1
|
||||||
in_tag = False
|
in_tag = False
|
||||||
elif buf_char == b"<":
|
# https://github.com/python/typeshed/issues/8182
|
||||||
|
elif buf_char == b"<": # type: ignore[comparison-overlap]
|
||||||
if curr > prev and not in_tag:
|
if curr > prev and not in_tag:
|
||||||
# Keep everything after last non-extended-ASCII,
|
# Keep everything after last non-extended-ASCII,
|
||||||
# non-alphabetic character
|
# non-alphabetic character
|
||||||
|
|
|
@ -15,12 +15,18 @@ If no paths are provided, it takes its input from stdin.
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
|
from typing import Iterable, List, Optional
|
||||||
|
|
||||||
from .. import __version__
|
from .. import __version__
|
||||||
from ..universaldetector import UniversalDetector
|
from ..universaldetector import UniversalDetector
|
||||||
|
|
||||||
|
|
||||||
def description_of(lines, name="stdin"):
|
def description_of(
|
||||||
|
lines: Iterable[bytes],
|
||||||
|
name: str = "stdin",
|
||||||
|
minimal: bool = False,
|
||||||
|
should_rename_legacy: bool = False,
|
||||||
|
) -> Optional[str]:
|
||||||
"""
|
"""
|
||||||
Return a string describing the probable encoding of a file or
|
Return a string describing the probable encoding of a file or
|
||||||
list of strings.
|
list of strings.
|
||||||
|
@ -29,8 +35,11 @@ def description_of(lines, name="stdin"):
|
||||||
:type lines: Iterable of bytes
|
:type lines: Iterable of bytes
|
||||||
:param name: Name of file or collection of lines
|
:param name: Name of file or collection of lines
|
||||||
:type name: str
|
:type name: str
|
||||||
|
:param should_rename_legacy: Should we rename legacy encodings to
|
||||||
|
their more modern equivalents?
|
||||||
|
:type should_rename_legacy: ``bool``
|
||||||
"""
|
"""
|
||||||
u = UniversalDetector()
|
u = UniversalDetector(should_rename_legacy=should_rename_legacy)
|
||||||
for line in lines:
|
for line in lines:
|
||||||
line = bytearray(line)
|
line = bytearray(line)
|
||||||
u.feed(line)
|
u.feed(line)
|
||||||
|
@ -39,12 +48,14 @@ def description_of(lines, name="stdin"):
|
||||||
break
|
break
|
||||||
u.close()
|
u.close()
|
||||||
result = u.result
|
result = u.result
|
||||||
|
if minimal:
|
||||||
|
return result["encoding"]
|
||||||
if result["encoding"]:
|
if result["encoding"]:
|
||||||
return f'{name}: {result["encoding"]} with confidence {result["confidence"]}'
|
return f'{name}: {result["encoding"]} with confidence {result["confidence"]}'
|
||||||
return f"{name}: no result"
|
return f"{name}: no result"
|
||||||
|
|
||||||
|
|
||||||
def main(argv=None):
|
def main(argv: Optional[List[str]] = None) -> None:
|
||||||
"""
|
"""
|
||||||
Handles command line arguments and gets things started.
|
Handles command line arguments and gets things started.
|
||||||
|
|
||||||
|
@ -54,17 +65,28 @@ def main(argv=None):
|
||||||
"""
|
"""
|
||||||
# Get command line arguments
|
# Get command line arguments
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Takes one or more file paths and reports their detected \
|
description=(
|
||||||
encodings"
|
"Takes one or more file paths and reports their detected encodings"
|
||||||
|
)
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"input",
|
"input",
|
||||||
help="File whose encoding we would like to determine. \
|
help="File whose encoding we would like to determine. (default: stdin)",
|
||||||
(default: stdin)",
|
|
||||||
type=argparse.FileType("rb"),
|
type=argparse.FileType("rb"),
|
||||||
nargs="*",
|
nargs="*",
|
||||||
default=[sys.stdin.buffer],
|
default=[sys.stdin.buffer],
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--minimal",
|
||||||
|
help="Print only the encoding to standard output",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-l",
|
||||||
|
"--legacy",
|
||||||
|
help="Rename legacy encodings to more modern ones.",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--version", action="version", version=f"%(prog)s {__version__}"
|
"--version", action="version", version=f"%(prog)s {__version__}"
|
||||||
)
|
)
|
||||||
|
@ -79,7 +101,11 @@ def main(argv=None):
|
||||||
"--help\n",
|
"--help\n",
|
||||||
file=sys.stderr,
|
file=sys.stderr,
|
||||||
)
|
)
|
||||||
print(description_of(f, f.name))
|
print(
|
||||||
|
description_of(
|
||||||
|
f, f.name, minimal=args.minimal, should_rename_legacy=args.legacy
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -27,6 +27,7 @@
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
from .codingstatemachinedict import CodingStateMachineDict
|
||||||
from .enums import MachineState
|
from .enums import MachineState
|
||||||
|
|
||||||
|
|
||||||
|
@ -53,18 +54,19 @@ class CodingStateMachine:
|
||||||
encoding from consideration from here on.
|
encoding from consideration from here on.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, sm):
|
def __init__(self, sm: CodingStateMachineDict) -> None:
|
||||||
self._model = sm
|
self._model = sm
|
||||||
self._curr_byte_pos = 0
|
self._curr_byte_pos = 0
|
||||||
self._curr_char_len = 0
|
self._curr_char_len = 0
|
||||||
self._curr_state = None
|
self._curr_state = MachineState.START
|
||||||
|
self.active = True
|
||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
self._curr_state = MachineState.START
|
self._curr_state = MachineState.START
|
||||||
|
|
||||||
def next_state(self, c):
|
def next_state(self, c: int) -> int:
|
||||||
# for each byte we get its class
|
# for each byte we get its class
|
||||||
# if it is first byte, we also get byte length
|
# if it is first byte, we also get byte length
|
||||||
byte_class = self._model["class_table"][c]
|
byte_class = self._model["class_table"][c]
|
||||||
|
@ -77,12 +79,12 @@ class CodingStateMachine:
|
||||||
self._curr_byte_pos += 1
|
self._curr_byte_pos += 1
|
||||||
return self._curr_state
|
return self._curr_state
|
||||||
|
|
||||||
def get_current_charlen(self):
|
def get_current_charlen(self) -> int:
|
||||||
return self._curr_char_len
|
return self._curr_char_len
|
||||||
|
|
||||||
def get_coding_state_machine(self):
|
def get_coding_state_machine(self) -> str:
|
||||||
return self._model["name"]
|
return self._model["name"]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return self._model["language"]
|
return self._model["language"]
|
||||||
|
|
|
@ -0,0 +1,19 @@
|
||||||
|
from typing import TYPE_CHECKING, Tuple
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
# TypedDict was introduced in Python 3.8.
|
||||||
|
#
|
||||||
|
# TODO: Remove the else block and TYPE_CHECKING check when dropping support
|
||||||
|
# for Python 3.7.
|
||||||
|
from typing import TypedDict
|
||||||
|
|
||||||
|
class CodingStateMachineDict(TypedDict, total=False):
|
||||||
|
class_table: Tuple[int, ...]
|
||||||
|
class_factor: int
|
||||||
|
state_table: Tuple[int, ...]
|
||||||
|
char_len_table: Tuple[int, ...]
|
||||||
|
name: str
|
||||||
|
language: str # Optional key
|
||||||
|
|
||||||
|
else:
|
||||||
|
CodingStateMachineDict = dict
|
|
@ -32,7 +32,7 @@ from .mbcssm import CP949_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
class CP949Prober(MultiByteCharSetProber):
|
class CP949Prober(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(CP949_SM_MODEL)
|
self.coding_sm = CodingStateMachine(CP949_SM_MODEL)
|
||||||
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be
|
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be
|
||||||
|
@ -41,9 +41,9 @@ class CP949Prober(MultiByteCharSetProber):
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return "CP949"
|
return "CP949"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return "Korean"
|
return "Korean"
|
||||||
|
|
|
@ -4,6 +4,8 @@ All of the Enums that are used throughout the chardet package.
|
||||||
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from enum import Enum, Flag
|
||||||
|
|
||||||
|
|
||||||
class InputState:
|
class InputState:
|
||||||
"""
|
"""
|
||||||
|
@ -15,12 +17,13 @@ class InputState:
|
||||||
HIGH_BYTE = 2
|
HIGH_BYTE = 2
|
||||||
|
|
||||||
|
|
||||||
class LanguageFilter:
|
class LanguageFilter(Flag):
|
||||||
"""
|
"""
|
||||||
This enum represents the different language filters we can apply to a
|
This enum represents the different language filters we can apply to a
|
||||||
``UniversalDetector``.
|
``UniversalDetector``.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
NONE = 0x00
|
||||||
CHINESE_SIMPLIFIED = 0x01
|
CHINESE_SIMPLIFIED = 0x01
|
||||||
CHINESE_TRADITIONAL = 0x02
|
CHINESE_TRADITIONAL = 0x02
|
||||||
JAPANESE = 0x04
|
JAPANESE = 0x04
|
||||||
|
@ -31,7 +34,7 @@ class LanguageFilter:
|
||||||
CJK = CHINESE | JAPANESE | KOREAN
|
CJK = CHINESE | JAPANESE | KOREAN
|
||||||
|
|
||||||
|
|
||||||
class ProbingState:
|
class ProbingState(Enum):
|
||||||
"""
|
"""
|
||||||
This enum represents the different states a prober can be in.
|
This enum represents the different states a prober can be in.
|
||||||
"""
|
"""
|
||||||
|
@ -62,7 +65,7 @@ class SequenceLikelihood:
|
||||||
POSITIVE = 3
|
POSITIVE = 3
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_num_categories(cls):
|
def get_num_categories(cls) -> int:
|
||||||
""":returns: The number of likelihood categories in the enum."""
|
""":returns: The number of likelihood categories in the enum."""
|
||||||
return 4
|
return 4
|
||||||
|
|
||||||
|
|
|
@ -25,6 +25,8 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .codingstatemachine import CodingStateMachine
|
from .codingstatemachine import CodingStateMachine
|
||||||
from .enums import LanguageFilter, MachineState, ProbingState
|
from .enums import LanguageFilter, MachineState, ProbingState
|
||||||
|
@ -43,7 +45,7 @@ class EscCharSetProber(CharSetProber):
|
||||||
identify these encodings.
|
identify these encodings.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, lang_filter=None):
|
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
|
||||||
super().__init__(lang_filter=lang_filter)
|
super().__init__(lang_filter=lang_filter)
|
||||||
self.coding_sm = []
|
self.coding_sm = []
|
||||||
if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
|
if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
|
||||||
|
@ -53,17 +55,15 @@ class EscCharSetProber(CharSetProber):
|
||||||
self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))
|
self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))
|
||||||
if self.lang_filter & LanguageFilter.KOREAN:
|
if self.lang_filter & LanguageFilter.KOREAN:
|
||||||
self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))
|
self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))
|
||||||
self.active_sm_count = None
|
self.active_sm_count = 0
|
||||||
self._detected_charset = None
|
self._detected_charset: Optional[str] = None
|
||||||
self._detected_language = None
|
self._detected_language: Optional[str] = None
|
||||||
self._state = None
|
self._state = ProbingState.DETECTING
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
super().reset()
|
super().reset()
|
||||||
for coding_sm in self.coding_sm:
|
for coding_sm in self.coding_sm:
|
||||||
if not coding_sm:
|
|
||||||
continue
|
|
||||||
coding_sm.active = True
|
coding_sm.active = True
|
||||||
coding_sm.reset()
|
coding_sm.reset()
|
||||||
self.active_sm_count = len(self.coding_sm)
|
self.active_sm_count = len(self.coding_sm)
|
||||||
|
@ -71,20 +71,20 @@ class EscCharSetProber(CharSetProber):
|
||||||
self._detected_language = None
|
self._detected_language = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> Optional[str]:
|
||||||
return self._detected_charset
|
return self._detected_charset
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> Optional[str]:
|
||||||
return self._detected_language
|
return self._detected_language
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
return 0.99 if self._detected_charset else 0.00
|
return 0.99 if self._detected_charset else 0.00
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
for c in byte_str:
|
for c in byte_str:
|
||||||
for coding_sm in self.coding_sm:
|
for coding_sm in self.coding_sm:
|
||||||
if not coding_sm or not coding_sm.active:
|
if not coding_sm.active:
|
||||||
continue
|
continue
|
||||||
coding_state = coding_sm.next_state(c)
|
coding_state = coding_sm.next_state(c)
|
||||||
if coding_state == MachineState.ERROR:
|
if coding_state == MachineState.ERROR:
|
||||||
|
|
|
@ -25,6 +25,7 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from .codingstatemachinedict import CodingStateMachineDict
|
||||||
from .enums import MachineState
|
from .enums import MachineState
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
@ -75,7 +76,7 @@ MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR
|
||||||
|
|
||||||
HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
||||||
|
|
||||||
HZ_SM_MODEL = {
|
HZ_SM_MODEL: CodingStateMachineDict = {
|
||||||
"class_table": HZ_CLS,
|
"class_table": HZ_CLS,
|
||||||
"class_factor": 6,
|
"class_factor": 6,
|
||||||
"state_table": HZ_ST,
|
"state_table": HZ_ST,
|
||||||
|
@ -134,7 +135,7 @@ ISO2022CN_ST = (
|
||||||
|
|
||||||
ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0)
|
ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||||
|
|
||||||
ISO2022CN_SM_MODEL = {
|
ISO2022CN_SM_MODEL: CodingStateMachineDict = {
|
||||||
"class_table": ISO2022CN_CLS,
|
"class_table": ISO2022CN_CLS,
|
||||||
"class_factor": 9,
|
"class_factor": 9,
|
||||||
"state_table": ISO2022CN_ST,
|
"state_table": ISO2022CN_ST,
|
||||||
|
@ -194,7 +195,7 @@ ISO2022JP_ST = (
|
||||||
|
|
||||||
ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||||
|
|
||||||
ISO2022JP_SM_MODEL = {
|
ISO2022JP_SM_MODEL: CodingStateMachineDict = {
|
||||||
"class_table": ISO2022JP_CLS,
|
"class_table": ISO2022JP_CLS,
|
||||||
"class_factor": 10,
|
"class_factor": 10,
|
||||||
"state_table": ISO2022JP_ST,
|
"state_table": ISO2022JP_ST,
|
||||||
|
@ -250,7 +251,7 @@ ISO2022KR_ST = (
|
||||||
|
|
||||||
ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0)
|
||||||
|
|
||||||
ISO2022KR_SM_MODEL = {
|
ISO2022KR_SM_MODEL: CodingStateMachineDict = {
|
||||||
"class_table": ISO2022KR_CLS,
|
"class_table": ISO2022KR_CLS,
|
||||||
"class_factor": 6,
|
"class_factor": 6,
|
||||||
"state_table": ISO2022KR_ST,
|
"state_table": ISO2022KR_ST,
|
||||||
|
|
|
@ -25,6 +25,8 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
from .chardistribution import EUCJPDistributionAnalysis
|
from .chardistribution import EUCJPDistributionAnalysis
|
||||||
from .codingstatemachine import CodingStateMachine
|
from .codingstatemachine import CodingStateMachine
|
||||||
from .enums import MachineState, ProbingState
|
from .enums import MachineState, ProbingState
|
||||||
|
@ -34,26 +36,29 @@ from .mbcssm import EUCJP_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
class EUCJPProber(MultiByteCharSetProber):
|
class EUCJPProber(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL)
|
self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL)
|
||||||
self.distribution_analyzer = EUCJPDistributionAnalysis()
|
self.distribution_analyzer = EUCJPDistributionAnalysis()
|
||||||
self.context_analyzer = EUCJPContextAnalysis()
|
self.context_analyzer = EUCJPContextAnalysis()
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
super().reset()
|
super().reset()
|
||||||
self.context_analyzer.reset()
|
self.context_analyzer.reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return "EUC-JP"
|
return "EUC-JP"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return "Japanese"
|
return "Japanese"
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
|
assert self.coding_sm is not None
|
||||||
|
assert self.distribution_analyzer is not None
|
||||||
|
|
||||||
for i, byte in enumerate(byte_str):
|
for i, byte in enumerate(byte_str):
|
||||||
# PY3K: byte_str is a byte array, so byte is an int, not a byte
|
# PY3K: byte_str is a byte array, so byte is an int, not a byte
|
||||||
coding_state = self.coding_sm.next_state(byte)
|
coding_state = self.coding_sm.next_state(byte)
|
||||||
|
@ -89,7 +94,9 @@ class EUCJPProber(MultiByteCharSetProber):
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
|
assert self.distribution_analyzer is not None
|
||||||
|
|
||||||
context_conf = self.context_analyzer.get_confidence()
|
context_conf = self.context_analyzer.get_confidence()
|
||||||
distrib_conf = self.distribution_analyzer.get_confidence()
|
distrib_conf = self.distribution_analyzer.get_confidence()
|
||||||
return max(context_conf, distrib_conf)
|
return max(context_conf, distrib_conf)
|
||||||
|
|
|
@ -32,16 +32,16 @@ from .mbcssm import EUCKR_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
class EUCKRProber(MultiByteCharSetProber):
|
class EUCKRProber(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL)
|
self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL)
|
||||||
self.distribution_analyzer = EUCKRDistributionAnalysis()
|
self.distribution_analyzer = EUCKRDistributionAnalysis()
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return "EUC-KR"
|
return "EUC-KR"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return "Korean"
|
return "Korean"
|
||||||
|
|
|
@ -32,16 +32,16 @@ from .mbcssm import EUCTW_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
class EUCTWProber(MultiByteCharSetProber):
|
class EUCTWProber(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL)
|
self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL)
|
||||||
self.distribution_analyzer = EUCTWDistributionAnalysis()
|
self.distribution_analyzer = EUCTWDistributionAnalysis()
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return "EUC-TW"
|
return "EUC-TW"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return "Taiwan"
|
return "Taiwan"
|
||||||
|
|
|
@ -32,16 +32,16 @@ from .mbcssm import GB2312_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
class GB2312Prober(MultiByteCharSetProber):
|
class GB2312Prober(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(GB2312_SM_MODEL)
|
self.coding_sm = CodingStateMachine(GB2312_SM_MODEL)
|
||||||
self.distribution_analyzer = GB2312DistributionAnalysis()
|
self.distribution_analyzer = GB2312DistributionAnalysis()
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return "GB2312"
|
return "GB2312"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return "Chinese"
|
return "Chinese"
|
||||||
|
|
|
@ -25,8 +25,11 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .enums import ProbingState
|
from .enums import ProbingState
|
||||||
|
from .sbcharsetprober import SingleByteCharSetProber
|
||||||
|
|
||||||
# This prober doesn't actually recognize a language or a charset.
|
# This prober doesn't actually recognize a language or a charset.
|
||||||
# It is a helper prober for the use of the Hebrew model probers
|
# It is a helper prober for the use of the Hebrew model probers
|
||||||
|
@ -127,6 +130,7 @@ from .enums import ProbingState
|
||||||
|
|
||||||
|
|
||||||
class HebrewProber(CharSetProber):
|
class HebrewProber(CharSetProber):
|
||||||
|
SPACE = 0x20
|
||||||
# windows-1255 / ISO-8859-8 code points of interest
|
# windows-1255 / ISO-8859-8 code points of interest
|
||||||
FINAL_KAF = 0xEA
|
FINAL_KAF = 0xEA
|
||||||
NORMAL_KAF = 0xEB
|
NORMAL_KAF = 0xEB
|
||||||
|
@ -152,31 +156,35 @@ class HebrewProber(CharSetProber):
|
||||||
VISUAL_HEBREW_NAME = "ISO-8859-8"
|
VISUAL_HEBREW_NAME = "ISO-8859-8"
|
||||||
LOGICAL_HEBREW_NAME = "windows-1255"
|
LOGICAL_HEBREW_NAME = "windows-1255"
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._final_char_logical_score = None
|
self._final_char_logical_score = 0
|
||||||
self._final_char_visual_score = None
|
self._final_char_visual_score = 0
|
||||||
self._prev = None
|
self._prev = self.SPACE
|
||||||
self._before_prev = None
|
self._before_prev = self.SPACE
|
||||||
self._logical_prober = None
|
self._logical_prober: Optional[SingleByteCharSetProber] = None
|
||||||
self._visual_prober = None
|
self._visual_prober: Optional[SingleByteCharSetProber] = None
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
self._final_char_logical_score = 0
|
self._final_char_logical_score = 0
|
||||||
self._final_char_visual_score = 0
|
self._final_char_visual_score = 0
|
||||||
# The two last characters seen in the previous buffer,
|
# The two last characters seen in the previous buffer,
|
||||||
# mPrev and mBeforePrev are initialized to space in order to simulate
|
# mPrev and mBeforePrev are initialized to space in order to simulate
|
||||||
# a word delimiter at the beginning of the data
|
# a word delimiter at the beginning of the data
|
||||||
self._prev = " "
|
self._prev = self.SPACE
|
||||||
self._before_prev = " "
|
self._before_prev = self.SPACE
|
||||||
# These probers are owned by the group prober.
|
# These probers are owned by the group prober.
|
||||||
|
|
||||||
def set_model_probers(self, logical_prober, visual_prober):
|
def set_model_probers(
|
||||||
|
self,
|
||||||
|
logical_prober: SingleByteCharSetProber,
|
||||||
|
visual_prober: SingleByteCharSetProber,
|
||||||
|
) -> None:
|
||||||
self._logical_prober = logical_prober
|
self._logical_prober = logical_prober
|
||||||
self._visual_prober = visual_prober
|
self._visual_prober = visual_prober
|
||||||
|
|
||||||
def is_final(self, c):
|
def is_final(self, c: int) -> bool:
|
||||||
return c in [
|
return c in [
|
||||||
self.FINAL_KAF,
|
self.FINAL_KAF,
|
||||||
self.FINAL_MEM,
|
self.FINAL_MEM,
|
||||||
|
@ -185,7 +193,7 @@ class HebrewProber(CharSetProber):
|
||||||
self.FINAL_TSADI,
|
self.FINAL_TSADI,
|
||||||
]
|
]
|
||||||
|
|
||||||
def is_non_final(self, c):
|
def is_non_final(self, c: int) -> bool:
|
||||||
# The normal Tsadi is not a good Non-Final letter due to words like
|
# The normal Tsadi is not a good Non-Final letter due to words like
|
||||||
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This
|
# 'lechotet' (to chat) containing an apostrophe after the tsadi. This
|
||||||
# apostrophe is converted to a space in FilterWithoutEnglishLetters
|
# apostrophe is converted to a space in FilterWithoutEnglishLetters
|
||||||
|
@ -198,7 +206,7 @@ class HebrewProber(CharSetProber):
|
||||||
# since these words are quite rare.
|
# since these words are quite rare.
|
||||||
return c in [self.NORMAL_KAF, self.NORMAL_MEM, self.NORMAL_NUN, self.NORMAL_PE]
|
return c in [self.NORMAL_KAF, self.NORMAL_MEM, self.NORMAL_NUN, self.NORMAL_PE]
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
# Final letter analysis for logical-visual decision.
|
# Final letter analysis for logical-visual decision.
|
||||||
# Look for evidence that the received buffer is either logical Hebrew
|
# Look for evidence that the received buffer is either logical Hebrew
|
||||||
# or visual Hebrew.
|
# or visual Hebrew.
|
||||||
|
@ -232,9 +240,9 @@ class HebrewProber(CharSetProber):
|
||||||
byte_str = self.filter_high_byte_only(byte_str)
|
byte_str = self.filter_high_byte_only(byte_str)
|
||||||
|
|
||||||
for cur in byte_str:
|
for cur in byte_str:
|
||||||
if cur == " ":
|
if cur == self.SPACE:
|
||||||
# We stand on a space - a word just ended
|
# We stand on a space - a word just ended
|
||||||
if self._before_prev != " ":
|
if self._before_prev != self.SPACE:
|
||||||
# next-to-last char was not a space so self._prev is not a
|
# next-to-last char was not a space so self._prev is not a
|
||||||
# 1 letter word
|
# 1 letter word
|
||||||
if self.is_final(self._prev):
|
if self.is_final(self._prev):
|
||||||
|
@ -247,9 +255,9 @@ class HebrewProber(CharSetProber):
|
||||||
else:
|
else:
|
||||||
# Not standing on a space
|
# Not standing on a space
|
||||||
if (
|
if (
|
||||||
(self._before_prev == " ")
|
(self._before_prev == self.SPACE)
|
||||||
and (self.is_final(self._prev))
|
and (self.is_final(self._prev))
|
||||||
and (cur != " ")
|
and (cur != self.SPACE)
|
||||||
):
|
):
|
||||||
# case (3) [-2:space][-1:final letter][cur:not space]
|
# case (3) [-2:space][-1:final letter][cur:not space]
|
||||||
self._final_char_visual_score += 1
|
self._final_char_visual_score += 1
|
||||||
|
@ -261,7 +269,10 @@ class HebrewProber(CharSetProber):
|
||||||
return ProbingState.DETECTING
|
return ProbingState.DETECTING
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
|
assert self._logical_prober is not None
|
||||||
|
assert self._visual_prober is not None
|
||||||
|
|
||||||
# Make the decision: is it Logical or Visual?
|
# Make the decision: is it Logical or Visual?
|
||||||
# If the final letter score distance is dominant enough, rely on it.
|
# If the final letter score distance is dominant enough, rely on it.
|
||||||
finalsub = self._final_char_logical_score - self._final_char_visual_score
|
finalsub = self._final_char_logical_score - self._final_char_visual_score
|
||||||
|
@ -289,11 +300,14 @@ class HebrewProber(CharSetProber):
|
||||||
return self.LOGICAL_HEBREW_NAME
|
return self.LOGICAL_HEBREW_NAME
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return "Hebrew"
|
return "Hebrew"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def state(self):
|
def state(self) -> ProbingState:
|
||||||
|
assert self._logical_prober is not None
|
||||||
|
assert self._visual_prober is not None
|
||||||
|
|
||||||
# Remain active as long as any of the model probers are active.
|
# Remain active as long as any of the model probers are active.
|
||||||
if (self._logical_prober.state == ProbingState.NOT_ME) and (
|
if (self._logical_prober.state == ProbingState.NOT_ME) and (
|
||||||
self._visual_prober.state == ProbingState.NOT_ME
|
self._visual_prober.state == ProbingState.NOT_ME
|
||||||
|
|
|
@ -32,16 +32,16 @@ from .mbcssm import JOHAB_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
class JOHABProber(MultiByteCharSetProber):
|
class JOHABProber(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(JOHAB_SM_MODEL)
|
self.coding_sm = CodingStateMachine(JOHAB_SM_MODEL)
|
||||||
self.distribution_analyzer = JOHABDistributionAnalysis()
|
self.distribution_analyzer = JOHABDistributionAnalysis()
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return "Johab"
|
return "Johab"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return "Korean"
|
return "Korean"
|
||||||
|
|
|
@ -25,6 +25,7 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from typing import List, Tuple, Union
|
||||||
|
|
||||||
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
@ -123,15 +124,15 @@ class JapaneseContextAnalysis:
|
||||||
MAX_REL_THRESHOLD = 1000
|
MAX_REL_THRESHOLD = 1000
|
||||||
MINIMUM_DATA_THRESHOLD = 4
|
MINIMUM_DATA_THRESHOLD = 4
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
self._total_rel = None
|
self._total_rel = 0
|
||||||
self._rel_sample = None
|
self._rel_sample: List[int] = []
|
||||||
self._need_to_skip_char_num = None
|
self._need_to_skip_char_num = 0
|
||||||
self._last_char_order = None
|
self._last_char_order = -1
|
||||||
self._done = None
|
self._done = False
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
self._total_rel = 0 # total sequence received
|
self._total_rel = 0 # total sequence received
|
||||||
# category counters, each integer counts sequence in its category
|
# category counters, each integer counts sequence in its category
|
||||||
self._rel_sample = [0] * self.NUM_OF_CATEGORY
|
self._rel_sample = [0] * self.NUM_OF_CATEGORY
|
||||||
|
@ -143,7 +144,7 @@ class JapaneseContextAnalysis:
|
||||||
# been made
|
# been made
|
||||||
self._done = False
|
self._done = False
|
||||||
|
|
||||||
def feed(self, byte_str, num_bytes):
|
def feed(self, byte_str: Union[bytes, bytearray], num_bytes: int) -> None:
|
||||||
if self._done:
|
if self._done:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -172,29 +173,29 @@ class JapaneseContextAnalysis:
|
||||||
] += 1
|
] += 1
|
||||||
self._last_char_order = order
|
self._last_char_order = order
|
||||||
|
|
||||||
def got_enough_data(self):
|
def got_enough_data(self) -> bool:
|
||||||
return self._total_rel > self.ENOUGH_REL_THRESHOLD
|
return self._total_rel > self.ENOUGH_REL_THRESHOLD
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
# This is just one way to calculate confidence. It works well for me.
|
# This is just one way to calculate confidence. It works well for me.
|
||||||
if self._total_rel > self.MINIMUM_DATA_THRESHOLD:
|
if self._total_rel > self.MINIMUM_DATA_THRESHOLD:
|
||||||
return (self._total_rel - self._rel_sample[0]) / self._total_rel
|
return (self._total_rel - self._rel_sample[0]) / self._total_rel
|
||||||
return self.DONT_KNOW
|
return self.DONT_KNOW
|
||||||
|
|
||||||
def get_order(self, _):
|
def get_order(self, _: Union[bytes, bytearray]) -> Tuple[int, int]:
|
||||||
return -1, 1
|
return -1, 1
|
||||||
|
|
||||||
|
|
||||||
class SJISContextAnalysis(JapaneseContextAnalysis):
|
class SJISContextAnalysis(JapaneseContextAnalysis):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._charset_name = "SHIFT_JIS"
|
self._charset_name = "SHIFT_JIS"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return self._charset_name
|
return self._charset_name
|
||||||
|
|
||||||
def get_order(self, byte_str):
|
def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]:
|
||||||
if not byte_str:
|
if not byte_str:
|
||||||
return -1, 1
|
return -1, 1
|
||||||
# find out current char's byte length
|
# find out current char's byte length
|
||||||
|
@ -216,7 +217,7 @@ class SJISContextAnalysis(JapaneseContextAnalysis):
|
||||||
|
|
||||||
|
|
||||||
class EUCJPContextAnalysis(JapaneseContextAnalysis):
|
class EUCJPContextAnalysis(JapaneseContextAnalysis):
|
||||||
def get_order(self, byte_str):
|
def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]:
|
||||||
if not byte_str:
|
if not byte_str:
|
||||||
return -1, 1
|
return -1, 1
|
||||||
# find out current char's byte length
|
# find out current char's byte length
|
||||||
|
|
|
@ -26,6 +26,8 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from typing import List, Union
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .enums import ProbingState
|
from .enums import ProbingState
|
||||||
|
|
||||||
|
@ -96,26 +98,26 @@ Latin1ClassModel = (
|
||||||
|
|
||||||
|
|
||||||
class Latin1Prober(CharSetProber):
|
class Latin1Prober(CharSetProber):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._last_char_class = None
|
self._last_char_class = OTH
|
||||||
self._freq_counter = None
|
self._freq_counter: List[int] = []
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
self._last_char_class = OTH
|
self._last_char_class = OTH
|
||||||
self._freq_counter = [0] * FREQ_CAT_NUM
|
self._freq_counter = [0] * FREQ_CAT_NUM
|
||||||
super().reset()
|
super().reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return "ISO-8859-1"
|
return "ISO-8859-1"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
byte_str = self.remove_xml_tags(byte_str)
|
byte_str = self.remove_xml_tags(byte_str)
|
||||||
for c in byte_str:
|
for c in byte_str:
|
||||||
char_class = Latin1_CharToClass[c]
|
char_class = Latin1_CharToClass[c]
|
||||||
|
@ -128,7 +130,7 @@ class Latin1Prober(CharSetProber):
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
if self.state == ProbingState.NOT_ME:
|
if self.state == ProbingState.NOT_ME:
|
||||||
return 0.01
|
return 0.01
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,162 @@
|
||||||
|
######################## BEGIN LICENSE BLOCK ########################
|
||||||
|
# This code was modified from latin1prober.py by Rob Speer <rob@lumino.so>.
|
||||||
|
# The Original Code is Mozilla Universal charset detector code.
|
||||||
|
#
|
||||||
|
# The Initial Developer of the Original Code is
|
||||||
|
# Netscape Communications Corporation.
|
||||||
|
# Portions created by the Initial Developer are Copyright (C) 2001
|
||||||
|
# the Initial Developer. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Contributor(s):
|
||||||
|
# Rob Speer - adapt to MacRoman encoding
|
||||||
|
# Mark Pilgrim - port to Python
|
||||||
|
# Shy Shalom - original C code
|
||||||
|
#
|
||||||
|
# This library is free software; you can redistribute it and/or
|
||||||
|
# modify it under the terms of the GNU Lesser General Public
|
||||||
|
# License as published by the Free Software Foundation; either
|
||||||
|
# version 2.1 of the License, or (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This library is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
# Lesser General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Lesser General Public
|
||||||
|
# License along with this library; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||||
|
# 02110-1301 USA
|
||||||
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from typing import List, Union
|
||||||
|
|
||||||
|
from .charsetprober import CharSetProber
|
||||||
|
from .enums import ProbingState
|
||||||
|
|
||||||
|
FREQ_CAT_NUM = 4
|
||||||
|
|
||||||
|
UDF = 0 # undefined
|
||||||
|
OTH = 1 # other
|
||||||
|
ASC = 2 # ascii capital letter
|
||||||
|
ASS = 3 # ascii small letter
|
||||||
|
ACV = 4 # accent capital vowel
|
||||||
|
ACO = 5 # accent capital other
|
||||||
|
ASV = 6 # accent small vowel
|
||||||
|
ASO = 7 # accent small other
|
||||||
|
ODD = 8 # character that is unlikely to appear
|
||||||
|
CLASS_NUM = 9 # total classes
|
||||||
|
|
||||||
|
# The change from Latin1 is that we explicitly look for extended characters
|
||||||
|
# that are infrequently-occurring symbols, and consider them to always be
|
||||||
|
# improbable. This should let MacRoman get out of the way of more likely
|
||||||
|
# encodings in most situations.
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
MacRoman_CharToClass = (
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
|
||||||
|
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
|
||||||
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
|
||||||
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
|
||||||
|
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
|
||||||
|
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
|
||||||
|
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
|
||||||
|
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
|
||||||
|
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
|
||||||
|
ACV, ACV, ACO, ACV, ACO, ACV, ACV, ASV, # 80 - 87
|
||||||
|
ASV, ASV, ASV, ASV, ASV, ASO, ASV, ASV, # 88 - 8F
|
||||||
|
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASV, # 90 - 97
|
||||||
|
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # 98 - 9F
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, ASO, # A0 - A7
|
||||||
|
OTH, OTH, ODD, ODD, OTH, OTH, ACV, ACV, # A8 - AF
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, ASV, ASV, # B8 - BF
|
||||||
|
OTH, OTH, ODD, OTH, ODD, OTH, OTH, OTH, # C0 - C7
|
||||||
|
OTH, OTH, OTH, ACV, ACV, ACV, ACV, ASV, # C8 - CF
|
||||||
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, ODD, # D0 - D7
|
||||||
|
ASV, ACV, ODD, OTH, OTH, OTH, OTH, OTH, # D8 - DF
|
||||||
|
OTH, OTH, OTH, OTH, OTH, ACV, ACV, ACV, # E0 - E7
|
||||||
|
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # E8 - EF
|
||||||
|
ODD, ACV, ACV, ACV, ACV, ASV, ODD, ODD, # F0 - F7
|
||||||
|
ODD, ODD, ODD, ODD, ODD, ODD, ODD, ODD, # F8 - FF
|
||||||
|
)
|
||||||
|
|
||||||
|
# 0 : illegal
|
||||||
|
# 1 : very unlikely
|
||||||
|
# 2 : normal
|
||||||
|
# 3 : very likely
|
||||||
|
MacRomanClassModel = (
|
||||||
|
# UDF OTH ASC ASS ACV ACO ASV ASO ODD
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, # UDF
|
||||||
|
0, 3, 3, 3, 3, 3, 3, 3, 1, # OTH
|
||||||
|
0, 3, 3, 3, 3, 3, 3, 3, 1, # ASC
|
||||||
|
0, 3, 3, 3, 1, 1, 3, 3, 1, # ASS
|
||||||
|
0, 3, 3, 3, 1, 2, 1, 2, 1, # ACV
|
||||||
|
0, 3, 3, 3, 3, 3, 3, 3, 1, # ACO
|
||||||
|
0, 3, 1, 3, 1, 1, 1, 3, 1, # ASV
|
||||||
|
0, 3, 1, 3, 1, 1, 3, 3, 1, # ASO
|
||||||
|
0, 1, 1, 1, 1, 1, 1, 1, 1, # ODD
|
||||||
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
class MacRomanProber(CharSetProber):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self._last_char_class = OTH
|
||||||
|
self._freq_counter: List[int] = []
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def reset(self) -> None:
|
||||||
|
self._last_char_class = OTH
|
||||||
|
self._freq_counter = [0] * FREQ_CAT_NUM
|
||||||
|
|
||||||
|
# express the prior that MacRoman is a somewhat rare encoding;
|
||||||
|
# this can be done by starting out in a slightly improbable state
|
||||||
|
# that must be overcome
|
||||||
|
self._freq_counter[2] = 10
|
||||||
|
|
||||||
|
super().reset()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def charset_name(self) -> str:
|
||||||
|
return "MacRoman"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self) -> str:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
|
byte_str = self.remove_xml_tags(byte_str)
|
||||||
|
for c in byte_str:
|
||||||
|
char_class = MacRoman_CharToClass[c]
|
||||||
|
freq = MacRomanClassModel[(self._last_char_class * CLASS_NUM) + char_class]
|
||||||
|
if freq == 0:
|
||||||
|
self._state = ProbingState.NOT_ME
|
||||||
|
break
|
||||||
|
self._freq_counter[freq] += 1
|
||||||
|
self._last_char_class = char_class
|
||||||
|
|
||||||
|
return self.state
|
||||||
|
|
||||||
|
def get_confidence(self) -> float:
|
||||||
|
if self.state == ProbingState.NOT_ME:
|
||||||
|
return 0.01
|
||||||
|
|
||||||
|
total = sum(self._freq_counter)
|
||||||
|
confidence = (
|
||||||
|
0.0
|
||||||
|
if total < 0.01
|
||||||
|
else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total
|
||||||
|
)
|
||||||
|
confidence = max(confidence, 0.0)
|
||||||
|
# lower the confidence of MacRoman so that other more accurate
|
||||||
|
# detector can take priority.
|
||||||
|
confidence *= 0.73
|
||||||
|
return confidence
|
|
@ -27,8 +27,12 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
from .chardistribution import CharDistributionAnalysis
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .enums import MachineState, ProbingState
|
from .codingstatemachine import CodingStateMachine
|
||||||
|
from .enums import LanguageFilter, MachineState, ProbingState
|
||||||
|
|
||||||
|
|
||||||
class MultiByteCharSetProber(CharSetProber):
|
class MultiByteCharSetProber(CharSetProber):
|
||||||
|
@ -36,29 +40,24 @@ class MultiByteCharSetProber(CharSetProber):
|
||||||
MultiByteCharSetProber
|
MultiByteCharSetProber
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, lang_filter=None):
|
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
|
||||||
super().__init__(lang_filter=lang_filter)
|
super().__init__(lang_filter=lang_filter)
|
||||||
self.distribution_analyzer = None
|
self.distribution_analyzer: Optional[CharDistributionAnalysis] = None
|
||||||
self.coding_sm = None
|
self.coding_sm: Optional[CodingStateMachine] = None
|
||||||
self._last_char = [0, 0]
|
self._last_char = bytearray(b"\0\0")
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
super().reset()
|
super().reset()
|
||||||
if self.coding_sm:
|
if self.coding_sm:
|
||||||
self.coding_sm.reset()
|
self.coding_sm.reset()
|
||||||
if self.distribution_analyzer:
|
if self.distribution_analyzer:
|
||||||
self.distribution_analyzer.reset()
|
self.distribution_analyzer.reset()
|
||||||
self._last_char = [0, 0]
|
self._last_char = bytearray(b"\0\0")
|
||||||
|
|
||||||
@property
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
def charset_name(self):
|
assert self.coding_sm is not None
|
||||||
raise NotImplementedError
|
assert self.distribution_analyzer is not None
|
||||||
|
|
||||||
@property
|
|
||||||
def language(self):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def feed(self, byte_str):
|
|
||||||
for i, byte in enumerate(byte_str):
|
for i, byte in enumerate(byte_str):
|
||||||
coding_state = self.coding_sm.next_state(byte)
|
coding_state = self.coding_sm.next_state(byte)
|
||||||
if coding_state == MachineState.ERROR:
|
if coding_state == MachineState.ERROR:
|
||||||
|
@ -91,5 +90,6 @@ class MultiByteCharSetProber(CharSetProber):
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
|
assert self.distribution_analyzer is not None
|
||||||
return self.distribution_analyzer.get_confidence()
|
return self.distribution_analyzer.get_confidence()
|
||||||
|
|
|
@ -30,6 +30,7 @@
|
||||||
from .big5prober import Big5Prober
|
from .big5prober import Big5Prober
|
||||||
from .charsetgroupprober import CharSetGroupProber
|
from .charsetgroupprober import CharSetGroupProber
|
||||||
from .cp949prober import CP949Prober
|
from .cp949prober import CP949Prober
|
||||||
|
from .enums import LanguageFilter
|
||||||
from .eucjpprober import EUCJPProber
|
from .eucjpprober import EUCJPProber
|
||||||
from .euckrprober import EUCKRProber
|
from .euckrprober import EUCKRProber
|
||||||
from .euctwprober import EUCTWProber
|
from .euctwprober import EUCTWProber
|
||||||
|
@ -40,7 +41,7 @@ from .utf8prober import UTF8Prober
|
||||||
|
|
||||||
|
|
||||||
class MBCSGroupProber(CharSetGroupProber):
|
class MBCSGroupProber(CharSetGroupProber):
|
||||||
def __init__(self, lang_filter=None):
|
def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
|
||||||
super().__init__(lang_filter=lang_filter)
|
super().__init__(lang_filter=lang_filter)
|
||||||
self.probers = [
|
self.probers = [
|
||||||
UTF8Prober(),
|
UTF8Prober(),
|
||||||
|
|
|
@ -25,6 +25,7 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from .codingstatemachinedict import CodingStateMachineDict
|
||||||
from .enums import MachineState
|
from .enums import MachineState
|
||||||
|
|
||||||
# BIG5
|
# BIG5
|
||||||
|
@ -74,7 +75,7 @@ BIG5_ST = (
|
||||||
|
|
||||||
BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0)
|
BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0)
|
||||||
|
|
||||||
BIG5_SM_MODEL = {
|
BIG5_SM_MODEL: CodingStateMachineDict = {
|
||||||
"class_table": BIG5_CLS,
|
"class_table": BIG5_CLS,
|
||||||
"class_factor": 5,
|
"class_factor": 5,
|
||||||
"state_table": BIG5_ST,
|
"state_table": BIG5_ST,
|
||||||
|
@ -117,7 +118,7 @@ CP949_ST = (
|
||||||
|
|
||||||
CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
|
CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
|
||||||
|
|
||||||
CP949_SM_MODEL = {
|
CP949_SM_MODEL: CodingStateMachineDict = {
|
||||||
"class_table": CP949_CLS,
|
"class_table": CP949_CLS,
|
||||||
"class_factor": 10,
|
"class_factor": 10,
|
||||||
"state_table": CP949_ST,
|
"state_table": CP949_ST,
|
||||||
|
@ -173,7 +174,7 @@ EUCJP_ST = (
|
||||||
|
|
||||||
EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0)
|
EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0)
|
||||||
|
|
||||||
EUCJP_SM_MODEL = {
|
EUCJP_SM_MODEL: CodingStateMachineDict = {
|
||||||
"class_table": EUCJP_CLS,
|
"class_table": EUCJP_CLS,
|
||||||
"class_factor": 6,
|
"class_factor": 6,
|
||||||
"state_table": EUCJP_ST,
|
"state_table": EUCJP_ST,
|
||||||
|
@ -226,7 +227,7 @@ EUCKR_ST = (
|
||||||
|
|
||||||
EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0)
|
EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0)
|
||||||
|
|
||||||
EUCKR_SM_MODEL = {
|
EUCKR_SM_MODEL: CodingStateMachineDict = {
|
||||||
"class_table": EUCKR_CLS,
|
"class_table": EUCKR_CLS,
|
||||||
"class_factor": 4,
|
"class_factor": 4,
|
||||||
"state_table": EUCKR_ST,
|
"state_table": EUCKR_ST,
|
||||||
|
@ -283,7 +284,7 @@ JOHAB_ST = (
|
||||||
|
|
||||||
JOHAB_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 0, 0, 2, 2, 2)
|
JOHAB_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 0, 0, 2, 2, 2)
|
||||||
|
|
||||||
JOHAB_SM_MODEL = {
|
JOHAB_SM_MODEL: CodingStateMachineDict = {
|
||||||
"class_table": JOHAB_CLS,
|
"class_table": JOHAB_CLS,
|
||||||
"class_factor": 10,
|
"class_factor": 10,
|
||||||
"state_table": JOHAB_ST,
|
"state_table": JOHAB_ST,
|
||||||
|
@ -340,7 +341,7 @@ EUCTW_ST = (
|
||||||
|
|
||||||
EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3)
|
EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3)
|
||||||
|
|
||||||
EUCTW_SM_MODEL = {
|
EUCTW_SM_MODEL: CodingStateMachineDict = {
|
||||||
"class_table": EUCTW_CLS,
|
"class_table": EUCTW_CLS,
|
||||||
"class_factor": 7,
|
"class_factor": 7,
|
||||||
"state_table": EUCTW_ST,
|
"state_table": EUCTW_ST,
|
||||||
|
@ -402,7 +403,7 @@ GB2312_ST = (
|
||||||
# 2 here.
|
# 2 here.
|
||||||
GB2312_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 1, 2)
|
GB2312_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 1, 2)
|
||||||
|
|
||||||
GB2312_SM_MODEL = {
|
GB2312_SM_MODEL: CodingStateMachineDict = {
|
||||||
"class_table": GB2312_CLS,
|
"class_table": GB2312_CLS,
|
||||||
"class_factor": 7,
|
"class_factor": 7,
|
||||||
"state_table": GB2312_ST,
|
"state_table": GB2312_ST,
|
||||||
|
@ -458,7 +459,7 @@ SJIS_ST = (
|
||||||
|
|
||||||
SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0)
|
SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0)
|
||||||
|
|
||||||
SJIS_SM_MODEL = {
|
SJIS_SM_MODEL: CodingStateMachineDict = {
|
||||||
"class_table": SJIS_CLS,
|
"class_table": SJIS_CLS,
|
||||||
"class_factor": 6,
|
"class_factor": 6,
|
||||||
"state_table": SJIS_ST,
|
"state_table": SJIS_ST,
|
||||||
|
@ -516,7 +517,7 @@ UCS2BE_ST = (
|
||||||
|
|
||||||
UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2)
|
UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2)
|
||||||
|
|
||||||
UCS2BE_SM_MODEL = {
|
UCS2BE_SM_MODEL: CodingStateMachineDict = {
|
||||||
"class_table": UCS2BE_CLS,
|
"class_table": UCS2BE_CLS,
|
||||||
"class_factor": 6,
|
"class_factor": 6,
|
||||||
"state_table": UCS2BE_ST,
|
"state_table": UCS2BE_ST,
|
||||||
|
@ -574,7 +575,7 @@ UCS2LE_ST = (
|
||||||
|
|
||||||
UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2)
|
UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2)
|
||||||
|
|
||||||
UCS2LE_SM_MODEL = {
|
UCS2LE_SM_MODEL: CodingStateMachineDict = {
|
||||||
"class_table": UCS2LE_CLS,
|
"class_table": UCS2LE_CLS,
|
||||||
"class_factor": 6,
|
"class_factor": 6,
|
||||||
"state_table": UCS2LE_ST,
|
"state_table": UCS2LE_ST,
|
||||||
|
@ -651,7 +652,7 @@ UTF8_ST = (
|
||||||
|
|
||||||
UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)
|
UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6)
|
||||||
|
|
||||||
UTF8_SM_MODEL = {
|
UTF8_SM_MODEL: CodingStateMachineDict = {
|
||||||
"class_table": UTF8_CLS,
|
"class_table": UTF8_CLS,
|
||||||
"class_factor": 16,
|
"class_factor": 16,
|
||||||
"state_table": UTF8_ST,
|
"state_table": UTF8_ST,
|
||||||
|
|
|
@ -6,6 +6,7 @@ This code is based on the language metadata from the uchardet project.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from string import ascii_letters
|
from string import ascii_letters
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
# TODO: Add Ukrainian (KOI8-U)
|
# TODO: Add Ukrainian (KOI8-U)
|
||||||
|
|
||||||
|
@ -33,13 +34,13 @@ class Language:
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
name=None,
|
name: Optional[str] = None,
|
||||||
iso_code=None,
|
iso_code: Optional[str] = None,
|
||||||
use_ascii=True,
|
use_ascii: bool = True,
|
||||||
charsets=None,
|
charsets: Optional[List[str]] = None,
|
||||||
alphabet=None,
|
alphabet: Optional[str] = None,
|
||||||
wiki_start_pages=None,
|
wiki_start_pages: Optional[List[str]] = None,
|
||||||
):
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.name = name
|
self.name = name
|
||||||
self.iso_code = iso_code
|
self.iso_code = iso_code
|
||||||
|
@ -55,7 +56,7 @@ class Language:
|
||||||
self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None
|
self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None
|
||||||
self.wiki_start_pages = wiki_start_pages
|
self.wiki_start_pages = wiki_start_pages
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
param_str = ", ".join(
|
param_str = ", ".join(
|
||||||
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
|
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
|
||||||
)
|
)
|
||||||
|
@ -103,7 +104,7 @@ LANGUAGES = {
|
||||||
name="Danish",
|
name="Danish",
|
||||||
iso_code="da",
|
iso_code="da",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
|
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||||
alphabet="æøåÆØÅ",
|
alphabet="æøåÆØÅ",
|
||||||
wiki_start_pages=["Forside"],
|
wiki_start_pages=["Forside"],
|
||||||
),
|
),
|
||||||
|
@ -111,8 +112,8 @@ LANGUAGES = {
|
||||||
name="German",
|
name="German",
|
||||||
iso_code="de",
|
iso_code="de",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=["ISO-8859-1", "WINDOWS-1252"],
|
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||||
alphabet="äöüßÄÖÜ",
|
alphabet="äöüßẞÄÖÜ",
|
||||||
wiki_start_pages=["Wikipedia:Hauptseite"],
|
wiki_start_pages=["Wikipedia:Hauptseite"],
|
||||||
),
|
),
|
||||||
"Greek": Language(
|
"Greek": Language(
|
||||||
|
@ -127,7 +128,7 @@ LANGUAGES = {
|
||||||
name="English",
|
name="English",
|
||||||
iso_code="en",
|
iso_code="en",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=["ISO-8859-1", "WINDOWS-1252"],
|
charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
|
||||||
wiki_start_pages=["Main_Page"],
|
wiki_start_pages=["Main_Page"],
|
||||||
),
|
),
|
||||||
"Esperanto": Language(
|
"Esperanto": Language(
|
||||||
|
@ -143,7 +144,7 @@ LANGUAGES = {
|
||||||
name="Spanish",
|
name="Spanish",
|
||||||
iso_code="es",
|
iso_code="es",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
|
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||||
alphabet="ñáéíóúüÑÁÉÍÓÚÜ",
|
alphabet="ñáéíóúüÑÁÉÍÓÚÜ",
|
||||||
wiki_start_pages=["Wikipedia:Portada"],
|
wiki_start_pages=["Wikipedia:Portada"],
|
||||||
),
|
),
|
||||||
|
@ -161,7 +162,7 @@ LANGUAGES = {
|
||||||
name="Finnish",
|
name="Finnish",
|
||||||
iso_code="fi",
|
iso_code="fi",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
|
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||||
alphabet="ÅÄÖŠŽåäöšž",
|
alphabet="ÅÄÖŠŽåäöšž",
|
||||||
wiki_start_pages=["Wikipedia:Etusivu"],
|
wiki_start_pages=["Wikipedia:Etusivu"],
|
||||||
),
|
),
|
||||||
|
@ -169,7 +170,7 @@ LANGUAGES = {
|
||||||
name="French",
|
name="French",
|
||||||
iso_code="fr",
|
iso_code="fr",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
|
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||||
alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ",
|
alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ",
|
||||||
wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"],
|
wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"],
|
||||||
),
|
),
|
||||||
|
@ -203,7 +204,7 @@ LANGUAGES = {
|
||||||
name="Italian",
|
name="Italian",
|
||||||
iso_code="it",
|
iso_code="it",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
|
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||||
alphabet="ÀÈÉÌÒÓÙàèéìòóù",
|
alphabet="ÀÈÉÌÒÓÙàèéìòóù",
|
||||||
wiki_start_pages=["Pagina_principale"],
|
wiki_start_pages=["Pagina_principale"],
|
||||||
),
|
),
|
||||||
|
@ -237,7 +238,7 @@ LANGUAGES = {
|
||||||
name="Dutch",
|
name="Dutch",
|
||||||
iso_code="nl",
|
iso_code="nl",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=["ISO-8859-1", "WINDOWS-1252"],
|
charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
|
||||||
wiki_start_pages=["Hoofdpagina"],
|
wiki_start_pages=["Hoofdpagina"],
|
||||||
),
|
),
|
||||||
"Polish": Language(
|
"Polish": Language(
|
||||||
|
@ -253,7 +254,7 @@ LANGUAGES = {
|
||||||
name="Portuguese",
|
name="Portuguese",
|
||||||
iso_code="pt",
|
iso_code="pt",
|
||||||
use_ascii=True,
|
use_ascii=True,
|
||||||
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"],
|
charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
|
||||||
alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú",
|
alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú",
|
||||||
wiki_start_pages=["Wikipédia:Página_principal"],
|
wiki_start_pages=["Wikipédia:Página_principal"],
|
||||||
),
|
),
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
from typing import TYPE_CHECKING, Optional
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
# TypedDict was introduced in Python 3.8.
|
||||||
|
#
|
||||||
|
# TODO: Remove the else block and TYPE_CHECKING check when dropping support
|
||||||
|
# for Python 3.7.
|
||||||
|
from typing import TypedDict
|
||||||
|
|
||||||
|
class ResultDict(TypedDict):
|
||||||
|
encoding: Optional[str]
|
||||||
|
confidence: float
|
||||||
|
language: Optional[str]
|
||||||
|
|
||||||
|
else:
|
||||||
|
ResultDict = dict
|
|
@ -26,23 +26,20 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from collections import namedtuple
|
from typing import Dict, List, NamedTuple, Optional, Union
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .enums import CharacterCategory, ProbingState, SequenceLikelihood
|
from .enums import CharacterCategory, ProbingState, SequenceLikelihood
|
||||||
|
|
||||||
SingleByteCharSetModel = namedtuple(
|
|
||||||
"SingleByteCharSetModel",
|
class SingleByteCharSetModel(NamedTuple):
|
||||||
[
|
charset_name: str
|
||||||
"charset_name",
|
language: str
|
||||||
"language",
|
char_to_order_map: Dict[int, int]
|
||||||
"char_to_order_map",
|
language_model: Dict[int, Dict[int, int]]
|
||||||
"language_model",
|
typical_positive_ratio: float
|
||||||
"typical_positive_ratio",
|
keep_ascii_letters: bool
|
||||||
"keep_ascii_letters",
|
alphabet: str
|
||||||
"alphabet",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class SingleByteCharSetProber(CharSetProber):
|
class SingleByteCharSetProber(CharSetProber):
|
||||||
|
@ -51,22 +48,27 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
POSITIVE_SHORTCUT_THRESHOLD = 0.95
|
POSITIVE_SHORTCUT_THRESHOLD = 0.95
|
||||||
NEGATIVE_SHORTCUT_THRESHOLD = 0.05
|
NEGATIVE_SHORTCUT_THRESHOLD = 0.05
|
||||||
|
|
||||||
def __init__(self, model, is_reversed=False, name_prober=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
model: SingleByteCharSetModel,
|
||||||
|
is_reversed: bool = False,
|
||||||
|
name_prober: Optional[CharSetProber] = None,
|
||||||
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._model = model
|
self._model = model
|
||||||
# TRUE if we need to reverse every pair in the model lookup
|
# TRUE if we need to reverse every pair in the model lookup
|
||||||
self._reversed = is_reversed
|
self._reversed = is_reversed
|
||||||
# Optional auxiliary prober for name decision
|
# Optional auxiliary prober for name decision
|
||||||
self._name_prober = name_prober
|
self._name_prober = name_prober
|
||||||
self._last_order = None
|
self._last_order = 255
|
||||||
self._seq_counters = None
|
self._seq_counters: List[int] = []
|
||||||
self._total_seqs = None
|
self._total_seqs = 0
|
||||||
self._total_char = None
|
self._total_char = 0
|
||||||
self._control_char = None
|
self._control_char = 0
|
||||||
self._freq_char = None
|
self._freq_char = 0
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
super().reset()
|
super().reset()
|
||||||
# char order of last character
|
# char order of last character
|
||||||
self._last_order = 255
|
self._last_order = 255
|
||||||
|
@ -78,18 +80,18 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
self._freq_char = 0
|
self._freq_char = 0
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> Optional[str]:
|
||||||
if self._name_prober:
|
if self._name_prober:
|
||||||
return self._name_prober.charset_name
|
return self._name_prober.charset_name
|
||||||
return self._model.charset_name
|
return self._model.charset_name
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> Optional[str]:
|
||||||
if self._name_prober:
|
if self._name_prober:
|
||||||
return self._name_prober.language
|
return self._name_prober.language
|
||||||
return self._model.language
|
return self._model.language
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
# TODO: Make filter_international_words keep things in self.alphabet
|
# TODO: Make filter_international_words keep things in self.alphabet
|
||||||
if not self._model.keep_ascii_letters:
|
if not self._model.keep_ascii_letters:
|
||||||
byte_str = self.filter_international_words(byte_str)
|
byte_str = self.filter_international_words(byte_str)
|
||||||
|
@ -139,7 +141,7 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
r = 0.01
|
r = 0.01
|
||||||
if self._total_seqs > 0:
|
if self._total_seqs > 0:
|
||||||
r = (
|
r = (
|
||||||
|
|
|
@ -48,7 +48,7 @@ from .sbcharsetprober import SingleByteCharSetProber
|
||||||
|
|
||||||
|
|
||||||
class SBCSGroupProber(CharSetGroupProber):
|
class SBCSGroupProber(CharSetGroupProber):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
hebrew_prober = HebrewProber()
|
hebrew_prober = HebrewProber()
|
||||||
logical_hebrew_prober = SingleByteCharSetProber(
|
logical_hebrew_prober = SingleByteCharSetProber(
|
||||||
|
|
|
@ -25,6 +25,8 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
from .chardistribution import SJISDistributionAnalysis
|
from .chardistribution import SJISDistributionAnalysis
|
||||||
from .codingstatemachine import CodingStateMachine
|
from .codingstatemachine import CodingStateMachine
|
||||||
from .enums import MachineState, ProbingState
|
from .enums import MachineState, ProbingState
|
||||||
|
@ -34,26 +36,29 @@ from .mbcssm import SJIS_SM_MODEL
|
||||||
|
|
||||||
|
|
||||||
class SJISProber(MultiByteCharSetProber):
|
class SJISProber(MultiByteCharSetProber):
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(SJIS_SM_MODEL)
|
self.coding_sm = CodingStateMachine(SJIS_SM_MODEL)
|
||||||
self.distribution_analyzer = SJISDistributionAnalysis()
|
self.distribution_analyzer = SJISDistributionAnalysis()
|
||||||
self.context_analyzer = SJISContextAnalysis()
|
self.context_analyzer = SJISContextAnalysis()
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
super().reset()
|
super().reset()
|
||||||
self.context_analyzer.reset()
|
self.context_analyzer.reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return self.context_analyzer.charset_name
|
return self.context_analyzer.charset_name
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return "Japanese"
|
return "Japanese"
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
|
assert self.coding_sm is not None
|
||||||
|
assert self.distribution_analyzer is not None
|
||||||
|
|
||||||
for i, byte in enumerate(byte_str):
|
for i, byte in enumerate(byte_str):
|
||||||
coding_state = self.coding_sm.next_state(byte)
|
coding_state = self.coding_sm.next_state(byte)
|
||||||
if coding_state == MachineState.ERROR:
|
if coding_state == MachineState.ERROR:
|
||||||
|
@ -92,7 +97,9 @@ class SJISProber(MultiByteCharSetProber):
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
|
assert self.distribution_analyzer is not None
|
||||||
|
|
||||||
context_conf = self.context_analyzer.get_confidence()
|
context_conf = self.context_analyzer.get_confidence()
|
||||||
distrib_conf = self.distribution_analyzer.get_confidence()
|
distrib_conf = self.distribution_analyzer.get_confidence()
|
||||||
return max(context_conf, distrib_conf)
|
return max(context_conf, distrib_conf)
|
||||||
|
|
|
@ -39,12 +39,16 @@ class a user of ``chardet`` should use.
|
||||||
import codecs
|
import codecs
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
from .charsetgroupprober import CharSetGroupProber
|
from .charsetgroupprober import CharSetGroupProber
|
||||||
|
from .charsetprober import CharSetProber
|
||||||
from .enums import InputState, LanguageFilter, ProbingState
|
from .enums import InputState, LanguageFilter, ProbingState
|
||||||
from .escprober import EscCharSetProber
|
from .escprober import EscCharSetProber
|
||||||
from .latin1prober import Latin1Prober
|
from .latin1prober import Latin1Prober
|
||||||
|
from .macromanprober import MacRomanProber
|
||||||
from .mbcsgroupprober import MBCSGroupProber
|
from .mbcsgroupprober import MBCSGroupProber
|
||||||
|
from .resultdict import ResultDict
|
||||||
from .sbcsgroupprober import SBCSGroupProber
|
from .sbcsgroupprober import SBCSGroupProber
|
||||||
from .utf1632prober import UTF1632Prober
|
from .utf1632prober import UTF1632Prober
|
||||||
|
|
||||||
|
@ -80,34 +84,55 @@ class UniversalDetector:
|
||||||
"iso-8859-9": "Windows-1254",
|
"iso-8859-9": "Windows-1254",
|
||||||
"iso-8859-13": "Windows-1257",
|
"iso-8859-13": "Windows-1257",
|
||||||
}
|
}
|
||||||
|
# Based on https://encoding.spec.whatwg.org/#names-and-labels
|
||||||
|
# but altered to match Python names for encodings and remove mappings
|
||||||
|
# that break tests.
|
||||||
|
LEGACY_MAP = {
|
||||||
|
"ascii": "Windows-1252",
|
||||||
|
"iso-8859-1": "Windows-1252",
|
||||||
|
"tis-620": "ISO-8859-11",
|
||||||
|
"iso-8859-9": "Windows-1254",
|
||||||
|
"gb2312": "GB18030",
|
||||||
|
"euc-kr": "CP949",
|
||||||
|
"utf-16le": "UTF-16",
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self, lang_filter=LanguageFilter.ALL):
|
def __init__(
|
||||||
self._esc_charset_prober = None
|
self,
|
||||||
self._utf1632_prober = None
|
lang_filter: LanguageFilter = LanguageFilter.ALL,
|
||||||
self._charset_probers = []
|
should_rename_legacy: bool = False,
|
||||||
self.result = None
|
) -> None:
|
||||||
self.done = None
|
self._esc_charset_prober: Optional[EscCharSetProber] = None
|
||||||
self._got_data = None
|
self._utf1632_prober: Optional[UTF1632Prober] = None
|
||||||
self._input_state = None
|
self._charset_probers: List[CharSetProber] = []
|
||||||
self._last_char = None
|
self.result: ResultDict = {
|
||||||
|
"encoding": None,
|
||||||
|
"confidence": 0.0,
|
||||||
|
"language": None,
|
||||||
|
}
|
||||||
|
self.done = False
|
||||||
|
self._got_data = False
|
||||||
|
self._input_state = InputState.PURE_ASCII
|
||||||
|
self._last_char = b""
|
||||||
self.lang_filter = lang_filter
|
self.lang_filter = lang_filter
|
||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
self._has_win_bytes = None
|
self._has_win_bytes = False
|
||||||
|
self.should_rename_legacy = should_rename_legacy
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def input_state(self):
|
def input_state(self) -> int:
|
||||||
return self._input_state
|
return self._input_state
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def has_win_bytes(self):
|
def has_win_bytes(self) -> bool:
|
||||||
return self._has_win_bytes
|
return self._has_win_bytes
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_probers(self):
|
def charset_probers(self) -> List[CharSetProber]:
|
||||||
return self._charset_probers
|
return self._charset_probers
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
"""
|
"""
|
||||||
Reset the UniversalDetector and all of its probers back to their
|
Reset the UniversalDetector and all of its probers back to their
|
||||||
initial states. This is called by ``__init__``, so you only need to
|
initial states. This is called by ``__init__``, so you only need to
|
||||||
|
@ -126,7 +151,7 @@ class UniversalDetector:
|
||||||
for prober in self._charset_probers:
|
for prober in self._charset_probers:
|
||||||
prober.reset()
|
prober.reset()
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str: Union[bytes, bytearray]) -> None:
|
||||||
"""
|
"""
|
||||||
Takes a chunk of a document and feeds it through all of the relevant
|
Takes a chunk of a document and feeds it through all of the relevant
|
||||||
charset probers.
|
charset probers.
|
||||||
|
@ -166,6 +191,7 @@ class UniversalDetector:
|
||||||
elif byte_str.startswith(b"\xFE\xFF\x00\x00"):
|
elif byte_str.startswith(b"\xFE\xFF\x00\x00"):
|
||||||
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
||||||
self.result = {
|
self.result = {
|
||||||
|
# TODO: This encoding is not supported by Python. Should remove?
|
||||||
"encoding": "X-ISO-10646-UCS-4-3412",
|
"encoding": "X-ISO-10646-UCS-4-3412",
|
||||||
"confidence": 1.0,
|
"confidence": 1.0,
|
||||||
"language": "",
|
"language": "",
|
||||||
|
@ -173,6 +199,7 @@ class UniversalDetector:
|
||||||
elif byte_str.startswith(b"\x00\x00\xFF\xFE"):
|
elif byte_str.startswith(b"\x00\x00\xFF\xFE"):
|
||||||
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
||||||
self.result = {
|
self.result = {
|
||||||
|
# TODO: This encoding is not supported by Python. Should remove?
|
||||||
"encoding": "X-ISO-10646-UCS-4-2143",
|
"encoding": "X-ISO-10646-UCS-4-2143",
|
||||||
"confidence": 1.0,
|
"confidence": 1.0,
|
||||||
"language": "",
|
"language": "",
|
||||||
|
@ -242,6 +269,7 @@ class UniversalDetector:
|
||||||
if self.lang_filter & LanguageFilter.NON_CJK:
|
if self.lang_filter & LanguageFilter.NON_CJK:
|
||||||
self._charset_probers.append(SBCSGroupProber())
|
self._charset_probers.append(SBCSGroupProber())
|
||||||
self._charset_probers.append(Latin1Prober())
|
self._charset_probers.append(Latin1Prober())
|
||||||
|
self._charset_probers.append(MacRomanProber())
|
||||||
for prober in self._charset_probers:
|
for prober in self._charset_probers:
|
||||||
if prober.feed(byte_str) == ProbingState.FOUND_IT:
|
if prober.feed(byte_str) == ProbingState.FOUND_IT:
|
||||||
self.result = {
|
self.result = {
|
||||||
|
@ -254,7 +282,7 @@ class UniversalDetector:
|
||||||
if self.WIN_BYTE_DETECTOR.search(byte_str):
|
if self.WIN_BYTE_DETECTOR.search(byte_str):
|
||||||
self._has_win_bytes = True
|
self._has_win_bytes = True
|
||||||
|
|
||||||
def close(self):
|
def close(self) -> ResultDict:
|
||||||
"""
|
"""
|
||||||
Stop analyzing the current document and come up with a final
|
Stop analyzing the current document and come up with a final
|
||||||
prediction.
|
prediction.
|
||||||
|
@ -288,7 +316,8 @@ class UniversalDetector:
|
||||||
max_prober = prober
|
max_prober = prober
|
||||||
if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
|
if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD):
|
||||||
charset_name = max_prober.charset_name
|
charset_name = max_prober.charset_name
|
||||||
lower_charset_name = max_prober.charset_name.lower()
|
assert charset_name is not None
|
||||||
|
lower_charset_name = charset_name.lower()
|
||||||
confidence = max_prober.get_confidence()
|
confidence = max_prober.get_confidence()
|
||||||
# Use Windows encoding name instead of ISO-8859 if we saw any
|
# Use Windows encoding name instead of ISO-8859 if we saw any
|
||||||
# extra Windows-specific bytes
|
# extra Windows-specific bytes
|
||||||
|
@ -297,6 +326,11 @@ class UniversalDetector:
|
||||||
charset_name = self.ISO_WIN_MAP.get(
|
charset_name = self.ISO_WIN_MAP.get(
|
||||||
lower_charset_name, charset_name
|
lower_charset_name, charset_name
|
||||||
)
|
)
|
||||||
|
# Rename legacy encodings with superset encodings if asked
|
||||||
|
if self.should_rename_legacy:
|
||||||
|
charset_name = self.LEGACY_MAP.get(
|
||||||
|
(charset_name or "").lower(), charset_name
|
||||||
|
)
|
||||||
self.result = {
|
self.result = {
|
||||||
"encoding": charset_name,
|
"encoding": charset_name,
|
||||||
"confidence": confidence,
|
"confidence": confidence,
|
||||||
|
|
|
@ -18,6 +18,8 @@
|
||||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
from typing import List, Union
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .enums import ProbingState
|
from .enums import ProbingState
|
||||||
|
|
||||||
|
@ -36,7 +38,7 @@ class UTF1632Prober(CharSetProber):
|
||||||
# a fixed constant ratio of expected zeros or non-zeros in modulo-position.
|
# a fixed constant ratio of expected zeros or non-zeros in modulo-position.
|
||||||
EXPECTED_RATIO = 0.94
|
EXPECTED_RATIO = 0.94
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.position = 0
|
self.position = 0
|
||||||
self.zeros_at_mod = [0] * 4
|
self.zeros_at_mod = [0] * 4
|
||||||
|
@ -51,7 +53,7 @@ class UTF1632Prober(CharSetProber):
|
||||||
self.first_half_surrogate_pair_detected_16le = False
|
self.first_half_surrogate_pair_detected_16le = False
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
super().reset()
|
super().reset()
|
||||||
self.position = 0
|
self.position = 0
|
||||||
self.zeros_at_mod = [0] * 4
|
self.zeros_at_mod = [0] * 4
|
||||||
|
@ -66,7 +68,7 @@ class UTF1632Prober(CharSetProber):
|
||||||
self.quad = [0, 0, 0, 0]
|
self.quad = [0, 0, 0, 0]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
if self.is_likely_utf32be():
|
if self.is_likely_utf32be():
|
||||||
return "utf-32be"
|
return "utf-32be"
|
||||||
if self.is_likely_utf32le():
|
if self.is_likely_utf32le():
|
||||||
|
@ -79,16 +81,16 @@ class UTF1632Prober(CharSetProber):
|
||||||
return "utf-16"
|
return "utf-16"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def approx_32bit_chars(self):
|
def approx_32bit_chars(self) -> float:
|
||||||
return max(1.0, self.position / 4.0)
|
return max(1.0, self.position / 4.0)
|
||||||
|
|
||||||
def approx_16bit_chars(self):
|
def approx_16bit_chars(self) -> float:
|
||||||
return max(1.0, self.position / 2.0)
|
return max(1.0, self.position / 2.0)
|
||||||
|
|
||||||
def is_likely_utf32be(self):
|
def is_likely_utf32be(self) -> bool:
|
||||||
approx_chars = self.approx_32bit_chars()
|
approx_chars = self.approx_32bit_chars()
|
||||||
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
|
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
|
||||||
self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
|
self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
|
||||||
|
@ -98,7 +100,7 @@ class UTF1632Prober(CharSetProber):
|
||||||
and not self.invalid_utf32be
|
and not self.invalid_utf32be
|
||||||
)
|
)
|
||||||
|
|
||||||
def is_likely_utf32le(self):
|
def is_likely_utf32le(self) -> bool:
|
||||||
approx_chars = self.approx_32bit_chars()
|
approx_chars = self.approx_32bit_chars()
|
||||||
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
|
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
|
||||||
self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
|
self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
|
||||||
|
@ -108,7 +110,7 @@ class UTF1632Prober(CharSetProber):
|
||||||
and not self.invalid_utf32le
|
and not self.invalid_utf32le
|
||||||
)
|
)
|
||||||
|
|
||||||
def is_likely_utf16be(self):
|
def is_likely_utf16be(self) -> bool:
|
||||||
approx_chars = self.approx_16bit_chars()
|
approx_chars = self.approx_16bit_chars()
|
||||||
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
|
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
|
||||||
(self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars
|
(self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars
|
||||||
|
@ -118,7 +120,7 @@ class UTF1632Prober(CharSetProber):
|
||||||
and not self.invalid_utf16be
|
and not self.invalid_utf16be
|
||||||
)
|
)
|
||||||
|
|
||||||
def is_likely_utf16le(self):
|
def is_likely_utf16le(self) -> bool:
|
||||||
approx_chars = self.approx_16bit_chars()
|
approx_chars = self.approx_16bit_chars()
|
||||||
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
|
return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
|
||||||
(self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars
|
(self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars
|
||||||
|
@ -128,7 +130,7 @@ class UTF1632Prober(CharSetProber):
|
||||||
and not self.invalid_utf16le
|
and not self.invalid_utf16le
|
||||||
)
|
)
|
||||||
|
|
||||||
def validate_utf32_characters(self, quad):
|
def validate_utf32_characters(self, quad: List[int]) -> None:
|
||||||
"""
|
"""
|
||||||
Validate if the quad of bytes is valid UTF-32.
|
Validate if the quad of bytes is valid UTF-32.
|
||||||
|
|
||||||
|
@ -150,7 +152,7 @@ class UTF1632Prober(CharSetProber):
|
||||||
):
|
):
|
||||||
self.invalid_utf32le = True
|
self.invalid_utf32le = True
|
||||||
|
|
||||||
def validate_utf16_characters(self, pair):
|
def validate_utf16_characters(self, pair: List[int]) -> None:
|
||||||
"""
|
"""
|
||||||
Validate if the pair of bytes is valid UTF-16.
|
Validate if the pair of bytes is valid UTF-16.
|
||||||
|
|
||||||
|
@ -182,7 +184,7 @@ class UTF1632Prober(CharSetProber):
|
||||||
else:
|
else:
|
||||||
self.invalid_utf16le = True
|
self.invalid_utf16le = True
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
for c in byte_str:
|
for c in byte_str:
|
||||||
mod4 = self.position % 4
|
mod4 = self.position % 4
|
||||||
self.quad[mod4] = c
|
self.quad[mod4] = c
|
||||||
|
@ -198,7 +200,7 @@ class UTF1632Prober(CharSetProber):
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def state(self):
|
def state(self) -> ProbingState:
|
||||||
if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}:
|
if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}:
|
||||||
# terminal, decided states
|
# terminal, decided states
|
||||||
return self._state
|
return self._state
|
||||||
|
@ -210,7 +212,7 @@ class UTF1632Prober(CharSetProber):
|
||||||
self._state = ProbingState.NOT_ME
|
self._state = ProbingState.NOT_ME
|
||||||
return self._state
|
return self._state
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
return (
|
return (
|
||||||
0.85
|
0.85
|
||||||
if (
|
if (
|
||||||
|
|
|
@ -25,6 +25,8 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .codingstatemachine import CodingStateMachine
|
from .codingstatemachine import CodingStateMachine
|
||||||
from .enums import MachineState, ProbingState
|
from .enums import MachineState, ProbingState
|
||||||
|
@ -34,26 +36,26 @@ from .mbcssm import UTF8_SM_MODEL
|
||||||
class UTF8Prober(CharSetProber):
|
class UTF8Prober(CharSetProber):
|
||||||
ONE_CHAR_PROB = 0.5
|
ONE_CHAR_PROB = 0.5
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.coding_sm = CodingStateMachine(UTF8_SM_MODEL)
|
self.coding_sm = CodingStateMachine(UTF8_SM_MODEL)
|
||||||
self._num_mb_chars = None
|
self._num_mb_chars = 0
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self) -> None:
|
||||||
super().reset()
|
super().reset()
|
||||||
self.coding_sm.reset()
|
self.coding_sm.reset()
|
||||||
self._num_mb_chars = 0
|
self._num_mb_chars = 0
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def charset_name(self):
|
def charset_name(self) -> str:
|
||||||
return "utf-8"
|
return "utf-8"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self) -> str:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
|
||||||
for c in byte_str:
|
for c in byte_str:
|
||||||
coding_state = self.coding_sm.next_state(c)
|
coding_state = self.coding_sm.next_state(c)
|
||||||
if coding_state == MachineState.ERROR:
|
if coding_state == MachineState.ERROR:
|
||||||
|
@ -72,7 +74,7 @@ class UTF8Prober(CharSetProber):
|
||||||
|
|
||||||
return self.state
|
return self.state
|
||||||
|
|
||||||
def get_confidence(self):
|
def get_confidence(self) -> float:
|
||||||
unlike = 0.99
|
unlike = 0.99
|
||||||
if self._num_mb_chars < 6:
|
if self._num_mb_chars < 6:
|
||||||
unlike *= self.ONE_CHAR_PROB**self._num_mb_chars
|
unlike *= self.ONE_CHAR_PROB**self._num_mb_chars
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
"""
|
"""
|
||||||
This module exists only to simplify retrieving the version number of chardet
|
This module exists only to simplify retrieving the version number of chardet
|
||||||
from within setup.py and from chardet subpackages.
|
from within setuptools and from chardet subpackages.
|
||||||
|
|
||||||
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__version__ = "5.0.0"
|
__version__ = "5.1.0"
|
||||||
VERSION = __version__.split(".")
|
VERSION = __version__.split(".")
|
||||||
|
|
|
@ -9,7 +9,7 @@ pyparsing==3.0.9
|
||||||
pyproject-hooks==1.0.0
|
pyproject-hooks==1.0.0
|
||||||
requests==2.28.2
|
requests==2.28.2
|
||||||
certifi==2022.12.7
|
certifi==2022.12.7
|
||||||
chardet==5.0.0
|
chardet==5.1.0
|
||||||
idna==3.4
|
idna==3.4
|
||||||
urllib3==1.26.12
|
urllib3==1.26.12
|
||||||
rich==12.6.0
|
rich==12.6.0
|
||||||
|
|
Loading…
Reference in New Issue