Skip to content

Commit f286f98

Browse files
committed
feat(vendor): Update vendored deps
1 parent 6379009 commit f286f98

File tree

109 files changed

+23044
-14824
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

109 files changed

+23044
-14824
lines changed

vendor/bin/normalizer

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
#!/usr/local/opt/python@3.11/bin/python3.11
1+
#!/usr/local/python/3.12.1/bin/python3
22
# -*- coding: utf-8 -*-
33
import re
44
import sys
5-
from charset_normalizer.cli.normalizer import cli_detect
5+
from charset_normalizer.cli import cli_detect
66
if __name__ == '__main__':
77
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
88
sys.exit(cli_detect())

vendor/certifi/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .core import contents, where
22

33
__all__ = ["contents", "where"]
4-
__version__ = "2023.05.07"
4+
__version__ = "2024.08.30"

vendor/certifi/cacert.pem

Lines changed: 590 additions & 250 deletions
Large diffs are not rendered by default.

vendor/certifi/core.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
This module returns the installation location of cacert.pem or its contents.
66
"""
77
import sys
8+
import atexit
9+
10+
def exit_cacert_ctx() -> None:
11+
_CACERT_CTX.__exit__(None, None, None) # type: ignore[union-attr]
812

913

1014
if sys.version_info >= (3, 11):
@@ -35,6 +39,7 @@ def where() -> str:
3539
# we will also store that at the global level as well.
3640
_CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
3741
_CACERT_PATH = str(_CACERT_CTX.__enter__())
42+
atexit.register(exit_cacert_ctx)
3843

3944
return _CACERT_PATH
4045

@@ -70,6 +75,7 @@ def where() -> str:
7075
# we will also store that at the global level as well.
7176
_CACERT_CTX = get_path("certifi", "cacert.pem")
7277
_CACERT_PATH = str(_CACERT_CTX.__enter__())
78+
atexit.register(exit_cacert_ctx)
7379

7480
return _CACERT_PATH
7581

vendor/charset_normalizer/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
"""
2222
import logging
2323

24-
from .api import from_bytes, from_fp, from_path
24+
from .api import from_bytes, from_fp, from_path, is_binary
2525
from .legacy import detect
2626
from .models import CharsetMatch, CharsetMatches
2727
from .utils import set_logging_handler
@@ -31,6 +31,7 @@
3131
"from_fp",
3232
"from_path",
3333
"from_bytes",
34+
"is_binary",
3435
"detect",
3536
"CharsetMatch",
3637
"CharsetMatches",
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from .cli import cli_detect
2+
3+
if __name__ == "__main__":
4+
cli_detect()

vendor/charset_normalizer/api.py

Lines changed: 136 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import logging
22
from os import PathLike
3-
from typing import Any, BinaryIO, List, Optional, Set
3+
from typing import BinaryIO, List, Optional, Set, Union
44

55
from .cd import (
66
coherence_ratio,
@@ -31,7 +31,7 @@
3131

3232

3333
def from_bytes(
34-
sequences: bytes,
34+
sequences: Union[bytes, bytearray],
3535
steps: int = 5,
3636
chunk_size: int = 512,
3737
threshold: float = 0.2,
@@ -40,6 +40,7 @@ def from_bytes(
4040
preemptive_behaviour: bool = True,
4141
explain: bool = False,
4242
language_threshold: float = 0.1,
43+
enable_fallback: bool = True,
4344
) -> CharsetMatches:
4445
"""
4546
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
@@ -158,6 +159,8 @@ def from_bytes(
158159

159160
results: CharsetMatches = CharsetMatches()
160161

162+
early_stop_results: CharsetMatches = CharsetMatches()
163+
161164
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
162165

163166
if sig_encoding is not None:
@@ -220,16 +223,20 @@ def from_bytes(
220223
try:
221224
if is_too_large_sequence and is_multi_byte_decoder is False:
222225
str(
223-
sequences[: int(50e4)]
224-
if strip_sig_or_bom is False
225-
else sequences[len(sig_payload) : int(50e4)],
226+
(
227+
sequences[: int(50e4)]
228+
if strip_sig_or_bom is False
229+
else sequences[len(sig_payload) : int(50e4)]
230+
),
226231
encoding=encoding_iana,
227232
)
228233
else:
229234
decoded_payload = str(
230-
sequences
231-
if strip_sig_or_bom is False
232-
else sequences[len(sig_payload) :],
235+
(
236+
sequences
237+
if strip_sig_or_bom is False
238+
else sequences[len(sig_payload) :]
239+
),
233240
encoding=encoding_iana,
234241
)
235242
except (UnicodeDecodeError, LookupError) as e:
@@ -361,11 +368,18 @@ def from_bytes(
361368
)
362369
# Preparing those fallbacks in case we got nothing.
363370
if (
364-
encoding_iana in ["ascii", "utf_8", specified_encoding]
371+
enable_fallback
372+
and encoding_iana in ["ascii", "utf_8", specified_encoding]
365373
and not lazy_str_hard_failure
366374
):
367375
fallback_entry = CharsetMatch(
368-
sequences, encoding_iana, threshold, False, [], decoded_payload
376+
sequences,
377+
encoding_iana,
378+
threshold,
379+
False,
380+
[],
381+
decoded_payload,
382+
preemptive_declaration=specified_encoding,
369383
)
370384
if encoding_iana == specified_encoding:
371385
fallback_specified = fallback_entry
@@ -419,28 +433,58 @@ def from_bytes(
419433
),
420434
)
421435

422-
results.append(
423-
CharsetMatch(
424-
sequences,
425-
encoding_iana,
426-
mean_mess_ratio,
427-
bom_or_sig_available,
428-
cd_ratios_merged,
429-
decoded_payload,
430-
)
436+
current_match = CharsetMatch(
437+
sequences,
438+
encoding_iana,
439+
mean_mess_ratio,
440+
bom_or_sig_available,
441+
cd_ratios_merged,
442+
(
443+
decoded_payload
444+
if (
445+
is_too_large_sequence is False
446+
or encoding_iana in [specified_encoding, "ascii", "utf_8"]
447+
)
448+
else None
449+
),
450+
preemptive_declaration=specified_encoding,
431451
)
432452

453+
results.append(current_match)
454+
433455
if (
434456
encoding_iana in [specified_encoding, "ascii", "utf_8"]
435457
and mean_mess_ratio < 0.1
436458
):
459+
# If md says nothing to worry about, then... stop immediately!
460+
if mean_mess_ratio == 0.0:
461+
logger.debug(
462+
"Encoding detection: %s is most likely the one.",
463+
current_match.encoding,
464+
)
465+
if explain:
466+
logger.removeHandler(explain_handler)
467+
logger.setLevel(previous_logger_level)
468+
return CharsetMatches([current_match])
469+
470+
early_stop_results.append(current_match)
471+
472+
if (
473+
len(early_stop_results)
474+
and (specified_encoding is None or specified_encoding in tested)
475+
and "ascii" in tested
476+
and "utf_8" in tested
477+
):
478+
probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
437479
logger.debug(
438-
"Encoding detection: %s is most likely the one.", encoding_iana
480+
"Encoding detection: %s is most likely the one.",
481+
probable_result.encoding,
439482
)
440483
if explain:
441484
logger.removeHandler(explain_handler)
442485
logger.setLevel(previous_logger_level)
443-
return CharsetMatches([results[encoding_iana]])
486+
487+
return CharsetMatches([probable_result])
444488

445489
if encoding_iana == sig_encoding:
446490
logger.debug(
@@ -507,6 +551,7 @@ def from_fp(
507551
preemptive_behaviour: bool = True,
508552
explain: bool = False,
509553
language_threshold: float = 0.1,
554+
enable_fallback: bool = True,
510555
) -> CharsetMatches:
511556
"""
512557
Same thing than the function from_bytes but using a file pointer that is already ready.
@@ -522,11 +567,12 @@ def from_fp(
522567
preemptive_behaviour,
523568
explain,
524569
language_threshold,
570+
enable_fallback,
525571
)
526572

527573

528574
def from_path(
529-
path: "PathLike[Any]",
575+
path: Union[str, bytes, PathLike], # type: ignore[type-arg]
530576
steps: int = 5,
531577
chunk_size: int = 512,
532578
threshold: float = 0.20,
@@ -535,6 +581,7 @@ def from_path(
535581
preemptive_behaviour: bool = True,
536582
explain: bool = False,
537583
language_threshold: float = 0.1,
584+
enable_fallback: bool = True,
538585
) -> CharsetMatches:
539586
"""
540587
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
@@ -551,4 +598,71 @@ def from_path(
551598
preemptive_behaviour,
552599
explain,
553600
language_threshold,
601+
enable_fallback,
554602
)
603+
604+
605+
def is_binary(
606+
fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes], # type: ignore[type-arg]
607+
steps: int = 5,
608+
chunk_size: int = 512,
609+
threshold: float = 0.20,
610+
cp_isolation: Optional[List[str]] = None,
611+
cp_exclusion: Optional[List[str]] = None,
612+
preemptive_behaviour: bool = True,
613+
explain: bool = False,
614+
language_threshold: float = 0.1,
615+
enable_fallback: bool = False,
616+
) -> bool:
617+
"""
618+
Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
619+
Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
620+
are disabled to be stricter around ASCII-compatible but unlikely to be a string.
621+
"""
622+
if isinstance(fp_or_path_or_payload, (str, PathLike)):
623+
guesses = from_path(
624+
fp_or_path_or_payload,
625+
steps=steps,
626+
chunk_size=chunk_size,
627+
threshold=threshold,
628+
cp_isolation=cp_isolation,
629+
cp_exclusion=cp_exclusion,
630+
preemptive_behaviour=preemptive_behaviour,
631+
explain=explain,
632+
language_threshold=language_threshold,
633+
enable_fallback=enable_fallback,
634+
)
635+
elif isinstance(
636+
fp_or_path_or_payload,
637+
(
638+
bytes,
639+
bytearray,
640+
),
641+
):
642+
guesses = from_bytes(
643+
fp_or_path_or_payload,
644+
steps=steps,
645+
chunk_size=chunk_size,
646+
threshold=threshold,
647+
cp_isolation=cp_isolation,
648+
cp_exclusion=cp_exclusion,
649+
preemptive_behaviour=preemptive_behaviour,
650+
explain=explain,
651+
language_threshold=language_threshold,
652+
enable_fallback=enable_fallback,
653+
)
654+
else:
655+
guesses = from_fp(
656+
fp_or_path_or_payload,
657+
steps=steps,
658+
chunk_size=chunk_size,
659+
threshold=threshold,
660+
cp_isolation=cp_isolation,
661+
cp_exclusion=cp_exclusion,
662+
preemptive_behaviour=preemptive_behaviour,
663+
explain=explain,
664+
language_threshold=language_threshold,
665+
enable_fallback=enable_fallback,
666+
)
667+
668+
return not guesses

0 commit comments

Comments
 (0)