diff --git a/NOTICE b/NOTICE index 7f8d102..3b91b93 100644 --- a/NOTICE +++ b/NOTICE @@ -2,8 +2,17 @@ Copyright (2023) Databricks, Inc. This Software includes software developed at Databricks (https://www.databricks.com/) and its use is subject to the included LICENSE file. +____________________ This Software contains code from the following open source projects, licensed under the Apache 2.0 license: Databricks SDK for Python - https://github.com/databricks/databricks-sdk-py Copyright 2023 Databricks, Inc. All rights reserved. License - https://github.com/databricks/databricks-sdk-py/blob/main/LICENSE + + +____________________ +This Software contains code from the following open source projects, licensed under the GNU Lesser GPL v2: + +chardet - https://github.com/chardet/chardet +Copyright 2005-2024 Mark Pilgrim, Maintainer: Dan Blanchard +License - https://github.com/chardet/chardet/blob/main/LICENSE \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index d0fc7e1..da57d7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,10 @@ classifiers = [ dependencies = ["databricks-sdk>=0.16.0"] [project.optional-dependencies] -yaml = ["PyYAML>=6.0.0,<7.0.0"] +yaml = [ + "PyYAML>=6.0.0,<7.0.0", + "chardet>=5.1.0,<6.0.0", +] [project.urls] Issues = "https://github.com/databrickslabs/blueprint/issues" diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index 1cf29f2..5c85837 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -18,6 +18,7 @@ from typing import BinaryIO, Literal, NoReturn, TextIO, TypeVar from urllib.parse import quote_from_bytes as urlquote_from_bytes +import chardet from databricks.sdk import WorkspaceClient from databricks.sdk.errors import DatabricksError, ResourceDoesNotExist from databricks.sdk.service.files import FileInfo @@ -1150,6 +1151,7 @@ def decode_with_bom( a text-based IO wrapper that will decode the underlying binary-mode file as text. """ use_encoding: str | None + _chardet_confidence_threshold: float = 0.6 if encoding is not None: use_encoding = encoding else: @@ -1157,7 +1159,12 @@ def decode_with_bom( if use_encoding is None and detect_xml: use_encoding = _detect_encoding_xml(file, preserve_position=True) if use_encoding is None: - use_encoding = locale.getpreferredencoding() + result = chardet.detect(file.read()) + use_encoding = result["encoding"] or locale.getpreferredencoding() + if result["confidence"] < _chardet_confidence_threshold: + logger.debug(f"Low confidence ({result['confidence']}) in detected encoding: {result}") + use_encoding = locale.getpreferredencoding() + file.seek(0) return io.TextIOWrapper(file, encoding=use_encoding, errors=errors, newline=newline) diff --git a/tests/unit/test_paths.py b/tests/unit/test_paths.py index 82c7b51..6c4ab9b 100644 --- a/tests/unit/test_paths.py +++ b/tests/unit/test_paths.py @@ -1128,6 +1128,7 @@ def test_read_xml_file_default_utf8(tmp_path: Path, monkeypatch) -> None: path.write_text(example, encoding="utf-8") # Verify the monkey-patching means we're not defaulting to UTF-8. + # with chardet this would likely work, unless the confidence score is less than 0.6 for this example it is 0.506 monkeypatch.setattr(locale, "getpreferredencoding", lambda: "Windows-1252") assert locale.getpreferredencoding() != "UTF-8" assert read_text(path, detect_xml=False) != example