databrickslabs
diff --git a/‎src/databricks/labs/ucx/mixins/cached_workspace_path.py‎
Lines changed: 86 additions & 87 deletions b/‎src/databricks/labs/ucx/mixins/cached_workspace_path.py‎
Lines changed: 86 additions & 87 deletions
diff --git a/‎src/databricks/labs/ucx/source_code/base.py‎
Lines changed: 68 additions & 16 deletions b/‎src/databricks/labs/ucx/source_code/base.py‎
Lines changed: 68 additions & 16 deletions
@@ -3,137 +3,136 @@
 import os
 from collections import OrderedDict
 from collections.abc import Generator
-from io import StringIO, BytesIO
+from io import BytesIO
+from pathlib import PurePosixPath
+from typing import IO, TypeVar
 
 from databricks.sdk import WorkspaceClient
 from databricks.sdk.service.workspace import ObjectInfo
 from databricks.labs.blueprint.paths import WorkspacePath
 
+from databricks.labs.ucx.source_code.base import decode_with_bom
 
-class _CachedIO:
 
-    def __init__(self, content):
-        self._content = content
-        self._index = 0
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        return False
+# lru_cache won't let us invalidate cache entries
+# so we provide our own custom lru_cache
+class _PathLruCache:
 
-    def read(self, *args, **_kwargs):
-        count = -1 if len(args) < 1 or args[0] < 1 else args[0]
-        if count == -1:
-            return self._content
-        start = self._index
-        end = self._index + count
-        if start >= len(self._content):
-            return None
-        self._index = self._index + count
-        return self._content[start:end]
+    _datas: OrderedDict[PurePosixPath, bytes]
+    """Cached binary data of files, keyed by workspace path, ordered from oldest to newest."""
 
-    def __iter__(self):
-        if isinstance(self._content, str):
-            yield from StringIO(self._content)
-            return
-        yield from self._as_string_io().__iter__()
+    _max_entries: int
+    """The maximum number of entries to hold in the cache."""
 
-    def with_mode(self, mode: str):
-        if 'b' in mode:
-            return self._as_bytes_io()
-        return self._as_string_io()
+    def __init__(self, max_entries: int) -> None:
+        # Ordered from oldest to newest.
+        self._datas = OrderedDict()
+        self._max_entries = max_entries
 
-    def _as_bytes_io(self):
-        if isinstance(self._content, bytes):
-            return self
-        return BytesIO(self._content.encode("utf-8-sig"))
+    @classmethod
+    def _normalize(cls, path: _CachedPath) -> PurePosixPath:
+        # Note: must not return the same instance that was passed in, to avoid circular references (and memory leaks).
+        return PurePosixPath(*path.parts)
 
-    def _as_string_io(self):
-        if isinstance(self._content, str):
-            return self
-        return StringIO(self._content.decode("utf-8"))
+    def load(self, cached_path: _CachedPath, buffering: int = -1) -> bytes:
+        normalized_path = self._normalize(cached_path)
 
+        data = self._datas.get(normalized_path, None)
+        if data is not None:
+            self._datas.move_to_end(normalized_path)
+            return data
 
-# lru_cache won't let us invalidate cache entries
-# so we provide our own custom lru_cache
-class _PathLruCache:
-
-    def __init__(self, max_entries: int):
-        self._datas: OrderedDict[str, bytes | str] = OrderedDict()
-        self._max_entries = max_entries
-
-    def open(self, cached_path: _CachedPath, mode, buffering, encoding, errors, newline):
-        path = str(cached_path)
-        if path in self._datas:
-            self._datas.move_to_end(path)
-            return _CachedIO(self._datas[path]).with_mode(mode)
-        io_obj = WorkspacePath.open(cached_path, mode, buffering, encoding, errors, newline)
-        # can't read twice from an IO so need to cache data rather than the io object
-        data = io_obj.read()
-        self._datas[path] = data
-        result = _CachedIO(data).with_mode(mode)
-        if len(self._datas) > self._max_entries:
+        # Need to bypass the _CachedPath.open() override to actually open and retrieve the file content.
+        with WorkspacePath.open(cached_path, mode="rb", buffering=buffering) as workspace_file:
+            data = workspace_file.read()
+        if self._max_entries <= len(self._datas):
             self._datas.popitem(last=False)
-        return result
+        self._datas[normalized_path] = data
+        return data
 
-    def clear(self):
+    def clear(self) -> None:
         self._datas.clear()
 
-    def remove(self, path: str):
-        if path in self._datas:
-            self._datas.pop(path)
+    def remove(self, path: _CachedPath) -> None:
+        del self._datas[self._normalize(path)]
 
 
 class _CachedPath(WorkspacePath):
-    def __init__(self, cache: _PathLruCache, ws: WorkspaceClient, *args: str | bytes | os.PathLike):
+    def __init__(self, cache: _PathLruCache, ws: WorkspaceClient, *args: str | bytes | os.PathLike) -> None:
         super().__init__(ws, *args)
         self._cache = cache
 
-    def with_object_info(self, object_info: ObjectInfo):
-        self._cached_object_info = object_info
-        return self
-
-    def with_segments(self, *path_segments: bytes | str | os.PathLike) -> _CachedPath:
+    @classmethod
+    def _from_object_info_with_cache(
+        cls,
+        cache: _PathLruCache,
+        ws: WorkspaceClient,
+        object_info: ObjectInfo,
+    ) -> _CachedPath:
+        assert object_info.path
+        path = cls(cache, ws, object_info.path)
+        path._cached_object_info = object_info
+        return path
+
+    def with_segments(self: _CachedPathT, *path_segments: bytes | str | os.PathLike) -> _CachedPathT:
         return type(self)(self._cache, self._ws, *path_segments)
 
     def iterdir(self) -> Generator[_CachedPath, None, None]:
+        # Variant of the superclass implementation that preserves the cache, as well as the client.
         for object_info in self._ws.workspace.list(self.as_posix()):
-            path = object_info.path
-            if path is None:
-                msg = f"Cannot initialise without object path: {object_info}"
-                raise ValueError(msg)
-            child = _CachedPath(self._cache, self._ws, path)
-            yield child.with_object_info(object_info)
-
-    def open(
+            yield self._from_object_info_with_cache(self._cache, self._ws, object_info)
+
+    def open(  # type: ignore[override]
         self,
         mode: str = "r",
         buffering: int = -1,
         encoding: str | None = None,
         errors: str | None = None,
         newline: str | None = None,
-    ):
-        # only cache reads
-        if 'r' in mode:
-            return self._cache.open(self, mode, buffering, encoding, errors, newline)
-        self._cache.remove(str(self))
-        return super().open(mode, buffering, encoding, errors, newline)
+    ) -> IO:
+        # We only cache reads; if a write happens we use the default implementation (and evict any cache entry).
+        if 'w' in mode:
+            self._cache.remove(self)
+            return super().open(mode, buffering, encoding, errors, newline)
+
+        binary_data = self._cache.load(self, buffering=buffering)
+        binary_io = BytesIO(binary_data)
+        if 'b' in mode:
+            return binary_io
 
-    def _cached_open(self, mode: str, buffering: int, encoding: str | None, errors: str | None, newline: str | None):
-        return super().open(mode, buffering, encoding, errors, newline)
+        return decode_with_bom(binary_io, encoding, errors, newline)
 
     # _rename calls unlink so no need to override it
     def unlink(self, missing_ok: bool = False) -> None:
-        self._cache.remove(str(self))
+        self._cache.remove(self)
         return super().unlink(missing_ok)
 
 
+_CachedPathT = TypeVar("_CachedPathT", bound=_CachedPath)
+
+
 class WorkspaceCache:
 
-    def __init__(self, ws: WorkspaceClient, max_entries=2048):
+    class InvalidWorkspacePath(ValueError):
+        pass
+
+    def __init__(self, ws: WorkspaceClient, max_entries: int = 2048) -> None:
         self._ws = ws
         self._cache = _PathLruCache(max_entries)
 
-    def get_path(self, path: str):
+    def get_workspace_path(self, path: str) -> WorkspacePath:
+        """Obtain a `WorkspacePath` instance for a path that refers to a workspace file or notebook.
+
+        The instance returned participates in this content cache: the first time the path is opened the content will
+        be immediately retrieved (prior to reading) and cached.
+
+        Args:
+            path: a valid workspace path (must be absolute)
+        Raises:
+            WorkspaceCache.InvalidWorkspacePath: this is raised immediately if the supplied path is not a syntactically
+                valid workspace path. (This is not raised if the path is syntactically valid but does not exist.)
+        """
+        if not path.startswith("/"):
+            msg = f"Invalid workspace path; must be absolute and start with a slash ('/'): {path}"
+            raise WorkspaceCache.InvalidWorkspacePath(msg)
         return _CachedPath(self._cache, self._ws, path)
@@ -2,15 +2,15 @@
 
 import codecs
 import dataclasses
-import locale
+import io
 import logging
 import sys
 from abc import abstractmethod, ABC
 from collections.abc import Iterable
 from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path
-from typing import Any
+from typing import Any, BinaryIO, TextIO
 
 from astroid import AstroidSyntaxError, NodeNG  # type: ignore
 from sqlglot import Expression, parse as parse_sql, ParseError as SqlParseError
@@ -482,18 +482,71 @@ def file_language(path: Path) -> Language | None:
     return SUPPORTED_EXTENSION_LANGUAGES.get(path.suffix.lower())
 
 
-def guess_encoding(path: Path) -> str:
-    # some files encode a unicode BOM (byte-order-mark), so let's use that if available
-    with path.open('rb') as _file:
-        raw = _file.read(4)
-        if raw.startswith(codecs.BOM_UTF32_LE) or raw.startswith(codecs.BOM_UTF32_BE):
-            return 'utf-32'
-        if raw.startswith(codecs.BOM_UTF16_LE) or raw.startswith(codecs.BOM_UTF16_BE):
-            return 'utf-16'
-        if raw.startswith(codecs.BOM_UTF8):
-            return 'utf-8-sig'
-        # no BOM, let's use default encoding
-        return locale.getpreferredencoding(False)
+def _detect_encoding_bom(binary_io: BinaryIO, *, preserve_position: bool) -> str | None:
+    # Peek at the first (up to) 4 bytes, preserving the file position if requested.
+    position = binary_io.tell() if preserve_position else None
+    try:
+        maybe_bom = binary_io.read(4)
+    finally:
+        if position is not None:
+            binary_io.seek(position)
+    # For these encodings, TextIOWrapper will skip over the BOM during decoding.
+    if maybe_bom.startswith(codecs.BOM_UTF32_LE) or maybe_bom.startswith(codecs.BOM_UTF32_BE):
+        return "utf-32"
+    if maybe_bom.startswith(codecs.BOM_UTF16_LE) or maybe_bom.startswith(codecs.BOM_UTF16_BE):
+        return "utf-16"
+    if maybe_bom.startswith(codecs.BOM_UTF8):
+        return "utf-8-sig"
+    return None
+
+
+def decode_with_bom(
+    file: BinaryIO,
+    encoding: str | None = None,
+    errors: str | None = None,
+    newline: str | None = None,
+) -> TextIO:
+    """Wrap an open binary file with a text decoder.
+
+    This has the same semantics as the built-in `open()` call, except that if the encoding is not specified and the
+    file is seekable then it will be checked for a BOM. If a BOM marker is found, that encoding is used. When neither
+    an encoding nor a BOM are present the encoding of the system locale is used.
+
+    Args:
+          file: the open (binary) file to wrap in text mode.
+          encoding: force decoding with a specific locale. If not present the file BOM and system locale are used.
+          errors: how decoding errors should be handled, as per open().
+          newline: how newlines should be handled, as per open().
+    Raises:
+          ValueError: if the encoding should be detected via potential BOM marker but the file is not seekable.
+    Returns:
+          a text-based IO wrapper that will decode the underlying binary-mode file as text.
+    """
+    use_encoding = _detect_encoding_bom(file, preserve_position=True) if encoding is None else encoding
+    return io.TextIOWrapper(file, encoding=use_encoding, errors=errors, newline=newline)
+
+
+def read_text(path: Path, size: int = -1) -> str:
+    """Read a file as text, decoding according to the BOM marker if that is present.
+
+    This differs to the normal `.read_text()` method on path which does not support BOM markers.
+
+    Arguments:
+        path: the path to read text from.
+        size: how much text (measured in characters) to read. If negative, all text is read. Less may be read if the
+            file is smaller than the specified size.
+    Returns:
+        The string content of the file, up to the specified size.
+    """
+    with path.open("rb") as binary_io:
+        # If the open file is seekable, we can detect the BOM and decode without re-opening.
+        if binary_io.seekable():
+            with decode_with_bom(binary_io) as f:
+                return f.read(size)
+        encoding = _detect_encoding_bom(binary_io, preserve_position=False)
+    # Otherwise having read the BOM there's no way to rewind so we need to re-open and read from that.
+    with path.open("rt", encoding=encoding) as f:
+        return f.read(size)
 
 
 # duplicated from CellLanguage to prevent cyclic import
@@ -513,8 +566,7 @@ def is_a_notebook(path: Path, content: str | None = None) -> bool:
     if content is not None:
         return content.startswith(magic_header)
     try:
-        with path.open('rt', encoding=guess_encoding(path)) as f:
-            file_header = f.read(len(magic_header))
+        file_header = read_text(path, size=len(magic_header))
     except (FileNotFoundError, UnicodeDecodeError, PermissionError):
         logger.warning(f"Could not read file {path}")
         return False