Skip to content

Commit ac57c46

Browse files
authored
Safe read files in more places (#3394)
## Changes Handle reading ### Linked issues Resolves #3386 ### Functionality - [x] modified existing command: `databricks labs ucx lint-local-code` - [x] modified existing workflow: `assessment` ### Tests - [ ] added and modified unit tests
1 parent a1eb27f commit ac57c46

File tree

5 files changed

+106
-50
lines changed

5 files changed

+106
-50
lines changed

src/databricks/labs/ucx/source_code/base.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,20 @@ def read_text(path: Path, size: int = -1) -> str:
459459
return f.read(size)
460460

461461

462+
def safe_read_text(path: Path, size: int = -1) -> str | None:
463+
"""Safe read a text file by handling reading exceptions, see :func:_read_text.
464+
465+
Returns
466+
str : Content of file
467+
None : If error occurred during reading.
468+
"""
469+
try:
470+
return read_text(path, size=size)
471+
except (FileNotFoundError, UnicodeDecodeError, PermissionError) as e:
472+
logger.warning(f"Could not read file: {path}", exc_info=e)
473+
return None
474+
475+
462476
# duplicated from CellLanguage to prevent cyclic import
463477
LANGUAGE_COMMENT_PREFIXES = {Language.PYTHON: '#', Language.SCALA: '//', Language.SQL: '--'}
464478
NOTEBOOK_HEADER = "Databricks notebook source"
@@ -475,9 +489,5 @@ def is_a_notebook(path: Path, content: str | None = None) -> bool:
475489
magic_header = f"{LANGUAGE_COMMENT_PREFIXES.get(language)} {NOTEBOOK_HEADER}"
476490
if content is not None:
477491
return content.startswith(magic_header)
478-
try:
479-
file_header = read_text(path, size=len(magic_header))
480-
except (FileNotFoundError, UnicodeDecodeError, PermissionError):
481-
logger.warning(f"Could not read file {path}")
482-
return False
492+
file_header = safe_read_text(path, size=len(magic_header))
483493
return file_header == magic_header

src/databricks/labs/ucx/source_code/jobs.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
SourceInfo,
3535
UsedTable,
3636
LineageAtom,
37-
read_text,
37+
safe_read_text,
3838
)
3939
from databricks.labs.ucx.source_code.directfs_access import (
4040
DirectFsAccessCrawler,
@@ -633,7 +633,9 @@ def _process_dependency(
633633
logger.warning(f"Unknown language for {dependency.path}")
634634
return
635635
cell_language = CellLanguage.of_language(language)
636-
source = read_text(dependency.path)
636+
source = safe_read_text(dependency.path)
637+
if not source:
638+
return
637639
if is_a_notebook(dependency.path):
638640
yield from self._collect_from_notebook(source, cell_language, dependency.path, inherited_tree)
639641
elif dependency.path.is_file():

src/databricks/labs/ucx/source_code/notebooks/sources.py

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import locale
55
import logging
66
from collections.abc import Iterable
7-
from functools import cached_property
87
from pathlib import Path
98
from typing import cast
109

@@ -14,14 +13,15 @@
1413

1514
from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex
1615
from databricks.labs.ucx.source_code.base import (
17-
Advice,
18-
Failure,
19-
Linter,
20-
CurrentSessionState,
21-
Advisory,
2216
file_language,
2317
is_a_notebook,
18+
safe_read_text,
2419
read_text,
20+
Advice,
21+
Advisory,
22+
CurrentSessionState,
23+
Failure,
24+
Linter,
2525
)
2626

2727
from databricks.labs.ucx.source_code.graph import (
@@ -297,7 +297,9 @@ def _load_source_from_path(self, path: Path | None) -> Notebook | None:
297297
if language is not Language.PYTHON:
298298
logger.warning(f"Unsupported notebook language: {language}")
299299
return None
300-
source = read_text(resolved)
300+
source = safe_read_text(resolved)
301+
if not source:
302+
return None
301303
return Notebook.parse(path, source, language)
302304

303305
def _linter(self, language: Language) -> Linter:
@@ -315,7 +317,9 @@ def _load_source_from_run_cell(self, run_cell: RunCell) -> None:
315317
language = file_language(resolved)
316318
if language is not Language.PYTHON:
317319
return
318-
source = read_text(resolved)
320+
source = safe_read_text(resolved)
321+
if not source:
322+
return
319323
notebook = Notebook.parse(path, source, language)
320324
for cell in notebook.cells:
321325
if isinstance(cell, RunCell):
@@ -390,16 +394,11 @@ def __init__(
390394
self._inherited_tree = inherited_tree
391395
self._content = content
392396

393-
@cached_property
394-
def _source_code(self) -> str:
395-
if self._content is None:
396-
self._content = read_text(self._path)
397-
return self._content
398-
399397
def lint(self) -> Iterable[Advice]:
400398
encoding = locale.getpreferredencoding(False)
401399
try:
402-
is_notebook = self._is_notebook()
400+
# Not using `safe_read_text` here to surface read errors
401+
self._content = self._content or read_text(self._path)
403402
except FileNotFoundError:
404403
failure_message = f"File not found: {self._path}"
405404
yield Failure("file-not-found", failure_message, 0, 0, 1, 1)
@@ -413,19 +412,21 @@ def lint(self) -> Iterable[Advice]:
413412
yield Failure("file-permission", failure_message, 0, 0, 1, 1)
414413
return
415414

416-
if is_notebook:
415+
if self._is_notebook():
417416
yield from self._lint_notebook()
418417
else:
419418
yield from self._lint_file()
420419

421420
def _is_notebook(self) -> bool:
421+
assert self._content is not None, "Content should be read from path before calling this method"
422422
# pre-check to avoid loading unsupported content
423423
language = file_language(self._path)
424424
if not language:
425425
return False
426-
return is_a_notebook(self._path, self._source_code)
426+
return is_a_notebook(self._path, self._content)
427427

428428
def _lint_file(self) -> Iterable[Advice]:
429+
assert self._content is not None, "Content should be read from path before calling this method"
429430
language = file_language(self._path)
430431
if not language:
431432
suffix = self._path.suffix.lower()
@@ -440,17 +441,18 @@ def _lint_file(self) -> Iterable[Advice]:
440441
linter = self._ctx.linter(language)
441442
if self._inherited_tree is not None and isinstance(linter, PythonSequentialLinter):
442443
linter.append_tree(self._inherited_tree)
443-
yield from linter.lint(self._source_code)
444+
yield from linter.lint(self._content)
444445
except ValueError as err:
445446
failure_message = f"Error while parsing content of {self._path.as_posix()}: {err}"
446447
yield Failure("unsupported-content", failure_message, 0, 0, 1, 1)
447448

448449
def _lint_notebook(self) -> Iterable[Advice]:
450+
assert self._content is not None, "Content should be read from path before calling this method"
449451
language = file_language(self._path)
450452
if not language:
451453
yield Failure("unknown-language", f"Cannot detect language for {self._path}", 0, 0, 1, 1)
452454
return
453-
notebook = Notebook.parse(self._path, self._source_code, language)
455+
notebook = Notebook.parse(self._path, self._content, language)
454456
notebook_linter = NotebookLinter(
455457
self._ctx,
456458
self._path_lookup,
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import codecs
2+
import logging
3+
from pathlib import Path
4+
from unittest.mock import create_autospec
5+
6+
import pytest
7+
8+
from databricks.labs.ucx.source_code.base import read_text, safe_read_text
9+
10+
11+
@pytest.mark.parametrize(
12+
"exception",
13+
[
14+
FileNotFoundError(),
15+
UnicodeDecodeError("utf-8", b"\x80\x81\x82", 0, 1, "invalid start byte"),
16+
PermissionError(),
17+
],
18+
)
19+
def test_safe_read_text_handles_and_warns_read_errors(caplog, exception: Exception) -> None:
20+
path = create_autospec(Path)
21+
path.open.side_effect = exception
22+
23+
with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.account.aggregate"):
24+
text = safe_read_text(path)
25+
26+
assert not text
27+
assert f"Could not read file: {path}" in caplog.messages
28+
29+
30+
@pytest.mark.parametrize(
31+
"bom, encoding",
32+
[
33+
(codecs.BOM_UTF8, "utf-8"),
34+
(codecs.BOM_UTF16_LE, "utf-16-le"),
35+
(codecs.BOM_UTF16_BE, "utf-16-be"),
36+
(codecs.BOM_UTF32_LE, "utf-32-le"),
37+
(codecs.BOM_UTF32_BE, "utf-32-be"),
38+
],
39+
)
40+
def test_read__encoded_file_with_bom(tmp_path, bom, encoding) -> None:
41+
path = tmp_path / "file.py"
42+
path.write_bytes(bom + "a = 12".encode(encoding))
43+
44+
text = read_text(path)
45+
46+
assert text == "a = 12"

tests/unit/source_code/notebooks/test_sources.py

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -19,26 +19,6 @@ def test_file_linter_lints_supported_language(path, content, migration_index, mo
1919
assert not advices
2020

2121

22-
@pytest.mark.parametrize("path", ["xyz.scala", "xyz.r", "xyz.sh"])
23-
def test_file_linter_lints_not_yet_supported_language(path, migration_index, mock_path_lookup) -> None:
24-
linter = FileLinter(LinterContext(migration_index), mock_path_lookup, CurrentSessionState(), Path(path), None, "")
25-
advices = list(linter.lint())
26-
assert [advice.code for advice in advices] == ["unsupported-language"]
27-
28-
29-
class FriendFileLinter(FileLinter):
30-
31-
def source_code(self):
32-
return self._source_code
33-
34-
35-
def test_checks_encoding_of_pseudo_file(migration_index, mock_path_lookup) -> None:
36-
linter = FriendFileLinter(
37-
LinterContext(migration_index), mock_path_lookup, CurrentSessionState(), Path("whatever"), None, "a=b"
38-
)
39-
assert linter.source_code() == "a=b"
40-
41-
4222
@pytest.mark.parametrize(
4323
"bom, encoding",
4424
[
@@ -49,11 +29,25 @@ def test_checks_encoding_of_pseudo_file(migration_index, mock_path_lookup) -> No
4929
(codecs.BOM_UTF32_BE, "utf-32-be"),
5030
],
5131
)
52-
def test_checks_encoding_of_file_with_bom(migration_index, bom, encoding, tmp_path, mock_path_lookup) -> None:
32+
def test_file_linter_lints_supported_language_encoded_file_with_bom(
33+
tmp_path, migration_index, mock_path_lookup, bom, encoding
34+
) -> None:
5335
path = tmp_path / "file.py"
5436
path.write_bytes(bom + "a = 12".encode(encoding))
55-
linter = FriendFileLinter(LinterContext(migration_index), mock_path_lookup, CurrentSessionState(), path)
56-
assert linter.source_code() == "a = 12"
37+
linter = FileLinter(LinterContext(migration_index), mock_path_lookup, CurrentSessionState(), path, None)
38+
39+
advices = list(linter.lint())
40+
41+
assert not advices
42+
43+
44+
@pytest.mark.parametrize("path", ["xyz.scala", "xyz.r", "xyz.sh"])
45+
def test_file_linter_lints_not_yet_supported_language(tmp_path, path, migration_index, mock_path_lookup) -> None:
46+
path = tmp_path / path
47+
path.touch()
48+
linter = FileLinter(LinterContext(migration_index), mock_path_lookup, CurrentSessionState(), Path(path), None, "")
49+
advices = list(linter.lint())
50+
assert [advice.code for advice in advices] == ["unsupported-language"]
5751

5852

5953
@pytest.mark.parametrize(
@@ -75,7 +69,9 @@ def test_checks_encoding_of_file_with_bom(migration_index, bom, encoding, tmp_pa
7569
".DS_Store", # on MacOS
7670
],
7771
)
78-
def test_file_linter_lints_ignorable_language(path, migration_index, mock_path_lookup) -> None:
72+
def test_file_linter_lints_ignorable_language(tmp_path, path, migration_index, mock_path_lookup) -> None:
73+
path = tmp_path / path
74+
path.touch()
7975
linter = FileLinter(LinterContext(migration_index), mock_path_lookup, CurrentSessionState(), Path(path), None)
8076
advices = list(linter.lint())
8177
assert not advices

0 commit comments

Comments
 (0)