Skip to content

Commit 59d9218

Browse files
Improve README.py and logging messages (#433)
Fixes #425
1 parent 11753d9 commit 59d9218

File tree

6 files changed

+37
-71
lines changed

6 files changed

+37
-71
lines changed

src/databricks/labs/ucx/config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,3 +228,6 @@ def from_dict(cls, raw: dict):
228228

229229
def to_workspace_client(self) -> WorkspaceClient:
230230
return WorkspaceClient(config=self.to_databricks_config())
231+
232+
def replace_inventory_variable(self, text: str) -> str:
233+
return text.replace("$inventory", f"hive_metastore.{self.inventory_database}")

src/databricks/labs/ucx/framework/tasks.py

Lines changed: 20 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,18 @@ class Task:
2323
dashboard: str = None
2424

2525

26+
@staticmethod
27+
def _remove_extra_indentation(doc: str) -> str:
28+
lines = doc.splitlines()
29+
stripped = []
30+
for line in lines:
31+
if line.startswith(" " * 4):
32+
stripped.append(line[4:])
33+
else:
34+
stripped.append(line)
35+
return "\n".join(stripped)
36+
37+
2638
def task(workflow, *, depends_on=None, job_cluster="main", notebook: str | None = None, dashboard: str | None = None):
2739
def decorator(func):
2840
@wraps(func)
@@ -59,7 +71,7 @@ def wrapper(*args, **kwargs):
5971
task_id=len(_TASKS),
6072
workflow=workflow,
6173
name=func.__name__,
62-
doc=func.__doc__,
74+
doc=_remove_extra_indentation(func.__doc__),
6375
fn=func,
6476
depends_on=deps,
6577
job_cluster=job_cluster,
@@ -77,60 +89,18 @@ def trigger(*argv):
7789
if "config" not in args:
7890
msg = "no --config specified"
7991
raise KeyError(msg)
92+
8093
task_name = args.get("task", "not specified")
81-
# `{{parent_run_id}}` is the run of entire workflow, whereas `{{run_id}}` is the run of a task
82-
workflow_run_id = args.get("parent_run_id", "unknown_run_id")
83-
job_id = args.get("job_id")
8494
if task_name not in _TASKS:
8595
msg = f'task "{task_name}" not found. Valid tasks are: {", ".join(_TASKS.keys())}'
8696
raise KeyError(msg)
8797

8898
current_task = _TASKS[task_name]
8999
print(current_task.doc)
90100

91-
config_path = Path(args["config"])
92-
cfg = WorkspaceConfig.from_file(config_path)
93-
94-
# see https://docs.python.org/3/howto/logging-cookbook.html
95-
databricks_logger = logging.getLogger("databricks")
96-
databricks_logger.setLevel(logging.DEBUG)
97-
98-
ucx_logger = logging.getLogger("databricks.labs.ucx")
99-
ucx_logger.setLevel(logging.DEBUG)
100-
101-
log_path = config_path.parent / "logs" / current_task.workflow / f"run-{workflow_run_id}"
102-
log_path.mkdir(parents=True, exist_ok=True)
103-
104-
log_file = log_path / f"{task_name}.log"
105-
file_handler = logging.FileHandler(log_file.as_posix())
106-
log_format = "%(asctime)s %(levelname)s [%(name)s] {%(threadName)s} %(message)s"
107-
log_formatter = logging.Formatter(fmt=log_format, datefmt="%H:%M:%S")
108-
file_handler.setFormatter(log_formatter)
109-
file_handler.setLevel(logging.DEBUG)
110-
111-
console_handler = _install(cfg.log_level)
112-
databricks_logger.removeHandler(console_handler)
113-
databricks_logger.addHandler(file_handler)
114-
115-
ucx_logger.info(f"See debug logs at {log_file}")
116-
117-
log_readme = log_path.joinpath("README.md")
118-
if not log_readme.exists():
119-
# this may race when run from multiple tasks, but let's accept the risk for now.
120-
with log_readme.open(mode="w") as f:
121-
f.write(f"# Logs for the UCX {current_task.workflow} workflow\n")
122-
f.write("This folder contains UCX log files.\n\n")
123-
f.write(f"See the [{current_task.workflow} job](/#job/{job_id}) and ")
124-
f.write(f"[run #{workflow_run_id}](/#job/{job_id}/run/{workflow_run_id})\n")
125-
126-
try:
127-
current_task.fn(cfg)
128-
except BaseException as error:
129-
log_file_for_cli = str(log_file).lstrip("/Workspace")
130-
cli_command = f"databricks workspace export /{log_file_for_cli}"
131-
ucx_logger.error(f"Task crashed. Execute `{cli_command}` locally to troubleshoot with more details. {error}")
132-
databricks_logger.debug("Task crash details", exc_info=error)
133-
file_handler.flush()
134-
raise
135-
finally:
136-
file_handler.close()
101+
_install()
102+
103+
cfg = WorkspaceConfig.from_file(Path(args["config"]))
104+
logging.getLogger("databricks").setLevel(cfg.log_level)
105+
106+
current_task.fn(cfg)

src/databricks/labs/ucx/install.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ def _create_dashboards(self):
153153
remote_folder=f"{self._install_folder}/queries",
154154
name=self._name("UCX Assessment"),
155155
warehouse_id=self._warehouse_id,
156-
query_text_callback=self._replace_inventory_variable,
156+
query_text_callback=self._current_config.replace_inventory_variable,
157157
)
158158
self._dashboards["assessment"] = dash.create_dashboard()
159159

@@ -351,17 +351,6 @@ def _step_list(cls) -> list[str]:
351351
step_list.append(task.workflow)
352352
return step_list
353353

354-
@staticmethod
355-
def _remove_extra_indentation(doc: str) -> str:
356-
lines = doc.splitlines()
357-
stripped = []
358-
for line in lines:
359-
if line.startswith(" " * 4):
360-
stripped.append(line[4:])
361-
else:
362-
stripped.append(line)
363-
return "\n".join(stripped)
364-
365354
def _create_readme(self):
366355
md = [
367356
"# UCX - The Unity Catalog Migration Assistant",
@@ -386,8 +375,7 @@ def _create_readme(self):
386375
for t in self._sorted_tasks():
387376
if t.workflow != step_name:
388377
continue
389-
doc = self._remove_extra_indentation(t.doc)
390-
doc = self._replace_inventory_variable(doc)
378+
doc = self._current_config.replace_inventory_variable(t.doc)
391379
md.append(f"### `{t.name}`\n\n")
392380
md.append(f"{doc}\n")
393381
md.append("\n\n")

src/databricks/labs/ucx/runtime.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def setup_schema(cfg: WorkspaceConfig):
3535
def crawl_tables(_: WorkspaceConfig):
3636
"""Iterates over all tables in the Hive Metastore of the current workspace and persists their metadata, such
3737
as _database name_, _table name_, _table type_, _table location_, etc., in the Delta table named
38-
`${inventory_database}.tables`. The `inventory_database` placeholder is set in the configuration file. The metadata
38+
`$inventory_database.tables`. Note that the `inventory_database` is set in the configuration file. The metadata
3939
stored is then used in the subsequent tasks and workflows to, for example, find all Hive Metastore tables that
4040
cannot easily be migrated to Unity Catalog."""
4141

@@ -47,10 +47,10 @@ def setup_tacl(_: WorkspaceConfig):
4747

4848
@task("assessment", depends_on=[crawl_tables, setup_tacl], job_cluster="tacl")
4949
def crawl_grants(cfg: WorkspaceConfig):
50-
"""Scans the previously created Delta table named `${inventory_database}.tables` and issues a `SHOW GRANTS`
50+
"""Scans the previously created Delta table named `$inventory_database.tables` and issues a `SHOW GRANTS`
5151
statement for every object to retrieve the permissions it has assigned to it. The permissions include information
5252
such as the _principal_, _action type_, and the _table_ it applies to. This is persisted in the Delta table
53-
`${inventory_database}.grants`. Other, migration related jobs use this inventory table to convert the legacy Table
53+
`$inventory_database.grants`. Other, migration related jobs use this inventory table to convert the legacy Table
5454
ACLs to Unity Catalog permissions.
5555
5656
Note: This job runs on a separate cluster (named `tacl`) as it requires the proper configuration to have the Table
@@ -126,8 +126,10 @@ def assess_pipelines(cfg: WorkspaceConfig):
126126
"""This module scans through all the Pipelines and identifies those pipelines which has Azure Service Principals
127127
embedded (who has been given access to the Azure storage accounts via spark configurations) in the pipeline
128128
configurations.
129+
129130
It looks for:
130131
- all the pipelines which has Azure Service Principal embedded in the pipeline configuration
132+
131133
Subsequently, a list of all the pipelines with matching configurations are stored in the
132134
`$inventory.pipelines` table."""
133135
ws = WorkspaceClient(config=cfg.to_databricks_config())
@@ -140,8 +142,10 @@ def assess_azure_service_principals(cfg: WorkspaceConfig):
140142
"""This module scans through all the clusters configurations, cluster policies, job cluster configurations,
141143
Pipeline configurations, Warehouse configuration and identifies all the Azure Service Principals who has been
142144
given access to the Azure storage accounts via spark configurations referred in those entities.
145+
143146
It looks in:
144147
- all those entities and prepares a list of Azure Service Principal embedded in their configurations
148+
145149
Subsequently, the list of all the Azure Service Principals referred in those configurations are saved
146150
in the `$inventory.azure_service_principals` table."""
147151
ws = WorkspaceClient(config=cfg.to_databricks_config())
@@ -153,6 +157,7 @@ def assess_azure_service_principals(cfg: WorkspaceConfig):
153157
def assess_global_init_scripts(cfg: WorkspaceConfig):
154158
"""This module scans through all the global init scripts and identifies if there is an Azure Service Principal
155159
who has been given access to the Azure storage accounts via spark configurations referred in those scripts.
160+
156161
It looks in:
157162
- the list of all the global init scripts are saved in the `$inventory.azure_service_principals` table."""
158163
ws = WorkspaceClient(config=cfg.to_databricks_config())

tests/unit/assessment/test_dashboard.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def test_dashboard(mocker):
3737
remote_folder="/users/not_a_real_user/queries",
3838
name="Assessment",
3939
warehouse_id="000000",
40-
query_text_callback=installer._replace_inventory_variable,
40+
query_text_callback=installer._current_config.replace_inventory_variable,
4141
)
4242
dashboard = dash.create_dashboard()
4343
assert dashboard is not None

tests/unit/test_install.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -359,9 +359,9 @@ def test_create_readme(mocker):
359359

360360

361361
def test_replace_pydoc(mocker):
362-
ws = mocker.Mock()
363-
install = WorkspaceInstaller(ws)
364-
doc = install._remove_extra_indentation(
362+
from databricks.labs.ucx.framework.tasks import _remove_extra_indentation
363+
364+
doc = _remove_extra_indentation(
365365
"""Test1
366366
Test2
367367
Test3"""

0 commit comments

Comments
 (0)