Writing log readme in multiprocess safe way (#794)

mwojtyczka · web-flow · commit 369893a08cd8 · 2024-01-17T14:32:07.000+01:00
## Changes

1. Changed writing of the readme log in a multiprocess safe way. Without
this the assessment tasks are occasionally failing with error:
`FileExistsError: [Errno 17] File exists:
'/Workspace/Users/jni4fe@xxx.com/.ucx/logs/assessment/run-340934288146817/README.md'`
2. Skip long running integration test for Redash
3. Check if job settings and tasks are present while crawling

### Tests
- [x] manually tested
diff --git a/src/databricks/labs/ucx/assessment/azure.py b/src/databricks/labs/ucx/assessment/azure.py
@@ -118,21 +118,23 @@ def _get_azure_spn_list(self, config: dict) -> list:
 
     def _get_cluster_configs_from_all_jobs(self, all_jobs, all_clusters_by_id):
         for j in all_jobs:
-            if j.settings.job_clusters is not None:
-                for jc in j.settings.job_clusters:
-                    if jc.new_cluster is None:
-                        continue
-                    yield j, jc.new_cluster
-
-            for t in j.settings.tasks:
-                if t.existing_cluster_id is not None:
-                    interactive_cluster = all_clusters_by_id.get(t.existing_cluster_id, None)
-                    if interactive_cluster is None:
-                        continue
-                    yield j, interactive_cluster
-
-                elif t.new_cluster is not None:
-                    yield j, t.new_cluster
+            if j.settings is not None:
+                if j.settings.job_clusters is not None:
+                    for jc in j.settings.job_clusters:
+                        if jc.new_cluster is None:
+                            continue
+                        yield j, jc.new_cluster
+
+                if j.settings.tasks is not None:
+                    for t in j.settings.tasks:
+                        if t.existing_cluster_id is not None:
+                            interactive_cluster = all_clusters_by_id.get(t.existing_cluster_id, None)
+                            if interactive_cluster is None:
+                                continue
+                            yield j, interactive_cluster
+
+                        elif t.new_cluster is not None:
+                            yield j, t.new_cluster
 
     def _get_relevant_service_principals(self) -> list:
         relevant_service_principals = []
diff --git a/src/databricks/labs/ucx/assessment/jobs.py b/src/databricks/labs/ucx/assessment/jobs.py
@@ -36,21 +36,23 @@ def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema):
     @staticmethod
     def _get_cluster_configs_from_all_jobs(all_jobs, all_clusters_by_id):
         for j in all_jobs:
-            if j.settings.job_clusters is not None:
-                for jc in j.settings.job_clusters:
-                    if jc.new_cluster is None:
-                        continue
-                    yield j, jc.new_cluster
-
-            for t in j.settings.tasks:
-                if t.existing_cluster_id is not None:
-                    interactive_cluster = all_clusters_by_id.get(t.existing_cluster_id, None)
-                    if interactive_cluster is None:
-                        continue
-                    yield j, interactive_cluster
-
-                elif t.new_cluster is not None:
-                    yield j, t.new_cluster
+            if j.settings is not None:
+                if j.settings.job_clusters is not None:
+                    for jc in j.settings.job_clusters:
+                        if jc.new_cluster is None:
+                            continue
+                        yield j, jc.new_cluster
+
+                if j.settings.tasks is not None:
+                    for t in j.settings.tasks:
+                        if t.existing_cluster_id is not None:
+                            interactive_cluster = all_clusters_by_id.get(t.existing_cluster_id, None)
+                            if interactive_cluster is None:
+                                continue
+                            yield j, interactive_cluster
+
+                        elif t.new_cluster is not None:
+                            yield j, t.new_cluster
 
     def _crawl(self) -> Iterable[JobInfo]:
         all_jobs = list(self._ws.jobs.list(expand_tasks=True))
@@ -71,17 +73,17 @@ def _assess_jobs(self, all_jobs: list[BaseJob], all_clusters_by_id) -> Iterable[
                 )
 
             job_settings = job.settings
-            assert job_settings is not None
-            job_name = job_settings.name
-            if not job_name:
-                job_name = "Unknown"
-            job_details[job.job_id] = JobInfo(
-                job_id=str(job.job_id),
-                job_name=job_name,
-                creator=job.creator_user_name,
-                success=1,
-                failures="[]",
-            )
+            if job_settings is not None:
+                job_name = job_settings.name
+                if not job_name:
+                    job_name = "Unknown"
+                job_details[job.job_id] = JobInfo(
+                    job_id=str(job.job_id),
+                    job_name=job_name,
+                    creator=job.creator_user_name,
+                    success=1,
+                    failures="[]",
+                )
 
         for job, cluster_config in self._get_cluster_configs_from_all_jobs(all_jobs, all_clusters_by_id):
             support_status = spark_version_compatibility(cluster_config.spark_version)
diff --git a/src/databricks/labs/ucx/framework/tasks.py b/src/databricks/labs/ucx/framework/tasks.py
@@ -1,12 +1,16 @@
 import logging
+import os
 from collections.abc import Callable
+from contextlib import contextmanager
 from dataclasses import dataclass
+from datetime import timedelta
 from functools import wraps
 from logging.handlers import TimedRotatingFileHandler
 from pathlib import Path
 
 from databricks.labs.blueprint.logger import install_logger
 from databricks.sdk.core import Config
+from databricks.sdk.retries import retried
 
 from databricks.labs.ucx.__about__ import __version__
 from databricks.labs.ucx.config import WorkspaceConfig
@@ -116,6 +120,36 @@ def wrapper(*args, **kwargs):
     return decorator
 
 
+@retried(on=[FileExistsError], timeout=timedelta(seconds=5))
+def _create_lock(lockfile_name):
+    while True:  # wait until the lock file can be opened
+        f = os.open(lockfile_name, os.O_CREAT | os.O_EXCL)
+        break
+    return f
+
+
+@contextmanager
+def _exclusive_open(filename: str, *args, **kwargs):
+    """Open a file with exclusive access across multiple processes.
+    Requires write access to the directory containing the file.
+
+    Arguments are the same as the built-in open.
+
+    Returns a context manager that closes the file and releases the lock.
+    """
+    lockfile_name = filename + ".lock"
+    lockfile = _create_lock(lockfile_name)
+
+    try:
+        with open(filename, *args, **kwargs) as f:
+            yield f
+    finally:
+        try:
+            os.close(lockfile)
+        finally:
+            os.unlink(lockfile_name)
+
+
 def trigger(*argv):
     args = dict(a[2:].split("=") for a in argv if "--" == a[0:2])
     if "config" not in args:
@@ -168,8 +202,8 @@ def trigger(*argv):
 
     log_readme = log_path.joinpath("README.md")
     if not log_readme.exists():
-        # this may race when run from multiple tasks, but let's accept the risk for now.
-        with log_readme.open(mode="w") as f:
+        # this may race when run from multiple tasks, therefore it must be multiprocess safe
+        with _exclusive_open(str(log_readme), mode="w") as f:
             f.write(f"# Logs for the UCX {current_task.workflow} workflow\n")
             f.write("This folder contains UCX log files.\n\n")
             f.write(f"See the [{current_task.workflow} job](/#job/{job_id}) and ")
diff --git a/tests/integration/workspace_access/test_redash.py b/tests/integration/workspace_access/test_redash.py
@@ -1,5 +1,6 @@
 import logging
 from datetime import timedelta
+from unittest import skip
 
 from databricks.sdk.errors import NotFound
 from databricks.sdk.retries import retried
@@ -60,6 +61,7 @@ def test_permissions_for_redash(
 # Redash group permissions are cached for up to 10 mins. If a group is renamed, redash permissions api returns
 # the old name for some time. Therefore, we need to allow at least 10 mins in the timeout for checking the permissions
 # after group rename.
+@skip  # skipping as it takes 5-10 mins to execute
 @retried(on=[NotFound], timeout=timedelta(minutes=13))
 def test_permissions_for_redash_after_group_is_renamed(
     ws,
diff --git a/tests/unit/assessment/test_jobs.py b/tests/unit/assessment/test_jobs.py
@@ -100,6 +100,68 @@ def test_job_assessment():
     assert result_set[1].success == 0
 
 
+def test_job_assessment_no_job_tasks():
+    sample_jobs = [
+        BaseJob(
+            created_time=1694536604319,
+            creator_user_name="anonymous@databricks.com",
+            job_id=536591785949415,
+            settings=JobSettings(
+                compute=None,
+                continuous=None,
+                tasks=None,
+                timeout_seconds=0,
+            ),
+        ),
+    ]
+
+    sample_clusters = [
+        ClusterDetails(
+            autoscale=AutoScale(min_workers=1, max_workers=6),
+            spark_conf={"spark.databricks.delta.preview.enabled": "true"},
+            spark_context_id=5134472582179566666,
+            spark_env_vars=None,
+            spark_version="13.3.x-cpu-ml-scala2.12",
+            cluster_id="0810-229933-chicago99",
+            cluster_source=ClusterSource.JOB,
+        ),
+    ]
+    ws = Mock()
+    result_set = JobsCrawler(ws, MockBackend(), "ucx")._assess_jobs(
+        sample_jobs, {c.cluster_id: c for c in sample_clusters}
+    )
+    assert len(result_set) == 1
+    assert result_set[0].success == 1
+
+
+def test_job_assessment_no_job_settings():
+    sample_jobs = [
+        BaseJob(
+            created_time=1694536604319,
+            creator_user_name="anonymous@databricks.com",
+            job_id=536591785949415,
+            settings=None,
+        ),
+    ]
+
+    sample_clusters = [
+        ClusterDetails(
+            autoscale=AutoScale(min_workers=1, max_workers=6),
+            spark_conf={"spark.databricks.delta.preview.enabled": "true"},
+            spark_context_id=5134472582179566666,
+            spark_env_vars=None,
+            spark_version="13.3.x-cpu-ml-scala2.12",
+            cluster_id="0810-229933-chicago99",
+            cluster_source=ClusterSource.JOB,
+        ),
+    ]
+    ws = Mock()
+    result_set = JobsCrawler(ws, MockBackend(), "ucx")._assess_jobs(
+        sample_jobs, {c.cluster_id: c for c in sample_clusters}
+    )
+    assert len(result_set) == 0
+
+
 def test_job_assessment_for_azure_spark_config():
     sample_jobs = [
         BaseJob(
@@ -243,6 +305,47 @@ def test_job_assessment_for_azure_spark_config():
     assert result_set[2].success == 0
 
 
+def test_jobs_assessment_with_spn_cluster_no_job_tasks(mocker):
+    sample_jobs = [
+        BaseJob(
+            created_time=1694536604319,
+            creator_user_name="anonymous@databricks.com",
+            job_id=536591785949415,
+            settings=JobSettings(
+                compute=None,
+                continuous=None,
+                tasks=None,
+                timeout_seconds=0,
+            ),
+        )
+    ]
+
+    ws = mocker.Mock()
+    ws.clusters.list.return_value = []
+    ws.jobs.list.return_value = sample_jobs
+
+    jobs = AzureServicePrincipalCrawler(ws, MockBackend(), "ucx")._list_all_jobs_with_spn_in_spark_conf()
+    assert len(jobs) == 0
+
+
+def test_jobs_assessment_with_spn_cluster_no_job_settings(mocker):
+    sample_jobs = [
+        BaseJob(
+            created_time=1694536604319,
+            creator_user_name="anonymous@databricks.com",
+            job_id=536591785949415,
+            settings=None,
+        )
+    ]
+
+    ws = mocker.Mock()
+    ws.clusters.list.return_value = []
+    ws.jobs.list.return_value = sample_jobs
+
+    jobs = AzureServicePrincipalCrawler(ws, MockBackend(), "ucx")._list_all_jobs_with_spn_in_spark_conf()
+    assert len(jobs) == 0
+
+
 def test_jobs_assessment_with_spn_cluster_policy_not_found(mocker):
     sample_jobs = [
         BaseJob(