databrickslabs
diff --git a/‎src/databricks/labs/ucx/cli.py‎
Lines changed: 17 additions & 0 deletions b/‎src/databricks/labs/ucx/cli.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎src/databricks/labs/ucx/contexts/application.py‎
Lines changed: 17 additions & 0 deletions b/‎src/databricks/labs/ucx/contexts/application.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎src/databricks/labs/ucx/hive_metastore/pipelines_migrate.py‎
Lines changed: 183 additions & 0 deletions b/‎src/databricks/labs/ucx/hive_metastore/pipelines_migrate.py‎
Lines changed: 183 additions & 0 deletions
diff --git a/‎tests/integration/conftest.py‎
Lines changed: 6 additions & 0 deletions b/‎tests/integration/conftest.py‎
Lines changed: 6 additions & 0 deletions
@@ -886,5 +886,22 @@ def enable_hms_federation(w: WorkspaceClient, _: Prompts, ctx: WorkspaceContext
     ctx.federation_enabler.enable()
 
 
+@ucx.command
+def migrate_dlt_pipelines(
+    w: WorkspaceClient,
+    ctx: WorkspaceContext | None = None,
+    run_as_collection: bool = False,
+    a: AccountClient | None = None,
+) -> None:
+    """Migrate DLT pipelines to UC"""
+    if ctx:
+        workspace_contexts = [ctx]
+    else:
+        workspace_contexts = _get_workspace_contexts(w, a, run_as_collection)
+
+    for workspace_context in workspace_contexts:
+        workspace_context.pipelines_migrator.migrate_pipelines()
+
+
 if __name__ == "__main__":
     ucx()
@@ -12,6 +12,9 @@
 from databricks.labs.blueprint.wheels import ProductInfo, WheelsV2
 from databricks.labs.lsql.backends import SqlBackend
 
+from databricks.labs.ucx.assessment.jobs import JobsCrawler
+from databricks.labs.ucx.assessment.pipelines import PipelinesCrawler
+from databricks.labs.ucx.hive_metastore.pipelines_migrate import PipelinesMigrator
 from databricks.labs.ucx.recon.data_comparator import StandardDataComparator
 from databricks.labs.ucx.recon.data_profiler import StandardDataProfiler
 from databricks.labs.ucx.recon.metadata_retriever import DatabricksTableMetadataRetriever
@@ -266,6 +269,10 @@ def udf_ownership(self) -> UdfOwnership:
     def tables_crawler(self) -> TablesCrawler:
         return TablesCrawler(self.sql_backend, self.inventory_database, self.config.include_databases)
 
+    @cached_property
+    def jobs_crawler(self) -> JobsCrawler:
+        return JobsCrawler(self.workspace_client, self.sql_backend, self.inventory_database)
+
     @cached_property
     def table_ownership(self) -> TableOwnership:
         return TableOwnership(
@@ -337,6 +344,12 @@ def acl_migrator(self):
     def table_ownership_grant_loader(self) -> TableOwnershipGrantLoader:
         return TableOwnershipGrantLoader(self.tables_crawler, self.default_securable_ownership)
 
+    @cached_property
+    def pipelines_migrator(self) -> PipelinesMigrator:
+        return PipelinesMigrator(
+            self.workspace_client, self.pipelines_crawler, self.jobs_crawler, self.config.ucx_catalog
+        )
+
     @cached_property
     def migrate_grants(self) -> MigrateGrants:
         # owner grants have to come first
@@ -365,6 +378,10 @@ def mounts_crawler(self) -> MountsCrawler:
             self.config.enable_hms_federation,
         )
 
+    @cached_property
+    def pipelines_crawler(self) -> PipelinesCrawler:
+        return PipelinesCrawler(self.workspace_client, self.sql_backend, self.inventory_database)
+
     @cached_property
     def azure_service_principal_crawler(self) -> AzureServicePrincipalCrawler:
         return AzureServicePrincipalCrawler(self.workspace_client, self.sql_backend, self.inventory_database)
 
@@ -0,0 +1,183 @@
+import logging
+from functools import partial
+from typing import BinaryIO
+
+from databricks.labs.blueprint.parallel import Threads
+from databricks.sdk import WorkspaceClient
+from databricks.sdk.errors import DatabricksError
+from databricks.sdk.service.jobs import PipelineTask, Task, JobSettings
+
+from databricks.labs.ucx.assessment.jobs import JobsCrawler
+from databricks.labs.ucx.assessment.pipelines import PipelineInfo, PipelinesCrawler
+
+logger = logging.getLogger(__name__)
+
+
+class PipelinesMigrator:
+    """
+    PipelinesMigrator is responsible for migrating pipelines from HMS to UC
+    It uses the DLT Migration API to migrate the pipelines and also updates the jobs associated with the pipelines if any
+    The class also provides an option to skip the pipelines that are already migrated or the ones that are explicitly skipped
+
+    :param ws: WorkspaceClient
+    :param pipelines_crawler: PipelinesCrawler
+    :param catalog_name: str
+    :param skip_pipeline_ids: list[str] | None
+    """
+
+    def __init__(
+        self,
+        ws: WorkspaceClient,
+        pipelines_crawler: PipelinesCrawler,
+        jobs_crawler: JobsCrawler,
+        catalog_name: str,
+        skip_pipeline_ids: list[str] | None = None,
+    ):
+        self._ws = ws
+        self._pipeline_crawler = pipelines_crawler
+        self._jobs_crawler = jobs_crawler
+        self._catalog_name = catalog_name
+        self._skip_pipeline_ids = skip_pipeline_ids or []
+        self._pipeline_job_tasks_mapping: dict[str, list[dict]] = {}
+
+    def _populate_pipeline_job_tasks_mapping(self) -> None:
+        """
+        Populates the pipeline_job_tasks_mapping dictionary with the pipeline_id as key and the list of jobs associated with the pipeline
+        """
+        jobs = self._jobs_crawler.snapshot()
+
+        for job in jobs:
+            if not job.job_id:
+                continue
+
+            job_details = self._ws.jobs.get(int(job.job_id))
+            if not job_details.settings or not job_details.settings.tasks:
+                continue
+
+            for task in job_details.settings.tasks:
+                if not task.pipeline_task:
+                    continue
+                pipeline_id = task.pipeline_task.pipeline_id
+                job_info = {"job_id": job.job_id, "task_key": task.task_key}
+                if pipeline_id not in self._pipeline_job_tasks_mapping:
+                    self._pipeline_job_tasks_mapping[pipeline_id] = [job_info]
+                else:
+                    self._pipeline_job_tasks_mapping[pipeline_id].append(job_info)
+                logger.info(f"Found job:{job.job_id} task:{task.task_key} associated with pipeline {pipeline_id}")
+
+    def _get_pipelines_to_migrate(self) -> list[PipelineInfo]:
+        """
+        Returns the list of pipelines in the current workspace
+        """
+        return list(self._pipeline_crawler.snapshot())
+
+    def migrate_pipelines(self) -> None:
+        """
+        Migrate the pipelines from HMS to UC. Public method to be called to start the pipeline migration process
+        """
+        self._populate_pipeline_job_tasks_mapping()
+        self._migrate_pipelines()
+
+    def _migrate_pipelines(self) -> list[partial[dict | bool | list | BinaryIO]]:
+        """
+        Create tasks to parallely migrate the pipelines
+        """
+        # get pipelines to migrate
+        pipelines_to_migrate = self._get_pipelines_to_migrate()
+        logger.info(f"Found {len(pipelines_to_migrate)} pipelines to migrate")
+
+        tasks = []
+        for pipeline in pipelines_to_migrate:
+            if pipeline.pipeline_id in self._skip_pipeline_ids:
+                logger.info(f"Skipping pipeline {pipeline.pipeline_id}")
+                continue
+            tasks.append(partial(self._migrate_pipeline, pipeline))
+        if not tasks:
+            return []
+        Threads.strict("migrate pipelines", tasks)
+        return tasks
+
+    def _migrate_pipeline(self, pipeline: PipelineInfo) -> dict | list | BinaryIO | bool:
+        """
+        Private method to clone the pipeline and handle the exceptions
+        """
+        try:
+            return self._clone_pipeline(pipeline)
+        except DatabricksError as e:
+            if "Cloning from Hive Metastore to Unity Catalog is currently not supported" in str(e):
+                logger.error(f"{e}: Please contact Databricks to enable DLT HMS to UC migration API on this workspace")
+                return False
+            logger.error(f"Failed to migrate pipeline {pipeline.pipeline_id}: {e}")
+            return False
+
+    def _clone_pipeline(self, pipeline: PipelineInfo) -> dict | list | BinaryIO:
+        """
+        This method calls the DLT Migration API to clone the pipeline
+        Stop and rename the old pipeline before cloning the new pipeline
+        Call the DLT Migration API to clone the pipeline
+        Update the jobs associated with the pipeline to point to the new pipeline
+        """
+        # Need to get the pipeline again to get the libraries
+        # else updating name fails with libraries not provided error
+        get_pipeline = self._ws.pipelines.get(pipeline.pipeline_id)
+        if get_pipeline.spec:
+            if get_pipeline.spec.catalog:
+                # Skip if the pipeline is already migrated to UC
+                logger.info(f"Pipeline {pipeline.pipeline_id} is already migrated to UC")
+                return []
+
+            # Stop HMS pipeline
+            self._ws.pipelines.stop(pipeline.pipeline_id)
+            # Rename old pipeline first
+            self._ws.pipelines.update(
+                pipeline.pipeline_id,
+                name=f"{pipeline.pipeline_name} [OLD]",
+                clusters=get_pipeline.spec.clusters if get_pipeline.spec.clusters else None,
+                storage=get_pipeline.spec.storage if get_pipeline.spec.storage else None,
+                continuous=get_pipeline.spec.continuous if get_pipeline.spec.continuous else None,
+                deployment=get_pipeline.spec.deployment if get_pipeline.spec.deployment else None,
+                target=get_pipeline.spec.target if get_pipeline.spec.target else None,
+                libraries=get_pipeline.spec.libraries if get_pipeline.spec.libraries else None,
+            )
+
+        # Clone pipeline
+        headers = {
+            'Accept': 'application/json',
+            'Content-Type': 'application/json',
+        }
+        body = {
+            'catalog': self._catalog_name,
+            'clone_mode': 'MIGRATE_TO_UC',
+            'configuration': {'pipelines.migration.ignoreExplicitPath': 'true'},
+            'name': f"{pipeline.pipeline_name}",
+        }
+        res = self._ws.api_client.do(
+            'POST', f'/api/2.0/pipelines/{pipeline.pipeline_id}/clone', body=body, headers=headers
+        )
+        assert isinstance(res, dict)
+        if 'pipeline_id' not in res:
+            logger.warning(f"Failed to clone pipeline {pipeline.pipeline_id}")
+            return res
+
+        # After successful clone, update jobs
+        # Currently there is no SDK method available to migrate the DLT pipelines
+        # We are directly using the DLT Migration API in the interim, once the SDK method is available, we can replace this
+        if pipeline.pipeline_id in self._pipeline_job_tasks_mapping:
+            for pipeline_job_task_mapping in self._pipeline_job_tasks_mapping[pipeline.pipeline_id]:
+                self._ws.jobs.update(
+                    pipeline_job_task_mapping['job_id'],
+                    new_settings=JobSettings(
+                        tasks=[
+                            Task(
+                                pipeline_task=PipelineTask(pipeline_id=str(res.get('pipeline_id'))),
+                                task_key=pipeline_job_task_mapping['task_key'],
+                            )
+                        ]
+                    ),
+                )
+                logger.info(f"Updated job {pipeline_job_task_mapping['job_id']} with new pipeline {res['pipeline_id']}")
+
+        # TODO:
+        # Check the error from UI
+        # BAD_REQUEST: Standard_D4pds_v6 is a Graviton instance and is not compatible with runtime dlt:14.1.21-delta-pipelines-dlt-release-2024.41-rc0-commit-f32d838-image-894c190.
+        return res
@@ -580,6 +580,12 @@ def make_dashboard(self, **kwargs) -> Dashboard:
         self._dashboards.append(dashboard)
         return dashboard
 
+    def make_notebook(self, **kwargs):
+        return self._make_notebook(**kwargs)
+
+    def make_catalog(self, **kwargs):
+        return self._make_catalog(**kwargs)
+
     def make_linting_resources(self) -> None:
         """Make resources to lint."""
         self.make_job(content="spark.read.parquet('dbfs://mnt/notebook/')")