Merge pull request #3 from databricks-industry-solutions/feat/pytest

dmoore247 · web-flow · commit 6fe1b60abd31 · 2025-08-01T00:35:27.000-04:00
add databricks pytest runner.
13 of 14 tests run.
diff --git a/databricks.yml b/databricks.yml
@@ -30,19 +30,18 @@ resources:
   jobs:
     demo_workflow:
       name: "${var.project_name} - Pytest Workflow"
+      max_concurrent_runs: 4
       tasks:
         - task_key: dbrunner
-          spark_python_task:
-            python_file: /Workspace/Users/douglas.moore@databricks.com/python-data-sources-x/zipdcm/db_runner.py
+          notebook_task:
+            notebook_path: /Workspace/Users/douglas.moore@databricks.com/python-data-sources-x/zipdcm/db_runner
+            source: WORKSPACE
           existing_cluster_id: 0519-014005-pr11dvi3
           libraries:
             - pypi:
                 package: pyspark==4.0.0.dev1
-      git_source:
-        git_url: https://github.com/databricks-industry-solutions/python-data-sources.git
-        git_provider: gitHub
-        git_branch: feat/zipdcm
       tags:
+        dev: douglas_moore
         owner: douglas.moore@databricks.com
         solacc: pixels
       queue:
@@ -56,4 +55,5 @@ resources:
               - pytest==8.3.5
       budget_policy_id: d8e5830d-97cb-40b9-bd65-063434295162
 
+
 # For more options and schema, see: https://docs.databricks.com/aws/en/dev-tools/bundles/settings
diff --git a/zipdcm/conftest.py b/zipdcm/conftest.py
@@ -11,10 +11,8 @@ def spark() -> SparkSession:
     the cluster in the remote Databricks workspace. Unit tests do not
     have access to this SparkSession by default.
     """
-    #sparkSession = DatabricksSession.builder.getOrCreate()
-    sparkSession = (SparkSession.builder
-        .master("local[*]")
-        .getOrCreate())
+    #sparkSession = DatabricksSession.builder.serverless(True).getOrCreate()
+    sparkSession = (SparkSession.builder.getOrCreate())
     sparkSession.dataSource.register(ZipDCMDataSource)
     return sparkSession
 
diff --git a/zipdcm/db_runner.py b/zipdcm/db_runner.py
@@ -1,84 +1,24 @@
-import configparser
-import io
+# Databricks notebook source
+import pytest
 import os
+import sys
 
-from databricks.sdk import WorkspaceClient
-from databricks.sdk.service.compute import ClusterSpec, DataSecurityMode, RuntimeEngine
-from databricks.sdk.service.jobs import (
-    GitProvider,
-    GitSource,
-    JobAccessControlRequest,
-    JobPermissionLevel,
-    NotebookTask,
-    RunResultState,
-    Source,
-    Task,
-)
+# Run all tests in the connected directory in the remote Databricks workspace.
+# By default, pytest searches through all files with filenames ending with
+# "_test.py" for tests. Within each of these files, pytest runs each function
+# with a function name beginning with "test_".
 
-WATCH_DOGS_EMAILS = os.environ.get("WATCH_DOGS_EMAILS", "").split(",")
+# Get the path to the directory for this file in the workspace.
+dir_root = os.path.abspath(".")
+print(dir_root)
+# Switch to the root directory.
+os.chdir(dir_root)
 
-#config = configparser.ConfigParser()
-#config.read_file(io.StringIO(os.environ["DB_PROFILES"]))
-#config = config["DEMO"]
-#os.environ["DATABRICKS_HOST"] = config["host"]
-#os.environ["DATABRICKS_TOKEN"] = config["token"]
+# Skip writing .pyc files to the bytecode cache on the cluster.
+sys.dont_write_bytecode = True
 
-branch = os.getenv("GITHUB_HEAD_REF", "main")
+# Now run pytest from the root directory, using the
 
-# Create workspace client using host and token
-workspace = WorkspaceClient()
-user = workspace.current_user.me().user_name
-nodes = [
-    node
-    for node in workspace.clusters.list_node_types().node_types
-    if not node.is_deprecated and node.num_cores == 4.0 and node.is_io_cache_enabled
-]
-acl = [JobAccessControlRequest(user_name=user, permission_level=JobPermissionLevel.IS_OWNER)]
-
-for watcher in WATCH_DOGS_EMAILS:
-    # Check if the watcher is a valid user
-    ww_list = list(
-        workspace.users.list(
-            attributes="id,userName", sort_by="userName", filter=f"userName eq '{watcher}'"
-        )
-    )
-    if len(ww_list) >= 1 and watcher != user:
-        acl.append(
-            JobAccessControlRequest(
-                user_name=watcher,
-                permission_level=JobPermissionLevel.CAN_VIEW,
-            )
-        )
-
-repo_url = "https://github.com/databricks-industry-solutions/python-data-sources.git"
-
-# Define the git source
-git_source = GitSource(git_url=repo_url, git_provider=GitProvider.GIT_HUB, git_branch=branch)
-
-# Define the job cluster
-cluster_spec = ClusterSpec(
-    num_workers=0,
-    spark_version="17.0.x-scala2.13",
-    node_type_id=nodes[0].node_type_id,
-    spark_conf={"spark.master": "local[*, 4]"},
-    data_security_mode=DataSecurityMode.SINGLE_USER,
-    runtime_engine=RuntimeEngine.STANDARD,
-)
-
-# Define the notebook task
-notebook_task = NotebookTask(
-    notebook_path="pytest_databricks",
-    base_parameters={},
-    source=Source.GIT,
-)
-
-# Define the task
-task = Task(task_key="notebook_task", notebook_task=notebook_task, new_cluster=cluster_spec)
-
-# Submit the task
-run_response = workspace.jobs.submit_and_wait(
-    run_name="pixels_gitaction_test", tasks=[task], git_source=git_source, access_control_list=acl
-)
-
-if run_response.state.result_state != RunResultState.SUCCESS:
-    raise Exception(f"Job failed with state {run_response.state.result_state}")
+#
+retcode = pytest.main(["-v", "."])
+dbutils.notebook.exit(retcode)