Add integration test for the new Files API client

ksafonov-db · ksafonov-db · commit a320c69abbd6 · 2025-03-03T16:15:58.000+01:00
diff --git a/databricks/sdk/mixins/files.py b/databricks/sdk/mixins/files.py
@@ -1090,6 +1090,7 @@ def perform():
                     # a 503 or 500 response, then you need to resume the interrupted upload from where it left off.
 
                     # Let's follow that for all potentially retryable status codes.
+                    # Together with the catch block below we replicate the logic in _retry_idempotent_operation().
                     if upload_response.status_code in self._RETRYABLE_STATUS_CODES:
                         if retry_count < self._config.multipart_upload_max_retries:
                             retry_count += 1
@@ -1100,7 +1101,7 @@ def perform():
                         retry_count = 0
 
                 except RequestException as e:
-                    # Let's do the same for retryable network errors
+                    # Let's do the same for retryable network errors.
                     if _BaseClient._is_retryable(e) and retry_count < self._config.multipart_upload_max_retries:
                         retry_count += 1
                         upload_response = retrieve_upload_status()
diff --git a/tests/integration/test_files.py b/tests/integration/test_files.py
@@ -1,13 +1,17 @@
+import datetime
 import io
 import logging
 import pathlib
 import platform
+import re
 import time
+from textwrap import dedent
 from typing import Callable, List, Tuple, Union
 
 import pytest
 
 from databricks.sdk.core import DatabricksError
+from databricks.sdk.errors.sdk import OperationFailed
 from databricks.sdk.service.catalog import VolumeType
 
 
@@ -382,3 +386,135 @@ def test_files_api_download_benchmark(ucws, files_api, random):
                 )
             min_str = str(best[0]) + "kb" if best[0] else "None"
             logging.info("Fastest chunk size: %s in %f seconds", min_str, best[1])
+
+
+@pytest.mark.parametrize("is_serverless", [True, False], ids=["Classic", "Serverless"])
+@pytest.mark.parametrize("use_new_files_api_client", [True, False], ids=["Default client", "Experimental client"])
+def test_files_api_in_cluster(ucws, random, env_or_skip, is_serverless, use_new_files_api_client):
+    from databricks.sdk.service import compute, jobs
+
+    databricks_sdk_pypi_package = "databricks-sdk"
+    option_env_name = "DATABRICKS_ENABLE_EXPERIMENTAL_FILES_API_CLIENT"
+
+    launcher_file_path = f"/home/{ucws.current_user.me().user_name}/test_launcher.py"
+
+    schema = "filesit-" + random()
+    volume = "filesit-" + random()
+    with ResourceWithCleanup.create_schema(ucws, "main", schema):
+        with ResourceWithCleanup.create_volume(ucws, "main", schema, volume):
+
+            cloud_file_path = f"/Volumes/main/{schema}/{volume}/test-{random()}.txt"
+            file_size = 100 * 1024 * 1024
+
+            if use_new_files_api_client:
+                enable_new_files_api_env = f"os.environ['{option_env_name}'] = 'True'"
+                expected_files_api_client_class = "FilesExt"
+            else:
+                enable_new_files_api_env = ""
+                expected_files_api_client_class = "FilesAPI"
+
+            using_files_api_client_msg = "Using files API client: "
+
+            command = f"""
+                from databricks.sdk import WorkspaceClient
+                import io
+                import os
+                import hashlib
+                import logging
+                
+                logging.basicConfig(level=logging.DEBUG)
+                
+                {enable_new_files_api_env}
+                
+                file_size = {file_size}
+                original_content = os.urandom(file_size)
+                cloud_file_path = '{cloud_file_path}'
+                
+                w = WorkspaceClient()
+                print(f"Using SDK: {{w.config._product_info}}") 
+
+                print(f"{using_files_api_client_msg}{{type(w.files).__name__}}")
+                    
+                w.files.upload(cloud_file_path, io.BytesIO(original_content), overwrite=True)
+                print("Upload succeeded")
+                
+                response = w.files.download(cloud_file_path)
+                resulting_content = response.contents.read()
+                print("Download succeeded")
+                
+                def hash(data: bytes):
+                    sha256 = hashlib.sha256()
+                    sha256.update(data)
+                    return sha256.hexdigest()
+                
+                if len(resulting_content) != len(original_content):
+                  raise ValueError(f"Content length does not match: expected {{len(original_content)}}, actual {{len(resulting_content)}}")
+                
+                expected_hash = hash(original_content)
+                actual_hash = hash(resulting_content)
+                if actual_hash != expected_hash:
+                  raise ValueError(f"Content hash does not match: expected {{expected_hash}}, actual {{actual_hash}}")
+                
+                print(f"Contents of size {{len(resulting_content)}} match")
+            """
+
+            with ucws.dbfs.open(launcher_file_path, write=True, overwrite=True) as f:
+                f.write(dedent(command).encode())
+
+            if is_serverless:
+                # If no job_cluster_key, existing_cluster_id, or new_cluster were specified in task definition,
+                # then task will be executed using serverless compute.
+                new_cluster_spec = None
+
+                # Library is specified in the environment
+                env_key = "test_env"
+                envs = [jobs.JobEnvironment(env_key, compute.Environment("test", [databricks_sdk_pypi_package]))]
+                libs = []
+            else:
+                new_cluster_spec = compute.ClusterSpec(
+                    spark_version=ucws.clusters.select_spark_version(long_term_support=True),
+                    instance_pool_id=env_or_skip("TEST_INSTANCE_POOL_ID"),
+                    num_workers=1,
+                )
+
+                # Library is specified in the task definition
+                env_key = None
+                envs = []
+                libs = [compute.Library(pypi=compute.PythonPyPiLibrary(package=databricks_sdk_pypi_package))]
+
+            waiter = ucws.jobs.submit(
+                run_name=f"py-sdk-{random(8)}",
+                tasks=[
+                    jobs.SubmitTask(
+                        task_key="task1",
+                        new_cluster=new_cluster_spec,
+                        spark_python_task=jobs.SparkPythonTask(python_file=f"dbfs:{launcher_file_path}"),
+                        libraries=libs,
+                        environment_key=env_key,
+                    )
+                ],
+                environments=envs,
+            )
+
+            def print_status(r: jobs.Run):
+                statuses = [f"{t.task_key}: {t.state.life_cycle_state}" for t in r.tasks]
+                logging.info(f'Run status: {", ".join(statuses)}')
+
+            logging.info(f"Waiting for the job run: {waiter.run_id}")
+            try:
+                job_run = waiter.result(timeout=datetime.timedelta(minutes=15), callback=print_status)
+                task_run_id = job_run.tasks[0].run_id
+                task_run_logs = ucws.jobs.get_run_output(task_run_id).logs
+                logging.info(f"Run finished, output: {task_run_logs}")
+                match = re.search(f"{using_files_api_client_msg}(.*)$", task_run_logs, re.MULTILINE)
+                assert match is not None
+                files_api_client_class = match.group(1)
+                assert files_api_client_class == expected_files_api_client_class
+
+            except OperationFailed:
+                job_run = ucws.jobs.get_run(waiter.run_id)
+                task_run_id = job_run.tasks[0].run_id
+                task_run_logs = ucws.jobs.get_run_output(task_run_id)
+                raise ValueError(
+                    f"Run failed, error: {task_run_logs.error}, error trace: {task_run_logs.error_trace}, output: {task_run_logs.logs}"
+                )