From 08222390279a0fe573cc051612ae5cc9836f7fca Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Sun, 8 Jun 2025 21:03:56 +0000 Subject: [PATCH 1/2] Add integration tests for `dvc import-url` This commit introduces a new test file `tests/func/test_import_url_integration.py` to validate the `dvc import-url` command with various remote storage types. The tests cover: - HTTP: Using a local HTTP server to serve a data file. - AWS S3: Using `moto` to mock an S3 bucket. - Google Cloud Storage (GCS): Using `testcontainers` with a GCS emulator. - Azure Blob Storage: Using `testcontainers` with the Azurite emulator. Each test verifies that `dvc import-url` successfully imports a file from the respective remote, creates the corresponding .dvc file, and that the content of the imported file matches the source. Fixtures are used to set up and tear down the necessary services and resources for each test. --- tests/func/test_import_url_integration.py | 231 ++++++++++++++++++++++ 1 file changed, 231 insertions(+) create mode 100644 tests/func/test_import_url_integration.py diff --git a/tests/func/test_import_url_integration.py b/tests/func/test_import_url_integration.py new file mode 100644 index 0000000000..b2cfdcc0d2 --- /dev/null +++ b/tests/func/test_import_url_integration.py @@ -0,0 +1,231 @@ +# End-to-end tests for `dvc import-url` with various remote types. + +import os +import pytest +import subprocess +import threading +from http.server import SimpleHTTPRequestHandler, HTTPServer + +import boto3 +from moto import mock_s3 + +from google.cloud import storage +from testcontainers.google import GCloudContainer + +from azure.storage.blob import BlobServiceClient +from testcontainers.azurite import AzuriteContainer + +@pytest.fixture +def http_server(tmp_path): + data_file = tmp_path / "data.txt" + data_file.write_text("test http data") + + class Handler(SimpleHTTPRequestHandler): + def __init__(self, *args, **kwargs): + super().__init__(*args, directory=str(tmp_path), **kwargs) + + host = "localhost" + port = 8000 + server_address = (host, port) + HTTPServer.allow_reuse_address = True + httpd = HTTPServer(server_address, Handler) + + server_thread = threading.Thread(target=httpd.serve_forever) + server_thread.daemon = True + server_thread.start() + + yield f"http://{host}:{port}" + + httpd.shutdown() + httpd.server_close() + server_thread.join() + +def test_import_url_http(tmp_path, dvc, http_server): + http_url = f"{http_server}/data.txt" + dest_file = tmp_path / "dest.txt" + + result = subprocess.run( + ["dvc", "import-url", http_url, str(dest_file)], + cwd=str(tmp_path), + capture_output=True, + text=True, + ) + assert result.returncode == 0, f"Error: {result.stderr}" + + assert dest_file.exists() + assert dest_file.read_text() == "test http data" + + dvc_file = dest_file.with_suffix(".dvc") + assert dvc_file.exists() + + os.remove(dest_file) + os.remove(dvc_file) + +@pytest.fixture +def s3_bucket(tmp_path): # tmp_path is kept for consistency, though not strictly used by moto for bucket creation + with mock_s3(): + s3 = boto3.client("s3", region_name="us-east-1") + bucket_name = "mybucket" + s3.create_bucket(Bucket=bucket_name) + + data_content = "test s3 data" + s3.put_object(Bucket=bucket_name, Key="data.txt", Body=data_content) + + yield f"s3://{bucket_name}" + # moto handles cleanup when the context manager exits + +def test_import_url_s3(tmp_path, dvc, s3_bucket): + s3_url = f"{s3_bucket}/data.txt" + dest_file = tmp_path / "dest_s3.txt" + + env = os.environ.copy() + env["AWS_ACCESS_KEY_ID"] = "testing" + env["AWS_SECRET_ACCESS_KEY"] = "testing" + env["AWS_SECURITY_TOKEN"] = "testing" + env["AWS_SESSION_TOKEN"] = "testing" + env["AWS_DEFAULT_REGION"] = "us-east-1" + + result = subprocess.run( + ["dvc", "import-url", s3_url, str(dest_file)], + cwd=str(tmp_path), + capture_output=True, + text=True, + env=env, + ) + assert result.returncode == 0, f"Error: {result.stderr}" + + assert dest_file.exists() + assert dest_file.read_text() == "test s3 data" + + dvc_file = dest_file.with_suffix(".dvc") + assert dvc_file.exists() + + os.remove(dest_file) + os.remove(dvc_file) + +@pytest.fixture(scope="session") +def gcs_emulator(): + try: + container = GCloudContainer() + container.start() + os.environ["STORAGE_EMULATOR_HOST"] = container.get_container_host_ip() + ":" + container.get_exposed_port(4443) + # Unset credentials to use anonymous access with emulator, GCLOUD_PROJECT is a dummy project + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "" + os.environ["GCLOUD_PROJECT"] = "test-project" + yield container + finally: + if 'container' in locals(): + container.stop() + # Clean up environment variables + os.environ.pop("STORAGE_EMULATOR_HOST", None) + os.environ.pop("GOOGLE_APPLICATION_CREDENTIALS", None) + os.environ.pop("GCLOUD_PROJECT", None) + +@pytest.fixture +def gcs_bucket(gcs_emulator, tmp_path): # tmp_path can be used if creating local files to upload + # Client automatically uses STORAGE_EMULATOR_HOST from environment + client = storage.Client(project="test-project") # Ensure project matches if client doesn't pick from env + + bucket_name = "mygcsbucket" + try: + bucket = client.create_bucket(bucket_name) + except Exception as e: + # Handle cases where bucket might already exist (e.g. from a previous interrupted run if emulator state persists) + if "conflict" in str(e).lower() or "Your previous request to create the named bucket succeeded and you already own it" in str(e): + bucket = client.get_bucket(bucket_name) + else: + raise + + data_content = "test gcs data" + blob = bucket.blob("data.txt") + blob.upload_from_string(data_content) + + yield f"gs://{bucket_name}" + # Emulator is session-scoped; explicit bucket cleanup can be added if needed + # but typically emulator provides fresh state or is torn down. + +def test_import_url_gcs(tmp_path, dvc, gcs_bucket): + gcs_url = f"{gcs_bucket}/data.txt" + dest_file = tmp_path / "dest_gcs.txt" + + # Environment for subprocess, ensuring it also sees the emulator settings + env = os.environ.copy() + # STORAGE_EMULATOR_HOST, GOOGLE_APPLICATION_CREDENTIALS, GCLOUD_PROJECT are set by gcs_emulator + + result = subprocess.run( + ["dvc", "import-url", gcs_url, str(dest_file)], + cwd=str(tmp_path), + capture_output=True, + text=True, + env=env, + ) + assert result.returncode == 0, f"Error: {result.stderr}" + + assert dest_file.exists() + assert dest_file.read_text() == "test gcs data" + + dvc_file = dest_file.with_suffix(".dvc") + assert dvc_file.exists() + + os.remove(dest_file) + os.remove(dvc_file) + +@pytest.fixture(scope="session") +def azurite_emulator(): + try: + container = AzuriteContainer() # Uses default image mcr.microsoft.com/azure-storage/azurite + container.start() + # The AzuriteContainer class provides a method to get the connection string + os.environ["AZURE_STORAGE_CONNECTION_STRING"] = container.get_connection_string() + yield container + finally: + if 'container' in locals(): + container.stop() + os.environ.pop("AZURE_STORAGE_CONNECTION_STRING", None) + +@pytest.fixture +def azure_blob_container_fixture(azurite_emulator, tmp_path): # tmp_path for consistency or local file prep + connection_string = azurite_emulator.get_connection_string() + blob_service_client = BlobServiceClient.from_connection_string(connection_string) + + container_name = "myazurecontainer" + try: + container_client = blob_service_client.create_container(container_name) + except Exception as e: # Handle if container already exists (e.g. ResourceExistsError) + if "already exists" in str(e).lower() or "The specified container already exists" in str(e): + container_client = blob_service_client.get_container_client(container_name) + else: + raise + + data_content = "test azure data" + blob_client = container_client.get_blob_client("data.txt") + blob_client.upload_blob(data_content, overwrite=True) + + yield f"azure://{container_name}" + # Emulator is session-scoped; explicit container cleanup can be added if needed. + +def test_import_url_azure(tmp_path, dvc, azure_blob_container_fixture): + azure_url = f"{azure_blob_container_fixture}/data.txt" # Using the renamed fixture + dest_file = tmp_path / "dest_azure.txt" + + # Environment for subprocess, ensuring it sees the emulator settings + env = os.environ.copy() + # AZURE_STORAGE_CONNECTION_STRING should be set by the azurite_emulator fixture + + result = subprocess.run( + ["dvc", "import-url", azure_url, str(dest_file)], + cwd=str(tmp_path), + capture_output=True, + text=True, + env=env, + ) + assert result.returncode == 0, f"Error: {result.stderr}" + + assert dest_file.exists() + assert dest_file.read_text() == "test azure data" + + dvc_file = dest_file.with_suffix(".dvc") + assert dvc_file.exists() + + os.remove(dest_file) + os.remove(dvc_file) From 3375966263cba9a6119a1450b00f9592862418b1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 8 Jun 2025 21:04:39 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/func/test_import_url_integration.py | 78 ++++++++++++++++------- 1 file changed, 56 insertions(+), 22 deletions(-) diff --git a/tests/func/test_import_url_integration.py b/tests/func/test_import_url_integration.py index b2cfdcc0d2..a8074a5548 100644 --- a/tests/func/test_import_url_integration.py +++ b/tests/func/test_import_url_integration.py @@ -1,19 +1,18 @@ # End-to-end tests for `dvc import-url` with various remote types. import os -import pytest import subprocess import threading -from http.server import SimpleHTTPRequestHandler, HTTPServer +from http.server import HTTPServer, SimpleHTTPRequestHandler import boto3 -from moto import mock_s3 - +import pytest +from azure.storage.blob import BlobServiceClient from google.cloud import storage +from moto import mock_s3 +from testcontainers.azurite import AzuriteContainer from testcontainers.google import GCloudContainer -from azure.storage.blob import BlobServiceClient -from testcontainers.azurite import AzuriteContainer @pytest.fixture def http_server(tmp_path): @@ -40,6 +39,7 @@ def __init__(self, *args, **kwargs): httpd.server_close() server_thread.join() + def test_import_url_http(tmp_path, dvc, http_server): http_url = f"{http_server}/data.txt" dest_file = tmp_path / "dest.txt" @@ -49,6 +49,7 @@ def test_import_url_http(tmp_path, dvc, http_server): cwd=str(tmp_path), capture_output=True, text=True, + check=False, ) assert result.returncode == 0, f"Error: {result.stderr}" @@ -61,8 +62,11 @@ def test_import_url_http(tmp_path, dvc, http_server): os.remove(dest_file) os.remove(dvc_file) + @pytest.fixture -def s3_bucket(tmp_path): # tmp_path is kept for consistency, though not strictly used by moto for bucket creation +def s3_bucket( + tmp_path, +): # tmp_path is kept for consistency, though not strictly used by moto for bucket creation with mock_s3(): s3 = boto3.client("s3", region_name="us-east-1") bucket_name = "mybucket" @@ -74,6 +78,7 @@ def s3_bucket(tmp_path): # tmp_path is kept for consistency, though not strictly yield f"s3://{bucket_name}" # moto handles cleanup when the context manager exits + def test_import_url_s3(tmp_path, dvc, s3_bucket): s3_url = f"{s3_bucket}/data.txt" dest_file = tmp_path / "dest_s3.txt" @@ -91,6 +96,7 @@ def test_import_url_s3(tmp_path, dvc, s3_bucket): capture_output=True, text=True, env=env, + check=False, ) assert result.returncode == 0, f"Error: {result.stderr}" @@ -103,35 +109,47 @@ def test_import_url_s3(tmp_path, dvc, s3_bucket): os.remove(dest_file) os.remove(dvc_file) + @pytest.fixture(scope="session") def gcs_emulator(): try: container = GCloudContainer() container.start() - os.environ["STORAGE_EMULATOR_HOST"] = container.get_container_host_ip() + ":" + container.get_exposed_port(4443) + os.environ["STORAGE_EMULATOR_HOST"] = ( + container.get_container_host_ip() + ":" + container.get_exposed_port(4443) + ) # Unset credentials to use anonymous access with emulator, GCLOUD_PROJECT is a dummy project os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "" os.environ["GCLOUD_PROJECT"] = "test-project" yield container finally: - if 'container' in locals(): + if "container" in locals(): container.stop() # Clean up environment variables os.environ.pop("STORAGE_EMULATOR_HOST", None) os.environ.pop("GOOGLE_APPLICATION_CREDENTIALS", None) os.environ.pop("GCLOUD_PROJECT", None) + @pytest.fixture -def gcs_bucket(gcs_emulator, tmp_path): # tmp_path can be used if creating local files to upload +def gcs_bucket( + gcs_emulator, tmp_path +): # tmp_path can be used if creating local files to upload # Client automatically uses STORAGE_EMULATOR_HOST from environment - client = storage.Client(project="test-project") # Ensure project matches if client doesn't pick from env + client = storage.Client( + project="test-project" + ) # Ensure project matches if client doesn't pick from env bucket_name = "mygcsbucket" try: bucket = client.create_bucket(bucket_name) except Exception as e: # Handle cases where bucket might already exist (e.g. from a previous interrupted run if emulator state persists) - if "conflict" in str(e).lower() or "Your previous request to create the named bucket succeeded and you already own it" in str(e): + if ( + "conflict" in str(e).lower() + or "Your previous request to create the named bucket succeeded and you already own it" + in str(e) + ): bucket = client.get_bucket(bucket_name) else: raise @@ -140,10 +158,11 @@ def gcs_bucket(gcs_emulator, tmp_path): # tmp_path can be used if creating local blob = bucket.blob("data.txt") blob.upload_from_string(data_content) - yield f"gs://{bucket_name}" + return f"gs://{bucket_name}" # Emulator is session-scoped; explicit bucket cleanup can be added if needed # but typically emulator provides fresh state or is torn down. + def test_import_url_gcs(tmp_path, dvc, gcs_bucket): gcs_url = f"{gcs_bucket}/data.txt" dest_file = tmp_path / "dest_gcs.txt" @@ -158,6 +177,7 @@ def test_import_url_gcs(tmp_path, dvc, gcs_bucket): capture_output=True, text=True, env=env, + check=False, ) assert result.returncode == 0, f"Error: {result.stderr}" @@ -170,42 +190,55 @@ def test_import_url_gcs(tmp_path, dvc, gcs_bucket): os.remove(dest_file) os.remove(dvc_file) + @pytest.fixture(scope="session") def azurite_emulator(): try: - container = AzuriteContainer() # Uses default image mcr.microsoft.com/azure-storage/azurite + container = ( + AzuriteContainer() + ) # Uses default image mcr.microsoft.com/azure-storage/azurite container.start() # The AzuriteContainer class provides a method to get the connection string - os.environ["AZURE_STORAGE_CONNECTION_STRING"] = container.get_connection_string() + os.environ["AZURE_STORAGE_CONNECTION_STRING"] = ( + container.get_connection_string() + ) yield container finally: - if 'container' in locals(): + if "container" in locals(): container.stop() os.environ.pop("AZURE_STORAGE_CONNECTION_STRING", None) + @pytest.fixture -def azure_blob_container_fixture(azurite_emulator, tmp_path): # tmp_path for consistency or local file prep +def azure_blob_container_fixture( + azurite_emulator, tmp_path +): # tmp_path for consistency or local file prep connection_string = azurite_emulator.get_connection_string() blob_service_client = BlobServiceClient.from_connection_string(connection_string) container_name = "myazurecontainer" try: container_client = blob_service_client.create_container(container_name) - except Exception as e: # Handle if container already exists (e.g. ResourceExistsError) - if "already exists" in str(e).lower() or "The specified container already exists" in str(e): + except ( + Exception + ) as e: # Handle if container already exists (e.g. ResourceExistsError) + if "already exists" in str( + e + ).lower() or "The specified container already exists" in str(e): container_client = blob_service_client.get_container_client(container_name) - else: + else: raise data_content = "test azure data" blob_client = container_client.get_blob_client("data.txt") blob_client.upload_blob(data_content, overwrite=True) - yield f"azure://{container_name}" + return f"azure://{container_name}" # Emulator is session-scoped; explicit container cleanup can be added if needed. + def test_import_url_azure(tmp_path, dvc, azure_blob_container_fixture): - azure_url = f"{azure_blob_container_fixture}/data.txt" # Using the renamed fixture + azure_url = f"{azure_blob_container_fixture}/data.txt" # Using the renamed fixture dest_file = tmp_path / "dest_azure.txt" # Environment for subprocess, ensuring it sees the emulator settings @@ -218,6 +251,7 @@ def test_import_url_azure(tmp_path, dvc, azure_blob_container_fixture): capture_output=True, text=True, env=env, + check=False, ) assert result.returncode == 0, f"Error: {result.stderr}"