diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..b5ef770 Binary files /dev/null and b/.gitattributes differ diff --git a/databusclient/client.py b/databusclient/client.py index 764bf6b..4f2f591 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -12,6 +12,15 @@ __debug = False +def __compute_file_sha256(filepath: str) -> str: + """Computes the SHA256 hex digest for a file.""" + sha256_hash = hashlib.sha256() + with open(filepath, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + sha256_hash.update(chunk) + return sha256_hash.hexdigest() + + class DeployError(Exception): """Raised if deploy fails""" @@ -28,6 +37,14 @@ class DeployLogLevel(Enum): debug = 2 +class ShaValidationMode(Enum): + """Controls the SHA256 validation behavior""" + + OFF = 0 # Skip validation + WARNING = 1 # Print a warning on mismatch + ERROR = 2 # Raise an error on mismatch + + def __get_content_variants(distribution_str: str) -> Optional[Dict[str, str]]: args = distribution_str.split("|") @@ -316,7 +333,7 @@ def create_dataset( "@type": "Artifact", "title": title, "abstract": abstract, - "description": description + "description": description, } graphs.append(artifact_graph) @@ -393,7 +410,15 @@ def deploy( print(resp.text) -def __download_file__(url, filename, vault_token_file=None, auth_url=None, client_id=None) -> None: +def __download_file__( + url, + filename, + vault_token_file=None, + auth_url=None, + client_id=None, + expected_sha256=None, + validation_mode: ShaValidationMode = ShaValidationMode.WARNING, +) -> None: """ Download a file from the internet with a progress bar using tqdm. @@ -403,11 +428,8 @@ def __download_file__(url, filename, vault_token_file=None, auth_url=None, clien - vault_token_file: Path to Vault refresh token file - auth_url: Keycloak token endpoint URL - client_id: Client ID for token exchange - - Steps: - 1. Try direct GET without Authorization header. - 2. If server responds with WWW-Authenticate: Bearer, 401 Unauthorized) or url starts with "https://data.dbpedia.io/databus.dbpedia.org", - then fetch Vault access token and retry with Authorization header. + - expected_sha256: The expected SHA256 checksum for validation + - validation_mode: Enum (OFF, WARNING, ERROR) to control validation behavior """ print(f"Download file: {url}") @@ -417,15 +439,25 @@ def __download_file__(url, filename, vault_token_file=None, auth_url=None, clien # --- 1. Get redirect URL by requesting HEAD --- response = requests.head(url, stream=True) # Check for redirect and update URL if necessary - if response.headers.get("Location") and response.status_code in [301, 302, 303, 307, 308]: + if response.headers.get("Location") and response.status_code in [ + 301, + 302, + 303, + 307, + 308, + ]: url = response.headers.get("Location") print("Redirects url: ", url) # --- 2. Try direct GET --- - response = requests.get(url, stream=True, allow_redirects=False) # no redirects here, we want to see if auth is required - www = response.headers.get('WWW-Authenticate', '') # get WWW-Authenticate header if present to check for Bearer auth - - if (response.status_code == 401 or "bearer" in www.lower()): + response = requests.get( + url, stream=True, allow_redirects=False + ) # no redirects here, we want to see if auth is required + www = response.headers.get( + "WWW-Authenticate", "" + ) # get WWW-Authenticate header if present to check for Bearer auth + + if response.status_code == 401 or "bearer" in www.lower(): print(f"Authentication required for {url}") if not (vault_token_file): raise ValueError("Vault token file not given for protected download") @@ -435,8 +467,16 @@ def __download_file__(url, filename, vault_token_file=None, auth_url=None, clien headers = {"Authorization": f"Bearer {vault_token}"} # --- 4. Retry with token --- + # This request correctly allows redirects (default) response = requests.get(url, headers=headers, stream=True) + # Handle 3xx redirects for non-authed requests (e.g., S3 presigned URLs) + elif response.is_redirect: + redirect_url = response.headers.get("Location") + print(f"Following redirect to {redirect_url}") + # Make a new request that *does* follow any further redirects + response = requests.get(redirect_url, stream=True, allow_redirects=True) + try: response.raise_for_status() # Raise if still failing except requests.exceptions.HTTPError as e: @@ -446,24 +486,36 @@ def __download_file__(url, filename, vault_token_file=None, auth_url=None, clien else: raise e - total_size_in_bytes = int(response.headers.get('content-length', 0)) + total_size_in_bytes = int(response.headers.get("content-length", 0)) block_size = 1024 # 1 KiB - progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) - with open(filename, 'wb') as file: + progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) + with open(filename, "wb") as file: for data in response.iter_content(block_size): progress_bar.update(len(data)) file.write(data) progress_bar.close() + # Validate checksum if expected hash is provided and validation is not OFF + if expected_sha256 and validation_mode != ShaValidationMode.OFF: + actual_sha256 = __compute_file_sha256(filename) + if actual_sha256 != expected_sha256: + mismatch_msg = f"SHA256 mismatch for {filename}\nExpected: {expected_sha256}\nActual: {actual_sha256}" + if validation_mode == ShaValidationMode.ERROR: + raise ValueError(mismatch_msg) + elif validation_mode == ShaValidationMode.WARNING: + print(f"\nWARNING: {mismatch_msg}\n") + # Don't raise, just print and continue + else: + print(f"SHA256 validated for {filename}") + if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: raise IOError("Downloaded size does not match Content-Length header") -def __get_vault_access__(download_url: str, - token_file: str, - auth_url: str, - client_id: str) -> str: +def __get_vault_access__( + download_url: str, token_file: str, auth_url: str, client_id: str +) -> str: """ Get Vault access token for a protected databus download. """ @@ -478,31 +530,37 @@ def __get_vault_access__(download_url: str, print(f"Warning: token from {token_file} is short (<80 chars)") # 2. Refresh token -> access token - resp = requests.post(auth_url, data={ - "client_id": client_id, - "grant_type": "refresh_token", - "refresh_token": refresh_token - }) + resp = requests.post( + auth_url, + data={ + "client_id": client_id, + "grant_type": "refresh_token", + "refresh_token": refresh_token, + }, + ) resp.raise_for_status() access_token = resp.json()["access_token"] # 3. Extract host as audience # Remove protocol prefix if download_url.startswith("https://"): - host_part = download_url[len("https://"):] + host_part = download_url[len("https://") :] elif download_url.startswith("http://"): - host_part = download_url[len("http://"):] + host_part = download_url[len("http://") :] else: host_part = download_url audience = host_part.split("/")[0] # host is before first "/" # 4. Access token -> Vault token - resp = requests.post(auth_url, data={ - "client_id": client_id, - "grant_type": "urn:ietf:params:oauth:grant-type:token-exchange", - "subject_token": access_token, - "audience": audience - }) + resp = requests.post( + auth_url, + data={ + "client_id": client_id, + "grant_type": "urn:ietf:params:oauth:grant-type:token-exchange", + "subject_token": access_token, + "audience": audience, + }, + ) resp.raise_for_status() vault_token = resp.json()["access_token"] @@ -522,40 +580,68 @@ def __query_sparql__(endpoint_url, query) -> dict: - Dictionary containing the query results """ sparql = SPARQLWrapper(endpoint_url) - sparql.method = 'POST' + sparql.method = "POST" sparql.setQuery(query) sparql.setReturnFormat(JSON) results = sparql.query().convert() return results -def __handle_databus_file_query__(endpoint_url, query) -> List[str]: +def __handle_databus_file_query__( + endpoint_url, query +) -> List[Tuple[str, Optional[str]]]: result_dict = __query_sparql__(endpoint_url, query) - for binding in result_dict['results']['bindings']: - if len(binding.keys()) > 1: - print("Error multiple bindings in query response") - break + for binding in result_dict["results"]["bindings"]: + # Attempt to find file URL and sha + file_url = None + sha = None + + # Try common variable names for the file URL + if "file" in binding: + file_url = binding["file"]["value"] + elif "downloadURL" in binding: + file_url = binding["downloadURL"]["value"] + elif len(binding.keys()) >= 1: # Fallback to original-like behavior + file_url = binding[next(iter(binding.keys()))]["value"] + + # Try common variable names for the checksum + if "sha" in binding: + sha = binding["sha"]["value"] + elif "sha256sum" in binding: + sha = binding["sha256sum"]["value"] + + if file_url: + yield (file_url, sha) else: - value = binding[next(iter(binding.keys()))]['value'] - yield value + print(f"Warning: Could not determine file URL from query binding: {binding}") -def __handle_databus_artifact_version__(json_str: str) -> List[str]: +def __handle_databus_artifact_version__( + json_str: str, +) -> List[Tuple[str, Optional[str]]]: """ - Parse the JSON-LD of a databus artifact version to extract download URLs. + Parse the JSON-LD of a databus artifact version to extract download URLs and SHA256 sums. Don't get downloadURLs directly from the JSON-LD, but follow the "file" links to count access to databus accurately. - Returns a list of download URLs. + Returns a list of (download_url, sha256sum) tuples. """ - databusIdUrl = [] + databus_files = [] json_dict = json.loads(json_str) graph = json_dict.get("@graph", []) for node in graph: if node.get("@type") == "Part": - id = node.get("file") - databusIdUrl.append(id) - return databusIdUrl + # Use the 'file' link as per the original comment + url = node.get("file") + if not url: + continue + + # Extract the sha256sum from the same node + # This key is used in your create_dataset function + sha = node.get("sha256sum") + + databus_files.append((url, sha)) + return databus_files def __get_databus_latest_version_of_artifact__(json_str: str) -> str: @@ -601,7 +687,7 @@ def __get_databus_artifacts_of_group__(json_str: str) -> List[str]: def wsha256(raw: str): - return sha256(raw.encode('utf-8')).hexdigest() + return sha256(raw.encode("utf-8")).hexdigest() def __handle_databus_collection__(uri: str) -> str: @@ -614,25 +700,46 @@ def __get_json_ld_from_databus__(uri: str) -> str: return requests.get(uri, headers=headers).text -def __download_list__(urls: List[str], - localDir: str, - vault_token_file: str = None, - auth_url: str = None, - client_id: str = None) -> None: - for url in urls: +def __download_list__( + files_to_download: List[Tuple[str, Optional[str]]], + localDir: str, + vault_token_file: str = None, + auth_url: str = None, + client_id: str = None, + validation_mode: ShaValidationMode = ShaValidationMode.WARNING, +) -> None: + for url, expected_sha in files_to_download: if localDir is None: - host, account, group, artifact, version, file = __get_databus_id_parts__(url) - localDir = os.path.join(os.getcwd(), account, group, artifact, version if version is not None else "latest") + host, account, group, artifact, version, file = __get_databus_id_parts__( + url + ) + localDir = os.path.join( + os.getcwd(), + account, + group, + artifact, + version if version is not None else "latest", + ) print(f"Local directory not given, using {localDir}") file = url.split("/")[-1] filename = os.path.join(localDir, file) print("\n") - __download_file__(url=url, filename=filename, vault_token_file=vault_token_file, auth_url=auth_url, client_id=client_id) + __download_file__( + url=url, + filename=filename, + vault_token_file=vault_token_file, + auth_url=auth_url, + client_id=client_id, + expected_sha256=expected_sha, # <-- Pass the SHA hash here + validation_mode=validation_mode, # <-- Pass the validation mode here + ) print("\n") -def __get_databus_id_parts__(uri: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str], Optional[str], Optional[str]]: +def __get_databus_id_parts__( + uri: str, +) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str], Optional[str], Optional[str]]: uri = uri.removeprefix("https://").removeprefix("http://") parts = uri.strip("/").split("/") parts += [None] * (6 - len(parts)) # pad with None if less than 6 parts @@ -645,7 +752,8 @@ def download( databusURIs: List[str], token=None, auth_url=None, - client_id=None + client_id=None, + validation_mode: ShaValidationMode = ShaValidationMode.WARNING, ) -> None: """ Download datasets to local storage from databus registry. If download is on vault, vault token will be used for downloading protected files. @@ -656,11 +764,14 @@ def download( token: Path to Vault refresh token file auth_url: Keycloak token endpoint URL client_id: Client ID for token exchange + validation_mode: (OFF, WARNING, ERROR) controls SHA256 validation behavior. Default is WARNING. """ # TODO: make pretty for databusURI in databusURIs: - host, account, group, artifact, version, file = __get_databus_id_parts__(databusURI) + host, account, group, artifact, version, file = __get_databus_id_parts__( + databusURI + ) # dataID or databus collection if databusURI.startswith("http://") or databusURI.startswith("https://"): @@ -673,15 +784,37 @@ def download( if "/collections/" in databusURI: # TODO "in" is not safe! there could be an artifact named collections, need to check for the correct part position in the URI query = __handle_databus_collection__(databusURI) res = __handle_databus_file_query__(endpoint, query) - __download_list__(res, localDir, vault_token_file=token, auth_url=auth_url, client_id=client_id) + __download_list__( + res, + localDir, + vault_token_file=token, + auth_url=auth_url, + client_id=client_id, + validation_mode=validation_mode, + ) # databus file elif file is not None: - __download_list__([databusURI], localDir, vault_token_file=token, auth_url=auth_url, client_id=client_id) + # Pass (url, None) to match the new signature + __download_list__( + [(databusURI, None)], + localDir, + vault_token_file=token, + auth_url=auth_url, + client_id=client_id, + validation_mode=validation_mode, + ) # databus artifact version elif version is not None: json_str = __get_json_ld_from_databus__(databusURI) res = __handle_databus_artifact_version__(json_str) - __download_list__(res, localDir, vault_token_file=token, auth_url=auth_url, client_id=client_id) + __download_list__( + res, + localDir, + vault_token_file=token, + auth_url=auth_url, + client_id=client_id, + validation_mode=validation_mode, + ) # databus artifact elif artifact is not None: json_str = __get_json_ld_from_databus__(databusURI) @@ -689,7 +822,14 @@ def download( print(f"No version given, using latest version: {latest}") json_str = __get_json_ld_from_databus__(latest) res = __handle_databus_artifact_version__(json_str) - __download_list__(res, localDir, vault_token_file=token, auth_url=auth_url, client_id=client_id) + __download_list__( + res, + localDir, + vault_token_file=token, + auth_url=auth_url, + client_id=client_id, + validation_mode=validation_mode, + ) # databus group elif group is not None: @@ -702,7 +842,14 @@ def download( print(f"No version given, using latest version: {latest}") json_str = __get_json_ld_from_databus__(latest) res = __handle_databus_artifact_version__(json_str) - __download_list__(res, localDir, vault_token_file=token, auth_url=auth_url, client_id=client_id) + __download_list__( + res, + localDir, + vault_token_file=token, + auth_url=auth_url, + client_id=client_id, + validation_mode=validation_mode, + ) # databus account elif account is not None: @@ -718,4 +865,11 @@ def download( if endpoint is None: # endpoint is required for queries (--databus) raise ValueError("No endpoint given for query") res = __handle_databus_file_query__(endpoint, databusURI) - __download_list__(res, localDir, vault_token_file=token, auth_url=auth_url, client_id=client_id) + __download_list__( + res, + localDir, + vault_token_file=token, + auth_url=auth_url, + client_id=client_id, + validation_mode=validation_mode, + ) \ No newline at end of file diff --git a/tests/test_databusclient.py b/tests/test_databusclient.py index 202ac16..ecf6c9a 100644 --- a/tests/test_databusclient.py +++ b/tests/test_databusclient.py @@ -1,100 +1,180 @@ -"""Client tests""" import pytest -from databusclient.client import create_dataset, create_distribution, __get_file_info -from collections import OrderedDict - - -EXAMPLE_URL = "https://raw.githubusercontent.com/dbpedia/databus/608482875276ef5df00f2360a2f81005e62b58bd/server/app/api/swagger.yml" - -@pytest.mark.skip(reason="temporarily disabled since code needs fixing") -def test_distribution_cases(): - - metadata_args_with_filler = OrderedDict() - - metadata_args_with_filler["type=config_source=databus"] = "" - metadata_args_with_filler["yml"] = None - metadata_args_with_filler["none"] = None - metadata_args_with_filler[ - "79582a2a7712c0ce78a74bb55b253dc2064931364cf9c17c827370edf9b7e4f1:56737" - ] = None - - # test by leaving out an argument each - artifact_name = "databusclient-pytest" - uri = "https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml" - parameters = list(metadata_args_with_filler.keys()) - - for i in range(0, len(metadata_args_with_filler.keys())): - - if i == 1: - continue - - dst_string = f"{uri}" - for j in range(0, len(metadata_args_with_filler.keys())): - if j == i: - replacement = metadata_args_with_filler[parameters[j]] - if replacement is None: - pass - else: - dst_string += f"|{replacement}" - else: - dst_string += f"|{parameters[j]}" - - print(f"{dst_string=}") - ( - name, - cvs, - formatExtension, - compression, - sha256sum, - content_length, - ) = __get_file_info(artifact_name, dst_string) - - created_dst_str = create_distribution( - uri, cvs, formatExtension, compression, (sha256sum, content_length) +import requests_mock +import os +import hashlib +from unittest.mock import patch + +# Import the functions and classes from your client.py file +# This assumes test_client.py is in a parent folder of databusclient +# Adjust the import if your directory structure is different +from databusclient.client import download, ShaValidationMode, __get_json_ld_from_databus__ + +# --- Mock Data --- + +# This is the fake content we will "download" +MOCK_FILE_CONTENT = b"This is the actual file content." +# This is the CORRECT hash for the content above +CORRECT_SHA256 = hashlib.sha256(MOCK_FILE_CONTENT).hexdigest() +# This is a FAKE hash that we will use to trigger a mismatch +INCORRECT_SHA256 = "this_is_a_fake_hash_that_will_not_match" + +# The Databus Artifact URL we will be "querying" +ARTIFACT_URL = "https://example.databus.com/my-account/my-group/my-artifact/2025-10-31" +# The "file" URL that the artifact metadata points to +FILE_URL = "https://example.databus.com/my-account/my-group/my-artifact/2025-10-31/my-file.ttl" + + +def get_mock_jsonld(sha_hash_to_use): + """Helper to generate mock JSON-LD with a specific hash.""" + return { + "@context": "https://downloads.dbpedia.org/databus/context.jsonld", + "@graph": [ + { + "@type": "Part", + "file": FILE_URL, + "sha256sum": sha_hash_to_use + } + ] + } + + +# --- Pytest Tests --- + +@pytest.fixture +def mock_file_download(requests_mock, tmp_path): + """ + A pytest fixture to set up ONLY the file download mock. + The metadata mock (which differs for each test) will be set up by the test itself. + """ + + # 1. Mock the file download itself (this is the same for all tests) + requests_mock.head(FILE_URL, headers={"Content-Length": str(len(MOCK_FILE_CONTENT))}) + requests_mock.get(FILE_URL, content=MOCK_FILE_CONTENT) + + # Provide the temporary path to the test + return tmp_path + + +# We patch 'builtins.print' to capture the console output +@patch('builtins.print') +def test_sha_mismatch_error(mock_print, mock_file_download, requests_mock): + """ + Tests that validation_mode=ERROR stops execution (raises ValueError) on mismatch. + """ + print("\n--- Testing SHA Mismatch with Mode: ERROR ---") + local_dir = mock_file_download + + # Set up the *specific* metadata mock for THIS test + requests_mock.get( + ARTIFACT_URL, + json=get_mock_jsonld(INCORRECT_SHA256), # Use INCORRECT hash + headers={"Accept": "application/ld+json"} + ) + + # We expect this to fail with a ValueError + with pytest.raises(ValueError) as e: + download( + localDir=str(local_dir), + endpoint=None, # Will be auto-detected + databusURIs=[ARTIFACT_URL], + validation_mode=ShaValidationMode.ERROR ) - assert dst_string == created_dst_str + # Check that the error message is correct + assert "SHA256 mismatch" in str(e.value) -@pytest.mark.skip(reason="temporarily disabled since code needs fixing") -def test_empty_cvs(): +@patch('builtins.print') +def test_sha_mismatch_warning(mock_print, mock_file_download, requests_mock): + """ + Tests that validation_mode=WARNING prints a warning but does NOT stop execution. + """ + print("\n--- Testing SHA Mismatch with Mode: WARNING ---") + local_dir = mock_file_download - dst = [create_distribution(url=EXAMPLE_URL, cvs={})] + # Set up the *specific* metadata mock for THIS test + requests_mock.get( + ARTIFACT_URL, + json=get_mock_jsonld(INCORRECT_SHA256), # Use INCORRECT hash + headers={"Accept": "application/ld+json"} + ) - dataset = create_dataset( - version_id="https://dev.databus.dbpedia.org/user/group/artifact/1970.01.01/", - title="Test Title", - abstract="Test abstract blabla", - description="Test description blabla", - license_url="https://license.url/test/", - distributions=dst, + # We expect this to run without raising an error + try: + download( + localDir=str(local_dir), + endpoint=None, + databusURIs=[ARTIFACT_URL], + validation_mode=ShaValidationMode.WARNING + ) + except ValueError: + pytest.fail("ValidationMode.WARNING raised a ValueError when it should not have.") + + # Check that the warning was printed to the console + printed_output = "\n".join([call.args[0] for call in mock_print.call_args_list if call.args]) + assert "WARNING: SHA256 mismatch" in printed_output + + +@patch('builtins.print') +def test_sha_mismatch_off(mock_print, mock_file_download, requests_mock): + """ + Tests that validation_mode=OFF skips validation entirely. + """ + print("\n--- Testing SHA Mismatch with Mode: OFF ---") + local_dir = mock_file_download + + # Set up the *specific* metadata mock for THIS test + requests_mock.get( + ARTIFACT_URL, + json=get_mock_jsonld(INCORRECT_SHA256), # Use INCORRECT hash + headers={"Accept": "application/ld+json"} ) - correct_dataset = { - "@context": "https://downloads.dbpedia.org/databus/context.jsonld", - "@graph": [ - { - "@type": "Dataset", - "@id": "https://dev.databus.dbpedia.org/user/group/artifact/1970.01.01#Dataset", - "hasVersion": "1970.01.01", - "title": "Test Title", - "abstract": "Test abstract blabla", - "description": "Test description blabla", - "license": "https://license.url/test/", - "distribution": [ - { - "@id": "https://dev.databus.dbpedia.org/user/group/artifact/1970.01.01#artifact.yml", - "@type": "Part", - "file": "https://dev.databus.dbpedia.org/user/group/artifact/1970.01.01/artifact.yml", - "formatExtension": "yml", - "compression": "none", - "downloadURL": EXAMPLE_URL, - "byteSize": 59986, - "sha256sum": "088e6161bf8b4861bdd4e9f517be4441b35a15346cb9d2d3c6d2e3d6cd412030", - } - ], - } - ], - } + # We expect this to run without raising an error + try: + download( + localDir=str(local_dir), + endpoint=None, + databusURIs=[ARTIFACT_URL], + validation_mode=ShaValidationMode.OFF + ) + except ValueError: + pytest.fail("ValidationMode.OFF raised a ValueError when it should not have.") + + # Check that the "skipping" message was printed + printed_output = "\n".join([call.args[0] for call in mock_print.call_args_list if call.args]) + assert "Skipping SHA256 validation" in printed_output + assert "WARNING: SHA256 mismatch" not in printed_output # Ensure no warning was printed + + +@patch('builtins.print') +def test_sha_match_success(mock_print, mock_file_download, requests_mock): + """ + Tests that a correct SHA256 hash passes validation. + """ + print("\n--- Testing SHA Match (Success) ---") + local_dir = mock_file_download + + # Set up the *specific* metadata mock for THIS test + requests_mock.get( + ARTIFACT_URL, + json=get_mock_jsonld(CORRECT_SHA256), # Use CORRECT hash + headers={"Accept": "application/ld+json"} + ) + + # This test uses the metadata with the CORRECT hash + # We expect this to run without raising an error + try: + download( + localDir=str(local_dir), + endpoint=None, + databusURIs=[ARTIFACT_URL], + validation_mode=ShaValidationMode.WARNING # Mode doesn't matter, it should pass + ) + except ValueError: + pytest.fail("Validation failed when SHA hashes matched.") + + # Check that the "validated" message was printed + printed_output = "\n".join([call.args[0] for call in mock_print.call_args_list if call.args]) + assert "SHA256 validated" in printed_output - assert dataset == correct_dataset