diff --git a/tests/unit/adapter/test_datastore_api.py b/tests/unit/adapter/test_datastore_api.py new file mode 100644 index 00000000..d1fda422 --- /dev/null +++ b/tests/unit/adapter/test_datastore_api.py @@ -0,0 +1,181 @@ +import os + +import pytest +from requests_mock import Mocker as RequestsMocker + +from job_executor.adapter import datastore_api +from job_executor.adapter.datastore_api.models import ( + Job, + JobParameters, + JobStatus, + Operation, + ReleaseStatus, + UserInfo, +) +from job_executor.common.exceptions import HttpResponseError + +DATASTORE_API_URL = os.environ["DATASTORE_API_URL"] +DATASTORE_RDN = os.environ["DATASTORE_RDN"] +JOB_ID = "123" +JOB_LIST = [ + Job( + job_id=JOB_ID, + datastore_rdn=DATASTORE_RDN, + status=JobStatus.QUEUED, + parameters=JobParameters(target="INNTEKT", operation=Operation.CHANGE), + log=[], + created_at="2022-05-18T11:40:22.519222", + created_by=UserInfo( + user_id="123-123-123", first_name="Data", last_name="Admin" + ), + ), + Job( + job_id=JOB_ID, + datastore_rdn=DATASTORE_RDN, + status=JobStatus.QUEUED, + parameters=JobParameters( + operation=Operation.SET_STATUS, + target="KJOENN", + release_status=ReleaseStatus.PENDING_RELEASE, + ), + log=[], + created_at="2022-05-18T11:40:22.519222", + created_by=UserInfo( + user_id="123-123-123", first_name="Data", last_name="Admin" + ), + ), +] +LOG_MESSAGE = "log message" +DESCRIPTION = "new description" +ERROR_RESPONSE = "Internal Server Error" + + +def test_get_jobs(requests_mock: RequestsMocker): + requests_mock.get( + f"{DATASTORE_API_URL}/jobs", + json=[ + job.model_dump(by_alias=True, exclude_none=True) for job in JOB_LIST + ], + ) + jobs = datastore_api.get_jobs() + assert jobs == JOB_LIST + assert len(requests_mock.request_history) == 1 + + +def test_update_job_status(requests_mock: RequestsMocker): + requests_mock.put( + f"{DATASTORE_API_URL}/jobs/{JOB_ID}", json={"message": "OK"} + ) + datastore_api.update_job_status(JOB_ID, JobStatus.QUEUED) + datastore_api.update_job_status(JOB_ID, JobStatus.QUEUED, LOG_MESSAGE) + request_history = requests_mock.request_history + assert len(request_history) == 2 + assert request_history[0].json() == {"status": "queued"} + assert request_history[1].json() == { + "status": "queued", + "log": LOG_MESSAGE, + } + + +def test_update_description(requests_mock: RequestsMocker): + requests_mock.put( + f"{DATASTORE_API_URL}/jobs/{JOB_ID}", json={"message": "OK"} + ) + datastore_api.update_description(JOB_ID, DESCRIPTION) + request_history = requests_mock.request_history + assert len(request_history) == 1 + assert request_history[0].json() == {"description": DESCRIPTION} + + +def test_no_connection(requests_mock: RequestsMocker): + requests_mock.get( + f"{DATASTORE_API_URL}/jobs", status_code=500, text=ERROR_RESPONSE + ) + requests_mock.put( + f"{DATASTORE_API_URL}/jobs/{JOB_ID}", + status_code=500, + text=ERROR_RESPONSE, + ) + with pytest.raises(HttpResponseError) as e: + datastore_api.get_jobs() + assert ERROR_RESPONSE in str(e) + with pytest.raises(HttpResponseError) as e: + datastore_api.update_job_status(JOB_ID, JobStatus.QUEUED) + assert ERROR_RESPONSE in str(e) + with pytest.raises(HttpResponseError) as e: + datastore_api.update_description(JOB_ID, DESCRIPTION) + assert ERROR_RESPONSE in str(e) + + +def test_get_maintenance_status(requests_mock: RequestsMocker): + requests_mock.get( + f"{DATASTORE_API_URL}/maintenance-statuses/latest", + json={ + "paused": False, + "msg": "OK", + "timestamp": "2023-05-08T06:31:00.519222", + }, + ) + maintenance_status = datastore_api.get_maintenance_status() + assert maintenance_status.paused is False + + +def test_get_maintenance_status_error(requests_mock: RequestsMocker): + requests_mock.get( + f"{DATASTORE_API_URL}/maintenance-statuses/latest", + status_code=500, + text=ERROR_RESPONSE, + ) + with pytest.raises(HttpResponseError) as e: + datastore_api.get_maintenance_status() + assert ERROR_RESPONSE in str(e) + + +@pytest.mark.parametrize( + "is_paused,expected_result", + [ + ( + True, + datastore_api.JobQueryResult( + built_jobs=JOB_LIST, + queued_manager_jobs=[], + queued_worker_jobs=[], + ), + ), + ( + False, + datastore_api.JobQueryResult( + built_jobs=JOB_LIST, + queued_manager_jobs=JOB_LIST, + queued_worker_jobs=JOB_LIST, + ), + ), + ], +) +def test_query_for_jobs(is_paused, expected_result, requests_mock, monkeypatch): + monkeypatch.setattr( + "job_executor.adapter.datastore_api.is_system_paused", lambda: is_paused + ) + + # Always return built jobs even if system is paused + # If system is paused, return empty list for queued and queued_manager jobs + def mock_get_jobs(job_status=None, operations=None): + if job_status == "built": + return JOB_LIST + elif job_status == "queued": + return JOB_LIST if not is_paused else [] + elif job_status == "queued_manager": + return JOB_LIST if not is_paused else [] + + monkeypatch.setattr( + "job_executor.adapter.datastore_api.get_jobs", mock_get_jobs + ) + + result = datastore_api.query_for_jobs() + assert result.built_jobs == JOB_LIST + if is_paused: + assert result.queued_manager_jobs == [] + assert result.queued_worker_jobs == [] + else: + assert result.queued_manager_jobs == JOB_LIST + assert result.queued_worker_jobs == JOB_LIST diff --git a/tests/unit/adapter/test_local_storage.py b/tests/unit/adapter/test_local_storage.py new file mode 100644 index 00000000..a796c67d --- /dev/null +++ b/tests/unit/adapter/test_local_storage.py @@ -0,0 +1,195 @@ +import json +import os +import shutil +from pathlib import Path + +import pytest + +from job_executor.adapter.fs import LocalStorageAdapter +from job_executor.adapter.fs.models.datastore_versions import ( + DatastoreVersions, + DraftVersion, +) +from job_executor.adapter.fs.models.metadata import ( + MetadataAll, +) +from job_executor.common.exceptions import LocalStorageError + +DATASTORE_DIR = "tests/unit/resources/adapter/local_storage/TEST_DATASTORE" +WORKING_DIR = DATASTORE_DIR + "_working" +DATASTORE_DATA_DIR = f"{DATASTORE_DIR}/data" + +local_storage = LocalStorageAdapter(Path(DATASTORE_DIR)) + +DATASTORE_VERSIONS_PATH = f"{DATASTORE_DIR}/datastore/datastore_versions.json" +DRAFT_METADATA_ALL_PATH = f"{DATASTORE_DIR}/datastore/metadata_all__draft.json" +DRAFT_VERSION_PATH = f"{DATASTORE_DIR}/datastore/draft_version.json" +DATA_VERSIONS_PATH = f"{DATASTORE_DIR}/datastore/data_versions__1_0.json" +METADATA_ALL_PATH = f"{DATASTORE_DIR}/datastore/metadata_all__1_0_0.json" + +DRAFT_DATASET_NAME = "UTDANNING" +DRAFT_DATA_PATH = f"{DATASTORE_DATA_DIR}/UTDANNING/UTDANNING__DRAFT.parquet" + +DRAFT2_DATASET_NAME = "BRUTTO_INNTEKT" +RELEASED_DRAFT2_DATA_PATH = ( + f"{DATASTORE_DATA_DIR}/BRUTTO_INNTEKT/BRUTTO_INNTEKT__1_1" +) + +WORKING_DIR_DATASET = "FOEDESTED" +MOVED_WORKING_DIR_DATASET_DATA_PATH = ( + f"{DATASTORE_DATA_DIR}/FOEDESTED/FOEDESTED__DRAFT.parquet" +) + + +def setup_function(): + if os.path.isdir("tests/unit/resources_backup"): + shutil.rmtree("tests/unit/resources_backup") + shutil.copytree("tests/unit/resources", "tests/unit/resources_backup") + + +def teardown_function(): + shutil.rmtree("tests/unit/resources") + shutil.move("tests/unit/resources_backup", "tests/unit/resources") + + +def read_json(file_path: str) -> dict: + with open(file_path, encoding="utf-8") as f: + return json.load(f) + + +def test_make_dataset_dir(): + local_storage.datastore_dir.make_dataset_dir(WORKING_DIR_DATASET) + assert os.path.isdir(f"{DATASTORE_DATA_DIR}/{WORKING_DIR_DATASET}") + + +def test_get_data_versions(): + assert local_storage.datastore_dir.get_data_versions("1_0_0") == read_json( + DATA_VERSIONS_PATH + ) + + +def test_write_data_versions(): + local_storage.datastore_dir.write_data_versions({}, "1_0_0") + assert read_json(DATA_VERSIONS_PATH) == {} + + +def test_get_draft_version(): + assert isinstance( + local_storage.datastore_dir.get_draft_version(), DraftVersion + ) + + +def test_write_draft_version(): + draft_version = local_storage.datastore_dir.get_draft_version() + draft_version.description = "updated" + local_storage.datastore_dir.write_draft_version(draft_version) + assert ( + local_storage.datastore_dir.get_draft_version().description == "updated" + ) + + +def test_get_datastore_versions(): + assert isinstance( + local_storage.datastore_dir.get_datastore_versions(), DatastoreVersions + ) + + +def test_write_datastore_versions(): + datastore_versions = local_storage.datastore_dir.get_datastore_versions() + datastore_versions.description = "updated" + local_storage.datastore_dir.write_datastore_versions(datastore_versions) + assert ( + local_storage.datastore_dir.get_datastore_versions().description + == "updated" + ) + + +def test_get_metadata_all(): + assert isinstance( + local_storage.datastore_dir.get_metadata_all("1_0_0"), MetadataAll + ) + + +def test_write_metadata_all(): + metadata_all = local_storage.datastore_dir.get_metadata_all("1_0_0") + metadata_all.data_structures = [] + local_storage.datastore_dir.write_metadata_all(metadata_all, "1_0_0") + assert ( + local_storage.datastore_dir.get_metadata_all("1_0_0").data_structures + == [] + ) + + +def delete_parquet_draft(): + local_storage.datastore_dir.delete_parquet_draft(DRAFT_DATASET_NAME) + assert not os.path.isfile(DRAFT_DATA_PATH) + + +def test_rename_parquet_draft_to_release(): + release_path = local_storage.datastore_dir.rename_parquet_draft_to_release( + DRAFT2_DATASET_NAME, "1_1_0" + ) + assert os.path.isdir(RELEASED_DRAFT2_DATA_PATH) + assert release_path == f"{DRAFT2_DATASET_NAME}__1_1" + + +def test_move_working_dir_parquet_to_datastore(): + local_storage.datastore_dir.make_dataset_dir(WORKING_DIR_DATASET) + local_storage.move_working_dir_parquet_to_datastore(WORKING_DIR_DATASET) + assert os.path.isfile(MOVED_WORKING_DIR_DATASET_DATA_PATH) + + +def test_make_temp_directory(): + datastore_content = os.listdir(Path(DATASTORE_DIR) / "datastore") + local_storage.datastore_dir.save_temporary_backup() + datastore_content_backup = os.listdir(Path(DATASTORE_DIR) / "datastore") + assert len(datastore_content_backup) == len(datastore_content) + 1 + tmp_dir = Path(DATASTORE_DIR) / "datastore" / "tmp" + assert os.path.isdir(tmp_dir) + tmp_actual_content = os.listdir(tmp_dir) + tmp_expected_content = [ + "metadata_all__DRAFT.json", + "datastore_versions.json", + "draft_version.json", + ] + assert len(tmp_actual_content) == 3 + for content in tmp_expected_content: + assert content in tmp_actual_content + + +def test_make_temp_directory_already_exists(): + local_storage.datastore_dir.save_temporary_backup() + datastore_content = os.listdir(Path(DATASTORE_DIR) / "datastore") + assert "tmp" in datastore_content + with pytest.raises(LocalStorageError) as e: + local_storage.datastore_dir.save_temporary_backup() + assert "tmp directory already exists" in str(e) + + +def test_archive_temp_directory(): + local_storage.datastore_dir.save_temporary_backup() + datastore_content = os.listdir(Path(DATASTORE_DIR) / "datastore") + local_storage.datastore_dir.archive_temporary_backup() + datastore_content_archived = os.listdir(Path(DATASTORE_DIR) / "datastore") + assert len(datastore_content) == len(datastore_content_archived) + 1 + assert not os.path.isdir(Path(DATASTORE_DIR) / "datastore" / "tmp") + + +def test_archived_temp_directory_unrecognized_files(): + local_storage.datastore_dir.save_temporary_backup() + tmp_dir = Path(DATASTORE_DIR) / "datastore" / "tmp" + assert os.path.isdir(tmp_dir) + (tmp_dir / "newfile.txt").touch() + + with pytest.raises(LocalStorageError) as e: + local_storage.datastore_dir.archive_temporary_backup() + assert "Found unrecognized files" in str(e) + + +def test_archive_or_delete_non_existent_tmp_dir(): + with pytest.raises(LocalStorageError) as e: + local_storage.datastore_dir.archive_temporary_backup() + assert "Could not find a tmp directory to archive." in str(e) + with pytest.raises(LocalStorageError) as e: + local_storage.datastore_dir.delete_temporary_backup() + assert "Could not find a tmp directory to delete." in str(e) diff --git a/tests/unit/adapter/test_pseudonym_service.py b/tests/unit/adapter/test_pseudonym_service.py new file mode 100644 index 00000000..5088c348 --- /dev/null +++ b/tests/unit/adapter/test_pseudonym_service.py @@ -0,0 +1,47 @@ +import pytest +from microdata_tools.validation.model.metadata import UnitIdType +from requests_mock import Mocker as RequestsMocker + +from job_executor.adapter import pseudonym_service +from job_executor.common.exceptions import HttpResponseError +from job_executor.config import environment, secrets + +JOB_ID = "123-123-123" +PSEUDONYM_SERVICE_URL = environment.pseudonym_service_url +API_KEY = secrets.pseudonym_service_api_key + +URL = f"{PSEUDONYM_SERVICE_URL}/?unit_id_type=FNR&job_id={JOB_ID}" +UNIT_ID_TYPE = UnitIdType.FNR +IDENTIFIERS = ["test1", "test2"] +PSEUDONYM_DICT = {"test1": "value", "test2": "value"} + + +def test_pseudonymize(requests_mock: RequestsMocker): + requests_mock.post(URL, status_code=200, json=PSEUDONYM_DICT) + assert ( + pseudonym_service.pseudonymize(IDENTIFIERS, UNIT_ID_TYPE, JOB_ID) + == PSEUDONYM_DICT + ) + request_history = requests_mock.request_history + request = request_history[0] + + assert len(request_history) == 1 + assert request.url == URL + assert request.method == "POST" + assert request.json() == IDENTIFIERS + assert request.headers["X-API-Key"] == API_KEY + + +def test_pseudonymize_bad_status(requests_mock: RequestsMocker): + requests_mock.post(URL, status_code=500, text="error") + with pytest.raises(HttpResponseError) as e: + pseudonym_service.pseudonymize(IDENTIFIERS, UNIT_ID_TYPE, JOB_ID) + request_history = requests_mock.request_history + request = request_history[0] + + assert len(request_history) == 1 + assert request.url == URL + assert request.method == "POST" + assert request.json() == IDENTIFIERS + assert request.headers["X-API-Key"] == API_KEY + assert "500: error" == str(e.value) diff --git a/tests/unit/config/test_environment.py b/tests/unit/config/test_environment.py new file mode 100644 index 00000000..26040d19 --- /dev/null +++ b/tests/unit/config/test_environment.py @@ -0,0 +1,13 @@ +import os + +from job_executor.config import environment + + +def test_config_from_environment(): + assert environment.datastore_dir == os.environ.get("DATASTORE_DIR") + assert environment.pseudonym_service_url == ( + os.environ.get("PSEUDONYM_SERVICE_URL") + ) + assert environment.datastore_api_url == ( + os.environ.get("DATASTORE_API_URL") + ) diff --git a/tests/unit/domain/manager/test_manager.py b/tests/unit/domain/manager/test_manager.py new file mode 100644 index 00000000..4f3a8bde --- /dev/null +++ b/tests/unit/domain/manager/test_manager.py @@ -0,0 +1,124 @@ +from dataclasses import dataclass + +from job_executor.domain.manager import Manager + + +@dataclass +class MockedWorker: + job_id: str + job_size: int + + def is_alive(self) -> bool: + return True + + def start(self) -> None: ... + + +def test_initial_state(): + manager = Manager( + max_workers=4, + max_bytes_all_workers=50 * 1024**3, + ) + + assert manager.current_total_size == 0 + assert len(manager.workers) == 0 + manager.close_logging_thread() + + +def test_can_spawn_worker(): + manager = Manager( + max_workers=4, + max_bytes_all_workers=50 * 1024**3, + ) + + can_spawn = manager.can_spawn_new_worker(new_job_size=1) + assert can_spawn is True + manager.close_logging_thread() + + +def test_cannot_spawn_worker_too_many_workers(): + manager = Manager( + max_workers=4, + max_bytes_all_workers=50 * 1024**3, + ) + + # Register 4 jobs + for i in range(4): + worker = MockedWorker( + job_id=f"job_{i}", + job_size=1024, + ) + manager.workers.append(worker) # type: ignore + worker.start() + + can_spawn = manager.can_spawn_new_worker(new_job_size=1024) + assert can_spawn is False + manager.close_logging_thread() + + +def test_cannot_spawn_worker_size_limit_reached(): + TWENTY_GB = 20 * 1024**3 + manager = Manager( + max_workers=20, + max_bytes_all_workers=TWENTY_GB, + ) + + large_job = MockedWorker( + job_id="job_large", + job_size=TWENTY_GB, + ) + manager.workers.append(large_job) # type: ignore + large_job.start() + + # Only one job active but size limit is reached cannot spawn new job + can_spawn = manager.can_spawn_new_worker(new_job_size=1024) + assert can_spawn is False + manager.close_logging_thread() + + +def test_oversized_jobs(): + FIFTY_GB = 50 * 1024**3 + TEN_GB = 10 * 1024**3 + manager = Manager( + max_workers=4, + max_bytes_all_workers=20 * 1024**3, + ) + + # This job will never be processed + can_spawn = manager.can_spawn_new_worker(new_job_size=FIFTY_GB) + assert can_spawn is False + + # This job will be accepted + can_spawn = manager.can_spawn_new_worker(new_job_size=TEN_GB) + assert can_spawn is True + worker = MockedWorker( + job_id="job_2", + job_size=TEN_GB, + ) + manager.workers.append(worker) # type: ignore + worker.start() + manager.close_logging_thread() + + +def test_unregister_job(): + manager = Manager( + max_workers=4, + max_bytes_all_workers=50 * 1024**3, + ) + + # Register 4 jobs + for i in range(4): + worker = MockedWorker( + job_id=f"job_{i}", + job_size=1024, + ) + manager.workers.append(worker) # type: ignore + worker.start() + + can_spawn = manager.can_spawn_new_worker(new_job_size=1024) + assert can_spawn is False + + manager.unregister_worker("job_1") + can_spawn = manager.can_spawn_new_worker(new_job_size=1024) + assert can_spawn is True + manager.close_logging_thread() diff --git a/tests/unit/domain/worker/steps/test_dataset_partitioner.py b/tests/unit/domain/worker/steps/test_dataset_partitioner.py new file mode 100644 index 00000000..fd63c264 --- /dev/null +++ b/tests/unit/domain/worker/steps/test_dataset_partitioner.py @@ -0,0 +1,105 @@ +import os +import shutil +from pathlib import Path + +import pyarrow +import pytest +from pyarrow import parquet + +from job_executor.common.exceptions import BuilderStepError +from job_executor.domain.worker.steps import dataset_partitioner + +WORKING_DIR = Path( + "tests/unit/resources/domain/worker/steps/dataset_partitioner" +) +JOB_ID_PARTITIONER = "321-321-321-321" + +TABLE_SIZE = 3000 +UNIT_ID_INPUT = [f"i{count}" for count in range(TABLE_SIZE)] +YEARS = ["2020"] * 1000 + ["2021"] * 1000 + ["2022"] * 1000 +START_EPOCH_DAYS = [18262] * 1000 + [18628] * 1000 + [18993] * 1000 +INPUT_TABLE = pyarrow.Table.from_pydict( + { + "unit_id": UNIT_ID_INPUT, + "value": UNIT_ID_INPUT, + "start_year": YEARS, + "start_epoch_days": START_EPOCH_DAYS, + "stop_epoch_days": [day + 1 for day in START_EPOCH_DAYS], + } +) + + +def setup_function(): + if os.path.isdir(f"{WORKING_DIR}_backup"): + shutil.rmtree(f"{WORKING_DIR}_backup") + + if not os.path.isdir(WORKING_DIR): + os.mkdir(WORKING_DIR) + + shutil.copytree(WORKING_DIR, f"{WORKING_DIR}_backup") + + parquet.write_table( + INPUT_TABLE, WORKING_DIR / "input_pseudonymized.parquet" + ) + + +def teardown_function(): + shutil.rmtree(WORKING_DIR) + shutil.move(f"{WORKING_DIR}_backup", WORKING_DIR) + + +def test_partitioner(): + dataset_path = Path(f"{WORKING_DIR}/input_pseudonymized.parquet") + dataset_partitioner.run(dataset_path, "input") + output_dir = dataset_path.parent / "input__DRAFT" + + assert output_dir.exists() + # Check each year's subdirectory + for year in [2020, 2021, 2022]: + partition_path = output_dir / f"start_year={year}" + + # 1. Verify the subdirectory exists + assert partition_path.exists() and partition_path.is_dir() + + # 2. Verify each subdirectory contains exactly one file + files = list(partition_path.iterdir()) + assert len(files) == 1 + + # 3. column names are the same except for the partition column + table_whole_year = pyarrow.parquet.read_table(partition_path) # type: ignore + assert table_whole_year.column_names == [ + "unit_id", + "value", + "start_epoch_days", + "stop_epoch_days", + ] + + # 4. Load the parquet file and check its length + table_from_partition = pyarrow.parquet.read_table(files[0]) # type: ignore + assert len(table_from_partition) == 1000 # Each year has 1000 records + + # 5. Check if start_epoch_days is within the correct start_year + start_epochs = table_from_partition.column( + "start_epoch_days" + ).to_pylist() + + if year == 2020: + assert all(18262 <= epoch < 18628 for epoch in start_epochs) + elif year == 2021: + assert all(18628 <= epoch < 18993 for epoch in start_epochs) + elif year == 2022: + assert all(18993 <= epoch for epoch in start_epochs) + else: + raise AssertionError(f"Unexpected year: {year}") + + +def test_partitioner_missing_start_year(): + # remove start_year column from input table + input_table = INPUT_TABLE.remove_column(2) + parquet.write_table( + input_table, WORKING_DIR / "input_pseudonymized.parquet" + ) + + dataset_path = Path(f"{WORKING_DIR}/input_pseudonymized.parquet") + with pytest.raises(BuilderStepError): + dataset_partitioner.run(dataset_path, "input") diff --git a/tests/unit/domain/worker/steps/test_dataset_pseudonymizer.py b/tests/unit/domain/worker/steps/test_dataset_pseudonymizer.py new file mode 100644 index 00000000..dfe9f869 --- /dev/null +++ b/tests/unit/domain/worker/steps/test_dataset_pseudonymizer.py @@ -0,0 +1,303 @@ +import json +import os +import shutil +from pathlib import Path + +import pyarrow +import pytest +from pyarrow import dataset, parquet + +from job_executor.adapter import pseudonym_service +from job_executor.adapter.fs.models.metadata import Metadata +from job_executor.common.exceptions import BuilderStepError +from job_executor.domain.worker.steps import dataset_pseudonymizer + +TABLE_SIZE = 1000 +UNIT_ID_INPUT = [f"i{count}" for count in range(TABLE_SIZE)] +UNIT_ID_PSEUDONYMIZED = [count for count in range(TABLE_SIZE)] + +INPUT_TABLE = pyarrow.Table.from_pydict( + { + "unit_id": UNIT_ID_INPUT, + "value": UNIT_ID_INPUT, + "start_epoch_days": pyarrow.array( + [18200] * TABLE_SIZE, type=pyarrow.int16() + ), + "stop_epoch_days": pyarrow.array( + [18201] * TABLE_SIZE, type=pyarrow.int16() + ), + } +) + +INPUT_TABLE_START_YEAR = pyarrow.Table.from_pydict( + { + "unit_id": UNIT_ID_INPUT, + "value": UNIT_ID_INPUT, + "start_epoch_days": pyarrow.array( + [18200] * TABLE_SIZE, type=pyarrow.int16() + ), + "stop_epoch_days": pyarrow.array( + [18201] * TABLE_SIZE, type=pyarrow.int16() + ), + "start_year": pyarrow.array( + [str(year) for year in [2020] * TABLE_SIZE] + ), + } +) + +EXPECTED_TABLE = pyarrow.Table.from_pydict( + { + "unit_id": UNIT_ID_PSEUDONYMIZED, + "value": UNIT_ID_INPUT, + "start_epoch_days": pyarrow.array( + [18200] * TABLE_SIZE, type=pyarrow.int16() + ), + "stop_epoch_days": pyarrow.array( + [18201] * TABLE_SIZE, type=pyarrow.int16() + ), + } +) + +EXPECTED_TABLE_START_YEAR = pyarrow.Table.from_pydict( + { + "unit_id": UNIT_ID_PSEUDONYMIZED, + "value": UNIT_ID_INPUT, + "start_epoch_days": pyarrow.array( + [18200] * TABLE_SIZE, type=pyarrow.int16() + ), + "stop_epoch_days": pyarrow.array( + [18201] * TABLE_SIZE, type=pyarrow.int16() + ), + "start_year": pyarrow.array( + [str(year) for year in [2020] * TABLE_SIZE] + ), + } +) + +EXPECTED_TABLE_WITH_BOTH_PSEUDONYMIZED = pyarrow.Table.from_pydict( + { + "unit_id": UNIT_ID_PSEUDONYMIZED, + "value": UNIT_ID_PSEUDONYMIZED, + "start_epoch_days": pyarrow.array( + [18200] * TABLE_SIZE, type=pyarrow.int16() + ), + "stop_epoch_days": pyarrow.array( + [18201] * TABLE_SIZE, type=pyarrow.int16() + ), + } +) + +EXPECTED_TABLE_WITH_ONLY_VALUE_PSEUDONYMIZED = pyarrow.Table.from_pydict( + { + "unit_id": UNIT_ID_INPUT, + "value": UNIT_ID_PSEUDONYMIZED, + "start_epoch_days": pyarrow.array( + [18200] * TABLE_SIZE, type=pyarrow.int16() + ), + "stop_epoch_days": pyarrow.array( + [18201] * TABLE_SIZE, type=pyarrow.int16() + ), + } +) + + +WORKING_DIR = Path( + "tests/unit/resources/domain/worker/steps/dataset_pseudonymizer" +) +INPUT_PARQUET_PATH = WORKING_DIR / "input.parquet" +OUTPUT_PARQUET_FILE_NAME = "input_pseudonymized.parquet" + +INPUT_PARQUET_PATH_START_YEAR = WORKING_DIR / "input_start_year.parquet" +OUTPUT_PARQUET_FILE_START_YEAR = "input_start_year_pseudonymized.parquet" + + +JOB_ID = "123-123-123-123" +PSEUDONYM_DICT = {f"i{count}": count for count in range(TABLE_SIZE)} +with open(f"{WORKING_DIR}/metadata.json", encoding="utf-8") as file: + METADATA = Metadata(**json.load(file)) +with open( + f"{WORKING_DIR}/metadata_invalid_unit_type.json", encoding="utf-8" +) as file: + INVALID_METADATA = Metadata(**json.load(file)) +with open( + f"{WORKING_DIR}/metadata_pseudonymize_unit_id_and_value.json", + encoding="utf-8", +) as file: + PSEUDONYMIZE_UNIT_ID_AND_VALUE_METADATA = Metadata(**json.load(file)) +with open( + f"{WORKING_DIR}/metadata_pseudonymize_value.json", + encoding="utf-8", +) as file: + PSEUDONYMIZE_ONLY_VALUE_METADATA = Metadata(**json.load(file)) + + +def setup_function(): + if os.path.isdir(f"{WORKING_DIR}_backup"): + shutil.rmtree(f"{WORKING_DIR}_backup") + + shutil.copytree(WORKING_DIR, f"{WORKING_DIR}_backup") + + parquet.write_table(INPUT_TABLE, INPUT_PARQUET_PATH) + parquet.write_table(INPUT_TABLE_START_YEAR, INPUT_PARQUET_PATH_START_YEAR) + + +def teardown_function(): + shutil.rmtree(WORKING_DIR) + shutil.move(f"{WORKING_DIR}_backup", WORKING_DIR) + + +def test_pseudonymizer(mocker): + mocker.patch.object( + pseudonym_service, "pseudonymize", return_value=PSEUDONYM_DICT + ) + assert str( + dataset_pseudonymizer.run(INPUT_PARQUET_PATH, METADATA, JOB_ID) + ) == str(OUTPUT_PARQUET_FILE_NAME) + + actual_table = dataset.dataset( + WORKING_DIR / OUTPUT_PARQUET_FILE_NAME + ).to_table() + _validate_content(actual_table, EXPECTED_TABLE) + + expected_types = { + "unit_id": "int64", + "value": "string", + "start_epoch_days": "int16", + "stop_epoch_days": "int16", + } + + # Checking the parquet schema is what we expect + _verify_parquet_schema( + WORKING_DIR / OUTPUT_PARQUET_FILE_NAME, expected_types + ) + + +def test_pseudonymizer_unit_id_and_value(mocker): + mocker.patch.object( + pseudonym_service, "pseudonymize", return_value=PSEUDONYM_DICT + ) + + # Pseudonymize + pseudonymized_output_file = dataset_pseudonymizer.run( + INPUT_PARQUET_PATH, + PSEUDONYMIZE_UNIT_ID_AND_VALUE_METADATA, + JOB_ID, + ) + actual_table = dataset.dataset( + WORKING_DIR / pseudonymized_output_file + ).to_table() + _validate_content(actual_table, EXPECTED_TABLE_WITH_BOTH_PSEUDONYMIZED) + + expected_types = { + "unit_id": "int64", + "value": "int64", + "start_epoch_days": "int16", + "stop_epoch_days": "int16", + } + + # Checking the parquet schema is what we expect + _verify_parquet_schema( + WORKING_DIR / OUTPUT_PARQUET_FILE_NAME, expected_types + ) + + +def test_pseudonymizer_only_value(mocker): + mocker.patch.object( + pseudonym_service, "pseudonymize", return_value=PSEUDONYM_DICT + ) + + # Pseudonymize + pseudonymized_output_file = dataset_pseudonymizer.run( + INPUT_PARQUET_PATH, + PSEUDONYMIZE_ONLY_VALUE_METADATA, + JOB_ID, + ) + actual_table = dataset.dataset( + WORKING_DIR / pseudonymized_output_file + ).to_table() + _validate_content( + actual_table, EXPECTED_TABLE_WITH_ONLY_VALUE_PSEUDONYMIZED + ) + + expected_types = { + "unit_id": "string", + "value": "int64", + "start_epoch_days": "int16", + "stop_epoch_days": "int16", + } + + # Checking the parquet schema is what we expect + _verify_parquet_schema( + WORKING_DIR / OUTPUT_PARQUET_FILE_NAME, expected_types + ) + + +def test_pseudonymizer_start_year(mocker): + mocker.patch.object( + pseudonym_service, "pseudonymize", return_value=PSEUDONYM_DICT + ) + assert str( + dataset_pseudonymizer.run( + INPUT_PARQUET_PATH_START_YEAR, METADATA, JOB_ID + ) + ) == str(OUTPUT_PARQUET_FILE_START_YEAR) + + actual_table = dataset.dataset( + WORKING_DIR / OUTPUT_PARQUET_FILE_START_YEAR + ).to_table() + _validate_content(actual_table, EXPECTED_TABLE_START_YEAR) + + expected_types = { + "unit_id": "int64", + "value": "string", + "start_year": "string", + "start_epoch_days": "int16", + "stop_epoch_days": "int16", + } + + # Checking the parquet schema is what we expect + _verify_parquet_schema( + WORKING_DIR / OUTPUT_PARQUET_FILE_START_YEAR, expected_types + ) + + +def test_pseudonymizer_adapter_failure(): + with pytest.raises(BuilderStepError) as e: + dataset_pseudonymizer.run(INPUT_PARQUET_PATH, METADATA, JOB_ID) + assert "Failed to pseudonymize dataset" == str(e.value) + + +def test_pseudonymizer_invalid_unit_id_type(): + with pytest.raises(BuilderStepError) as e: + dataset_pseudonymizer.run(INPUT_PARQUET_PATH, INVALID_METADATA, JOB_ID) + assert "Failed to pseudonymize dataset" in str(e) + + +# In Parquet, the physical type refers to how the data is stored. +# INT32 and INT16 are both physically stored as INT32 in Parquet files. +# The logical type can provide additional context about the data. +# For example, it can tell you that a certain INT32 physical column is to be +# interpreted as a INT16 logical type. +def _verify_parquet_schema(parquet_file_path, expected_types): + """ + Checks the logical type of each column in the parquet file to make sure + they are what we expect. + """ + table = parquet.read_table(parquet_file_path) + schema = table.schema + + for column_name, expected_type in expected_types.items(): + actual_type = schema.field(column_name).type + assert str(actual_type) == expected_type + + +def _validate_content(actual_table, expected_table): + """ + Validate the content of the actual table against the expected table. + """ + + for column_name in expected_table.schema.names: + assert ( + actual_table[column_name].to_pylist() + == expected_table[column_name].to_pylist() + ) diff --git a/tests/unit/domain/worker/steps/test_dataset_transformer.py b/tests/unit/domain/worker/steps/test_dataset_transformer.py new file mode 100644 index 00000000..0c482f4b --- /dev/null +++ b/tests/unit/domain/worker/steps/test_dataset_transformer.py @@ -0,0 +1,91 @@ +import pytest + +from job_executor.domain.worker.steps import dataset_transformer +from tests.unit.resources.domain.worker.steps import ( + dataset_transformer as test_data, +) + + +def test_transform_identifier(): + # test that pseudonymized variable has dataType Long + assert test_data.PERSON_IDENTIFIER["dataType"] == "STRING" + transformed_identifier = dataset_transformer._transform_variable( + test_data.PERSON_IDENTIFIER, "Identifier", "2020-01-01", "2020-12-31" + ) + assert transformed_identifier["dataType"] == "Long" + + # test that not pseudonymized variable keeps dataType + assert test_data.BK_HELSTASJONSKONSULTASJON_IDENTIFIER["dataType"] == "LONG" + transformed_identifier = dataset_transformer._transform_variable( + test_data.BK_HELSTASJONSKONSULTASJON_IDENTIFIER, + "Identifier", + "2020-01-01", + "2020-12-31", + ) + assert transformed_identifier["dataType"] == "Long" + + +def test_transform_codelist(): + """ + Value domains with codelists get transformed to multiple + represented periods based on each unique period of codes + """ + transformed_codelist = ( + dataset_transformer._represented_variables_from_code_list( + "description", [], test_data.CODELIST + ) + ) + assert transformed_codelist == test_data.TRANSFORMED_CODELIST + + """ + SentinelAndMissingValues are included in each represented variables + code list. And marked as a missing value in the missingValues list + """ + transformed_codelist_with_missing = ( + dataset_transformer._represented_variables_from_code_list( + "description", test_data.MISSING_VALUES, test_data.CODELIST + ) + ) + + assert ( + transformed_codelist_with_missing + == test_data.TRANSFORMED_CODELIST_WITH_MISSING_VALUES + ) + + with pytest.raises(ValueError) as e: + dataset_transformer._represented_variables_from_code_list( + "description", [], [] + ) + assert "Code list can not be empty" in str(e) + + +def test_dataset_with_enumerated_valuedomain(): + actual_metadata = dataset_transformer.run(test_data.KREFTREG_DS_ENUMERATED) + assert ( + actual_metadata.model_dump(by_alias=True, exclude_none=True) + == test_data.ENUMERATED_EXPECTED + ) + + +def test_dataset_with_described_valuedomain(): + actual_metadata = dataset_transformer.run(test_data.KREFTREG_DS_DESCRIBED) + assert ( + actual_metadata.model_dump(by_alias=True, exclude_none=True) + == test_data.DESCRIBED_EXPECTED + ) + + +def test_dataset_with_status_type(): + actual_metadata = dataset_transformer.run(test_data.UTDANNING) + assert ( + actual_metadata.model_dump(by_alias=True, exclude_none=True) + == test_data.STATUS_EXPECTED + ) + + +def test_patch_dataset_with_status_type(): + actual_metadata = dataset_transformer.run(test_data.UTDANNING_PATCH) + assert ( + actual_metadata.model_dump(by_alias=True, exclude_none=True) + == test_data.STATUS_PATCH_EXPECTED + ) diff --git a/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/data/BRUTTO_INNTEKT/BRUTTO_INNTEKT__1_0_0/.gitkeep b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/data/BRUTTO_INNTEKT/BRUTTO_INNTEKT__1_0_0/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/data/BRUTTO_INNTEKT/BRUTTO_INNTEKT__DRAFT/.gitkeep b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/data/BRUTTO_INNTEKT/BRUTTO_INNTEKT__DRAFT/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/data/UTDANNING/UTDANNING__DRAFT.parquet b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/data/UTDANNING/UTDANNING__DRAFT.parquet new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/data_versions__1_0.json b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/data_versions__1_0.json new file mode 100644 index 00000000..2f2d5dc8 --- /dev/null +++ b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/data_versions__1_0.json @@ -0,0 +1,6 @@ +{ + "FOEDSELSVEKT": "FOEDSELSVEKT__1_0.parquet", + "INNTEKT": "INNTEKT__1_0", + "KJOENN": "KJOENN__1_0.parquet", + "SIVSTAND": "SIVSTAND__1_0.parquet" +} diff --git a/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/data_versions__2_0.json b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/data_versions__2_0.json new file mode 100644 index 00000000..bfaa5155 --- /dev/null +++ b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/data_versions__2_0.json @@ -0,0 +1,5 @@ +{ + "FOEDSELSVEKT": "FOEDSELSVEKT__1_0.parquet", + "KJOENN": "KJOENN__1_0.parquet", + "SIVSTAND": "SIVSTAND__1_0.parquet" +} diff --git a/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/datastore_versions.json b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/datastore_versions.json new file mode 100644 index 00000000..fe3173b2 --- /dev/null +++ b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/datastore_versions.json @@ -0,0 +1,56 @@ +{ + "name": "no.jobmanager.test", + "label": "Test datastore", + "description": "Syntetiske data for unit testing av job-manager", + "versions": [ + { + "version": "2.0.0.0", + "description": "Første release", + "releaseTime": 1635299291, + "languageCode": "no", + "dataStructureUpdates": [ + { + "name": "INNTEKT", + "description": "Første publisering", + "operation": "REMOVE", + "releaseStatus": "DELETED" + } + ], + "updateType": "MAJOR" + }, + { + "version": "1.0.0.0", + "description": "Første release", + "releaseTime": 1635299291, + "languageCode": "no", + "dataStructureUpdates": [ + { + "name": "INNTEKT", + "description": "Første publisering", + "operation": "ADD", + "releaseStatus": "RELEASED" + }, + { + "name": "SIVSTAND", + "description": "Første publisering", + "operation": "ADD", + "releaseStatus": "RELEASED" + }, + { + "name": "KJOENN", + "description": "Første publisering", + "operation": "ADD", + "releaseStatus": "RELEASED" + }, + { + "name": "FOEDSELSVEKT", + "description": "Første publisering", + "operation": "ADD", + "releaseStatus": "RELEASED" + } + ], + "updateType": "MAJOR" + } + ] + } + diff --git a/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/draft_version.json b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/draft_version.json new file mode 100644 index 00000000..49130202 --- /dev/null +++ b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/draft_version.json @@ -0,0 +1,21 @@ +{ + "version": "0.0.0.1635299291", + "description": "Draft", + "releaseTime": 1635299291, + "languageCode": "no", + "dataStructureUpdates": [ + { + "name": "UTDANNING", + "description": "Første publisering", + "operation": "ADD", + "releaseStatus": "DRAFT" + }, + { + "name": "BRUTTO_INNTEKT", + "description": "Første publisering", + "operation": "ADD", + "releaseStatus": "PENDING_RELEASE" + } + ], + "updateType": "MINOR" +} diff --git a/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/metadata_all__1_0_0.json b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/metadata_all__1_0_0.json new file mode 100644 index 00000000..f065a6b7 --- /dev/null +++ b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/metadata_all__1_0_0.json @@ -0,0 +1,390 @@ +{ + "dataStore": { + "name": "no.jobmanager.test", + "label": "Test datastore", + "description": "Syntetiske data for unit testing av job-manager", + "languageCode": "no" + }, + "languages": [ + {"code": "no", "label": "Norsk"} + ], + "dataStructures": [ + { + "name": "INNTEKT", + "populationDescription": "Alle rapporterte personinntekter i norge fra \u00e5r 1234 til \u00e5r 4321", + "languageCode": "no", + "temporality": "ACCUMULATED", + "sensitivityLevel": "PERSON_GENERAL", + "subjectFields": ["\u00d8konomi", "Samfunn"], + "temporalCoverage": { "start": 365, "stop": 18627 }, + "identifierVariables": [ + { + "variableRole": "Identifier", + "name": "PERSON", + "label": "Personidentifikator", + "notPseudonym": false, + "dataType": "String", + "representedVariables": [ + { + "description": "Identifikator for person i Microdata", + "validPeriod": { "start": 365, "stop": 18627 }, + "valueDomain": { + "description": "Pseudonymisert personnummer", + "unitOfMeasure": "N/A" + } + } + ], + "format": "RandomUInt48", + "keyType": { + "name": "PERSON", + "label": "Person", + "description": "Statistisk enhet er person (individ, enkeltmenenske)" + } + } + ], + "measureVariable": { + "variableRole": "Measure", + "name": "INNTEKT", + "label": "Inntekt", + "notPseudonym": true, + "dataType": "String", + "representedVariables": [ + { + "description": "Personens rapporterte inntekt", + "validPeriod": { "start": 365, "stop": 18627 }, + "valueDomain": { + "description": "\u00c5rlig personinntekt", + "unitOfMeasure": "Norske Kroner" + } + } + ] + }, + "attributeVariables": [ + { + "variableRole": "Start", + "name": "START", + "label": "Startdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Startdato/m\u00e5letidspunktet for hendelsen", + "validPeriod": { "start": 365, "stop": 18627 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + }, + { + "variableRole": "Stop", + "name": "STOP", + "label": "Stoppdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Stoppdato/sluttdato for hendelsen", + "validPeriod": { "start": 365, "stop": 18627 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + } + ] + }, + { + "name": "KJOENN", + "populationDescription": "Kj\u00f8nn for en populasjon", + "languageCode": "no", + "temporality": "FIXED", + "sensitivityLevel": "PERSON_GENERAL", + "subjectFields": ["Helse", "Helsetjenester"], + "temporalCoverage": { "start": -25567, "stop": 18628 }, + "identifierVariables": [ + { + "variableRole": "Identifier", + "name": "PERSON", + "label": "Personidentifikator", + "notPseudonym": false, + "dataType": "String", + "representedVariables": [ + { + "description": "Identifikator for person i Microdata", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "Pseudonymisert personnummer", + "unitOfMeasure": "N/A" + } + } + ], + "format": "RandomUInt48", + "keyType": { + "name": "PERSON", + "label": "Person", + "description": "Statistisk enhet er person (individ, enkeltmenenske)" + } + } + ], + "measureVariable": { + "variableRole": "Measure", + "name": "KJOENN", + "label": "Kj\u00f8nn", + "notPseudonym": true, + "dataType": "String", + "representedVariables": [ + { + "description": "Rapportert kj\u00f8nn for person", + "validPeriod": { "start": -25567 }, + "valueDomain": { + "codeList": [ + { "category": "Mann", "code": "1" }, + { "category": "Kvinne", "code": "2" } + ], + "missingValues": [] + } + } + ] + }, + "attributeVariables": [ + { + "variableRole": "Start", + "name": "START", + "label": "Startdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Startdato/m\u00e5letidspunktet for hendelsen", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + }, + { + "variableRole": "Stop", + "name": "STOP", + "label": "Stoppdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Stoppdato/sluttdato for hendelsen", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + } + ] + }, + { + "name": "SIVSTAND", + "populationDescription": "Alle personer registrert bosatt i Norge", + "languageCode": "no", + "temporality": "EVENT", + "sensitivityLevel": "PERSON_GENERAL", + "subjectFields": ["Befolkning"], + "temporalCoverage": { "start": -18263, "stop": -6940 }, + "identifierVariables": [ + { + "variableRole": "Identifier", + "name": "PERSON", + "label": "Personidentifikator", + "notPseudonym": false, + "dataType": "String", + "representedVariables": [ + { + "description": "Identifikator for person i Microdata", + "validPeriod": { "start": -18263, "stop": -6940 }, + "valueDomain": { + "description": "Pseudonymisert personnummer", + "unitOfMeasure": "N/A" + } + } + ], + "format": "RandomUInt48", + "keyType": { + "name": "PERSON", + "label": "Person", + "description": "Statistisk enhet er person (individ, enkeltmenenske)" + } + } + ], + "measureVariable": { + "variableRole": "Measure", + "name": "SIVSTAND", + "label": "Sivilstand", + "notPseudonym": true, + "dataType": "String", + "representedVariables": [ + { + "description": "Sivilstand i forhold til ekteskapslovgivningen", + "validPeriod": { "start": -16071, "stop": -15526 }, + "valueDomain": { + "codeList": [ + { "category": "Ugift", "code": "1" }, + { "category": "Gift", "code": "2" }, + { "category": "Enke/Enkemann", "code": "3" }, + { "category": "Skilt", "code": "4" }, + { "category": "Separert", "code": "5" }, + { "category": "Siviltand ukjent", "code": "0" } + ], + "missingValues": ["0"] + } + }, + { + "description": "Sivilstand i forhold til ekteskapslovgivningen", + "validPeriod": { "start": -15525 }, + "valueDomain": { + "codeList": [ + { "category": "Ugift", "code": "1" }, + { "category": "Gift", "code": "2" }, + { "category": "Enke/Enkemann", "code": "3" }, + { "category": "Skilt", "code": "4" }, + { "category": "Separert", "code": "5" }, + { "category": "Registrert partner", "code": "6" }, + { "category": "Separert partner", "code": "7" }, + { "category": "Skilt partner", "code": "8" }, + { "category": "Gjenlevende partner", "code": "9" }, + { "category": "Siviltand ukjent", "code": "0" } + ], + "missingValues": ["0"] + } + } + ] + }, + "attributeVariables": [ + { + "variableRole": "Start", + "name": "START", + "label": "Startdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Startdato/m\u00e5letidspunktet for hendelsen", + "validPeriod": { "start": -18263, "stop": -6940 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + }, + { + "variableRole": "Stop", + "name": "STOP", + "label": "Stoppdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Stoppdato/sluttdato for hendelsen", + "validPeriod": { "start": -18263, "stop": -6940 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + } + ] + }, + { + "name": "FOEDSELSVEKT", + "populationDescription": "F\u00f8dselsvekt for en populasjon", + "languageCode": "no", + "temporality": "FIXED", + "sensitivityLevel": "PERSON_GENERAL", + "subjectFields": ["Helse"], + "temporalCoverage": { "start": -25567, "stop": 18628 }, + "identifierVariables": [ + { + "variableRole": "Identifier", + "name": "PERSON", + "label": "Personidentifikator", + "notPseudonym": false, + "dataType": "String", + "representedVariables": [ + { + "description": "Identifikator for person i Microdata", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "Pseudonymisert personnummer", + "unitOfMeasure": "N/A" + } + } + ], + "format": "RandomUInt48", + "keyType": { + "name": "PERSON", + "label": "Person", + "description": "Statistisk enhet er person (individ, enkeltmenneske)" + } + } + ], + "measureVariable": { + "variableRole": "Measure", + "name": "FOEDSELSVEKT", + "label": "F\u00f8dested", + "notPseudonym": true, + "dataType": "Long", + "representedVariables": [ + { + "description": "Rapportert f\u00f8dested for person", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "F\u00f8dselsvekt", + "unitOfMeasure": "Gram" + } + } + ] + }, + "attributeVariables": [ + { + "variableRole": "Start", + "name": "START", + "label": "Startdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Startdato/m\u00e5letidspunktet for hendelsen", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + }, + { + "variableRole": "Stop", + "name": "STOP", + "label": "Stoppdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Stoppdato/sluttdato for hendelsen", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + } + ] + } + ] +} diff --git a/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/metadata_all__2_0_0.json b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/metadata_all__2_0_0.json new file mode 100644 index 00000000..7cf73979 --- /dev/null +++ b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/metadata_all__2_0_0.json @@ -0,0 +1,304 @@ +{ + "dataStore": { + "name": "no.jobmanager.test", + "label": "Test datastore", + "description": "Syntetiske data for unit testing av job-manager", + "languageCode": "no" + }, + "languages": [ + {"code": "no", "label": "Norsk"} + ], + "dataStructures": [ + { + "name": "KJOENN", + "populationDescription": "Kj\u00f8nn for en populasjon", + "languageCode": "no", + "temporality": "FIXED", + "sensitivityLevel": "PERSON_GENERAL", + "subjectFields": ["Helse", "Helsetjenester"], + "temporalCoverage": { "start": -25567, "stop": 18628 }, + "identifierVariables": [ + { + "variableRole": "Identifier", + "name": "PERSON", + "label": "Personidentifikator", + "notPseudonym": false, + "dataType": "String", + "representedVariables": [ + { + "description": "Identifikator for person i Microdata", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "Pseudonymisert personnummer", + "unitOfMeasure": "N/A" + } + } + ], + "format": "RandomUInt48", + "keyType": { + "name": "PERSON", + "label": "Person", + "description": "Statistisk enhet er person (individ, enkeltmenenske)" + } + } + ], + "measureVariable": { + "variableRole": "Measure", + "name": "KJOENN", + "label": "Kj\u00f8nn", + "notPseudonym": true, + "dataType": "String", + "representedVariables": [ + { + "description": "Rapportert kj\u00f8nn for person", + "validPeriod": { "start": -25567 }, + "valueDomain": { + "codeList": [ + { "category": "Mann", "code": "1" }, + { "category": "Kvinne", "code": "2" } + ], + "missingValues": [] + } + } + ] + }, + "attributeVariables": [ + { + "variableRole": "Start", + "name": "START", + "label": "Startdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Startdato/m\u00e5letidspunktet for hendelsen", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + }, + { + "variableRole": "Stop", + "name": "STOP", + "label": "Stoppdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Stoppdato/sluttdato for hendelsen", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + } + ] + }, + { + "name": "SIVSTAND", + "populationDescription": "Alle personer registrert bosatt i Norge", + "languageCode": "no", + "temporality": "EVENT", + "sensitivityLevel": "PERSON_GENERAL", + "subjectFields": ["Befolkning"], + "temporalCoverage": { "start": -18263, "stop": -6940 }, + "identifierVariables": [ + { + "variableRole": "Identifier", + "name": "PERSON", + "label": "Personidentifikator", + "notPseudonym": false, + "dataType": "String", + "representedVariables": [ + { + "description": "Identifikator for person i Microdata", + "validPeriod": { "start": -18263, "stop": -6940 }, + "valueDomain": { + "description": "Pseudonymisert personnummer", + "unitOfMeasure": "N/A" + } + } + ], + "format": "RandomUInt48", + "keyType": { + "name": "PERSON", + "label": "Person", + "description": "Statistisk enhet er person (individ, enkeltmenenske)" + } + } + ], + "measureVariable": { + "variableRole": "Measure", + "name": "SIVSTAND", + "label": "Sivilstand", + "notPseudonym": true, + "dataType": "String", + "representedVariables": [ + { + "description": "Sivilstand i forhold til ekteskapslovgivningen", + "validPeriod": { "start": -16071, "stop": -15526 }, + "valueDomain": { + "codeList": [ + { "category": "Ugift", "code": "1" }, + { "category": "Gift", "code": "2" }, + { "category": "Enke/Enkemann", "code": "3" }, + { "category": "Skilt", "code": "4" }, + { "category": "Separert", "code": "5" }, + { "category": "Siviltand ukjent", "code": "0" } + ], + "missingValues": ["0"] + } + }, + { + "description": "Sivilstand i forhold til ekteskapslovgivningen", + "validPeriod": { "start": -15525 }, + "valueDomain": { + "codeList": [ + { "category": "Ugift", "code": "1" }, + { "category": "Gift", "code": "2" }, + { "category": "Enke/Enkemann", "code": "3" }, + { "category": "Skilt", "code": "4" }, + { "category": "Separert", "code": "5" }, + { "category": "Registrert partner", "code": "6" }, + { "category": "Separert partner", "code": "7" }, + { "category": "Skilt partner", "code": "8" }, + { "category": "Gjenlevende partner", "code": "9" }, + { "category": "Siviltand ukjent", "code": "0" } + ], + "missingValues": ["0"] + } + } + ] + }, + "attributeVariables": [ + { + "variableRole": "Start", + "name": "START", + "label": "Startdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Startdato/m\u00e5letidspunktet for hendelsen", + "validPeriod": { "start": -18263, "stop": -6940 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + }, + { + "variableRole": "Stop", + "name": "STOP", + "label": "Stoppdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Stoppdato/sluttdato for hendelsen", + "validPeriod": { "start": -18263, "stop": -6940 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + } + ] + }, + { + "name": "FOEDSELSVEKT", + "populationDescription": "F\u00f8dselsvekt for en populasjon", + "languageCode": "no", + "temporality": "FIXED", + "sensitivityLevel": "PERSON_GENERAL", + "subjectFields": ["Helse"], + "temporalCoverage": { "start": -25567, "stop": 18628 }, + "identifierVariables": [ + { + "variableRole": "Identifier", + "name": "PERSON", + "label": "Personidentifikator", + "notPseudonym": false, + "dataType": "String", + "representedVariables": [ + { + "description": "Identifikator for person i Microdata", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "Pseudonymisert personnummer", + "unitOfMeasure": "N/A" + } + } + ], + "format": "RandomUInt48", + "keyType": { + "name": "PERSON", + "label": "Person", + "description": "Statistisk enhet er person (individ, enkeltmenneske)" + } + } + ], + "measureVariable": { + "variableRole": "Measure", + "name": "FOEDSELSVEKT", + "label": "F\u00f8dested", + "notPseudonym": true, + "dataType": "Long", + "representedVariables": [ + { + "description": "Rapportert f\u00f8dested for person", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "F\u00f8dselsvekt", + "unitOfMeasure": "Gram" + } + } + ] + }, + "attributeVariables": [ + { + "variableRole": "Start", + "name": "START", + "label": "Startdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Startdato/m\u00e5letidspunktet for hendelsen", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + }, + { + "variableRole": "Stop", + "name": "STOP", + "label": "Stoppdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Stoppdato/sluttdato for hendelsen", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + } + ] + } + ] + } + diff --git a/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/metadata_all__DRAFT.json b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/metadata_all__DRAFT.json new file mode 100644 index 00000000..7237c0dd --- /dev/null +++ b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE/datastore/metadata_all__DRAFT.json @@ -0,0 +1,484 @@ +{ + "dataStore": { + "name": "no.jobmanager.test", + "label": "Test datastore", + "description": "Syntetiske data for unit testing av job-manager", + "languageCode": "no" + }, + "languages": [ + {"code": "no", "label": "Norsk"} + ], + "dataStructures": [ + { + "name": "KJOENN", + "populationDescription": "Kj\u00f8nn for en populasjon", + "languageCode": "no", + "temporality": "FIXED", + "sensitivityLevel": "PERSON_GENERAL", + "subjectFields": ["Helse", "Helsetjenester"], + "temporalCoverage": { "start": -25567, "stop": 18628 }, + "identifierVariables": [ + { + "variableRole": "Identifier", + "name": "PERSON", + "label": "Personidentifikator", + "notPseudonym": false, + "dataType": "String", + "representedVariables": [ + { + "description": "Identifikator for person i Microdata", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "Pseudonymisert personnummer", + "unitOfMeasure": "N/A" + } + } + ], + "format": "RandomUInt48", + "keyType": { + "name": "PERSON", + "label": "Person", + "description": "Statistisk enhet er person (individ, enkeltmenenske)" + } + } + ], + "measureVariable": { + "variableRole": "Measure", + "name": "KJOENN", + "label": "Kj\u00f8nn", + "notPseudonym": true, + "dataType": "String", + "representedVariables": [ + { + "description": "Rapportert kj\u00f8nn for person", + "validPeriod": { "start": -25567 }, + "valueDomain": { + "codeList": [ + { "category": "Mann", "code": "1" }, + { "category": "Kvinne", "code": "2" } + ], + "missingValues": [] + } + } + ] + }, + "attributeVariables": [ + { + "variableRole": "Start", + "name": "START", + "label": "Startdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Startdato/m\u00e5letidspunktet for hendelsen", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + }, + { + "variableRole": "Stop", + "name": "STOP", + "label": "Stoppdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Stoppdato/sluttdato for hendelsen", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + } + ] + }, + { + "name": "SIVSTAND", + "populationDescription": "Alle personer registrert bosatt i Norge", + "languageCode": "no", + "temporality": "EVENT", + "sensitivityLevel": "PERSON_GENERAL", + "subjectFields": ["Befolkning"], + "temporalCoverage": { "start": -18263, "stop": -6940 }, + "identifierVariables": [ + { + "variableRole": "Identifier", + "name": "PERSON", + "label": "Personidentifikator", + "notPseudonym": false, + "dataType": "String", + "representedVariables": [ + { + "description": "Identifikator for person i Microdata", + "validPeriod": { "start": -18263, "stop": -6940 }, + "valueDomain": { + "description": "Pseudonymisert personnummer", + "unitOfMeasure": "N/A" + } + } + ], + "format": "RandomUInt48", + "keyType": { + "name": "PERSON", + "label": "Person", + "description": "Statistisk enhet er person (individ, enkeltmenenske)" + } + } + ], + "measureVariable": { + "variableRole": "Measure", + "name": "SIVSTAND", + "label": "Sivilstand", + "notPseudonym": true, + "dataType": "String", + "representedVariables": [ + { + "description": "Sivilstand i forhold til ekteskapslovgivningen", + "validPeriod": { "start": -16071, "stop": -15526 }, + "valueDomain": { + "codeList": [ + { "category": "Ugift", "code": "1" }, + { "category": "Gift", "code": "2" }, + { "category": "Enke/Enkemann", "code": "3" }, + { "category": "Skilt", "code": "4" }, + { "category": "Separert", "code": "5" }, + { "category": "Siviltand ukjent", "code": "0" } + ], + "missingValues": ["0"] + } + }, + { + "description": "Sivilstand i forhold til ekteskapslovgivningen", + "validPeriod": { "start": -15525 }, + "valueDomain": { + "codeList": [ + { "category": "Ugift", "code": "1" }, + { "category": "Gift", "code": "2" }, + { "category": "Enke/Enkemann", "code": "3" }, + { "category": "Skilt", "code": "4" }, + { "category": "Separert", "code": "5" }, + { "category": "Registrert partner", "code": "6" }, + { "category": "Separert partner", "code": "7" }, + { "category": "Skilt partner", "code": "8" }, + { "category": "Gjenlevende partner", "code": "9" }, + { "category": "Siviltand ukjent", "code": "0" } + ], + "missingValues": ["0"] + } + } + ] + }, + "attributeVariables": [ + { + "variableRole": "Start", + "name": "START", + "label": "Startdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Startdato/m\u00e5letidspunktet for hendelsen", + "validPeriod": { "start": -18263, "stop": -6940 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + }, + { + "variableRole": "Stop", + "name": "STOP", + "label": "Stoppdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Stoppdato/sluttdato for hendelsen", + "validPeriod": { "start": -18263, "stop": -6940 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + } + ] + }, + { + "name": "FOEDSELSVEKT", + "populationDescription": "F\u00f8dselsvekt for en populasjon", + "languageCode": "no", + "temporality": "FIXED", + "sensitivityLevel": "PERSON_GENERAL", + "subjectFields": ["Helse"], + "temporalCoverage": { "start": -25567, "stop": 18628 }, + "identifierVariables": [ + { + "variableRole": "Identifier", + "name": "PERSON", + "label": "Personidentifikator", + "notPseudonym": false, + "dataType": "String", + "representedVariables": [ + { + "description": "Identifikator for person i Microdata", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "Pseudonymisert personnummer", + "unitOfMeasure": "N/A" + } + } + ], + "format": "RandomUInt48", + "keyType": { + "name": "PERSON", + "label": "Person", + "description": "Statistisk enhet er person (individ, enkeltmenneske)" + } + } + ], + "measureVariable": { + "variableRole": "Measure", + "name": "FOEDSELSVEKT", + "label": "F\u00f8dested", + "notPseudonym": true, + "dataType": "Long", + "representedVariables": [ + { + "description": "Rapportert f\u00f8dested for person", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "F\u00f8dselsvekt", + "unitOfMeasure": "Gram" + } + } + ] + }, + "attributeVariables": [ + { + "variableRole": "Start", + "name": "START", + "label": "Startdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Startdato/m\u00e5letidspunktet for hendelsen", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + }, + { + "variableRole": "Stop", + "name": "STOP", + "label": "Stoppdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Stoppdato/sluttdato for hendelsen", + "validPeriod": { "start": -25567, "stop": 18628 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + } + ] + }, + { + "name": "UTDANNING", + "populationDescription": "Utdanningniv\u00e5 for en populasjon", + "languageCode": "no", + "temporality": "EVENT", + "sensitivityLevel": "PERSON_GENERAL", + "subjectFields": ["Befolkning"], + "temporalCoverage": { "start": -18263, "stop": -6940 }, + "identifierVariables": [ + { + "variableRole": "Identifier", + "name": "PERSON", + "label": "Personidentifikator", + "notPseudonym": false, + "dataType": "String", + "representedVariables": [ + { + "description": "Identifikator for person i Microdata", + "validPeriod": { "start": -18263, "stop": -6940 }, + "valueDomain": { + "description": "Pseudonymisert personnummer", + "unitOfMeasure": "N/A" + } + } + ], + "format": "RandomUInt48", + "keyType": { + "name": "PERSON", + "label": "Person", + "description": "Statistisk enhet er person (individ, enkeltmenenske)" + } + } + ], + "measureVariable": { + "variableRole": "Measure", + "name": "UTDANNING", + "label": "Utdanning", + "notPseudonym": true, + "dataType": "String", + "representedVariables": [ + { + "description": "Utdanning for person", + "validPeriod": { "start": -16071 }, + "valueDomain": { + "codeList": [ + { "category": "Grunnskole", "code": "1" }, + { "category": "Videreg\u00e5ende skole", "code": "2" }, + { "category": "Bachelorgrad", "code": "3" }, + { "category": "Mastergrad", "code": "4" }, + { "category": "Doktorgrad", "code": "5" }, + { "category": "Ukjent utdanning", "code": "0" } + ], + "missingValues": ["0"] + } + } + ] + }, + "attributeVariables": [ + { + "variableRole": "Start", + "name": "START", + "label": "Startdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Startdato/m\u00e5letidspunktet for hendelsen", + "validPeriod": { "start": -18263, "stop": -6940 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + }, + { + "variableRole": "Stop", + "name": "STOP", + "label": "Stoppdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Stoppdato/sluttdato for hendelsen", + "validPeriod": { "start": -18263, "stop": -6940 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + } + ] + }, + { + "name": "BRUTTO_INNTEKT", + "populationDescription": "Alle rapporterte personinntekter i norge fra \u00e5r 1234 til \u00e5r 4321", + "languageCode": "no", + "temporality": "ACCUMULATED", + "sensitivityLevel": "PERSON_GENERAL", + "subjectFields": ["\u00d8konomi", "Samfunn"], + "temporalCoverage": { "start": 365, "stop": 18627 }, + "identifierVariables": [ + { + "variableRole": "Identifier", + "name": "PERSON", + "label": "Personidentifikator", + "notPseudonym": false, + "dataType": "String", + "representedVariables": [ + { + "description": "Identifikator for person i Microdata", + "validPeriod": { "start": 365, "stop": 18627 }, + "valueDomain": { + "description": "Pseudonymisert personnummer", + "unitOfMeasure": "N/A" + } + } + ], + "format": "RandomUInt48", + "keyType": { + "name": "PERSON", + "label": "Person", + "description": "Statistisk enhet er person (individ, enkeltmenenske)" + } + } + ], + "measureVariable": { + "variableRole": "Measure", + "name": "BRUTTO_INNTEKT", + "label": "Inntekt", + "notPseudonym": true, + "dataType": "String", + "representedVariables": [ + { + "description": "Personens rapporterte inntekt", + "validPeriod": { "start": 365, "stop": 18627 }, + "valueDomain": { + "description": "\u00c5rlig personinntekt", + "unitOfMeasure": "Norske Kroner" + } + } + ] + }, + "attributeVariables": [ + { + "variableRole": "Start", + "name": "START", + "label": "Startdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Startdato/m\u00e5letidspunktet for hendelsen", + "validPeriod": { "start": 365, "stop": 18627 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + }, + { + "variableRole": "Stop", + "name": "STOP", + "label": "Stoppdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Stoppdato/sluttdato for hendelsen", + "validPeriod": { "start": 365, "stop": 18627 }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + } + ] + } + ] +} diff --git a/tests/unit/resources/adapter/local_storage/TEST_DATASTORE_working/FOEDESTED.json b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE_working/FOEDESTED.json new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE_working/FOEDESTED.json @@ -0,0 +1 @@ + diff --git a/tests/unit/resources/adapter/local_storage/TEST_DATASTORE_working/FOEDESTED.parquet b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE_working/FOEDESTED.parquet new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/resources/adapter/local_storage/TEST_DATASTORE_working/FOEDESTED__DRAFT.json b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE_working/FOEDESTED__DRAFT.json new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/resources/adapter/local_storage/TEST_DATASTORE_working/FOEDESTED__DRAFT.parquet b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE_working/FOEDESTED__DRAFT.parquet new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/resources/adapter/local_storage/TEST_DATASTORE_working/FOEDESTED_pseudonymized.parquet b/tests/unit/resources/adapter/local_storage/TEST_DATASTORE_working/FOEDESTED_pseudonymized.parquet new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/resources/domain/worker/steps/dataset_partitioner/.gitkeep b/tests/unit/resources/domain/worker/steps/dataset_partitioner/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/resources/domain/worker/steps/dataset_pseudonymizer/metadata.json b/tests/unit/resources/domain/worker/steps/dataset_pseudonymizer/metadata.json new file mode 100644 index 00000000..f21c153e --- /dev/null +++ b/tests/unit/resources/domain/worker/steps/dataset_pseudonymizer/metadata.json @@ -0,0 +1,71 @@ +{ + "name": "input", + "populationDescription": "N/A", + "languageCode": "no", + "temporality": "STATUS", + "sensitivityLevel": "PERSON_GENERAL", + "subjectFields": [], + "temporalCoverage": { "start": 16436, "stop": 18627 }, + "identifierVariables": [ + { + "variableRole": "Identifier", + "name": "PERSON", + "label": "Person", + "notPseudonym": false, + "dataType": "String", + "representedVariables": [ + { + "description": "N/A", + "validPeriod": { "start": 16436, "stop": 18627 }, + "valueDomain": { "description": "N/A", "unitOfMeasure": "N/A" } + } + ], + "format": "RandomUInt48", + "keyType": { "name": "PERSON", "label": "Person", "description": "N/A"} + } + ], + "measureVariable": { + "variableRole": "Measure", + "name": "input", + "label": "input", + "notPseudonym": true, + "dataType": "String", + "representedVariables": [ + { + "description": "N/A", + "validPeriod": { "start": 16436, "stop": 18627 }, + "valueDomain": { "description": "N/A", "unitOfMeasure": "N/A" } + } + ] + }, + "attributeVariables": [ + { + "variableRole": "Start", + "name": "START", + "label": "Startdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "N/A", + "validPeriod": { "start": 16436, "stop": 18627 }, + "valueDomain": { "description": "N/A", "unitOfMeasure": "N/A" } + } + ] + }, + { + "variableRole": "Stop", + "name": "STOP", + "label": "Stoppdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "N/A", + "validPeriod": { "start": 16436, "stop": 18627 }, + "valueDomain": { "description": "N/A", "unitOfMeasure": "N/A" } + } + ] + } + ] +} diff --git a/tests/unit/resources/domain/worker/steps/dataset_pseudonymizer/metadata_invalid_unit_type.json b/tests/unit/resources/domain/worker/steps/dataset_pseudonymizer/metadata_invalid_unit_type.json new file mode 100644 index 00000000..c3e1ae58 --- /dev/null +++ b/tests/unit/resources/domain/worker/steps/dataset_pseudonymizer/metadata_invalid_unit_type.json @@ -0,0 +1,72 @@ +{ + "name": "input", + "populationDescription": "N/A", + "languageCode": "no", + "temporality": "STATUS", + "sensitivityLevel": "PERSON_GENERAL", + "subjectFields": [], + "temporalCoverage": { "start": 16436, "stop": 18627 }, + "identifierVariables": [ + { + "variableRole": "Identifier", + "name": "PERSON", + "label": "Person", + "notPseudonym": false, + "dataType": "String", + "representedVariables": [ + { + "description": "N/A", + "validPeriod": { "start": 16436, "stop": 18627 }, + "valueDomain": { "description": "N/A", "unitOfMeasure": "N/A" } + } + ], + "format": "RandomUInt48", + "keyType": { "name": "INVALID", "label": "Invalid", "description": "N/A"} + } + ], + "measureVariable": { + "variableRole": "Measure", + "name": "input", + "label": "input", + "notPseudonym": true, + "dataType": "String", + "representedVariables": [ + { + "description": "N/A", + "validPeriod": { "start": 16436, "stop": 18627 }, + "valueDomain": { "description": "N/A", "unitOfMeasure": "N/A" } + } + ] + }, + "attributeVariables": [ + { + "variableRole": "Start", + "name": "START", + "label": "Startdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "N/A", + "validPeriod": { "start": 16436, "stop": 18627 }, + "valueDomain": { "description": "N/A", "unitOfMeasure": "N/A" } + } + ] + }, + { + "variableRole": "Stop", + "name": "STOP", + "label": "Stoppdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "N/A", + "validPeriod": { "start": 16436, "stop": 18627 }, + "valueDomain": { "description": "N/A", "unitOfMeasure": "N/A" } + } + ] + } + ] + } + diff --git a/tests/unit/resources/domain/worker/steps/dataset_pseudonymizer/metadata_pseudonymize_unit_id_and_value.json b/tests/unit/resources/domain/worker/steps/dataset_pseudonymizer/metadata_pseudonymize_unit_id_and_value.json new file mode 100644 index 00000000..781fa38b --- /dev/null +++ b/tests/unit/resources/domain/worker/steps/dataset_pseudonymizer/metadata_pseudonymize_unit_id_and_value.json @@ -0,0 +1,107 @@ +{ + "name": "input", + "populationDescription": "N/A", + "languageCode": "no", + "temporality": "STATUS", + "sensitivityLevel": "PERSON_GENERAL", + "subjectFields": [], + "temporalCoverage": { + "start": 16436, + "stop": 18627 + }, + "identifierVariables": [ + { + "variableRole": "Identifier", + "name": "PERSON", + "label": "Person", + "notPseudonym": false, + "dataType": "String", + "representedVariables": [ + { + "description": "N/A", + "validPeriod": { + "start": 16436, + "stop": 18627 + }, + "valueDomain": { + "description": "N/A", + "unitOfMeasure": "N/A" + } + } + ], + "format": "RandomUInt48", + "keyType": { + "name": "PERSON", + "label": "Person", + "description": "N/A" + } + } + ], + "measureVariable": { + "variableRole": "Measure", + "name": "input", + "label": "input", + "notPseudonym": true, + "dataType": "String", + "representedVariables": [ + { + "description": "N/A", + "validPeriod": { + "start": 16436, + "stop": 18627 + }, + "valueDomain": { + "description": "N/A", + "unitOfMeasure": "N/A" + } + } + ], + "keyType": { + "name": "PERSON", + "label": "Person", + "description": "N/A" + } + }, + "attributeVariables": [ + { + "variableRole": "Start", + "name": "START", + "label": "Startdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "N/A", + "validPeriod": { + "start": 16436, + "stop": 18627 + }, + "valueDomain": { + "description": "N/A", + "unitOfMeasure": "N/A" + } + } + ] + }, + { + "variableRole": "Stop", + "name": "STOP", + "label": "Stoppdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "N/A", + "validPeriod": { + "start": 16436, + "stop": 18627 + }, + "valueDomain": { + "description": "N/A", + "unitOfMeasure": "N/A" + } + } + ] + } + ] +} diff --git a/tests/unit/resources/domain/worker/steps/dataset_pseudonymizer/metadata_pseudonymize_value.json b/tests/unit/resources/domain/worker/steps/dataset_pseudonymizer/metadata_pseudonymize_value.json new file mode 100644 index 00000000..d440e90e --- /dev/null +++ b/tests/unit/resources/domain/worker/steps/dataset_pseudonymizer/metadata_pseudonymize_value.json @@ -0,0 +1,102 @@ +{ + "name": "input", + "populationDescription": "N/A", + "languageCode": "no", + "temporality": "STATUS", + "sensitivityLevel": "PERSON_GENERAL", + "subjectFields": [], + "temporalCoverage": { + "start": 16436, + "stop": 18627 + }, + "identifierVariables": [ + { + "variableRole": "Identifier", + "name": "input", + "label": "input", + "notPseudonym": true, + "dataType": "String", + "representedVariables": [ + { + "description": "N/A", + "validPeriod": { + "start": 16436, + "stop": 18627 + }, + "valueDomain": { + "description": "N/A", + "unitOfMeasure": "N/A" + } + } + ], + "format": "RandomUInt48" + } + ], + "measureVariable": { + "variableRole": "Measure", + "name": "PERSON", + "label": "Person", + "notPseudonym": true, + "dataType": "String", + "representedVariables": [ + { + "description": "N/A", + "validPeriod": { + "start": 16436, + "stop": 18627 + }, + "valueDomain": { + "description": "N/A", + "unitOfMeasure": "N/A" + } + } + ], + "keyType": { + "name": "PERSON", + "label": "Person", + "description": "N/A" + } + }, + "attributeVariables": [ + { + "variableRole": "Start", + "name": "START", + "label": "Startdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "N/A", + "validPeriod": { + "start": 16436, + "stop": 18627 + }, + "valueDomain": { + "description": "N/A", + "unitOfMeasure": "N/A" + } + } + ] + }, + { + "variableRole": "Stop", + "name": "STOP", + "label": "Stoppdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "N/A", + "validPeriod": { + "start": 16436, + "stop": 18627 + }, + "valueDomain": { + "description": "N/A", + "unitOfMeasure": "N/A" + } + } + ] + } + ] +} diff --git a/tests/unit/resources/domain/worker/steps/dataset_transformer/__init__.py b/tests/unit/resources/domain/worker/steps/dataset_transformer/__init__.py new file mode 100644 index 00000000..a341bfa9 --- /dev/null +++ b/tests/unit/resources/domain/worker/steps/dataset_transformer/__init__.py @@ -0,0 +1,167 @@ +import json +from pathlib import Path + +from microdata_tools.validation.components import ( + temporal_attributes, + unit_type_variables, +) + +INPUT_DATA_PATH = Path( + "tests/unit/resources/domain/worker/steps/dataset_transformer/input" +) +EXPECTED_DATA_PATH = Path( + "tests/unit/resources/domain/worker/steps/dataset_transformer/expected" +) + +PERSON_IDENTIFIER = unit_type_variables.get("PERSON") +BK_HELSTASJONSKONSULTASJON_IDENTIFIER = unit_type_variables.get( + "BK_HELSESTASJONSKONSULTASJON" +) + +KREFTREG_DS_DESCRIBED = json.load(open(INPUT_DATA_PATH / "DESCRIBED.json")) +KREFTREG_DS_DESCRIBED["identifierVariables"] = [ + unit_type_variables.get("PERSON") +] +KREFTREG_DS_DESCRIBED["attributeVariables"] = [ + temporal_attributes.generate_start_time_attribute("STATUS"), + temporal_attributes.generate_stop_time_attribute("STATUS"), +] + +KREFTREG_DS_ENUMERATED = json.load(open(INPUT_DATA_PATH / "ENUMERATED.json")) +KREFTREG_DS_ENUMERATED["identifierVariables"] = [ + unit_type_variables.get("PERSON") +] +KREFTREG_DS_ENUMERATED["attributeVariables"] = [ + temporal_attributes.generate_start_time_attribute("STATUS"), + temporal_attributes.generate_stop_time_attribute("STATUS"), +] + +UTDANNING_PATCH = json.load(open(INPUT_DATA_PATH / "UTDANNING_PATCH.json")) +UTDANNING_PATCH["identifierVariables"] = [unit_type_variables.get("PERSON")] +UTDANNING_PATCH["attributeVariables"] = [ + temporal_attributes.generate_start_time_attribute("STATUS"), + temporal_attributes.generate_stop_time_attribute("STATUS"), +] + +UTDANNING = json.load(open(INPUT_DATA_PATH / "UTDANNING.json")) +UTDANNING["identifierVariables"] = [unit_type_variables.get("PERSON")] +UTDANNING["attributeVariables"] = [ + temporal_attributes.generate_start_time_attribute("STATUS"), + temporal_attributes.generate_stop_time_attribute("STATUS"), +] + +DESCRIBED_EXPECTED = json.load(open(EXPECTED_DATA_PATH / "DESCRIBED.json")) +STATUS_EXPECTED = json.load(open(EXPECTED_DATA_PATH / "UTDANNING.json")) +STATUS_PATCH_EXPECTED = json.load( + open(EXPECTED_DATA_PATH / "UTDANNING_PATCH.json") +) +ENUMERATED_EXPECTED = json.load(open(EXPECTED_DATA_PATH / "ENUMERATED.json")) + +CODELIST = [ + { + "code": "1", + "categoryTitle": [{"languageCode": "no", "value": "Grunnskole"}], + "validFrom": "1900-01-01", + "validUntil": None, + }, + { + "code": "2", + "categoryTitle": [{"languageCode": "no", "value": "Gymnasium"}], + "validFrom": "1910-01-01", + "validUntil": "1919-12-31", + }, + { + "code": "3", + "categoryTitle": [{"languageCode": "no", "value": "Bachelorgrad"}], + "validFrom": "1910-01-01", + "validUntil": None, + }, + { + "code": "2", + "categoryTitle": [ + {"languageCode": "no", "value": "Videregående skole"} + ], + "validFrom": "1920-01-01", + "validUntil": None, + }, + { + "code": "4", + "categoryTitle": [{"languageCode": "no", "value": "Mastergrad"}], + "validFrom": "1940-01-01", + "validUntil": None, + }, + { + "code": "5", + "categoryTitle": [{"languageCode": "no", "value": "Doktorgrad"}], + "validFrom": "1940-01-01", + "validUntil": None, + }, +] + +MISSING_VALUES = [ + { + "code": "99", + "categoryTitle": [{"languageCode": "no", "value": "Ukjent"}], + } +] + +TRANSFORMED_CODELIST = [ + { + "description": "description", + "validPeriod": {"start": -25567, "stop": -21916}, + "valueDomain": { + "codeList": [{"category": "Grunnskole", "code": "1"}], + "missingValues": [], + }, + }, + { + "description": "description", + "validPeriod": {"start": -21915, "stop": -18264}, + "valueDomain": { + "codeList": [ + {"category": "Grunnskole", "code": "1"}, + {"category": "Gymnasium", "code": "2"}, + {"category": "Bachelorgrad", "code": "3"}, + ], + "missingValues": [], + }, + }, + { + "description": "description", + "validPeriod": {"start": -18263, "stop": -10959}, + "valueDomain": { + "codeList": [ + {"category": "Grunnskole", "code": "1"}, + {"category": "Bachelorgrad", "code": "3"}, + {"category": "Videregående skole", "code": "2"}, + ], + "missingValues": [], + }, + }, + { + "description": "description", + "validPeriod": {"start": -10958}, + "valueDomain": { + "codeList": [ + {"category": "Grunnskole", "code": "1"}, + {"category": "Bachelorgrad", "code": "3"}, + {"category": "Videregående skole", "code": "2"}, + {"category": "Mastergrad", "code": "4"}, + {"category": "Doktorgrad", "code": "5"}, + ], + "missingValues": [], + }, + }, +] + +TRANSFORMED_CODELIST_WITH_MISSING_VALUES = [ + { + **represented, + "valueDomain": { + "codeList": represented["valueDomain"]["codeList"] + + [{"category": "Ukjent", "code": "99"}], + "missingValues": ["99"], + }, + } + for represented in TRANSFORMED_CODELIST +] diff --git a/tests/unit/resources/domain/worker/steps/dataset_transformer/expected/DESCRIBED.json b/tests/unit/resources/domain/worker/steps/dataset_transformer/expected/DESCRIBED.json new file mode 100644 index 00000000..1b86b402 --- /dev/null +++ b/tests/unit/resources/domain/worker/steps/dataset_transformer/expected/DESCRIBED.json @@ -0,0 +1,115 @@ +{ + "name": "KREFTREG_DS", + "populationDescription": "Alle personer som har f\u00e5tt kreft.", + "languageCode": "no", + "temporality": "STATUS", + "sensitivityLevel": "PERSON_GENERAL", + "subjectFields": [ + "Helse", + "Helsetjenester" + ], + "temporalCoverage": { + "start": 16436, + "stop": 18627 + }, + "identifierVariables": [ + { + "variableRole": "Identifier", + "name": "PERSONID_1", + "label": "Personidentifikator", + "notPseudonym": false, + "dataType": "Long", + "representedVariables": [ + { + "description": "Identifikator for person i microdata", + "validPeriod": { + "start": 16436, + "stop": 18627 + }, + "valueDomain": { + "description": "Pseudonymisert personnummer", + "unitOfMeasure": "N/A" + } + } + ], + "format": "RandomUInt48", + "keyType": { + "name": "PERSON", + "label": "Person", + "description": "Person er et enkeltmenneske, individ." + } + } + ], + "measureVariable": { + "variableRole": "Measure", + "name": "KREFTREG_DS", + "label": "Diagnosens sikkerhet", + "notPseudonym": true, + "dataType": "String", + "representedVariables": [ + { + "description": "Hvor p\u00e5litelig diagnosen i meldingen er. P\u00e5liteligheten vurderes ut fra sikkerhet rundt tumors malignitetspotensiale og sikkerhet rundt tumors prim\u00e6re utgangspunkt.", + "validPeriod": { + "start": 16436, + "stop": 18627 + }, + "valueDomain": { + "description": "Verdier for diagnosens sikkerhet", + "unitOfMeasure": "N/A" + } + } + ] + }, + "attributeVariables": [ + { + "variableRole": "Start", + "name": "START", + "label": "Startdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Startdato/m\u00e5letidspunktet for hendelsen", + "validPeriod": { + "start": 16436, + "stop": 18627 + }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + }, + { + "variableRole": "Stop", + "name": "STOP", + "label": "Stoppdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Stoppdato/m\u00e5letidspunktet for hendelsen", + "validPeriod": { + "start": 16436, + "stop": 18627 + }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + } + ], + "temporalStatusDates": [ + 16436, + 16801, + 17167, + 17532, + 17897 + ], + "temporalEnd": { + "description": "Variabelen blir ikke lenger oppdatert" + } +} diff --git a/tests/unit/resources/domain/worker/steps/dataset_transformer/expected/ENUMERATED.json b/tests/unit/resources/domain/worker/steps/dataset_transformer/expected/ENUMERATED.json new file mode 100644 index 00000000..dc38e373 --- /dev/null +++ b/tests/unit/resources/domain/worker/steps/dataset_transformer/expected/ENUMERATED.json @@ -0,0 +1,213 @@ +{ + "name": "KREFTREG_DS", + "populationDescription": "Alle personer som har f\u00e5tt kreft.", + "languageCode": "no", + "temporality": "STATUS", + "sensitivityLevel": "PERSON_GENERAL", + "subjectFields": [ + "Helse", + "Helsetjenester" + ], + "temporalCoverage": { + "start": 16436, + "stop": 18627 + }, + "identifierVariables": [ + { + "variableRole": "Identifier", + "name": "PERSONID_1", + "label": "Personidentifikator", + "notPseudonym": false, + "dataType": "Long", + "representedVariables": [ + { + "description": "Identifikator for person i microdata", + "validPeriod": { + "start": 16436, + "stop": 18627 + }, + "valueDomain": { + "description": "Pseudonymisert personnummer", + "unitOfMeasure": "N/A" + } + } + ], + "format": "RandomUInt48", + "keyType": { + "name": "PERSON", + "label": "Person", + "description": "Person er et enkeltmenneske, individ." + } + } + ], + "measureVariable": { + "variableRole": "Measure", + "name": "KREFTREG_DS", + "label": "Diagnosens sikkerhet", + "notPseudonym": true, + "dataType": "String", + "representedVariables": [ + { + "description": "Hvor p\u00e5litelig diagnosen i meldingen er. P\u00e5liteligheten vurderes ut fra sikkerhet rundt tumors malignitetspotensiale og sikkerhet rundt tumors prim\u00e6re utgangspunkt.", + "validPeriod": { + "start": -6209, + "stop": 18627 + }, + "valueDomain": { + "codeList": [ + { + "category": "Det foreligger svulst med usikker malignitet og usikker topografi", + "code": "0" + }, + { + "category": "Det foreligger svulst uten p\u00e5vist malignitet, men sikker topografi", + "code": "1" + }, + { + "category": "Det foreligger svulst med sikker malignitet, men usikker topografi (benyttes bare for solide svulster)", + "code": "2" + }, + { + "category": "Det foreligger svulst med sikker malignitet og sikker topografi", + "code": "3" + }, + { + "category": "Der foreligger svulst med sikker malignitet og sikker topografi hos pasient som er registrert med premalign tilstand i samme organ mer enn fire hele m\u00e5neder forut for diagnosem\u00e5neden til aktuelle krefttilfelle", + "code": "4" + }, + { + "category": "Det foreligger klinisk sikker kreft og sikker topografi", + "code": "5" + }, + { + "category": "Det foreligger klinisk sikker kreft og sikker topografi", + "code": "6" + }, + { + "category": "Det foreligger histologi-/cytologimelding eller d\u00f8dsattest om svulst med usikker malignitetsgrad (usikkert benign/premalign/malign svulst) og sikker topografi f\u00f8r klinisk melding er registrert", + "code": "7" + }, + { + "category": "Ugyldig/manglende verdi", + "code": "999" + }, + { + "category": "Uidentifiserbar verdi", + "code": "888" + } + ], + "missingValues": [ + "999", + "888" + ] + } + }, + { + "description": "Hvor p\u00e5litelig diagnosen i meldingen er. P\u00e5liteligheten vurderes ut fra sikkerhet rundt tumors malignitetspotensiale og sikkerhet rundt tumors prim\u00e6re utgangspunkt.", + "validPeriod": { + "start": 18628 + }, + "valueDomain": { + "codeList": [ + { + "category": "Det foreligger svulst med usikker malignitet og usikker topografi", + "code": "0" + }, + { + "category": "Det foreligger svulst med sikker malignitet, men usikker topografi (benyttes bare for solide svulster)", + "code": "2" + }, + { + "category": "Det foreligger svulst med sikker malignitet og sikker topografi", + "code": "3" + }, + { + "category": "Der foreligger svulst med sikker malignitet og sikker topografi hos pasient som er registrert med premalign tilstand i samme organ mer enn fire hele m\u00e5neder forut for diagnosem\u00e5neden til aktuelle krefttilfelle", + "code": "4" + }, + { + "category": "Det foreligger klinisk sikker kreft og sikker topografi", + "code": "5" + }, + { + "category": "Det foreligger klinisk sikker kreft og sikker topografi", + "code": "6" + }, + { + "category": "Det foreligger histologi-/cytologimelding eller d\u00f8dsattest om svulst med usikker malignitetsgrad (usikkert benign/premalign/malign svulst) og sikker topografi f\u00f8r klinisk melding er registrert", + "code": "7" + }, + { + "category": "Ugyldig/manglende verdi", + "code": "999" + }, + { + "category": "Uidentifiserbar verdi", + "code": "888" + } + ], + "missingValues": [ + "999", + "888" + ] + } + } + ] + }, + "attributeVariables": [ + { + "variableRole": "Start", + "name": "START", + "label": "Startdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Startdato/m\u00e5letidspunktet for hendelsen", + "validPeriod": { + "start": 16436, + "stop": 18627 + }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + }, + { + "variableRole": "Stop", + "name": "STOP", + "label": "Stoppdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Stoppdato/m\u00e5letidspunktet for hendelsen", + "validPeriod": { + "start": 16436, + "stop": 18627 + }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + } + ], + "temporalStatusDates": [ + 16436, + 16801, + 17167, + 17532, + 17897 + ], + "temporalEnd": { + "description": "Variabelen blir ikke lenger oppdatert", + "successors": [ + "KREFTREG_DS_1", + "KREFTREG_DS_2" + ] + } +} diff --git a/tests/unit/resources/domain/worker/steps/dataset_transformer/expected/UTDANNING.json b/tests/unit/resources/domain/worker/steps/dataset_transformer/expected/UTDANNING.json new file mode 100644 index 00000000..af3027c3 --- /dev/null +++ b/tests/unit/resources/domain/worker/steps/dataset_transformer/expected/UTDANNING.json @@ -0,0 +1,135 @@ +{ + "name": "UTDANNING", + "populationDescription": "Personer med utdannelse i Norge", + "languageCode": "no", + "temporality": "STATUS", + "sensitivityLevel": "PERSON_GENERAL", + "subjectFields": [ + "BEFOLKNING", + "UTDANNING" + ], + "temporalCoverage": { + "start": 7305, + "stop": 8766 + }, + "identifierVariables": [ + { + "variableRole": "Identifier", + "name": "PERSONID_1", + "label": "Personidentifikator", + "notPseudonym": false, + "dataType": "Long", + "representedVariables": [ + { + "description": "Identifikator for person i microdata", + "validPeriod": { + "start": 7305, + "stop": 8766 + }, + "valueDomain": { + "description": "Pseudonymisert personnummer", + "unitOfMeasure": "N/A" + } + } + ], + "format": "RandomUInt48", + "keyType": { + "name": "PERSON", + "label": "Person", + "description": "Person er et enkeltmenneske, individ." + } + } + ], + "measureVariable": { + "variableRole": "Measure", + "name": "UTDANNING", + "label": "Utdanning", + "notPseudonym": true, + "dataType": "String", + "representedVariables": [ + { + "description": "Utdanningsniv\u00e5 for person", + "validPeriod": { + "start": -16071 + }, + "valueDomain": { + "codeList": [ + { + "category": "Grunnskole", + "code": "1" + }, + { + "category": "Videreg\u00e5ende skole", + "code": "2" + }, + { + "category": "Bachelorgrad", + "code": "3" + }, + { + "category": "Mastergrad", + "code": "4" + }, + { + "category": "Doktorgrad", + "code": "5" + } + ], + "missingValues": [] + } + } + ] + }, + "attributeVariables": [ + { + "variableRole": "Start", + "name": "START", + "label": "Startdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Startdato/m\u00e5letidspunktet for hendelsen", + "validPeriod": { + "start": 7305, + "stop": 8766 + }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + }, + { + "variableRole": "Stop", + "name": "STOP", + "label": "Stoppdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Stoppdato/m\u00e5letidspunktet for hendelsen", + "validPeriod": { + "start": 7305, + "stop": 8766 + }, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + } + ], + "temporalStatusDates": [ + 7305, + 7670, + 8035, + 8401, + 8766 + ], + "temporalEnd": { + "description": "Variabelen blir ikke lenger oppdatert" + } +} diff --git a/tests/unit/resources/domain/worker/steps/dataset_transformer/expected/UTDANNING_PATCH.json b/tests/unit/resources/domain/worker/steps/dataset_transformer/expected/UTDANNING_PATCH.json new file mode 100644 index 00000000..6c8a2387 --- /dev/null +++ b/tests/unit/resources/domain/worker/steps/dataset_transformer/expected/UTDANNING_PATCH.json @@ -0,0 +1,120 @@ +{ + "name": "UTDANNING", + "populationDescription": "Personer med utdannelse i Norge", + "languageCode": "no", + "temporality": "STATUS", + "sensitivityLevel": "PERSON_GENERAL", + "subjectFields": [ + "BEFOLKNING", + "SAMFUNN" + ], + "temporalCoverage": {}, + "identifierVariables": [ + { + "variableRole": "Identifier", + "name": "PERSONID_1", + "label": "Personidentifikator", + "notPseudonym": false, + "dataType": "Long", + "representedVariables": [ + { + "description": "Identifikator for person i microdata", + "validPeriod": {}, + "valueDomain": { + "description": "Pseudonymisert personnummer", + "unitOfMeasure": "N/A" + } + } + ], + "format": "RandomUInt48", + "keyType": { + "name": "PERSON", + "label": "Person", + "description": "Person er et enkeltmenneske, individ." + } + } + ], + "measureVariable": { + "variableRole": "Measure", + "name": "UTDANNING", + "label": "Utdanning", + "notPseudonym": true, + "dataType": "String", + "representedVariables": [ + { + "description": "Utdanningsniv\u00e5 for person", + "validPeriod": { + "start": -16071 + }, + "valueDomain": { + "codeList": [ + { + "category": "Grunnskole", + "code": "1" + }, + { + "category": "Videreg\u00e5ende skole", + "code": "2" + }, + { + "category": "Bachelorgrad", + "code": "3" + }, + { + "category": "Mastergrad", + "code": "4" + }, + { + "category": "Doktorgrad", + "code": "5" + } + ], + "missingValues": [] + } + } + ] + }, + "attributeVariables": [ + { + "variableRole": "Start", + "name": "START", + "label": "Startdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Startdato/m\u00e5letidspunktet for hendelsen", + "validPeriod": {}, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + }, + { + "variableRole": "Stop", + "name": "STOP", + "label": "Stoppdato", + "notPseudonym": true, + "dataType": "Instant", + "representedVariables": [ + { + "description": "Stoppdato/m\u00e5letidspunktet for hendelsen", + "validPeriod": {}, + "valueDomain": { + "description": "Dato oppgitt i dager siden 1970-01-01", + "unitOfMeasure": "N/A" + } + } + ] + } + ], + "temporalEnd": { + "description": "Variabelen blir ikke lenger oppdatert", + "successors": [ + "UTDANNING_1", + "UTDANNING_2" + ] + } +} diff --git a/tests/unit/resources/domain/worker/steps/dataset_transformer/input/DESCRIBED.json b/tests/unit/resources/domain/worker/steps/dataset_transformer/input/DESCRIBED.json new file mode 100644 index 00000000..5a51fc63 --- /dev/null +++ b/tests/unit/resources/domain/worker/steps/dataset_transformer/input/DESCRIBED.json @@ -0,0 +1,56 @@ +{ + "shortName": "KREFTREG_DS", + "temporalityType": "STATUS", + "sensitivityLevel": "PERSON_GENERAL", + "populationDescription": [ + {"languageCode": "no", "value": "Alle personer som har fått kreft."}, + {"languageCode": "en", "value": "All persons who are suffering from cancer."} + ], + "spatialCoverageDescription": [ + {"languageCode": "no", "value": "Norge"} + ], + "subjectFields": [ + [{"languageCode": "no", "value": "Helse"}], + [{"languageCode": "no", "value": "Helsetjenester"}] + ], + "dataRevision": { + "description": [ + {"languageCode": "no", "value": "Første publisering."} + ], + "temporalCoverageStart": "2015-01-01", + "temporalCoverageLatest": "2020-12-31", + "temporalStatusDates": ["2015-01-01", "2016-01-01", "2017-01-01", "2018-01-01", "2019-01-01"], + "temporalEnd": { + "description": [{"languageCode": "no","value": "Variabelen blir ikke lenger oppdatert"}] + } + }, + "measureVariables": [ + { + "shortName": "KREFTREG_DS", + "name": [ + {"languageCode": "no", "value": "Diagnosens sikkerhet"} + ], + "description": [ + {"languageCode": "no", "value": "Hvor pålitelig diagnosen i meldingen er. Påliteligheten vurderes ut fra sikkerhet rundt tumors malignitetspotensiale og sikkerhet rundt tumors primære utgangspunkt."} + ], + "populationDescription": [ + {"languageCode": "no", "value": "Alle krefttilfeller, forstadier til kreft og enkelte benigne svulster for personer i Norge."} + ], + "dataType": "STRING", + "uriDefinition": ["https://metadata.kreftregisteret.no/variables/detail/8"], + "valueDomain": { + "description": [ + {"languageCode": "no", "value": "Verdier for diagnosens sikkerhet"} + ], + "uriDefinition": [ + "http://www.ssb.no/a/metadata/conceptvariable/vardok/26/nb", + "https://www.ssb.no/a/metadata/conceptvariable/vardok/26/en" + ], + "measurementUnitDescription": [ + {"languageCode": "no", "value": "N/A"}, + {"languageCode": "en", "value": "N/A"} + ] + } + } + ] + } diff --git a/tests/unit/resources/domain/worker/steps/dataset_transformer/input/ENUMERATED.json b/tests/unit/resources/domain/worker/steps/dataset_transformer/input/ENUMERATED.json new file mode 100644 index 00000000..84c25252 --- /dev/null +++ b/tests/unit/resources/domain/worker/steps/dataset_transformer/input/ENUMERATED.json @@ -0,0 +1,122 @@ +{ + "shortName": "KREFTREG_DS", + "temporalityType": "STATUS", + "sensitivityLevel": "PERSON_GENERAL", + "populationDescription": [ + {"languageCode": "no", "value": "Alle personer som har fått kreft."}, + {"languageCode": "en", "value": "All persons who are suffering from cancer."} + ], + "spatialCoverageDescription": [ + {"languageCode": "no", "value": "Norge"} + ], + "subjectFields": [ + [{"languageCode": "no", "value": "Helse"}], + [{"languageCode": "no", "value": "Helsetjenester"}] + ], + "dataRevision": { + "description": [ + {"languageCode": "no", "value": "Første publisering."} + ], + "temporalCoverageStart": "2015-01-01", + "temporalCoverageLatest": "2020-12-31", + "temporalStatusDates": ["2015-01-01", "2016-01-01", "2017-01-01", "2018-01-01", "2019-01-01"], + "temporalEnd": { + "description": [{"languageCode": "no", "value": "Variabelen blir ikke lenger oppdatert"}], + "successors": [ + "KREFTREG_DS_1", "KREFTREG_DS_2" + ] + } + }, + "measureVariables": [ + { + "variableRole": "MEASURE", + "shortName": "KREFTREG_DS", + "name": [ + {"languageCode": "no", "value": "Diagnosens sikkerhet"} + ], + "description": [ + {"languageCode": "no", "value": "Hvor pålitelig diagnosen i meldingen er. Påliteligheten vurderes ut fra sikkerhet rundt tumors malignitetspotensiale og sikkerhet rundt tumors primære utgangspunkt."} + ], + "populationDescription": [ + {"languageCode": "no", "value": "Alle krefttilfeller, forstadier til kreft og enkelte benigne svulster for personer i Norge."} + ], + "dataType": "STRING", + "uriDefinition": ["https://metadata.kreftregisteret.no/variables/detail/8"], + "valueDomain": { + "codeList": [ + { + "code": "0", + "categoryTitle": [ + {"languageCode": "no", "value": "Det foreligger svulst med usikker malignitet og usikker topografi"} + ], + "validFrom": "1953-01-01" + }, + { + "code": "1", + "categoryTitle": [ + {"languageCode": "no", "value": "Det foreligger svulst uten påvist malignitet, men sikker topografi"} + ], + "validFrom": "1953-01-01", + "validUntil": "2020-12-31" + }, + { + "code": "2", + "categoryTitle": [ + {"languageCode": "no", "value": "Det foreligger svulst med sikker malignitet, men usikker topografi (benyttes bare for solide svulster)"} + ], + "validFrom": "1953-01-01" + }, + { + "code": "3", + "categoryTitle": [ + {"languageCode": "no", "value": "Det foreligger svulst med sikker malignitet og sikker topografi"} + ], + "validFrom": "1953-01-01" + }, + { + "code": "4", + "categoryTitle": [ + {"languageCode": "no", "value": "Der foreligger svulst med sikker malignitet og sikker topografi hos pasient som er registrert med premalign tilstand i samme organ mer enn fire hele måneder forut for diagnosemåneden til aktuelle krefttilfelle"} + ], + "validFrom": "1953-01-01" + }, + { + "code": "5", + "categoryTitle": [ + {"languageCode": "no", "value": "Det foreligger klinisk sikker kreft og sikker topografi"} + ], + "validFrom": "1953-01-01" + }, + { + "code": "6", + "categoryTitle": [ + {"languageCode": "no", "value": "Det foreligger klinisk sikker kreft og sikker topografi"} + ], + "validFrom": "1953-01-01" + }, + { + "code": "7", + "categoryTitle": [ + {"languageCode": "no", "value": "Det foreligger histologi-/cytologimelding eller dødsattest om svulst med usikker malignitetsgrad (usikkert benign/premalign/malign svulst) og sikker topografi før klinisk melding er registrert"} + ], + "validFrom": "1953-01-01" + } + ], + "sentinelAndMissingValues" : [ + { + "code": "999", + "categoryTitle": [ + {"languageCode": "no", "value": "Ugyldig/manglende verdi"} + ] + }, + { + "code": "888", + "categoryTitle": [ + {"languageCode": "no", "value": "Uidentifiserbar verdi"} + ] + } + ] + } + } + ] + } diff --git a/tests/unit/resources/domain/worker/steps/dataset_transformer/input/UTDANNING.json b/tests/unit/resources/domain/worker/steps/dataset_transformer/input/UTDANNING.json new file mode 100644 index 00000000..2464a2a6 --- /dev/null +++ b/tests/unit/resources/domain/worker/steps/dataset_transformer/input/UTDANNING.json @@ -0,0 +1,141 @@ +{ + "temporalityType": "STATUS", + "sensitivityLevel": "PERSON_GENERAL", + "populationDescription": [ + { + "languageCode": "no", + "value": "Personer med utdannelse i Norge" + } + ], + "subjectFields": [ + [ + { + "value": "BEFOLKNING", + "languageCode": "no" + } + ], + [ + { + "value": "UTDANNING", + "languageCode": "no" + } + ] + ], + "spatialCoverageDescription": [ + { + "languageCode": "no", + "value": "Norge" + } + ], + "dataRevision": { + "description": [ + { + "languageCode": "no", + "value": "Første publisering." + } + ], + "temporalEnd": { + "description": [ + { + "languageCode": "no", + "value": "Variabelen blir ikke lenger oppdatert" + } + ] + }, + "temporalCoverageStart": "1990-01-01", + "temporalCoverageLatest": "1994-01-01", + "temporalStatusDates": [ + "1990-01-01", + "1991-01-01", + "1992-01-01", + "1993-01-01", + "1994-01-01" + ] + }, + "measureVariables": [ + { + "name": [ + { + "languageCode": "no", + "value": "Utdanning" + } + ], + "description": [ + { + "languageCode": "no", + "value": "Utdanningsnivå for person" + } + ], + "subjectFields": [ + [ + { + "languageCode": "no", + "value": "Testdata" + } + ] + ], + "dataType": "STRING", + "valueDomain": { + "codeList": [ + { + "code": "1", + "categoryTitle": [ + { + "languageCode": "no", + "value": "Grunnskole" + } + ], + "validFrom": "1926-01-01", + "validUntil": null + }, + { + "code": "2", + "categoryTitle": [ + { + "languageCode": "no", + "value": "Videregående skole" + } + ], + "validFrom": "1926-01-01", + "validUntil": null + }, + { + "code": "3", + "categoryTitle": [ + { + "languageCode": "no", + "value": "Bachelorgrad" + } + ], + "validFrom": "1926-01-01", + "validUntil": null + }, + { + "code": "4", + "categoryTitle": [ + { + "languageCode": "no", + "value": "Mastergrad" + } + ], + "validFrom": "1926-01-01", + "validUntil": null + }, + { + "code": "5", + "categoryTitle": [ + { + "languageCode": "no", + "value": "Doktorgrad" + } + ], + "validFrom": "1926-01-01", + "validUntil": null + } + ] + }, + "shortName": "UTDANNING" + } + ], + "shortName": "UTDANNING" +} diff --git a/tests/unit/resources/domain/worker/steps/dataset_transformer/input/UTDANNING_PATCH.json b/tests/unit/resources/domain/worker/steps/dataset_transformer/input/UTDANNING_PATCH.json new file mode 100644 index 00000000..7955d69a --- /dev/null +++ b/tests/unit/resources/domain/worker/steps/dataset_transformer/input/UTDANNING_PATCH.json @@ -0,0 +1,131 @@ +{ + "temporalityType": "STATUS", + "sensitivityLevel": "PERSON_GENERAL", + "populationDescription": [ + { + "languageCode": "no", + "value": "Personer med utdannelse i Norge" + } + ], + "spatialCoverageDescription": [ + { + "languageCode": "no", + "value": "Norge" + } + ], + "subjectFields": [ + [ + { + "languageCode": "no", + "value": "BEFOLKNING" + } + ], + [ + { + "languageCode": "no", + "value": "SAMFUNN" + } + ] + ], + "dataRevision": { + "description": [ + { + "languageCode": "no", + "value": "Første publisering." + } + ], + "temporalEnd": { + "description": [ + { + "languageCode": "no", + "value": "Variabelen blir ikke lenger oppdatert" + } + ], + "successors": [ + "UTDANNING_1", + "UTDANNING_2" + ] + } + }, + "measureVariables": [ + { + "name": [ + { + "languageCode": "no", + "value": "Utdanning" + } + ], + "description": [ + { + "languageCode": "no", + "value": "Utdanningsnivå for person" + } + ], + "subjectFields": [ + [ + { + "languageCode": "no", + "value": "Testdata" + } + ] + ], + "dataType": "STRING", + "valueDomain": { + "codeList": [ + { + "code": "1", + "categoryTitle": [ + { + "languageCode": "no", + "value": "Grunnskole" + } + ], + "validFrom": "1926-01-01" + }, + { + "code": "2", + "categoryTitle": [ + { + "languageCode": "no", + "value": "Videregående skole" + } + ], + "validFrom": "1926-01-01" + }, + { + "code": "3", + "categoryTitle": [ + { + "languageCode": "no", + "value": "Bachelorgrad" + } + ], + "validFrom": "1926-01-01" + }, + { + "code": "4", + "categoryTitle": [ + { + "languageCode": "no", + "value": "Mastergrad" + } + ], + "validFrom": "1926-01-01" + }, + { + "code": "5", + "categoryTitle": [ + { + "languageCode": "no", + "value": "Doktorgrad" + } + ], + "validFrom": "1926-01-01" + } + ] + }, + "shortName": "UTDANNING" + } + ], + "shortName": "UTDANNING" +}